diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000..10eef953d5
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,28 @@
+---
+name: Bug report
+about: Create a report to help us improve the repository or project
+title: ""
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+
+A clear and concise description of what the bug is.
+
+**Steps/Code to reproduce bug**
+
+Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
+
+A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
+
+
+**Expected behavior**
+
+A clear and concise description of what you expected to happen.
+
+
+**Additional context**
+
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000..99d680b0ab
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,2 @@
+blank_issues_enabled: false
+
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000..7334f687d1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ""
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/ISSUE_TEMPLATE/model-support-request.md b/.github/ISSUE_TEMPLATE/model-support-request.md
new file mode 100644
index 0000000000..52d2e017ef
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/model-support-request.md
@@ -0,0 +1,31 @@
+---
+name: Model Support Request
+about: Request conversion support and training recipes for a new model
+title: "<Model name> Model Support"
+labels: ''
+assignees: ''
+
+---
+
+Add support for \<model name\> model:
+
+**Please include a link to the model's HuggingFace repo**
+HF repo:
+
+**These checklist items are required for all models in Megatron Bridge**
+
+- [ ] Model providers
+- [ ] Model bridge for HF conversion
+- [ ] Unit tests (config and bridge)
+- [ ] Model conversion functional tests
+
+**For flagship models, these items are also needed**
+
+- [ ] Optimal pretraining recipe
+- [ ] Optimal finetuning recipe
+- [ ] Recipe unit tests
+- [ ] Recipe functional tests
+- [ ] End to end CI tests
+
+**Additional context**
+Add any other context or screenshots about the model request here.
diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
index 42dbf5026a..6c8a7d3572 100644
--- a/.github/workflows/build-docs.yml
+++ b/.github/workflows/build-docs.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   build-docs:
     needs: [pre-flight]
diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml
index a77c50cca7..c03b93bb5f 100644
--- a/.github/workflows/build-test-publish-wheel.yml
+++ b/.github/workflows/build-test-publish-wheel.yml
@@ -31,7 +31,7 @@ permissions:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   build-test-publish-wheel:
     needs: [pre-flight]
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 49009f8d5f..248ecf66ed 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -10,7 +10,7 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 name: CICD NeMo
 on:
   schedule:
@@ -31,7 +31,7 @@ permissions:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   lint-check:
     name: Lint check
diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml
index b7e007ac9a..7d0e00493d 100644
--- a/.github/workflows/copyright-check.yml
+++ b/.github/workflows/copyright-check.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   copyright-check:
     needs: [pre-flight]
diff --git a/.github/workflows/install-test.yml b/.github/workflows/install-test.yml
index 8ad2601def..41936ad65f 100644
--- a/.github/workflows/install-test.yml
+++ b/.github/workflows/install-test.yml
@@ -26,20 +26,19 @@ on:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.53.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
 
   pip-test-bare-metal:
     needs: [pre-flight]
     if: |
       !(needs.pre-flight.outputs.docs_only == 'true'
       || needs.pre-flight.outputs.is_deployment_workflow == 'true')
-    runs-on: ${{ matrix.arch }}
-    name: Pip - Python${{ matrix.python-version }} - ${{ matrix.arch == 'ubuntu-latest' && 'AMD64/Linux' || 'ARM64/Darwin' }} - Bare Metal
+    runs-on: linux-amd64-cpu16
+    name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - Bare Metal
     container: ubuntu:24.04
     strategy:
       fail-fast: false
       matrix:
-        arch: ["ubuntu-latest"]
         python-version: ["3.10", "3.11", "3.12"]
     steps:
       - name: Checkout repository
diff --git a/.gitignore b/.gitignore
index 7e7db08e4c..d755ce3aa9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -182,3 +182,5 @@ slurm*.out
 
 # UV package manager
 .uv/
+
+*.mp4
\ No newline at end of file
diff --git a/annotators/Inpainting/automatic_segmentation.py b/annotators/Inpainting/automatic_segmentation.py
new file mode 100644
index 0000000000..f3fc79d916
--- /dev/null
+++ b/annotators/Inpainting/automatic_segmentation.py
@@ -0,0 +1,720 @@
+"""
+Automatic Instance Segmentation Pipeline using RAM + Grounding DINO + SAM2
+
+This script performs fully automatic instance segmentation without any manual annotation:
+1. RAM (Recognize Anything Model) - Automatically generates image tags
+2. Grounding DINO - Detects objects based on generated tags
+3. SAM2 - Segments detected objects using bounding boxes as prompts
+
+No human annotation required!
+"""
+import os
+os.environ['HF_HOME'] = '/home/tanya/.huggingface'
+os.environ['HUGGINGFACE_HUB_CACHE'] = '/home/tanya/.huggingface/hub'
+os.environ['TRANSFORMERS_CACHE'] = '/home/tanya/.huggingface/hub'
+
+import torch
+import numpy as np
+import cv2
+import os
+from pathlib import Path
+import argparse
+from tqdm import tqdm
+from PIL import Image
+import supervision as sv
+from typing import List, Dict, Tuple
+
+# SAM2 imports
+from sam2.build_sam import build_sam2, build_sam2_video_predictor
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+import tempfile
+import shutil
+
+# Optional: RAM and Grounding DINO imports (will check if available)
+try:
+    from groundingdino.util.inference import Model as GroundingDINOModel
+    GROUNDING_DINO_AVAILABLE = True
+except ImportError:
+    print("Warning: Grounding DINO not available. Install with:")
+    print("pip install groundingdino-py")
+    GROUNDING_DINO_AVAILABLE = False
+
+try:
+    from ram.models import ram_plus
+    from ram import inference_ram as inference
+    from ram.transform import get_transform as ram_transform
+    RAM_AVAILABLE = True
+except ImportError:
+    print("Warning: RAM not available. Will use manual text prompts.")
+    RAM_AVAILABLE = False
+
+
+class AutomaticSegmentationPipeline:
+    """Pipeline for automatic instance segmentation using RAM + Grounding DINO + SAM2"""
+    
+    def __init__(
+        self,
+        sam2_checkpoint: str,
+        sam2_config: str = "sam2_hiera_l.yaml",
+        grounding_dino_config: str = None,
+        grounding_dino_checkpoint: str = None,
+        ram_checkpoint: str = None,
+        device: str = "cuda"
+    ):
+        self.device = device
+        self.sam2_checkpoint = sam2_checkpoint
+        self.sam2_config = sam2_config
+        
+        # Load SAM2 for images
+        print("Loading SAM2...")
+        self.sam2_predictor = SAM2ImagePredictor(
+            build_sam2(sam2_config, sam2_checkpoint, device=device)
+        )
+        
+        # Load Grounding DINO
+        self.grounding_dino = None
+        if GROUNDING_DINO_AVAILABLE and grounding_dino_config and grounding_dino_checkpoint:
+            print("Loading Grounding DINO...")
+            self.grounding_dino = GroundingDINOModel(
+                model_config_path=grounding_dino_config,
+                model_checkpoint_path=grounding_dino_checkpoint,
+                device=device
+            )
+        
+        # Load RAM
+        self.ram_model = None
+        if RAM_AVAILABLE and ram_checkpoint:
+            print("Loading RAM...")
+            self.ram_model = ram_plus(
+                pretrained=ram_checkpoint,
+                image_size=384,
+                vit='swin_l'
+            )
+            self.ram_model.eval()
+            self.ram_model = self.ram_model.to(device)
+    
+    def generate_tags_with_ram(self, image: np.ndarray) -> List[str]:
+        """Generate image tags using RAM model"""
+        if self.ram_model is None:
+            return []
+        
+        # Convert BGR to RGB
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image_pil = Image.fromarray(image_rgb)
+        
+        # Preprocess image for RAM using the transform
+        transform = ram_transform(image_size=384)
+        image_tensor = transform(image_pil).unsqueeze(0).to(self.device)
+        
+        # Generate tags using the model
+        with torch.no_grad():
+            tags, tags_chinese = self.ram_model.generate_tag(image_tensor)
+        
+        # Parse tags - they come as a string separated by |
+        tag_list = [tag.strip() for tag in tags[0].split('|') if tag.strip()]
+        
+        return tag_list
+    
+    def detect_objects_with_grounding_dino(
+        self,
+        image: np.ndarray,
+        text_prompt: str,
+        box_threshold: float = 0.25,
+        text_threshold: float = 0.25,
+        min_area_ratio: float = 0.20,
+        max_area_ratio: float = 0.50
+    ) -> Tuple[np.ndarray, np.ndarray, List[str]]:
+        """
+        Detect objects using Grounding DINO
+        
+        Args:
+            min_area_ratio: Minimum box area as ratio of image area (default: 0.20)
+            max_area_ratio: Maximum box area as ratio of image area (default: 0.50)
+        
+        Returns:
+            boxes: (N, 4) array of bounding boxes in xyxy format
+            scores: (N,) array of confidence scores
+            labels: List of N labels
+        """
+        if self.grounding_dino is None:
+            return np.array([]), np.array([]), []
+        
+        # Detect objects
+        detections = self.grounding_dino.predict_with_classes(
+            image=image,
+            classes=[text_prompt],
+            box_threshold=box_threshold,
+            text_threshold=text_threshold
+        )
+        
+        # Extract results
+        boxes = detections.xyxy if len(detections) > 0 else np.array([])
+        scores = detections.confidence if len(detections) > 0 else np.array([])
+        labels = detections.class_id if len(detections) > 0 else []
+        
+        # Filter boxes by area
+        if len(boxes) > 0:
+            image_area = image.shape[0] * image.shape[1]
+            box_areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+            area_ratios = box_areas / image_area
+            
+            # Keep boxes within area ratio range
+            valid_mask = (area_ratios >= min_area_ratio) & (area_ratios <= max_area_ratio)
+            boxes = boxes[valid_mask]
+            scores = scores[valid_mask]
+            labels = [label for i, label in enumerate(labels) if valid_mask[i]]
+            
+            print(f"Filtered boxes: {valid_mask.sum()}/{len(valid_mask)} boxes kept (area between {min_area_ratio*100}% and {max_area_ratio*100}%)")
+            
+            # Keep only the box with highest score
+            if len(boxes) > 0:
+                best_idx = np.argmax(scores)
+                boxes = boxes[best_idx:best_idx+1]
+                scores = scores[best_idx:best_idx+1]
+                labels = [labels[best_idx]]
+                print(f"Selected box with highest score: {scores[0]:.3f}")
+        
+        return boxes, scores, labels
+    
+    def segment_with_sam2(
+        self,
+        image: np.ndarray,
+        boxes: np.ndarray
+    ) -> List[np.ndarray]:
+        """
+        Segment objects using SAM2 with bounding box prompts
+        
+        Args:
+            image: Input image (H, W, 3)
+            boxes: Bounding boxes in xyxy format (N, 4)
+        
+        Returns:
+            List of binary masks, one for each box
+        """
+        if len(boxes) == 0:
+            return []
+        
+        # Set image
+        self.sam2_predictor.set_image(image)
+        
+        masks = []
+        for box in boxes:
+            # SAM expects box in xyxy format
+            mask, score, _ = self.sam2_predictor.predict(
+                point_coords=None,
+                point_labels=None,
+                box=box[None, :],  # Add batch dimension
+                multimask_output=False,
+            )
+            masks.append(mask[0])  # Take first (and only) mask
+        
+        return masks
+    
+    def process_image(
+        self,
+        image: np.ndarray,
+        text_prompt: str = None,
+        use_ram: bool = True,
+        box_threshold: float = 0.25,
+        text_threshold: float = 0.25,
+        min_area_ratio: float = 0.20,
+        max_area_ratio: float = 0.50
+    ) -> Dict:
+        """
+        Process a single image through the full pipeline
+        
+        Args:
+            image: Input image (H, W, 3) in BGR format
+            text_prompt: Optional text prompt. If None and use_ram=True, will generate automatically
+            use_ram: Whether to use RAM for automatic tag generation
+            box_threshold: Grounding DINO box threshold
+            text_threshold: Grounding DINO text threshold
+        
+        Returns:
+            Dictionary containing:
+                - tags: Generated or provided tags
+                - boxes: Detected bounding boxes
+                - scores: Detection confidence scores
+                - masks: Instance segmentation masks
+                - labels: Object labels
+        """
+        # Step 1: Generate tags with RAM (if enabled and no prompt provided)
+        tags = []
+        if text_prompt is None and use_ram:
+            tags = self.generate_tags_with_ram(image)
+            text_prompt = " . ".join(tags) if tags else "object"
+            print(f"Generated tags: {tags}")
+        elif text_prompt is None:
+            text_prompt = "object"
+        
+        # Step 2: Detect objects with Grounding DINO
+        boxes, scores, labels = self.detect_objects_with_grounding_dino(
+            image,
+            text_prompt,
+            box_threshold=box_threshold,
+            text_threshold=text_threshold,
+            min_area_ratio=min_area_ratio,
+            max_area_ratio=max_area_ratio
+        )
+        
+        print(f"Detected {len(boxes)} objects")
+        
+        # Step 3: Segment with SAM2
+        masks = self.segment_with_sam2(image, boxes)
+        
+        return {
+            'tags': tags,
+            'text_prompt': text_prompt,
+            'boxes': boxes,
+            'scores': scores,
+            'masks': masks,
+            'labels': labels
+        }
+
+
+def visualize_results(
+    image: np.ndarray,
+    boxes: np.ndarray,
+    masks: List[np.ndarray],
+    scores: np.ndarray,
+    labels: List[str] = None
+) -> np.ndarray:
+    """Visualize detection and segmentation results"""
+    vis_image = image.copy()
+    
+    # Generate random colors for each instance
+    np.random.seed(42)
+    colors = np.random.randint(0, 255, size=(len(masks), 3), dtype=np.uint8)
+    
+    # Draw masks
+    for idx, mask in enumerate(masks):
+        color = colors[idx].tolist()
+        # Create colored mask
+        colored_mask = np.zeros_like(image)
+        colored_mask[mask] = color
+        # Overlay with transparency
+        vis_image = cv2.addWeighted(vis_image, 1.0, colored_mask, 0.5, 0)
+    
+    # Draw bounding boxes
+    for idx, (box, score) in enumerate(zip(boxes, scores)):
+        x1, y1, x2, y2 = box.astype(int)
+        color = colors[idx].tolist()
+        cv2.rectangle(vis_image, (x1, y1), (x2, y2), color, 2)
+        
+        # Add label
+        label_text = f"{labels[idx] if labels else 'obj'}: {score:.2f}"
+        cv2.putText(vis_image, label_text, (x1, y1 - 10),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+    
+    return vis_image
+
+
+def extract_video_frames(video_path: str, output_dir: str, start_frame: int, end_frame: int) -> Tuple[float, int, int]:
+    """Extract frames from video to directory"""
+    cap = cv2.VideoCapture(video_path)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    
+    if end_frame == -1:
+        end_frame = total_frames
+    
+    os.makedirs(output_dir, exist_ok=True)
+    
+    frame_idx = 0
+    extracted_count = 0
+    
+    print(f"Extracting frames {start_frame} to {end_frame}...")
+    pbar = tqdm(total=end_frame - start_frame, desc="Extracting frames")
+    
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret or frame_idx >= end_frame:
+            break
+        
+        if frame_idx >= start_frame:
+            # Save frame with relative index (starting from 0)
+            frame_filename = os.path.join(output_dir, f"{extracted_count:05d}.jpg")
+            cv2.imwrite(frame_filename, frame)
+            extracted_count += 1
+            pbar.update(1)
+        
+        frame_idx += 1
+    
+    cap.release()
+    pbar.close()
+    
+    return fps, width, height
+
+
+def process_video(
+    video_path: str,
+    output_dir: str,
+    pipeline: AutomaticSegmentationPipeline,
+    text_prompt: str = None,
+    use_ram: bool = True,
+    start_frame: int = 0,
+    end_frame: int = -1,
+    start_time: float = None,
+    end_time: float = None,
+    box_threshold: float = 0.25,
+    text_threshold: float = 0.25,
+    min_area_ratio: float = 0.20,
+    max_area_ratio: float = 0.50
+):
+    """
+    Process video with automatic segmentation and propagation.
+    Uses RAM + Grounding DINO on first frame, then SAM2 propagates through remaining frames.
+    
+    Args:
+        video_path: Path to input video
+        output_dir: Output directory
+        pipeline: AutomaticSegmentationPipeline instance
+        text_prompt: Manual text prompt (if not using RAM)
+        use_ram: Whether to use RAM for tag generation
+        start_frame: Starting frame index (overridden by start_time if provided)
+        end_frame: Ending frame index (overridden by end_time if provided)
+        start_time: Starting timestamp in seconds
+        end_time: Ending timestamp in seconds
+        box_threshold: Grounding DINO box threshold
+        text_threshold: Grounding DINO text threshold
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Convert timestamps to frame indices if provided
+    cap_temp = cv2.VideoCapture(video_path)
+    fps = cap_temp.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap_temp.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap_temp.release()
+    
+    if start_time is not None:
+        start_frame = int(start_time * fps)
+        print(f"Start time {start_time}s -> frame {start_frame}")
+    
+    if end_time is not None:
+        end_frame = int(end_time * fps)
+        print(f"End time {end_time}s -> frame {end_frame}")
+    
+    if end_frame == -1:
+        end_frame = total_frames
+    
+    # Create temporary directory for extracted frames
+    temp_dir = tempfile.mkdtemp(prefix="sam2_frames_")
+    frames_dir = os.path.join(temp_dir, "frames")
+    
+    try:
+        # Extract frames
+        fps, width, height = extract_video_frames(video_path, frames_dir, start_frame, end_frame)
+        
+        # Read first frame for detection
+        first_frame_path = os.path.join(frames_dir, "00000.jpg")
+        first_frame = cv2.imread(first_frame_path)
+        
+        print("\n=== Step 1: Detecting objects in first frame ===")
+        
+        # Step 1: Generate tags with RAM (if enabled)
+        tags = []
+        if text_prompt is None and use_ram:
+            tags = pipeline.generate_tags_with_ram(first_frame)
+            text_prompt = " . ".join(tags) if tags else "object"
+            print(f"Generated tags: {tags}")
+        elif text_prompt is None:
+            text_prompt = "object"
+        
+        # Step 2: Detect objects with Grounding DINO
+        boxes, scores, labels = pipeline.detect_objects_with_grounding_dino(
+            first_frame,
+            text_prompt,
+            box_threshold=box_threshold,
+            text_threshold=text_threshold,
+            min_area_ratio=min_area_ratio,
+            max_area_ratio=max_area_ratio
+        )
+        
+        print(f"Detected {len(boxes)} objects")
+        
+        if len(boxes) == 0:
+            print("Warning: No objects detected! Try lowering thresholds or providing specific prompts.")
+            return
+        
+        # Print detection summary
+        for i, (box, score) in enumerate(zip(boxes, scores)):
+            print(f"  Object {i+1}: confidence={score:.3f}, box={box.astype(int)}")
+        
+        print("\n=== Step 2: Initializing SAM2 video propagation ===")
+        
+        # Step 3: Initialize SAM2 video predictor
+        video_predictor = build_sam2_video_predictor(
+            pipeline.sam2_config,
+            pipeline.sam2_checkpoint,
+            device=pipeline.device
+        )
+        
+        inference_state = video_predictor.init_state(video_path=frames_dir)
+        
+        # Add all detected objects to the first frame
+        for obj_id, box in enumerate(boxes, start=1):
+            # Convert box to center point + box format for SAM2
+            x1, y1, x2, y2 = box
+            
+            # Add box prompt to SAM2
+            _, out_obj_ids, out_mask_logits = video_predictor.add_new_points_or_box(
+                inference_state=inference_state,
+                frame_idx=0,  # First frame
+                obj_id=obj_id,
+                box=box,
+            )
+        
+        print(f"Added {len(boxes)} objects to track")
+        print("\n=== Step 3: Propagating masks through video ===")
+        
+        # Step 4: Propagate through video
+        video_segments = {}
+        for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
+            video_segments[out_frame_idx] = {
+                out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                for i, out_obj_id in enumerate(out_obj_ids)
+            }
+        
+        print("\n=== Step 4: Saving results ===\n")
+        
+        # Step 5: Create output videos and save masks
+        # Setup video writers
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        output_video_path = os.path.join(output_dir, "original_video.mp4")
+        video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
+        
+        # Setup mask video writers for each object
+        mask_video_writers = {}
+        src_video_writers = {}
+        for obj_id in range(1, len(boxes) + 1):
+            mask_video_path = os.path.join(output_dir, f"mask_obj_{obj_id}.mp4")
+            mask_video_writers[obj_id] = cv2.VideoWriter(mask_video_path, fourcc, fps, (width, height), isColor=False)
+            
+            # Setup source video writer with inverse mask applied (for inpainting)
+            src_video_path = os.path.join(output_dir, f"src_video_obj_{obj_id}.mp4")
+            src_video_writers[obj_id] = cv2.VideoWriter(src_video_path, fourcc, fps, (width, height))
+        
+        # Generate random colors for each object
+        np.random.seed(42)
+        colors = np.random.randint(0, 255, size=(len(boxes), 3), dtype=np.uint8)
+        
+        num_frames = end_frame - start_frame
+        for frame_idx in tqdm(range(num_frames), desc="Saving results"):
+            # Read frame
+            frame_path = os.path.join(frames_dir, f"{frame_idx:05d}.jpg")
+            frame = cv2.imread(frame_path)
+            
+            # Write original frame to video
+            video_writer.write(frame)
+            
+            # Process masks if available
+            if frame_idx in video_segments:
+                for obj_id in sorted(video_segments[frame_idx].keys()):
+                    mask = video_segments[frame_idx][obj_id][0] # Get mask
+                    
+                    # Write mask frame to video
+                    mask_img = (mask * 255).astype(np.uint8)
+                    mask_video_writers[obj_id].write(mask_img)
+                    
+                    # Create source video with inverse mask applied (zeroing out the object for inpainting)
+                    bool_mask = mask > 0
+                    src_frame = frame.copy()
+                    src_frame[bool_mask] = 128  # Gray out the masked region
+                    src_video_writers[obj_id].write(src_frame)
+        
+        video_writer.release()
+        for obj_id, mask_writer in mask_video_writers.items():
+            mask_writer.release()
+        for obj_id, src_writer in src_video_writers.items():
+            src_writer.release()
+        
+        print(f"\n{'='*60}")
+        print("Processing complete!")
+        print(f"{'='*60}")
+        print(f"Video segment: frames {start_frame} to {end_frame}")
+        if start_time is not None or end_time is not None:
+            print(f"Time segment: {start_time if start_time else 0}s to {end_time if end_time else end_frame/fps}s")
+        print(f"Detected and tracked {len(boxes)} objects")
+        print(f"\nOutputs:")
+        print(f"  Original video: {output_video_path}")
+        for obj_id in range(1, len(boxes) + 1):
+            mask_video_path = os.path.join(output_dir, f"mask_obj_{obj_id}.mp4")
+            src_video_path = os.path.join(output_dir, f"src_video_obj_{obj_id}.mp4")
+            print(f"  Mask video (obj {obj_id}): {mask_video_path}")
+            print(f"  Source video with inverse mask (obj {obj_id}): {src_video_path}")
+        
+        # Save detection info
+        info_path = os.path.join(output_dir, "detection_info.txt")
+        with open(info_path, 'w') as f:
+            f.write(f"Video: {video_path}\n")
+            f.write(f"Frames: {start_frame} to {end_frame}\n")
+            if start_time is not None or end_time is not None:
+                f.write(f"Time: {start_time if start_time else 0}s to {end_time if end_time else end_frame/fps}s\n")
+            f.write(f"FPS: {fps}\n")
+            f.write(f"\nGenerated tags: {', '.join(tags) if tags else 'N/A'}\n")
+            f.write(f"Text prompt used: {text_prompt}\n")
+            f.write(f"\nDetected {len(boxes)} objects:\n")
+            for i, (box, score) in enumerate(zip(boxes, scores)):
+                f.write(f"  Object {i+1}: confidence={score:.3f}, box={box.astype(int).tolist()}\n")
+        print(f"  Detection info: {info_path}")
+        
+    finally:
+        # Clean up temporary directory
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+            print(f"\nCleaned up temporary files")
+
+def process_image_single(
+    image_path: str,
+    output_dir: str,
+    pipeline: AutomaticSegmentationPipeline,
+    text_prompt: str = None,
+    use_ram: bool = True,
+    box_threshold: float = 0.25,
+    text_threshold: float = 0.25,
+    min_area_ratio: float = 0.20,
+    max_area_ratio: float = 0.50
+):  
+    """Process a single image"""
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Load image
+    image = cv2.imread(image_path)
+    # Process
+    results = pipeline.process_image(
+        image,
+        text_prompt=text_prompt,
+        use_ram=use_ram,
+        box_threshold=box_threshold,
+        text_threshold=text_threshold,
+        min_area_ratio=min_area_ratio,
+        max_area_ratio=max_area_ratio
+    )  
+    
+    # Visualize
+    vis_image = visualize_results(
+        image,
+        results['boxes'],
+        results['masks'],
+        results['scores'],
+        results['labels']
+    )
+    
+    # Save results
+    output_path = os.path.join(output_dir, "segmentation_result.jpg")
+    cv2.imwrite(output_path, vis_image)
+    
+    # Save individual masks
+    for mask_idx, mask in enumerate(results['masks']):
+        mask_img = (mask * 255).astype(np.uint8)
+        mask_path = os.path.join(output_dir, f"mask_{mask_idx}.png")
+        cv2.imwrite(mask_path, mask_img)
+    
+    print(f"\nProcessing complete!")
+    print(f"Detected objects: {len(results['boxes'])}")
+    print(f"Tags used: {results['text_prompt']}")
+    print(f"Output: {output_path}")
+    print(f"Masks saved to: {output_dir}/")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Automatic instance segmentation using RAM + Grounding DINO + SAM2"
+    )
+    
+    # Input/Output
+    parser.add_argument("--input", type=str, required=True,
+                       help="Path to input image or video")
+    parser.add_argument("--output-dir", type=str, default="auto_segmentation_output",
+                       help="Output directory")
+    parser.add_argument("--mode", type=str, choices=["image", "video"], default="image",
+                       help="Processing mode")
+    
+    # SAM2 arguments
+    parser.add_argument("--sam2-checkpoint", type=str, required=True,
+                       help="Path to SAM2 checkpoint")
+    parser.add_argument("--sam2-config", type=str, default="sam2_hiera_l.yaml",
+                       help="SAM2 config file")
+    
+    # Grounding DINO arguments
+    parser.add_argument("--grounding-dino-config", type=str,
+                       help="Path to Grounding DINO config file")
+    parser.add_argument("--grounding-dino-checkpoint", type=str,
+                       help="Path to Grounding DINO checkpoint")
+    
+    # RAM arguments
+    parser.add_argument("--ram-checkpoint", type=str,
+                       help="Path to RAM checkpoint")
+    parser.add_argument("--no-ram", action="store_true", default=False,
+                       help="Disable RAM and use manual text prompt")
+    # Detection parameters
+    parser.add_argument("--text-prompt", type=str, default=None,
+                       help="Text prompt for detection (if not using RAM)")
+    parser.add_argument("--box-threshold", type=float, default=0.25,
+                       help="Grounding DINO box threshold")
+    parser.add_argument("--text-threshold", type=float, default=0.25,
+                       help="Grounding DINO text threshold")
+    parser.add_argument("--min-area-ratio", type=float, default=0.20,
+                       help="Minimum box area as ratio of image area (default: 0.20)")
+    parser.add_argument("--max-area-ratio", type=float, default=0.50,
+                       help="Maximum box area as ratio of image area (default: 0.50)")
+    parser.add_argument("--text-threshold", type=float, default=0.25,
+                       help="Grounding DINO text threshold")
+    
+    # Video-specific arguments
+    parser.add_argument("--start-frame", type=int, default=0,
+                       help="Starting frame for video processing (overridden by --start-time)")
+    parser.add_argument("--end-frame", type=int, default=-1,
+                       help="Ending frame for video processing (-1 for end, overridden by --end-time)")
+    parser.add_argument("--start-time", type=float, default=None,
+                       help="Starting timestamp in seconds (overrides --start-frame)")
+    parser.add_argument("--end-time", type=float, default=None,
+                       help="Ending timestamp in seconds (overrides --end-frame)")
+    
+    # Device
+    parser.add_argument("--device", type=str, default="cuda",
+                       choices=["cuda", "cpu"], help="Device to run on")
+    
+    args = parser.parse_args()
+    
+    # Initialize pipeline
+    pipeline = AutomaticSegmentationPipeline(
+        sam2_checkpoint=args.sam2_checkpoint,
+        sam2_config=args.sam2_config,
+        grounding_dino_config=args.grounding_dino_config,
+        grounding_dino_checkpoint=args.grounding_dino_checkpoint)
+    # Process based on mode
+    if args.mode == "image":
+        process_image_single(
+            args.input,
+            args.output_dir,
+            pipeline,
+            text_prompt=args.text_prompt,
+            use_ram=not args.no_ram,
+            box_threshold=args.box_threshold,
+            text_threshold=args.text_threshold,
+            min_area_ratio=args.min_area_ratio,
+            max_area_ratio=args.max_area_ratio
+        )
+    else:  # video
+        process_video(
+            args.input,
+            args.output_dir,
+            pipeline,
+            text_prompt=args.text_prompt,
+            use_ram=not args.no_ram,
+            start_frame=args.start_frame,
+            end_frame=args.end_frame,
+            start_time=args.start_time,
+            end_time=args.end_time,
+            box_threshold=args.box_threshold,
+            text_threshold=args.text_threshold,
+            min_area_ratio=args.min_area_ratio,
+            max_area_ratio=args.max_area_ratio
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/annotators/Inpainting/batch_process_videos.py b/annotators/Inpainting/batch_process_videos.py
new file mode 100644
index 0000000000..f0f4805ecb
--- /dev/null
+++ b/annotators/Inpainting/batch_process_videos.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+Batch process all videos from all_mixkit subdirectories using RAM + Grounding DINO + SAM2
+Reads frame ranges from video_mixkit.json files in subdirectories
+"""
+import os
+import sys
+import json
+from pathlib import Path
+import subprocess
+from tqdm import tqdm
+
+# Import the segmentation pipeline
+sys.path.insert(0, str(Path.home() / 'RAM_DINO_SAM'))
+from automatic_segmentation import AutomaticSegmentationPipeline, process_video
+
+def find_json_files(root_dir):
+    """Find all video_mixkit.json files in subdirectories"""
+    json_files = []
+    root_path = Path(root_dir).expanduser()
+    
+    for json_path in root_path.rglob('video_mixkit.json'):
+        if json_path.is_file():
+            json_files.append(json_path)
+    
+    return sorted(json_files)
+
+def process_videos_from_json(json_path, input_base_dir, output_base_dir, ram_dino_sam_dir, pipeline):
+    """Process all videos listed in a JSON file with their frame ranges"""
+    print(f"\n{'='*80}")
+    print(f"Processing JSON: {json_path.relative_to(input_base_dir)}")
+    print(f"{'='*80}\n")
+    
+    with open(json_path, 'r') as f:
+        video_entries = json.load(f)
+    
+    # Group entries by video path
+    video_groups = {}
+    for entry in video_entries:
+        video_path = entry['path']
+        if video_path not in video_groups:
+            video_groups[video_path] = []
+        video_groups[video_path].append(entry)
+    
+    stats = {'successful': 0, 'failed': 0, 'skipped': 0}
+    
+    # Collect all meta entries for the meta.json
+    all_meta_entries = []
+    
+    # Process each video
+    for video_path, entries in video_groups.items():
+        full_video_path = input_base_dir / video_path
+        
+        if not full_video_path.exists():
+            print(f"⚠️  Video not found: {full_video_path}")
+            stats['failed'] += len(entries)
+            continue
+        
+        # Create output directory maintaining the subdirectory structure
+        relative_path = Path(video_path).parent
+        video_name = Path(video_path).stem
+        output_dir = output_base_dir / relative_path / video_name
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        print(f"\n{'='*80}")
+        print(f"Video: {video_path}")
+        print(f"Segments: {len(entries)}")
+        print(f"Output: {output_dir.relative_to(output_base_dir)}")
+        print(f"{'='*80}\n")
+        
+        # Process each segment
+        for idx, entry in enumerate(entries, 1):
+            frame_range = entry['frame_idx']
+            start_frame, end_frame = frame_range.split(':')
+            
+            # Create segment-specific output directory
+            segment_output_dir = output_dir / f"segment_{start_frame}_{end_frame}"
+            
+            # Calculate frame index for meta.json (0-based relative to segment)
+            segment_length = int(end_frame) - int(start_frame)
+            
+            # Check if already processed
+            if (segment_output_dir / "segmentation_complete.txt").exists():
+                print(f"  [{idx}/{len(entries)}] ✓ Skipped (already done): frames {start_frame}-{end_frame}")
+                stats['skipped'] += 1
+                
+                # Still add to meta entries if successful
+                original_video_path = segment_output_dir / "original_video.mp4"
+                if original_video_path.exists():
+                    relative_output_path = original_video_path.relative_to(output_base_dir)
+                    meta_entry = {
+                        "path": str(relative_output_path),
+                        "frame_idx": f"0:{segment_length}",
+                        "cap": entry.get('cap', '')
+                    }
+                    all_meta_entries.append(meta_entry)
+                continue
+            
+            segment_output_dir.mkdir(parents=True, exist_ok=True)
+            
+            print(f"  [{idx}/{len(entries)}] Processing: frames {start_frame}-{end_frame}")
+            
+            # Process video segment directly using shared pipeline
+            try:
+                process_video(
+                    video_path=str(full_video_path),
+                    output_dir=str(segment_output_dir),
+                    pipeline=pipeline,
+                    text_prompt=None,
+                    use_ram=True,
+                    start_frame=int(start_frame),
+                    end_frame=int(end_frame),
+                    start_time=None,
+                    end_time=None,
+                    box_threshold=0.25,
+                    text_threshold=0.25
+                )
+                
+                # Mark as complete
+                with open(segment_output_dir / "segmentation_complete.txt", "w") as f:
+                    f.write(f"Video: {video_path}\n")
+                    f.write(f"Frames: {start_frame}-{end_frame}\n")
+                    f.write(f"Status: Success\n")
+                
+                print(f"  [{idx}/{len(entries)}] ✅ Success: frames {start_frame}-{end_frame}")
+                stats['successful'] += 1
+                
+                # Add to meta entries
+                original_video_path = segment_output_dir / "original_video.mp4"
+                if original_video_path.exists():
+                    relative_output_path = original_video_path.relative_to(output_base_dir)
+                    meta_entry = {
+                        "path": str(relative_output_path),
+                        "frame_idx": f"0:{segment_length}",
+                        "cap": entry.get('cap', '')
+                    }
+                    all_meta_entries.append(meta_entry)
+                
+            except Exception as e:
+                print(f"  [{idx}/{len(entries)}] ❌ Failed: frames {start_frame}-{end_frame}")
+                print(f"      Error: {str(e)}")
+                
+                # Log error
+                with open(segment_output_dir / "segmentation_error.txt", "w") as f:
+                    f.write(f"Video: {video_path}\n")
+                    f.write(f"Frames: {start_frame}-{end_frame}\n")
+                    f.write(f"Status: Failed\n")
+                    f.write(f"Error: {str(e)}\n")
+                
+                stats['failed'] += 1
+                continue
+                
+            except KeyboardInterrupt:
+                print("\n\n⚠️  Processing interrupted by user")
+                raise
+        
+        print(f"\n✅ Completed all segments for {video_path}")
+    
+    # Save meta.json for this JSON file's output
+    if all_meta_entries:
+        # Determine the output directory for the meta.json
+        # Use the parent directory of the first entry to determine where to save
+        if all_meta_entries:
+            # Save meta.json in the same directory as the JSON file's processed outputs
+            json_relative = json_path.relative_to(input_base_dir).parent
+            meta_output_dir = output_base_dir / json_relative
+            meta_output_path = meta_output_dir / "meta.json"
+            
+            with open(meta_output_path, 'w') as f:
+                json.dump(all_meta_entries, f, indent=2)
+            
+            print(f"\n📝 Created meta.json with {len(all_meta_entries)} entries: {meta_output_path.relative_to(output_base_dir)}")
+    
+    return stats
+
+
+def main():
+    # Set up directories
+    input_base_dir = Path.home() / 'all_mixkit'
+    output_base_dir = Path.home() / 'all_mixkit_segmented'
+    ram_dino_sam_dir = Path.home() / 'RAM_DINO_SAM'
+    
+    # Create output directory
+    output_base_dir.mkdir(parents=True, exist_ok=True)
+    
+    print(f"\n{'='*80}")
+    print(f"Batch Video Processing with RAM + Grounding DINO + SAM2")
+    print(f"{'='*80}")
+    print(f"Input directory: {input_base_dir}")
+    print(f"Output directory: {output_base_dir}")
+    print(f"{'='*80}\n")
+    
+    # Find all JSON files
+    print("Searching for video_mixkit.json files...")
+    json_files = find_json_files(input_base_dir)
+    
+    if not json_files:
+        print(f"❌ No video_mixkit.json files found in {input_base_dir}")
+        sys.exit(1)
+    
+    print(f"\nFound {len(json_files)} JSON file(s):")
+    for json_file in json_files:
+        print(f"  - {json_file.relative_to(input_base_dir)}")
+    
+    # Initialize pipeline once for all processing
+    print(f"\n{'='*80}")
+    print("Initializing models (RAM + Grounding DINO + SAM2)...")
+    print(f"{'='*80}\n")
+    
+    pipeline = AutomaticSegmentationPipeline(
+        sam2_checkpoint=str(ram_dino_sam_dir / 'models/sam2_hiera_large.pt'),
+        sam2_config='sam2_hiera_l.yaml',
+        grounding_dino_config=str(ram_dino_sam_dir / 'models/GroundingDINO_SwinT_OGC.py'),
+        grounding_dino_checkpoint=str(ram_dino_sam_dir / 'models/groundingdino_swint_ogc.pth'),
+        ram_checkpoint=str(ram_dino_sam_dir / 'models/ram_plus_swin_large_14m.pth'),
+        device='cuda'
+    )
+    
+    print("\n✅ Models loaded successfully! Processing videos...\n")
+    
+    # Process each JSON file
+    total_stats = {'successful': 0, 'failed': 0, 'skipped': 0}
+    
+    for json_file in json_files:
+        try:
+            stats = process_videos_from_json(json_file, input_base_dir, output_base_dir, ram_dino_sam_dir, pipeline)
+            total_stats['successful'] += stats['successful']
+            total_stats['failed'] += stats['failed']
+            total_stats['skipped'] += stats['skipped']
+        except KeyboardInterrupt:
+            print("\n\n⚠️  Processing interrupted by user")
+            break
+        except Exception as e:
+            print(f"\n❌ Error processing {json_file}: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+    
+    # Final summary
+    print(f"\n{'='*80}")
+    print(f"BATCH PROCESSING COMPLETE")
+    print(f"{'='*80}")
+    print(f"Total segments processed:")
+    print(f"  ✅ Successful: {total_stats['successful']}")
+    print(f"  ❌ Failed: {total_stats['failed']}")
+    print(f"  ⏭️  Skipped: {total_stats['skipped']}")
+    print(f"  📊 Total: {sum(total_stats.values())}")
+    print(f"\nResults saved to: {output_base_dir}")
+    print(f"{'='*80}\n")
+
+if __name__ == "__main__":
+    main()
+
diff --git a/annotators/Inpainting/install_auto_seg.sh b/annotators/Inpainting/install_auto_seg.sh
new file mode 100755
index 0000000000..b74baab9aa
--- /dev/null
+++ b/annotators/Inpainting/install_auto_seg.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Automatic Installation Script for RAM + Grounding DINO + SAM2 Pipeline
+# This script will download all necessary models and install dependencies
+
+set -e  # Exit on error
+
+echo "================================================"
+echo "Installing Automatic Segmentation Pipeline"
+echo "================================================"
+
+# Create directories
+mkdir -p models
+cd models
+
+# Install Python dependencies
+echo ""
+echo "[1/5] Installing base dependencies..."
+pip install torch torchvision opencv-python pillow numpy tqdm supervision matplotlib scipy timm transformers
+
+# Install SAM2
+echo ""
+echo "[2/5] Installing SAM2..."
+pip install git+https://github.com/facebookresearch/segment-anything-2.git
+
+# Download SAM2 checkpoints
+echo ""
+echo "[3/5] Downloading SAM2 checkpoints..."
+if [ ! -f "sam2_hiera_large.pt" ]; then
+    echo "Downloading SAM2 Large..."
+    wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt
+fi
+
+if [ ! -f "sam2_hiera_base_plus.pt" ]; then
+    echo "Downloading SAM2 Base+..."
+    wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_base_plus.pt
+fi
+
+# Install and setup Grounding DINO
+echo ""
+echo "[4/5] Installing Grounding DINO..."
+pip install groundingdino-py
+
+# Download Grounding DINO checkpoint
+if [ ! -f "groundingdino_swint_ogc.pth" ]; then
+    echo "Downloading Grounding DINO checkpoint..."
+    wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
+fi
+
+# Download Grounding DINO config
+if [ ! -f "GroundingDINO_SwinT_OGC.py" ]; then
+    echo "Downloading Grounding DINO config..."
+    wget https://raw.githubusercontent.com/IDEA-Research/GroundingDINO/main/groundingdino/config/GroundingDINO_SwinT_OGC.py
+fi
+
+# Install RAM (optional)
+echo ""
+echo "[5/5] Installing RAM (optional - for automatic tag generation)..."
+read -p "Do you want to install RAM for automatic tag generation? (y/n) " -n 1 -r
+echo
+if [[ $REPLY =~ ^[Yy]$ ]]; then
+    # Clone and install RAM
+    if [ ! -d "recognize-anything" ]; then
+        git clone https://github.com/xinyu1205/recognize-anything.git
+        cd recognize-anything
+        pip install -e .
+        cd ..
+    fi
+    
+    # Download RAM checkpoint
+    if [ ! -f "ram_plus_swin_large_14m.pth" ]; then
+        echo "Downloading RAM checkpoint..."
+        wget https://huggingface.co/xinyu1205/recognize-anything-plus-model/resolve/main/ram_plus_swin_large_14m.pth
+    fi
+    echo "RAM installed successfully!"
+else
+    echo "Skipping RAM installation. You can use manual text prompts instead."
+fi
+
+cd ..
+
+echo ""
+echo "================================================"
+echo "Installation Complete!"
+echo "================================================"
+echo ""
+echo "Model checkpoints are in: ./models/"
+echo ""
+echo "Quick Start Examples:"
+echo ""
+echo "1. With RAM (fully automatic):"
+echo "   python automatic_segmentation.py \\"
+echo "       --input image.jpg \\"
+echo "       --mode image \\"
+echo "       --sam2-checkpoint models/sam2_hiera_large.pt \\"
+echo "       --grounding-dino-config models/GroundingDINO_SwinT_OGC.py \\"
+echo "       --grounding-dino-checkpoint models/groundingdino_swint_ogc.pth \\"
+echo "       --ram-checkpoint models/ram_plus_swin_large_14m.pth"
+echo ""
+echo "2. Without RAM (manual prompts):"
+echo "   python automatic_segmentation.py \\"
+echo "       --input image.jpg \\"
+echo "       --mode image \\"
+echo "       --sam2-checkpoint models/sam2_hiera_large.pt \\"
+echo "       --grounding-dino-config models/GroundingDINO_SwinT_OGC.py \\"
+echo "       --grounding-dino-checkpoint models/groundingdino_swint_ogc.pth \\"
+echo "       --text-prompt 'person . car . dog' \\"
+echo "       --no-ram"
+echo ""
+echo "See README_AUTO_SEGMENTATION.md for more examples!"
diff --git a/annotators/Inpainting/run_batch_process.sh b/annotators/Inpainting/run_batch_process.sh
new file mode 100644
index 0000000000..c9aaa25f13
--- /dev/null
+++ b/annotators/Inpainting/run_batch_process.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Activate conda environment and run batch processing
+source ~/miniconda3/bin/activate ram_dino_sam
+
+echo "Environment activated: $CONDA_DEFAULT_ENV"
+echo ""
+
+# Run the batch processing script
+python batch_process_videos.py
+
+echo ""
+echo "Batch processing completed!"
diff --git a/docs/conf.py b/docs/conf.py
index d926eedf89..af1ceb97c3 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -27,7 +27,7 @@
 project = "Megatron Bridge"
 copyright = "2025, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-release = "0.2.0"
+release = "0.1.0"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/docs/index.md b/docs/index.md
index c5a9bebca8..5d472c6a98 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,6 +7,7 @@
 :hidden:
 
 parallelisms.md
+performance-summary.md
 performance-guide.md
 recipe-usage.md
 ```
@@ -37,6 +38,7 @@ training/attention-optimizations.md
 training/activation-recomputation.md
 training/cpu-offloading.md
 training/peft.md
+training/packed-sequences.md
 ```
 
 ```{toctree}
diff --git a/docs/performance-summary.md b/docs/performance-summary.md
new file mode 100644
index 0000000000..9e146ff8d2
--- /dev/null
+++ b/docs/performance-summary.md
@@ -0,0 +1,58 @@
+# Performance
+
+As part of the NVIDIA NeMo Framework, Megatron Bridge, provides optimal performance for training advanced generative AI models by incorporating the most recent training techniques, such as model parallelization, optimized attention mechanisms, and more, to achieve high training throughput.
+
+This page provides performance benchmarks for large language models using Megatron-Bridge across different GPU systems and configurations.
+
+## Nomenclature
+
+- **GBS**: Global Batch Size
+- **MBS**: Micro Batch Size
+- **FSDP**: Fully Sharded Data Parallel
+  - FSDP = 1: use FSDP
+  - FSDP = 0: use DDP (Distributed Data Parallel)
+- **TP**: Tensor Parallel Size
+- **PP**: Pipeline Parallel Size
+- **CP**: Context Parallel Size
+- **VP**: Virtual Pipeline Parallel Size
+- **EP**: Expert Parallel Size
+- **GA**: Number of Gradient Accumulations
+
+## Performance Metrics
+
+Performance is measured using:
+- **Tokens/sec/GPU**: Throughput per GPU
+- **Model TFLOP/sec/GPU**: Model floating-point operations per second per GPU
+
+```{contents}
+:local:
+:depth: 2
+```
+
+## Performance Summary for Large Language Models
+
+Below are performance benchmarks for various large language models organized by release version. These results were obtained using performance recipes available [here](https://github.com/NVIDIA/Megatron-Bridge/tree/main/scripts/performance).
+
+The performance data includes:
+
+- **Pre-training Performance**: Throughput metrics for various model sizes and architectures
+- **System Configurations**: Results across different GPU systems (DGX-GB200, DGX-B200, DGX-H100)
+- **Precision Options**: Performance comparisons between different precision modes (BF16, FP8, MXFP8)
+
+---
+
+## 25.09 NeMo Container
+
+### Pre-Training Performance
+
+#### System: DGX-GB200
+
+*Performance tables will be added here*
+
+#### System: DGX-B200
+
+*Performance tables will be added here*
+
+#### System: DGX-H100
+
+*Performance tables will be added here*
diff --git a/docs/project.json b/docs/project.json
index 549f0f296f..274f2907bb 100644
--- a/docs/project.json
+++ b/docs/project.json
@@ -1 +1,4 @@
-{"name": "megatron-bridge", "version": "0.2.0"}
+{
+    "name": "megatron-bridge",
+    "version": "0.1.0"
+}
\ No newline at end of file
diff --git a/docs/training/images/canonical_lora.png b/docs/training/images/canonical_lora.png
new file mode 100644
index 0000000000..69e8dacf09
Binary files /dev/null and b/docs/training/images/canonical_lora.png differ
diff --git a/docs/training/images/performant_lora.png b/docs/training/images/performant_lora.png
new file mode 100644
index 0000000000..00c12df247
Binary files /dev/null and b/docs/training/images/performant_lora.png differ
diff --git a/docs/training/packed-sequences.md b/docs/training/packed-sequences.md
new file mode 100644
index 0000000000..11220ed913
--- /dev/null
+++ b/docs/training/packed-sequences.md
@@ -0,0 +1,183 @@
+# Packed Sequences
+
+This guide explains how to use packed sequences in Megatron Bridge for efficient supervised fine-tuning (SFT) and parameter-efficient fine-tuning (PEFT).
+
+## Overview
+
+When fine-tuning large language models, GPU under-utilization often occurs due to inefficient input data structure. This inefficiency arises because many fine-tuning datasets have a skewed distribution of sequence lengths, with many short sequences and a few long ones, following [Zipf's Law](https://en.wikipedia.org/wiki/Zipf%27s_law). Since transformer models require fixed-length inputs, shorter sequences must be padded with many padding tokens.
+
+This leads to two main inefficiencies:
+
+- Computation performed on the pad tokens is eventually masked out, resulting in wasted GPU computation.
+- Micro batch size is often limited by the batch which contains longer sequences, so that most other micro batches have under-utilized GPU memory.
+
+Packed sequences is a training technique where multiple training sequences (examples) are concatenated into one long sequence (pack). This technique greatly reduces the number of padding tokens, allowing more meaningful tokens to be processed in each micro batch. As a result, it maximizes both GPU compute and GPU memory utilization.
+
+**Note:** Sequence packing is primarily beneficial for fine-tuning workloads. Megatron-style pretraining datasets (using `IndexedDataset` and `GPTDataset`) already concatenate documents during sampling to fill sequences to the target length, eliminating padding tokens without requiring the boundary-aware packing infrastructure described here. For supervised fine-tuning, however, naive concatenation is insufficient—each training example must be treated individually to preserve data quality.
+
+The conventional solution is to build a custom attention mask (specifically, a block triangular mask) to mask out attention values between sequences. However, this increases the complexity of attention from $\sum_i {s_i}^2$ to $\Big({\sum_i {s_i}}\Big)^2$, where $s_i$ is the length of the $i$th subsequence. In practice, the conventional solution puts a limit on the packed sequence size.
+
+Instead, Megatron Bridge provides a highly optimized version of sequence packing which makes use of variable-length attention kernels in FlashAttention and TransformerEngine. Instead of providing a custom attention mask, information about sequence boundaries is passed in with the `cu_seqlens` variable (short for cumulative sequence length). With this approach, attention values between sequences are never calculated, so the complexity of attention remains at $\sum_i {s_i}^2$. This allows the packed sequence size to increase to arbitrary lengths without affecting the memory complexity, so that GPU memory can be fully utilized.
+
+The packed sequence implementation automatically creates {py:class}`bridge.data.datasets.sft.GPTSFTPackedDataset` instances when `.npy` files are detected, providing optimized data loading and batching for packed sequences.
+
+## Using Packed Sequences
+
+### Prepare the Dataset
+
+In Megatron Bridge, the packed dataset is automatically prepared before training using the {py:func}`bridge.data.datasets.packed_sequence.prepare_packed_sequence_data` function, eliminating the need for any additional preprocessing steps.
+
+### Configure Packed Sequences
+
+Packed sequences are configured through the {py:class}`bridge.training.config.FinetuningDatasetConfig` by specifying `packed_sequence_specs`:
+
+```python
+from megatron.bridge.training.config import ConfigContainer, FinetuningDatasetConfig
+from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
+
+config = ConfigContainer(
+    # ... other configurations
+    dataset=FinetuningDatasetConfig(
+        dataset_root="/path/to/your/dataset",
+        seq_length=2048,
+        packed_sequence_specs=PackedSequenceSpecs(
+            packed_sequence_size=2048,
+            tokenizer_model_name="your_tokenizer_name",
+        ),
+    ),
+    # ... other configurations
+)
+```
+
+### PackedSequenceSpecs Configuration
+
+The {py:class}`bridge.data.datasets.packed_sequence.PackedSequenceSpecs` class provides the following configuration options:
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `packed_sequence_size` | `int` | `-1` | If positive, enables sequence packing with the specified pack size. If ≤ 0, sequence packing is disabled. |
+| `tokenizer_model_name` | `str` | `None` | Tokenizer model name for tracking, since different tokenizers produce different packed datasets. |
+| `packed_train_data_path` | `str` | `None` | Custom path for packed training dataset file (`.npy` format). |
+| `packed_val_data_path` | `str` | `None` | Custom path for packed validation dataset file (`.npy` format). |
+| `packed_metadata_path` | `str` | `None` | Custom path for packing metadata file (`.jsonl` format). |
+| `pad_cu_seqlens` | `bool` | `False` | Whether to pad `cu_seqlens` to constant size, required for CUDA graphs. |
+
+### Batch Size Considerations
+
+When using packed sequences, you must adjust your batch sizes:
+
+1. **Micro batch size must be set to 1**: This constraint arises because samples in a micro batch are no longer stacked; they are now concatenated during the data preparation step. Consequently, micro batch size becomes irrelevant when using packed sequences.
+
+2. **Global batch size must be adjusted**: Since each pack now contains multiple sequences, the global batch size needs to be reduced by the average number of sequences per pack `n` where `n = num_sequences_in_dataset / num_packs` (equivalently, `n = packed_sequence_size / average_seq_len`). This ensures that each gradient iteration sees, on average, the same number of tokens. The value of `n` is printed out during the data preparation step. You may need to run training once, obtain the value of `n` from the logs, then run your training script again with the updated global batch size.
+
+### Full Configuration Example
+
+```python
+from megatron.bridge.training.config import (
+    ConfigContainer, TrainingConfig, CheckpointConfig, SchedulerConfig
+)
+from megatron.bridge.training.config import FinetuningDatasetConfig
+from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
+from megatron.bridge.peft.lora import LoRA
+from megatron.core.optimizer import OptimizerConfig
+
+config = ConfigContainer(
+    model=model_provider,
+    train=TrainingConfig(
+        train_iters=1000,
+        global_batch_size=32,  # Reduced from original due to packing
+        micro_batch_size=1,    # Required for packed sequences
+        eval_interval=100,
+    ),
+    optimizer=OptimizerConfig(
+        optimizer="adam",
+        lr=1e-4,
+        weight_decay=0.01,
+        bf16=True,
+        use_distributed_optimizer=True,
+    ),
+    scheduler=SchedulerConfig(
+        lr_decay_style="cosine",
+        lr_warmup_iters=100,
+        lr_decay_iters=1000,
+    ),
+    dataset=FinetuningDatasetConfig(
+        dataset_root="/path/to/dataset",
+        seq_length=2048,
+        packed_sequence_specs=PackedSequenceSpecs(
+            packed_sequence_size=2048,
+            tokenizer_model_name="llama2_tokenizer",
+        ),
+    ),
+    checkpoint=CheckpointConfig(
+        pretrained_checkpoint="/path/to/pretrained/model",
+        save="/path/to/checkpoints",
+        save_interval=200,
+    ),
+    peft=LoRA(
+        target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
+        dim=16,
+        alpha=32,
+        dropout=0.1,
+    ),
+    # ... other configurations
+)
+```
+
+## File Organization
+
+When using packed sequences, the {py:class}`bridge.data.builders.finetuning_dataset.FinetuningDatasetBuilder` automatically organizes files in your dataset directory:
+
+```
+dataset_root/
+├── training.jsonl          # Original training data
+├── validation.jsonl        # Original validation data
+└── packed/
+    └── {tokenizer_name}/
+        ├── training_{packed_size}.npy      # Packed training data
+        ├── validation_{packed_size}.npy    # Packed validation data
+        └── {packed_size}_metadata.jsonl    # Packing metadata
+```
+
+The tokenizer name and packed sequence size are automatically incorporated into the file paths to avoid conflicts when using different configurations.
+
+## Advanced Configuration
+
+### Custom File Paths
+
+You can specify custom paths for packed data files:
+
+```python
+packed_sequence_specs = PackedSequenceSpecs(
+    packed_sequence_size=4096,
+    tokenizer_model_name="custom_tokenizer",
+    packed_train_data_path="/custom/path/training_packed.npy",
+    packed_val_data_path="/custom/path/validation_packed.npy",
+    packed_metadata_path="/custom/path/metadata.jsonl",
+)
+```
+
+### CUDA Graphs Support
+
+For CUDA graphs compatibility, enable `pad_cu_seqlens`:
+
+```python
+packed_sequence_specs = PackedSequenceSpecs(
+    packed_sequence_size=2048,
+    pad_cu_seqlens=True,  # Required for CUDA graphs
+    tokenizer_model_name="your_tokenizer",
+)
+```
+
+When `pad_cu_seqlens=True`, you must also set `pad_to_max_length=True` in your dataset configuration.
+
+## API Reference
+
+For detailed API documentation, see:
+
+- {py:class}`bridge.training.config.FinetuningDatasetConfig` - Main dataset configuration class
+- {py:class}`bridge.data.datasets.packed_sequence.PackedSequenceSpecs` - Packed sequence configuration
+- {py:func}`bridge.data.datasets.packed_sequence.prepare_packed_sequence_data` - Data preparation function
+- {py:class}`bridge.data.datasets.sft.GPTSFTPackedDataset` - Packed sequence dataset implementation
+- {py:class}`bridge.data.builders.finetuning_dataset.FinetuningDatasetBuilder` - Dataset builder with packing support
+- {py:func}`bridge.training.gpt_step.get_packed_seq_params` - Packed sequence parameter extraction for training
diff --git a/docs/training/peft.md b/docs/training/peft.md
index 9bfa3d2f4f..d9e5c2547a 100644
--- a/docs/training/peft.md
+++ b/docs/training/peft.md
@@ -96,6 +96,118 @@ lora_config = LoRA(
 )
 ```
 
+### Canonical LoRA: Performant vs Canonical Variants
+
+There are two variants of LoRA implemented in Megatron Bridge: "performant LoRA" (`LoRA`) and "canonical LoRA" (`CanonicalLoRA`).
+
+The distinction comes from the fact that Megatron Core optimizes the implementation of the following two linear modules by fusing multiple linear layers into one layer. When these layers are adapted with LoRA, the performant version also uses only one adapter for the linear module. The two linear modules are:
+
+1. `linear_qkv`: The projection matrix in self attention that transforms hidden state to query, key and value. Megatron Core fuses these three projection matrices into a single matrix to efficiently parallelize the matrix multiplication. Hence, performant LoRA applies a single adapter to the qkv projection matrix, whereas canonical LoRA applies three adapters.
+2. `linear_fc1`: The first linear layer in the MLP module before the intermediate activation. For gated linear activations, Megatron Core fuses the up and gate projection matrices into a single matrix for efficient parallelization. Hence, performant LoRA applies a single adapter to the up and gate projection matrices, whereas canonical LoRA applies two adapters.
+
+The following two figures illustrate the difference between canonical and performant LoRA, using the `linear_qkv` layer as an example. Canonical LoRA runs three adapters sequentially, while performant LoRA runs one adapter.
+
+```{image} images/canonical_lora.png
+:width: 640
+:align: center
+```
+
+```{image} images/performant_lora.png
+:width: 400
+:align: center
+```
+
+Canonical LoRA conforms more closely to reference implementations, though it is slower in comparison since it performs several matrix multiplications sequentially, as described above. Performant LoRA has fewer parameters than canonical LoRA and can often achieve the same level of accuracy as canonical LoRA.
+
+Though not immediately apparent, performant LoRA is mathematically equivalent to canonical LoRA when the $A_q$, $A_k$, $A_v$ matrices are tied (i.e. forced to share the same weight during training) in `linear_qkv`, and similarly when the $A_{up}$, $A_{gate}$ matrices are tied in `linear_fc1`.
+
+```{admonition} Mathematical Proof: Performant LoRA Equivalence to Canonical LoRA with Tied Weights
+:class: dropdown
+
+Let $[x \quad y]$ denote matrix concatenation. (In Megatron Bridge, this concatenation is done in an interleaved fashion, but this does not affect the proof below.)
+
+Let $A_q = A_k = A_v = A_{qkv}$ (weight tying)
+
+Then
+
+$$
+\begin{align}
+& [query \quad key \quad value] \\
+= & [W_q x + B_q A_q x \quad W_k x + B_k A_k x \quad W_v x + B_v A_v x] \quad\quad \text{(canonical formulation)} \\
+= & [W_q x + B_q (A_{qkv} x) \quad W_k x + B_k (A_{qkv} x) \quad W_v x + B_v (A_{qkv} x)] \\
+= & [W_q \quad W_k \quad W_v] x + [B_q \quad B_k \quad B_v]A_{qkv} x \\
+= & W_{qkv} x + B_{qkv} A_{qkv} x  \quad\quad \text{(performant formulation)}
+\end{align}
+$$
+
+Note: dimensions of weight matrices are as follows:
+
+$$
+\begin{align}
+W_q:     &\ h \times n_q d          \qquad & A_q:     &\ h \times r \qquad  & B_q:     &\ r \times n_q d \\
+W_k:     &\ h \times n_{kv} d       \qquad & A_k:     &\ h \times r \qquad  & B_k:     &\ r \times n_{kv} d \\
+W_v:     &\ h \times n_{kv} d       \qquad & A_v:     &\ h \times r \qquad  & B_v:     &\ r \times n_{kv} d \\
+W_{qkv}: &\ h \times (n_q+2n_{kv})d \qquad & A_{qkv}: &\ h \times r \qquad  & B_{qkv}: &\ r \times (n_q+2n_{kv})d
+\end{align}
+$$
+
+Where:
+- $n_q$: Number of attention heads (`num_attention_heads`).
+- $n_{kv}$: Number of key value heads (`num_query_groups`). Note that if grouped query attention (GQA) is not used, $n_{kv} = n_q$.
+- $h$: Transformer hidden size (`hidden_size`).
+- $d$: Transformer head dimension (`kv_channels`).
+- $r$: LoRA rank.
+
+```
+
+#### Using Canonical LoRA
+
+```python
+from megatron.bridge.peft.canonical_lora import CanonicalLoRA
+
+canonical_lora_config = CanonicalLoRA(
+    target_modules=[
+        "linear_q", "linear_k", "linear_v",      # Individual Q, K, V projections
+        "linear_proj",                           # Attention output projection
+        "linear_fc1_up", "linear_fc1_gate",     # Individual up and gate projections
+        "linear_fc2"                             # Second MLP layer
+    ],
+    dim=16,                    # Rank of adaptation
+    alpha=32,                  # Scaling parameter
+    dropout=0.1,               # Dropout rate
+)
+```
+
+#### Key Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `target_modules` | `List[str]` | All canonical linear layers | Modules to apply canonical LoRA to |
+| `dim` | `int` | `32` | Rank of the low-rank adaptation |
+| `alpha` | `float` | `32` | Scaling parameter for LoRA |
+| `dropout` | `float` | `0.0` | Dropout rate for LoRA layers |
+| `dropout_position` | `Literal["pre", "post"]` | `"pre"` | Position for applying dropout |
+| `lora_A_init_method` | `str` | `"xavier"` | Initialization method for LoRA A matrix |
+| `lora_B_init_method` | `str` | `"zero"` | Initialization method for LoRA B matrix |
+
+#### Target Modules for Canonical LoRA
+
+The following table lists specific submodules within transformer architectures that are targeted for canonical LoRA:
+
+| Module | Description |
+|--------|-------------|
+| `linear_q` | Query projection in attention |
+| `linear_k` | Key projection in attention |
+| `linear_v` | Value projection in attention |
+| `linear_proj` | Attention output projection |
+| `linear_fc1_up` | Up projection in MLP |
+| `linear_fc1_gate` | Gate projection in MLP |
+| `linear_fc2` | Second MLP layer |
+
+```{note}
+Canonical LoRA does not support `linear_qkv` or `linear_fc1` targets. Use the individual component targets (`linear_q`, `linear_k`, `linear_v` for QKV and `linear_fc1_up`, `linear_fc1_gate` for FC1) instead.
+```
+
 ### [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353)
 
 DoRA decomposes the pre-trained weight into magnitude and direction. It learns a separate magnitude parameter while employing LoRA for directional updates, efficiently minimizing the number of trainable parameters. DoRA enhances both the learning capacity and training stability of LoRA, while avoiding any additional inference overhead. DoRA has been shown to consistently outperform LoRA on various downstream tasks.
diff --git a/docs/versions1.json b/docs/versions1.json
index 35b654b99d..e4ee6022ef 100644
--- a/docs/versions1.json
+++ b/docs/versions1.json
@@ -1,11 +1,7 @@
 [
     {
         "preferred": true,
-        "version": "0.2.0",
-        "url": "../0.2.0"
-    },
-    {
         "version": "0.1.0",
         "url": "../0.1.0"
     }
-]
+]
\ No newline at end of file
diff --git a/example_commands.sh b/example_commands.sh
new file mode 100644
index 0000000000..d4a4ceb511
--- /dev/null
+++ b/example_commands.sh
@@ -0,0 +1,130 @@
+# ### set path to Megatron-Bridge
+# export MBRIDGE_PATH=/path/to/Megatron-Bridge
+# export PYTHONPATH="${MBRIDGE_PATH}/.:${MBRIDGE_PATH}/src/.:/opt/NeMo-Framework-Launcher/launcher_scripts"
+
+export CUDA_VISIBLE_DEVICES=0,1
+
+# ### install dependencies
+# pip install --upgrade git+https://github.com/NVIDIA/Megatron-LM.git@ce8185cbbe04f38beb74360e878450f2e8525885
+# python3 -m pip install --upgrade diffusers
+# pip install easydict
+# pip install imageio
+# pip install imageio-ffmpeg
+
+
+# ### Convert checkpoint
+# See examples/conversion/convert_wan_checkpoints.py for details.
+
+
+# ### Finetuning
+# export HF_TOKEN=...
+# export WANDB_API_KEY=...
+# EXP_NAME=...
+# PRETRAINED_CHECKPOINT=/path/to/pretrained_checkpoint
+# CHECKPOINT_DIR=/path/to/checkpoint_dir
+# DATASET_PATH=/path/to/dataset
+# cd $MBRIDGE_PATH
+NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=2 examples/recipes/wan/pretrain_wan.py \
+  model.tensor_model_parallel_size=1 \
+  model.pipeline_model_parallel_size=1 \
+  model.context_parallel_size=2 \
+  model.sequence_parallel=false \
+  model.qkv_format=thd \
+  dataset.num_workers=0 \
+  dataset.path=${DATASET_PATH} \
+  checkpoint.save=${CHECKPOINT_DIR} \
+  checkpoint.load=${PRETRAINED_CHECKPOINT} \
+  checkpoint.load_optim=false \
+  checkpoint.save_interval=200 \
+  optimizer.lr=5e-6 \
+  optimizer.min_lr=5e-6 \
+  train.eval_iters=0 \
+  scheduler.lr_decay_style=constant \
+  scheduler.lr_warmup_iters=0 \
+  model.seq_length=2048 \
+  dataset.seq_length=2048 \
+  train.global_batch_size=1 \
+  train.micro_batch_size=1 \
+  dataset.global_batch_size=1 \
+  dataset.micro_batch_size=1 \
+  logger.log_interval=1 \
+  logger.wandb_project="wan" \
+  logger.wandb_exp_name=${EXP_NAME} \
+  logger.wandb_save_dir=${CHECKPOINT_DIR}
+
+
+### Inferencing
+# Download T5 weights and VAE weights from "https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B/tree/main"
+#   T5: models_t5_umt5-xxl-enc-bf16.pth, google
+#   VAE: Wan2.1_VAE.pth
+
+CHECKPOINT_DIR=/opt/megatron_checkpoint_WAN
+T5_DIR=/opt/Wan2.1-T2V-1.3B
+VAE_DIR=/opt/Wan2.1-T2V-1.3B
+# cd $MBRIDGE_PATH
+# NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=1 examples/recipes/wan/inference_wan.py \
+#   --task t2v-1.3B \
+#   --sizes 832*480 \
+#   --checkpoint_dir ${CHECKPOINT_DIR} \
+#   --checkpoint_step 0000 \
+#   --t5_checkpoint_dir ${T5_DIR} \
+#   --vae_checkpoint_dir ${VAE_DIR} \
+#   --prompts "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
+#   --frame_nums 81 \
+#   --tensor_parallel_size 1 \
+#   --context_parallel_size 1 \
+#   --pipeline_parallel_size 1 \
+#   --sequence_parallel False \
+#   --base_seed 42 \
+#   --sample_steps 50
+
+
+NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=1 --rdzv-backend=c10d --rdzv-endpoint=localhost:0 examples/recipes/wan/inference_wan.py \
+  --task t2v-1.3B \
+  --sizes 832*480 \
+  --checkpoint_dir ${CHECKPOINT_DIR} \
+  --checkpoint_step 0000 \
+  --t5_checkpoint_dir ${T5_DIR} \
+  --vae_checkpoint_dir ${VAE_DIR} \
+  --prompts "Beautiful maple leaves across the mountain during the autumn." \
+  --frame_nums 81 \
+  --tensor_parallel_size 1 \
+  --context_parallel_size 1 \
+  --pipeline_parallel_size 1 \
+  --sequence_parallel False \
+  --base_seed 42 \
+  --sample_steps 50
+
+
+  # NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=2 examples/recipes/wan/inference_wan.py \
+  # --task t2v-1.3B \
+  # --sizes 832*480 \
+  # --checkpoint_dir ${CHECKPOINT_DIR} \
+  # --checkpoint_step 0000 \
+  # --t5_checkpoint_dir ${T5_DIR} \
+  # --vae_checkpoint_dir ${VAE_DIR} \
+  # --prompts "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
+  # --frame_nums 81 \
+  # --tensor_parallel_size 1 \
+  # --context_parallel_size 2 \
+  # --pipeline_parallel_size 1 \
+  # --sequence_parallel False \
+  # --base_seed 42 \
+  # --sample_steps 50
+
+
+  #   NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=2 examples/recipes/wan/inference_wan.py \
+  # --task t2v-1.3B \
+  # --sizes 832*480 \
+  # --checkpoint_dir ${CHECKPOINT_DIR} \
+  # --checkpoint_step 0000 \
+  # --t5_checkpoint_dir ${T5_DIR} \
+  # --vae_checkpoint_dir ${VAE_DIR} \
+  # --prompts "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
+  # --frame_nums 81 \
+  # --tensor_parallel_size 1 \
+  # --context_parallel_size 1 \
+  # --pipeline_parallel_size 2 \
+  # --sequence_parallel False \
+  # --base_seed 42 \
+  # --sample_steps 50
\ No newline at end of file
diff --git a/examples/conversion/compare_hf_and_megatron/compare.py b/examples/conversion/compare_hf_and_megatron/compare.py
index 9f46b27f17..8449fcda2a 100644
--- a/examples/conversion/compare_hf_and_megatron/compare.py
+++ b/examples/conversion/compare_hf_and_megatron/compare.py
@@ -20,46 +20,46 @@
 
 Run Script Examples:
     # Regular LLM comparison between HF and Megatron models:
-    python examples/models/compare_hf_and_megatron/compare.py \
+    python examples/conversion/compare_hf_and_megatron/compare.py \
         --hf_model_path "Qwen/Qwen3-1.7B" \
         --prompt "Hello, how are you?"
 
 
     # Vision-language comparison with image from URL:
-    python examples/models/compare_hf_and_megatron/compare.py \
+    python examples/conversion/compare_hf_and_megatron/compare.py \
         --hf_model_path "Qwen/Qwen2.5-VL-3B-Instruct" \
         --model_class "Qwen2_5_VLForConditionalGeneration" \
         --image_path "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" \
         --prompt "Describe this image."
 
     # Vision-language comparison with local image:
-    python examples/models/compare_hf_and_megatron/compare.py \
+    python examples/conversion/compare_hf_and_megatron/compare.py \
         --hf_model_path "Qwen/Qwen2.5-VL-3B-Instruct" \
         --model_class "Qwen2_5_VLForConditionalGeneration" \
         --image_path "/path/to/local/image.jpg" \
         --prompt "What do you see in this image?"
 
     # Multi-GPU comparison with tensor parallelism (regular LLM):
-    torchrun --nproc_per_node=2 examples/models/compare_hf_and_megatron/compare.py \
+    torchrun --nproc_per_node=2 examples/conversion/compare_hf_and_megatron/compare.py \
         --hf_model_path "Qwen/Qwen3-1.7B" \
         --prompt "Hello world" \
         --tp 2
 
     # Pipeline parallel comparison (VL model):
-    torchrun --nproc_per_node=2 examples/models/compare_hf_and_megatron/compare.py \
+    torchrun --nproc_per_node=2 examples/conversion/compare_hf_and_megatron/compare.py \
         --hf_model_path "Qwen/Qwen2.5-VL-3B-Instruct" \
         --model_class "Qwen2_5_VLForConditionalGeneration" \
         --prompt "Hello world" \
         --pp 2
 
     # Compare with pre-converted Megatron checkpoint:
-    python examples/models/compare_hf_and_megatron/compare.py \
+    python examples/conversion/compare_hf_and_megatron/compare.py \
         --hf_model_path "Qwen/Qwen3-1.7B" \
         --megatron_model_path "/path/to/megatron/checkpoint" \
         --prompt "Hello world"
 
     # Enable debug hooks to inspect forward pass intermediate results:
-    python examples/models/compare_hf_and_megatron/compare.py \
+    python examples/conversion/compare_hf_and_megatron/compare.py \
         --hf_model_path "Qwen/Qwen3-1.7B" \
         --prompt "Hello world" \
         --enable_debug_hooks
@@ -491,7 +491,16 @@ def _load_megatron_model(args):
         model_provider.expert_tensor_parallel_size = etp
         model_provider.pipeline_dtype = torch.bfloat16
         model_provider.initialize_model_parallel(seed=0)
-        megatron_model = bridge.load_megatron_model(args.megatron_model_path, wrap_with_ddp=False)
+        megatron_model = bridge.load_megatron_model(
+            args.megatron_model_path,
+            mp_overrides={
+                "tensor_model_parallel_size": tp,
+                "pipeline_model_parallel_size": pp,
+                "expert_model_parallel_size": ep,
+                "expert_tensor_parallel_size": etp,
+            },
+            wrap_with_ddp=False,
+        )
     else:
         # Convert from HF to Megatron
         bridge = AutoBridge.from_hf_pretrained(args.hf_model_path)
diff --git a/examples/conversion/convert_checkpoints.py b/examples/conversion/convert_checkpoints.py
index 5bd341e248..4e6ad4b7d7 100644
--- a/examples/conversion/convert_checkpoints.py
+++ b/examples/conversion/convert_checkpoints.py
@@ -258,6 +258,10 @@ def main():
     else:
         raise RuntimeError(f"Unknown command: {args.command}")
 
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+
 
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/examples/conversion/convert_vace_checkpoints.py b/examples/conversion/convert_vace_checkpoints.py
new file mode 100644
index 0000000000..dd0eb6e378
--- /dev/null
+++ b/examples/conversion/convert_vace_checkpoints.py
@@ -0,0 +1,49 @@
+import os, random, multiprocessing as mp
+
+def main():
+    from megatron.bridge.models.hf_pretrained.wan import PreTrainedVACE
+    from megatron.bridge.models.wan.wan_bridge import VACEBridge
+    from megatron.bridge.training.model_load_save import save_megatron_model
+
+    # --- minimal torch.distributed single-rank env ---
+    os.environ.setdefault("MASTER_ADDR", "127.0.0.1")
+    os.environ.setdefault("MASTER_PORT", str(29500 + random.randint(0, 1000)))
+    os.environ.setdefault("RANK", "0")
+    os.environ.setdefault("WORLD_SIZE", "1")
+    os.environ.setdefault("LOCAL_RANK", "0")
+
+    # --- build & load ---
+    hf = PreTrainedVACE("Wan-AI/Wan2.1-VACE-1.3B-Diffusers")
+    # hf = PreTrainedVACE("Wan-AI/Wan2.1-VACE-14B-Diffusers")
+
+    bridge = VACEBridge()
+    provider = bridge.provider_bridge(hf)
+    provider.perform_initialization = False
+
+    # If you're on GPU but want CPU init to reduce peak mem:
+    megatron_models = provider.provide_distributed_model(
+        wrap_with_ddp=False, use_cpu_initialization=True
+    )
+
+    bridge.load_weights_hf_to_megatron(hf, megatron_models)
+
+    # Save Megatron-format checkpoint (this triggers async writer internally)
+    save_megatron_model(
+        megatron_models,
+        "/opt/megatron_checkpoint_VACE",
+        hf_tokenizer_path=None
+    )
+
+if __name__ == "__main__":
+    # On Linux, prefer 'fork' to avoid re-importing the module on spawn.
+    try:
+        mp.set_start_method("fork")
+    except RuntimeError:
+        # already set (fine on re-entry or non-Linux)
+        pass
+
+    # If you’re on macOS/Windows and still want to be extra safe:
+    # mp.freeze_support()
+
+    main()
+
diff --git a/examples/conversion/convert_wan_checkpoints.py b/examples/conversion/convert_wan_checkpoints.py
new file mode 100644
index 0000000000..c4cf0bfcf3
--- /dev/null
+++ b/examples/conversion/convert_wan_checkpoints.py
@@ -0,0 +1,74 @@
+# from megatron.bridge.models.hf_pretrained.wan import PreTrainedWAN
+# from megatron.bridge.models.wan.wan_bridge import WanBridge
+# from megatron.bridge.training.model_load_save import save_megatron_model
+# import os, random
+# os.environ["MASTER_ADDR"] = "127.0.0.1"
+# os.environ["MASTER_PORT"] = str(29500 + random.randint(0, 1000))
+# os.environ["RANK"] = "0"
+# os.environ["WORLD_SIZE"] = "1"
+# os.environ["LOCAL_RANK"] = "0"
+# #
+# hf = PreTrainedWAN("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
+# # hf = PreTrainedWAN("Wan-AI/Wan2.1-T2V-14B-Diffusers")
+# bridge = WanBridge()
+# #
+# provider = bridge.provider_bridge(hf)
+# provider.perform_initialization = False
+# megatron_models = provider.provide_distributed_model(wrap_with_ddp=False, use_cpu_initialization=True)
+# #
+# bridge.load_weights_hf_to_megatron(hf, megatron_models)
+# save_megatron_model(megatron_models, "/opt/megatron_checkpoint", hf_tokenizer_path=None)
+
+
+# convert_wan_checkpoints.py
+
+import os, random, multiprocessing as mp
+
+def main():
+    from megatron.bridge.models.hf_pretrained.wan import PreTrainedWAN
+    from megatron.bridge.models.wan.wan_bridge import WanBridge
+    from megatron.bridge.training.model_load_save import save_megatron_model
+
+    # --- minimal torch.distributed single-rank env ---
+    os.environ.setdefault("MASTER_ADDR", "127.0.0.1")
+    os.environ.setdefault("MASTER_PORT", str(29500 + random.randint(0, 1000)))
+    os.environ.setdefault("RANK", "0")
+    os.environ.setdefault("WORLD_SIZE", "1")
+    os.environ.setdefault("LOCAL_RANK", "0")
+
+    # --- build & load ---
+    hf = PreTrainedWAN("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
+    # hf = PreTrainedWAN("Wan-AI/Wan2.1-T2V-14B-Diffusers")
+
+    bridge = WanBridge()
+    provider = bridge.provider_bridge(hf)
+    provider.perform_initialization = False
+
+    # If you're on GPU but want CPU init to reduce peak mem:
+    megatron_models = provider.provide_distributed_model(
+        wrap_with_ddp=False, use_cpu_initialization=True
+    )
+    print(megatron_models[0])
+    bridge.load_weights_hf_to_megatron(hf, megatron_models)
+
+
+    # Save Megatron-format checkpoint (this triggers async writer internally)
+    save_megatron_model(
+        megatron_models,
+        "/opt/megatron_checkpoint_WAN",
+        hf_tokenizer_path=None
+    )
+
+if __name__ == "__main__":
+    # On Linux, prefer 'fork' to avoid re-importing the module on spawn.
+    try:
+        mp.set_start_method("fork")
+    except RuntimeError:
+        # already set (fine on re-entry or non-Linux)
+        pass
+
+    # If you’re on macOS/Windows and still want to be extra safe:
+    # mp.freeze_support()
+
+    main()
+
diff --git a/examples/conversion/hf_megatron_roundtrip_multi_gpu.py b/examples/conversion/hf_megatron_roundtrip_multi_gpu.py
index 9de063739c..cf184c5912 100644
--- a/examples/conversion/hf_megatron_roundtrip_multi_gpu.py
+++ b/examples/conversion/hf_megatron_roundtrip_multi_gpu.py
@@ -33,8 +33,8 @@
     in Megatron's native checkpoint format by specifying the `--megatron-save-path` argument.
 
 Usage:
-torchrun --nproc_per_node 1 examples/models/hf_megatron_roundtrip_multi_gpu.py
-torchrun --nproc_per_node 1 examples/models/hf_megatron_roundtrip_multi_gpu.py --megatron-save-path ./megatron_checkpoint
+torchrun --nproc_per_node 1 examples/conversion/hf_megatron_roundtrip_multi_gpu.py
+torchrun --nproc_per_node 1 examples/conversion/hf_megatron_roundtrip_multi_gpu.py --megatron-save-path ./megatron_checkpoint
 """
 
 import argparse
@@ -89,7 +89,16 @@ def main(
         # Once all overrides are set, finalize the model provider to ensure the post initialization logic is run
         model_provider.finalize()
         model_provider.initialize_model_parallel(seed=0)
-        megatron_model = bridge.load_megatron_model(megatron_load_path, wrap_with_ddp=False)
+        megatron_model = bridge.load_megatron_model(
+            megatron_load_path,
+            mp_overrides={
+                "tensor_model_parallel_size": tp,
+                "pipeline_model_parallel_size": pp,
+                "expert_model_parallel_size": ep,
+                "expert_tensor_parallel_size": etp,
+            },
+            wrap_with_ddp=False,
+        )
         megatron_model = [m.cuda() for m in megatron_model]
 
     else:
diff --git a/examples/conversion/hf_to_megatron_generate_text.py b/examples/conversion/hf_to_megatron_generate_text.py
index 743420c3c4..144313b4ec 100644
--- a/examples/conversion/hf_to_megatron_generate_text.py
+++ b/examples/conversion/hf_to_megatron_generate_text.py
@@ -15,10 +15,10 @@
 """
 Example:
   # Load from HuggingFace model:
-  python examples/models/hf_to_megatron_generate_text.py --hf_model_path="meta-llama/Llama-3.2-1B" --prompt="Hello, how are you?"
+  python examples/conversion/hf_to_megatron_generate_text.py --hf_model_path="meta-llama/Llama-3.2-1B" --prompt="Hello, how are you?"
 
   # Load from Megatron checkpoint:
-  python examples/models/hf_to_megatron_generate_text.py --hf_model_path="meta-llama/Llama-3.2-1B" --megatron_model_path="/path/to/megatron/checkpoint" --prompt="Hello, how are you?"
+  python examples/conversion/hf_to_megatron_generate_text.py --hf_model_path="meta-llama/Llama-3.2-1B" --megatron_model_path="/path/to/megatron/checkpoint" --prompt="Hello, how are you?"
 """
 
 import argparse
@@ -127,7 +127,16 @@ def main(args) -> None:
         model_provider.initialize_model_parallel(seed=0)
 
         # Load the Megatron model directly
-        model = bridge.load_megatron_model(args.megatron_model_path, wrap_with_ddp=False)
+        model = bridge.load_megatron_model(
+            args.megatron_model_path,
+            mp_overrides={
+                "tensor_model_parallel_size": tp,
+                "pipeline_model_parallel_size": pp,
+                "expert_model_parallel_size": ep,
+                "expert_tensor_parallel_size": etp,
+            },
+            wrap_with_ddp=False,
+        )
 
     else:
         # Load from HuggingFace and convert to Megatron
diff --git a/examples/conversion/hf_to_megatron_generate_vlm.py b/examples/conversion/hf_to_megatron_generate_vlm.py
index 9055e42431..c2bdb1be36 100644
--- a/examples/conversion/hf_to_megatron_generate_vlm.py
+++ b/examples/conversion/hf_to_megatron_generate_vlm.py
@@ -209,7 +209,16 @@ def main(args) -> None:
         model_provider.initialize_model_parallel(seed=0)
 
         # Load the Megatron model directly
-        model = bridge.load_megatron_model(args.megatron_model_path, wrap_with_ddp=False)
+        model = bridge.load_megatron_model(
+            args.megatron_model_path,
+            mp_overrides={
+                "tensor_model_parallel_size": tp,
+                "pipeline_model_parallel_size": pp,
+                "expert_model_parallel_size": ep,
+                "expert_tensor_parallel_size": etp,
+            },
+            wrap_with_ddp=False,
+        )
 
     else:
         # Load from HuggingFace and convert to Megatron
diff --git a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml
index 96d5a29615..9124fe65a0 100644
--- a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml
+++ b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml
@@ -18,6 +18,7 @@
 # and its sub-configurations (e.g., model, train, etc.)
 # Top-level ConfigContainer fields are dataclasses themselves
 
+backend: mbridge
 model:
   seq_length: 4096
 
diff --git a/examples/recipes/llama/pretrain_DiT_Model.py b/examples/recipes/llama/pretrain_DiT_Model.py
new file mode 100644
index 0000000000..15c14907fd
--- /dev/null
+++ b/examples/recipes/llama/pretrain_DiT_Model.py
@@ -0,0 +1,179 @@
+
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Llama3 8B Pretraining Script with YAML and CLI Configuration Overrides.
+
+This script provides a flexible way to pretrain Llama3 8B models using Megatron-Bridge with support for
+both YAML configuration files and command-line overrides using Hydra-style syntax.
+
+Examples:
+    Basic usage with default configuration:
+        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py
+
+    Using a custom YAML config file:
+        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_custom_config.yaml
+
+    Using CLI overrides only:
+        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py model.tensor_model_parallel_size=4 train.train_iters=100000
+
+    Combining YAML and CLI overrides (CLI takes precedence):
+        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file conf/my_config.yaml \
+        model.pipeline_dtype=torch.float16 \
+        train.global_batch_size=512
+
+Configuration Precedence:
+    1. Base configuration from pretrain_config() recipe
+    2. YAML overrides from --config-file (if provided)
+    3. CLI overrides (highest precedence)
+
+Supported Override Syntax:
+    - Standard assignment: key=value
+    - Nested assignment: section.subsection.key=value
+    - Addition: +new_key=value
+    - Deletion: ~key_to_remove
+    - Type conversion: Automatic for basic types (int, float, bool, str)
+    - Complex types: torch.dtype, enums, etc. are supported
+"""
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Tuple
+
+from omegaconf import OmegaConf
+
+from megatron.bridge.recipes.DiTModel.dit import pretrain_config
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.models.DiTModel.dit_step import DITForwardStep
+from megatron.bridge.training.pretrain import pretrain
+from megatron.bridge.training.utils.omegaconf_utils import (
+    apply_overrides,
+    create_omegaconf_dict_config,
+    parse_hydra_overrides,
+)
+from megatron.bridge.utils.common_utils import get_rank_safe
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# Define paths relative to this script's location
+# Assumes this script (pretrain_llama3_8b.py) is in Megatron-Bridge/examples/recipes/llama/
+# and the config is in a 'conf' subdirectory.
+SCRIPT_DIR: Path = Path(__file__).parent.resolve()
+DEFAULT_CONFIG_FILENAME: str = "llama3_8b_pretrain_override_example.yaml"
+DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME
+
+
+def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
+    """Parse command line arguments, separating known script args from OmegaConf overrides."""
+    parser = argparse.ArgumentParser(
+        description="Pretrain Llama3 8B model using Megatron-Bridge with YAML and CLI overrides",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=str(DEFAULT_CONFIG_FILE_PATH),
+        help="Path to the YAML OmegaConf override file. Default: conf/llama3_8b_pretrain_override_example.yaml",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+
+    # Parse known args for the script, remaining will be treated as overrides
+    args, cli_dotlist_overrides = parser.parse_known_args()
+    return args, cli_dotlist_overrides
+
+
+def main() -> None:
+    """
+    Entry point for the Llama3 8B pretraining script.
+
+    This function orchestrates the complete configuration workflow:
+    1. Loads the base configuration from pretrain_config() recipe
+    2. Applies YAML overrides from --config-file (if exists)
+    3. Applies CLI overrides using Hydra-style syntax
+    4. Starts Megatron pretraining with the final merged configuration
+
+    Configuration merging preserves callable fields (like activation functions)
+    and handles type conversions automatically.
+
+    Examples of CLI usage:
+        # Use default config with custom learning rate
+        torchrun --nproc_per_node=8 pretrain_llama3_8b.py optimizer.lr=0.0002
+
+        # Custom config file with additional overrides
+        torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_config.yaml train.train_iters=50000
+
+        # Multiple overrides for distributed training
+        torchrun --nproc_per_node=8 pretrain_llama3_8b.py \
+            model.tensor_model_parallel_size=4 \
+            model.pipeline_model_parallel_size=2 \
+            train.global_batch_size=512
+    """
+    args, cli_overrides = parse_cli_args()
+
+    logger.info("Megatron-Bridge Llama3 8B Pretraining Script with YAML & CLI Overrides")
+    logger.info("------------------------------------------------------------------")
+
+    # Load base configuration from the recipe as a Python dataclass
+    cfg: ConfigContainer = pretrain_config()
+    logger.info("Loaded base configuration")
+
+    # Print configuration on rank 0
+    if get_rank_safe() == 0:
+        cfg.print_yaml()
+
+    # # Convert the initial Python dataclass to an OmegaConf DictConfig for merging
+    # merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
+
+    # # Load and merge YAML overrides if a config file is provided
+    # if args.config_file:
+    #     logger.debug(f"Loading YAML overrides from: {args.config_file}")
+    #     if not os.path.exists(args.config_file):
+    #         logger.error(f"Override YAML file not found: {args.config_file}")
+    #         sys.exit(1)
+    #     yaml_overrides_omega = OmegaConf.load(args.config_file)
+    #     merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega)
+    #     logger.debug("YAML overrides merged successfully.")
+
+    # # Apply command-line overrides using Hydra-style parsing
+    # if cli_overrides:
+    #     logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}")
+    #     merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
+    #     logger.debug("Hydra-style command-line overrides applied successfully.")
+
+    # # Apply the final merged OmegaConf configuration back to the original ConfigContainer
+    # logger.debug("Applying final merged configuration back to Python ConfigContainer...")
+    # final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
+    # # Apply overrides while preserving excluded fields
+    # apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
+
+    # Display final configuration
+    if get_rank_safe() == 0:
+        logger.info("--- Final Merged Configuration ---")
+        cfg.print_yaml()
+        logger.info("----------------------------------")
+
+    # Start training
+    logger.debug("Starting pretraining...")
+    pretrain(config=cfg, forward_step_func=DITForwardStep())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/recipes/llama/pretrain_llama3_8b.py b/examples/recipes/llama/pretrain_llama3_8b.py
index b7523bef8b..9757d747be 100644
--- a/examples/recipes/llama/pretrain_llama3_8b.py
+++ b/examples/recipes/llama/pretrain_llama3_8b.py
@@ -55,9 +55,10 @@
 from pathlib import Path
 from typing import Tuple
 
+import torch
 from omegaconf import OmegaConf
 
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
+from megatron.bridge.recipes.llama import llama3_8b_pretrain_config as pretrain_config
 from megatron.bridge.training.config import ConfigContainer
 from megatron.bridge.training.gpt_step import forward_step
 from megatron.bridge.training.pretrain import pretrain
@@ -173,6 +174,11 @@ def main() -> None:
     logger.debug("Starting pretraining...")
     pretrain(config=cfg, forward_step_func=forward_step)
 
+    # Cleanup process group
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_partial.py b/examples/recipes/llama/pretrain_llama3_8b_nemo_run_partial.py
index 5b081970f4..915f0f56db 100644
--- a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_partial.py
+++ b/examples/recipes/llama/pretrain_llama3_8b_nemo_run_partial.py
@@ -18,7 +18,7 @@
 
 import nemo_run as run
 
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
+from megatron.bridge.recipes.llama import llama3_8b_pretrain_config as pretrain_config
 from megatron.bridge.recipes.utils.nemo_run_utils import get_partial_fn
 from megatron.bridge.training.config import ConfigContainer, ProfilingConfig
 from megatron.bridge.training.gpt_step import forward_step
diff --git a/examples/recipes/wan/inference_vace.py b/examples/recipes/wan/inference_vace.py
new file mode 100644
index 0000000000..b0ce120fec
--- /dev/null
+++ b/examples/recipes/wan/inference_vace.py
@@ -0,0 +1,378 @@
+import argparse
+import logging
+import os
+import sys
+import warnings
+from datetime import datetime
+
+warnings.filterwarnings('ignore')
+
+import random
+
+import torch
+import torch.distributed as dist
+from PIL import Image
+
+from megatron.bridge.models.wan.flow_matching.flow_inference_pipeline import VACEFlowInferencePipeline
+from megatron.bridge.models.wan.inference.configs import SIZE_CONFIGS, SUPPORTED_SIZES, MAX_AREA_CONFIGS, WAN_CONFIGS
+from megatron.bridge.models.wan.inference.utils.utils import cache_video, cache_image, str2bool
+
+
+EXAMPLE_PROMPT = {
+    "vace-1.3B": {
+        "src_ref_images": 'assets/images/girl.png,assets/images/snake.png',
+        "prompt": "在一个欢乐而充满节日气氛的场景中，穿着鲜艳红色春服的小女孩正与她的可爱卡通蛇嬉戏。她的春服上绣着金色吉祥图案，散发着喜庆的气息，脸上洋溢着灿烂的笑容。蛇身呈现出亮眼的绿色，形状圆润，宽大的眼睛让它显得既友善又幽默。小女孩欢快地用手轻轻抚摸着蛇的头部，共同享受着这温馨的时刻。周围五彩斑斓的灯笼和彩带装饰着环境，阳光透过洒在她们身上，营造出一个充满友爱与幸福的新年氛围。"
+    },
+    "vace-14B": {
+        "src_ref_images": 'assets/images/girl.png,assets/images/snake.png',
+        "prompt": "在一个欢乐而充满节日气氛的场景中，穿着鲜艳红色春服的小女孩正与她的可爱卡通蛇嬉戏。她的春服上绣着金色吉祥图案，散发着喜庆的气息，脸上洋溢着灿烂的笑容。蛇身呈现出亮眼的绿色，形状圆润，宽大的眼睛让它显得既友善又幽默。小女孩欢快地用手轻轻抚摸着蛇的头部，共同享受着这温馨的时刻。周围五彩斑斓的灯笼和彩带装饰着环境，阳光透过洒在她们身上，营造出一个充满友爱与幸福的新年氛围。"
+    }
+}
+
+
+
+
+def validate_args(args):
+    # Basic check
+    assert args.checkpoint_dir is not None, "Please specify the checkpoint directory."
+    assert args.model_name in WAN_CONFIGS, f"Unsupport model name: {args.model_name}"
+    assert args.model_name in EXAMPLE_PROMPT, f"Unsupport model name: {args.model_name}"
+
+    # The default sampling steps are 40 for image-to-video tasks and 50 for text-to-video tasks.
+    if args.sample_steps is None:
+        args.sample_steps = 50
+
+    if args.sample_shift is None:
+        args.sample_shift = 16
+        
+    # The default number of frames are 1 for text-to-image tasks and 81 for other tasks.
+    if args.frame_nums is None:
+        args.frame_nums = 81
+
+    args.base_seed = args.base_seed if args.base_seed >= 0 else random.randint(0, sys.maxsize)
+    # Size check
+    if args.sizes is not None and len(args.sizes) > 0:
+        for s in args.sizes:
+            assert s in SUPPORTED_SIZES[args.model_name], f"Unsupport size {s} for model name {args.model_name}, supported sizes are: {', '.join(SUPPORTED_SIZES[args.model_name])}"
+    return args
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate a image or video from a text prompt or image using Wan"
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="vace-1.3B",
+        choices=list(WAN_CONFIGS.keys()),
+        help="The model name to run.")
+    parser.add_argument(
+        "--sizes",
+        type=str,
+        nargs="+",
+        default=None,
+        choices=list(SIZE_CONFIGS.keys()),
+        help="List of sizes to generate multiple images or videos (WIDTH*HEIGHT). Example: --sizes 1280*720 1920*1080"
+    )
+    parser.add_argument(
+        "--frame_nums",
+        type=int,
+        nargs="+",
+        default=None,
+        help="List of frame counts (each should be 4n+1). Broadcasts if single value."
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        default=None,
+        help="The path to the main VACE checkpoint directory.")
+    parser.add_argument(
+        "--checkpoint_step",
+        type=int,
+        default=None,
+        help=(
+            "Optional training step to load, e.g. 1800 -> iter_0001800. "
+            "If not provided, the latest (largest) step in --checkpoint_dir is used.")
+    )
+    parser.add_argument(
+        "--t5_checkpoint_dir",
+        type=str,
+        default=None,
+        help="Optional directory containing T5 checkpoint/tokenizer")
+    parser.add_argument(
+        "--vae_checkpoint_dir",
+        type=str,
+        default=None,
+        help="Optional directory containing VAE checkpoint")
+    parser.add_argument(
+        "--offload_model",
+        type=str2bool,
+        default=None,
+        help="Whether to offload the model to CPU after each model forward, reducing GPU memory usage."
+    )
+    parser.add_argument(
+        "--t5_cpu",
+        action="store_true",
+        default=False,
+        help="Whether to place T5 model on CPU.")
+    parser.add_argument(
+        "--save_file",
+        type=str,
+        default=None,
+        help="The file to save the generated image or video to.")
+    parser.add_argument(
+        "--src_video",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of name of the source video. Default None.")
+    parser.add_argument(
+        "--src_mask",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of name of the source mask. Default None.")
+    parser.add_argument(
+        "--src_ref_images",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of list of the source reference images. Separated by ','. Default None.")
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of prompt to generate the image or video from.")
+    parser.add_argument(
+        "--base_seed",
+        type=int,
+        default=-1,
+        help="The seed to use for generating the image or video.")
+    parser.add_argument(
+        "--sample_solver",
+        type=str,
+        default='unipc',
+        choices=['unipc', 'dpm++'],
+        help="The solver used to sample.")
+    parser.add_argument(
+        "--sample_steps", type=int, default=None, help="The sampling steps.")
+    parser.add_argument(
+        "--sample_shift",
+        type=float,
+        default=None,
+        help="Sampling shift factor for flow matching schedulers.")
+    parser.add_argument(
+        "--sample_guide_scale",
+        type=float,
+        default=5.0,
+        help="Classifier free guidance scale.")
+    parser.add_argument(
+        "--tensor_parallel_size",
+        type=int,
+        default=1,
+        help="Tensor parallel size.")
+    parser.add_argument(
+        "--context_parallel_size",
+        type=int,
+        default=1,
+        help="Context parallel size.")
+    parser.add_argument(
+        "--pipeline_parallel_size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size.")
+    parser.add_argument(
+        "--sequence_parallel",
+        type=str2bool,
+        default=False,
+        help="Sequence parallel.")
+
+    args = parser.parse_args()
+
+    validate_args(args)
+
+    return args
+
+
+def _init_logging(rank):
+    # logging
+    if rank == 0:
+        # set format
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[%(asctime)s] %(levelname)s: %(message)s",
+            handlers=[logging.StreamHandler(stream=sys.stdout)])
+    else:
+        logging.basicConfig(level=logging.ERROR)
+
+
+def generate(args):
+    rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    device = local_rank
+    _init_logging(rank)
+
+    if args.offload_model is None:
+        args.offload_model = False if world_size > 1 else True
+        logging.info(
+            f"offload_model is not specified, set to {args.offload_model}.")
+    if world_size > 1:
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            rank=rank,
+            world_size=world_size)
+
+    cfg = WAN_CONFIGS[args.model_name]
+
+    logging.info(f"Generation job args: {args}")
+    logging.info(f"Generation model config: {cfg}")
+
+    if dist.is_initialized():
+        base_seed = [args.base_seed] if rank == 0 else [None]
+        dist.broadcast_object_list(base_seed, src=0)
+        args.base_seed = base_seed[0]
+
+    if args.prompts is None:
+        prompts = [None]
+    else:
+        prompts = args.prompts * 8
+        
+    if args.src_video is None:
+        src_video = [None] * len(prompts)
+    else:
+        src_video = args.src_video * 8
+        
+    if args.src_mask is None:
+        src_mask = [None] * len(prompts)
+    else:
+        src_mask = args.src_mask * 8
+        
+    if args.src_ref_images is None:
+        src_ref_images = [None] * len(prompts)
+    else:
+        src_ref_images = args.src_ref_images * 8
+
+    # Resolve sizes list (default to first supported size for task)
+    if args.sizes is not None and len(args.sizes) > 0:
+        size_keys = args.sizes * 8
+    else:
+        size_keys = [SUPPORTED_SIZES[args.model_name][0]]
+
+    # Resolve frame counts list (default 81)
+    if args.frame_nums is not None and len(args.frame_nums) > 0:
+        frame_nums = args.frame_nums * 8
+    else:
+        frame_nums = [81]
+
+    # Enforce 1:1 pairing across lists
+    assert len(prompts) == len(size_keys) == len(frame_nums), (
+        f"prompts ({len(prompts)}), sizes ({len(size_keys)}), and frame_nums ({len(frame_nums)}) "
+        f"must have the same length")
+
+    logging.info("Creating VACE flow inference pipeline.")
+    pipeline = VACEFlowInferencePipeline(
+        config=cfg,
+        checkpoint_dir=args.checkpoint_dir,
+        checkpoint_step=args.checkpoint_step,
+        t5_checkpoint_dir=args.t5_checkpoint_dir,
+        vae_checkpoint_dir=args.vae_checkpoint_dir,
+        device_id=device,
+        rank=rank,
+        t5_cpu=args.t5_cpu,
+        tensor_parallel_size=args.tensor_parallel_size,
+        context_parallel_size=args.context_parallel_size,
+        pipeline_parallel_size=args.pipeline_parallel_size,
+        sequence_parallel=args.sequence_parallel,
+        pipeline_dtype=torch.float32,
+    )
+
+    # DEBUGGING
+    rank = dist.get_rank()
+    if rank == 0:
+        print("tensor_parallel_size:", args.tensor_parallel_size)
+        print("context_parallel_size:", args.context_parallel_size)
+        print("pipeline_parallel_size:", args.pipeline_parallel_size)
+        print("sequence_parallel:", args.sequence_parallel)
+        print("\n\n\n")
+
+    for i in range(len(src_video)):
+        sub_src_video, sub_src_mask, sub_src_ref_images = pipeline.prepare_source([src_video[i]],
+                                                                  [src_mask[i]],
+                                                                  [src_ref_images[i]],
+                                                                  frame_nums[i], SIZE_CONFIGS[size_keys[i]], device)
+        src_video[i], src_mask[i], src_ref_images[i] = *sub_src_video, *sub_src_mask, *sub_src_ref_images
+    
+    
+    logging.info(
+        f"Generating videos ...")
+    videos = pipeline.generate(
+        prompts=prompts,
+        input_frames=src_video,
+        input_masks=src_mask,
+        input_ref_images=src_ref_images,
+        sizes=[SIZE_CONFIGS[size] for size in size_keys],
+        frame_nums=frame_nums,
+        shift=args.sample_shift,
+        sample_solver=args.sample_solver,
+        sampling_steps=args.sample_steps,
+        guide_scale=args.sample_guide_scale,
+        seed=args.base_seed,
+        offload_model=args.offload_model)
+
+    if rank == 0:
+        for i, video in enumerate(videos):
+            formatted_experiment_name = (args.save_file) if args.save_file is not None else "DefaultExp"
+            formatted_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+            formatted_prompt = prompts[i].replace(" ", "_").replace("/",
+                                                                    "_")[:50]
+            suffix = '.mp4'
+            formatted_save_file = f"{args.model_name}_{formatted_experiment_name}_videoindex{int(i)}_size{size_keys[i].replace('*','x') if sys.platform=='win32' else size_keys[i]}_{formatted_prompt}_{formatted_time}" + suffix
+
+            # if "t2v" in args.task:
+            logging.info(f"Saving generated video to {formatted_save_file}")
+            cache_video(
+                tensor=video[None],
+                save_file=formatted_save_file,
+                fps=cfg.sample_fps,
+                nrow=1,
+                normalize=True,
+                value_range=(-1, 1))
+                
+            cache_video(
+                tensor=src_video[i][None],
+                save_file=f'{args.model_name}_{formatted_experiment_name}_index{i}_src_video_{formatted_time}.mp4',
+                fps=cfg.sample_fps,
+                nrow=1,
+                normalize=True,
+                value_range=(-1, 1))
+            logging.info(f"Saving src_video to {args.model_name}_{formatted_experiment_name}_index{i}_src_video_{formatted_time}.mp4")
+
+            cache_video(
+                tensor=src_mask[i][None],
+                save_file=f'{args.model_name}_{formatted_experiment_name}_index{i}_src_mask_{formatted_time}.mp4',
+                fps=cfg.sample_fps,
+                nrow=1,
+                normalize=True,
+                value_range=(0, 1))
+            logging.info(f"Saving src_mask to {args.model_name}_{formatted_experiment_name}_index{i}_src_mask_{formatted_time}.mp4")
+
+            if src_ref_images[i] is not None:
+                for j, ref_img in enumerate(src_ref_images[i]):
+                    cache_image(
+                        tensor=ref_img[:, 0, ...],
+                        save_file=f'{args.model_name}_{formatted_experiment_name}_index{i}_src_ref_image_{j}_{formatted_time}.png',
+                        nrow=1,
+                        normalize=True,
+                        value_range=(-1, 1))
+                    logging.info(f"Saving src_ref_image_{j} to {args.model_name}_{formatted_experiment_name}_index{i}_src_ref_image_{j}_{formatted_time}.png")
+    logging.info("Finished.")
+
+
+if __name__ == "__main__":
+    args = _parse_args()
+    generate(args)
diff --git a/examples/recipes/wan/inference_wan.py b/examples/recipes/wan/inference_wan.py
new file mode 100644
index 0000000000..61f38ecdea
--- /dev/null
+++ b/examples/recipes/wan/inference_wan.py
@@ -0,0 +1,324 @@
+# Example of running script for Wan inference.
+#     NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=1 examples/recipes/wan/inference_wan.py  \
+#     --task t2v-1.3B \
+#     --sizes 480*832 \
+#     --checkpoint_dir /path/to/wan_checkpoint_dir \
+#     --t5_checkpoint_dir /path/to/t5_checkpoint_dir \
+#     --vae_checkpoint_dir /path/to/vae_checkpoint_dir \
+#     --frame_nums 81 \
+#     --prompts "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
+#     --tensor_parallel_size 1 \
+#     --context_parallel_size 1 \
+#     --pipeline_parallel_size 1 \
+#     --sequence_parallel False \
+#     --base_seed 42 \
+#     --sample_steps 50
+
+import argparse
+import logging
+import os
+import sys
+import warnings
+from datetime import datetime
+
+warnings.filterwarnings('ignore')
+
+import random
+
+import torch
+import torch.distributed as dist
+from PIL import Image
+
+from megatron.bridge.models.wan.flow_matching.flow_inference_pipeline import FlowInferencePipeline
+from megatron.bridge.models.wan.inference.configs import SIZE_CONFIGS, SUPPORTED_SIZES, WAN_CONFIGS
+from megatron.bridge.models.wan.inference.utils.utils import cache_video, str2bool
+
+EXAMPLE_PROMPT = {
+    "t2v-1.3B": {
+        "prompt":
+            "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage.",
+    },
+    "t2v-14B": {
+        "prompt":
+            "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage.",
+    },
+}
+
+
+def _validate_args(args):
+    # Basic check
+    assert args.checkpoint_dir is not None, "Please specify the checkpoint directory."
+    assert args.t5_checkpoint_dir is not None, "Please specify the T5 checkpoint directory."
+    assert args.vae_checkpoint_dir is not None, "Please specify the VAE checkpoint directory."
+    assert args.task in WAN_CONFIGS, f"Unsupport task: {args.task}"
+    assert args.task in EXAMPLE_PROMPT, f"Unsupport task: {args.task}"
+
+    # The default sampling steps are 40 for image-to-video tasks and 50 for text-to-video tasks.
+    if args.sample_steps is None:
+        args.sample_steps = 50
+
+    if args.sample_shift is None:
+        args.sample_shift = 5.0
+
+    # Frames default handled later; no single frame arg anymore
+
+    args.base_seed = args.base_seed if args.base_seed >= 0 else random.randint(
+        0, sys.maxsize)
+    # Size check: only validate provided --sizes; default handled later
+    if args.sizes is not None and len(args.sizes) > 0:
+        for s in args.sizes:
+            assert s in SUPPORTED_SIZES[args.task], (
+                f"Unsupport size {s} for task {args.task}, supported sizes are: "
+                f"{', '.join(SUPPORTED_SIZES[args.task])}")
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate a image or video from a text prompt or image using Wan"
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        default="t2v-14B",
+        choices=list(WAN_CONFIGS.keys()),
+        help="The task to run.")
+    parser.add_argument(
+        "--sizes",
+        type=str,
+        nargs="+",
+        default=None,
+        choices=list(SIZE_CONFIGS.keys()),
+        help="A list of sizes to generate multiple images or videos (WIDTH*HEIGHT). Example: --sizes 1280*720 1920*1080"
+    )
+    parser.add_argument(
+        "--frame_nums",
+        type=int,
+        nargs="+",
+        default=None,
+        help="List of frame counts (each should be 4n+1). Broadcasts if single value."
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        default=None,
+        help="The path to the main WAN checkpoint directory.")
+    parser.add_argument(
+        "--checkpoint_step",
+        type=int,
+        default=None,
+        help=(
+            "Optional training step to load, e.g. 1800 -> iter_0001800. "
+            "If not provided, the latest (largest) step in --checkpoint_dir is used.")
+    )
+    parser.add_argument(
+        "--t5_checkpoint_dir",
+        type=str,
+        default=None,
+        help="Optional directory containing T5 checkpoint/tokenizer")
+    parser.add_argument(
+        "--vae_checkpoint_dir",
+        type=str,
+        default=None,
+        help="Optional directory containing VAE checkpoint")
+    parser.add_argument(
+        "--offload_model",
+        type=str2bool,
+        default=None,
+        help="Whether to offload the model to CPU after each model forward, reducing GPU memory usage."
+    )
+    parser.add_argument(
+        "--t5_cpu",
+        action="store_true",
+        default=False,
+        help="Whether to place T5 model on CPU.")
+    parser.add_argument(
+        "--save_file",
+        type=str,
+        default=None,
+        help="The file to save the generated image or video to.")
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        nargs="+",
+        default=None,
+        help="A list of prompts to generate multiple images or videos. Example: --prompts 'a cat' 'a dog'"
+    )
+    parser.add_argument(
+        "--base_seed",
+        type=int,
+        default=-1,
+        help="The seed to use for generating the image or video.")
+    parser.add_argument(
+        "--sample_solver",
+        type=str,
+        default='unipc',
+        choices=['unipc', 'dpm++'],
+        help="The solver used to sample.")
+    parser.add_argument(
+        "--sample_steps", type=int, default=None, help="The sampling steps.")
+    parser.add_argument(
+        "--sample_shift",
+        type=float,
+        default=None,
+        help="Sampling shift factor for flow matching schedulers.")
+    parser.add_argument(
+        "--sample_guide_scale",
+        type=float,
+        default=5.0,
+        help="Classifier free guidance scale.")
+    parser.add_argument(
+        "--tensor_parallel_size",
+        type=int,
+        default=1,
+        help="Tensor parallel size.")
+    parser.add_argument(
+        "--context_parallel_size",
+        type=int,
+        default=1,
+        help="Context parallel size.")
+    parser.add_argument(
+        "--pipeline_parallel_size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size.")
+    parser.add_argument(
+        "--sequence_parallel",
+        type=str2bool,
+        default=False,
+        help="Sequence parallel.")
+
+    args = parser.parse_args()
+
+    _validate_args(args)
+
+    return args
+
+
+def _init_logging(rank):
+    # logging
+    if rank == 0:
+        # set format
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[%(asctime)s] %(levelname)s: %(message)s",
+            handlers=[logging.StreamHandler(stream=sys.stdout)])
+    else:
+        logging.basicConfig(level=logging.ERROR)
+
+
+def generate(args):
+    rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    device = local_rank
+    _init_logging(rank)
+
+    if args.offload_model is None:
+        args.offload_model = False if world_size > 1 else True
+        logging.info(
+            f"offload_model is not specified, set to {args.offload_model}.")
+    if world_size > 1:
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            rank=rank,
+            world_size=world_size)
+
+    cfg = WAN_CONFIGS[args.task]
+
+    logging.info(f"Generation job args: {args}")
+    logging.info(f"Generation model config: {cfg}")
+
+    if dist.is_initialized():
+        base_seed = [args.base_seed] if rank == 0 else [None]
+        dist.broadcast_object_list(base_seed, src=0)
+        args.base_seed = base_seed[0]
+
+    if "t2v" in args.task:
+        # Resolve prompts list (default to example prompt)
+        if args.prompts is not None and len(args.prompts) > 0:
+            prompts = args.prompts
+        else:
+            prompts = [EXAMPLE_PROMPT[args.task]["prompt"]]
+
+        # Resolve sizes list (default to first supported size for task)
+        if args.sizes is not None and len(args.sizes) > 0:
+            size_keys = args.sizes
+        else:
+            size_keys = [SUPPORTED_SIZES[args.task][0]]
+
+        # Resolve frame counts list (default 81)
+        if args.frame_nums is not None and len(args.frame_nums) > 0:
+            frame_nums = args.frame_nums
+        else:
+            frame_nums = [81]
+
+        # Enforce 1:1 pairing across lists
+        assert len(prompts) == len(size_keys) == len(frame_nums), (
+            f"prompts ({len(prompts)}), sizes ({len(size_keys)}), and frame_nums ({len(frame_nums)}) "
+            f"must have the same length")
+
+        logging.info("Creating flow inference pipeline.")
+        pipeline = FlowInferencePipeline(
+            config=cfg,
+            checkpoint_dir=args.checkpoint_dir,
+            checkpoint_step=args.checkpoint_step,
+            t5_checkpoint_dir=args.t5_checkpoint_dir,
+            vae_checkpoint_dir=args.vae_checkpoint_dir,
+            device_id=device,
+            rank=rank,
+            t5_cpu=args.t5_cpu,
+            tensor_parallel_size=args.tensor_parallel_size,
+            context_parallel_size=args.context_parallel_size,
+            pipeline_parallel_size=args.pipeline_parallel_size,
+            sequence_parallel=args.sequence_parallel,
+            pipeline_dtype=torch.float32,
+        )
+
+        # DEBUGGING
+        rank = dist.get_rank()
+        if rank == 0:
+            print("tensor_parallel_size:", args.tensor_parallel_size)
+            print("context_parallel_size:", args.context_parallel_size)
+            print("pipeline_parallel_size:", args.pipeline_parallel_size)
+            print("sequence_parallel:", args.sequence_parallel)
+            print("\n\n\n")
+
+        logging.info(
+            f"Generating videos ...")
+        videos = pipeline.generate(
+            prompts=prompts,
+            sizes=[SIZE_CONFIGS[size] for size in size_keys],
+            frame_nums=frame_nums,
+            shift=args.sample_shift,
+            sample_solver=args.sample_solver,
+            sampling_steps=args.sample_steps,
+            guide_scale=args.sample_guide_scale,
+            seed=args.base_seed,
+            offload_model=args.offload_model)
+
+    if rank == 0:
+        for i, video in enumerate(videos):
+            formatted_experiment_name = (args.save_file) if args.save_file is not None else "DefaultExp"
+            formatted_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+            formatted_prompt = prompts[i].replace(" ", "_").replace("/",
+                                                                    "_")[:50]
+            suffix = '.mp4'
+            formatted_save_file = f"{args.task}_{formatted_experiment_name}_videoindex{int(i)}_size{size_keys[i].replace('*','x') if sys.platform=='win32' else size_keys[i]}_{formatted_prompt}_{formatted_time}" + suffix
+
+            if "t2v" in args.task:
+                logging.info(f"Saving generated video to {formatted_save_file}")
+                cache_video(
+                    tensor=video[None],
+                    save_file=formatted_save_file,
+                    fps=cfg.sample_fps,
+                    nrow=1,
+                    normalize=True,
+                    value_range=(-1, 1))
+    logging.info("Finished.")
+
+
+if __name__ == "__main__":
+    args = _parse_args()
+    generate(args)
diff --git a/examples/recipes/wan/pretrain_vace.py b/examples/recipes/wan/pretrain_vace.py
new file mode 100644
index 0000000000..ac4c27b3d3
--- /dev/null
+++ b/examples/recipes/wan/pretrain_vace.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+VACE Finetuning Script with YAML and CLI Configuration Overrides.
+
+This script provides a flexible way to pretrain VACE models using Megatron-Bridge with support for
+both YAML configuration files and command-line overrides using Hydra-style syntax.
+
+Examples:
+    Basic usage with default configuration:
+        $ torchrun --nproc_per_node=8 pretrain_vace.py
+
+    Using a custom YAML config file:
+        $ torchrun --nproc_per_node=8 pretrain_vace.py --config-file my_custom_config.yaml
+
+    Using CLI overrides only:
+        $ torchrun --nproc_per_node=8 pretrain_vace.py model.tensor_model_parallel_size=4 train.train_iters=100000
+
+    Combining YAML and CLI overrides (CLI takes precedence):
+        $ torchrun --nproc_per_node=8 pretrain_vace.py --config-file conf/my_config.yaml \
+        model.pipeline_dtype=torch.float16 \
+        train.global_batch_size=512
+
+Configuration Precedence:
+    1. Base configuration from vace_pretrain_config() recipe
+    2. YAML overrides from --config-file (if provided)
+    3. CLI overrides (highest precedence)
+
+Supported Override Syntax:
+    - Standard assignment: key=value
+    - Nested assignment: section.subsection.key=value
+    - Addition: +new_key=value
+    - Deletion: ~key_to_remove
+    - Type conversion: Automatic for basic types (int, float, bool, str)
+    - Complex types: torch.dtype, enums, etc. are supported
+"""
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Tuple
+
+from omegaconf import OmegaConf
+import wandb
+
+from megatron.bridge.recipes.wan.vace import vace_pretrain_config
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.models.wan.wan_step import WanForwardStep, VACEForwardStep
+from megatron.bridge.training.pretrain import pretrain
+from megatron.bridge.training.utils.omegaconf_utils import (
+    apply_overrides,
+    create_omegaconf_dict_config,
+    parse_hydra_overrides,
+)
+from megatron.bridge.utils.common_utils import get_rank_safe
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# Define paths relative to this script's location
+# Assumes this script (pretrain_vace.py) is in Megatron-Bridge/examples/recipes/wan/
+# and the config is in a 'conf' subdirectory.
+SCRIPT_DIR: Path = Path(__file__).parent.resolve()
+DEFAULT_CONFIG_FILENAME: str = "vace_pretrain_override_example.yaml"
+DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME
+
+# DEBUGGING
+import numpy as np
+import torch
+np.set_printoptions(precision=10, suppress=False)
+torch.set_printoptions(precision=10, sci_mode=False)
+
+def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
+    """Parse command line arguments, separating known script args from OmegaConf overrides."""
+    parser = argparse.ArgumentParser(
+        description="pretrain VACE model using Megatron-Bridge with YAML and CLI overrides",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=str(DEFAULT_CONFIG_FILE_PATH),
+        help="Path to the YAML OmegaConf override file. Default: conf/vace_pretrain_override_example.yaml",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+
+    # Parse known args for the script, remaining will be treated as overrides
+    args, cli_dotlist_overrides = parser.parse_known_args()
+    return args, cli_dotlist_overrides
+
+
+def main() -> None:
+    """
+    Entry point for the VACE finetuning script.
+
+    This function orchestrates the complete configuration workflow:
+    1. Loads the base configuration from vace_pretrain_config() recipe
+    2. Applies YAML overrides from --config-file (if exists)
+    3. Applies CLI overrides using Hydra-style syntax
+    4. Starts Megatron finetuning with the final merged configuration
+
+    Configuration merging preserves callable fields (like activation functions)
+    and handles type conversions automatically.
+
+    Examples of CLI usage:
+        # Use default config with custom learning rate
+        torchrun --nproc_per_node=8 pretrain_vace.py optimizer.lr=0.0002
+
+        # Custom config file with additional overrides
+        torchrun --nproc_per_node=8 pretrain_vace.py --config-file my_config.yaml train.train_iters=50000
+
+        # Multiple overrides for distributed training
+        torchrun --nproc_per_node=8 pretrain_vace.py \
+            model.tensor_model_parallel_size=4 \
+            model.pipeline_model_parallel_size=2 \
+            train.global_batch_size=512
+    """
+    args, cli_overrides = parse_cli_args()
+
+    logger.info("Megatron-Bridge VACE Finetuning Script with YAML & CLI Overrides")
+    logger.info("------------------------------------------------------------------")
+
+    # Load base configuration from the recipe as a Python dataclass
+    cfg: ConfigContainer = vace_pretrain_config()
+    logger.info("Loaded base configuration for VACE finetuning")
+
+    # Print configuration on rank 0
+    if get_rank_safe() == 0:
+        cfg.print_yaml()
+
+    # Convert the initial Python dataclass to an OmegaConf DictConfig for merging
+    merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
+
+    # Load and merge YAML overrides if a config file is provided
+    # if args.config_file:
+    #     logger.debug(f"Loading YAML overrides from: {args.config_file}")
+    #     if not os.path.exists(args.config_file):
+    #         logger.error(f"Override YAML file not found: {args.config_file}")
+    #         sys.exit(1)
+    #     yaml_overrides_omega = OmegaConf.load(args.config_file)
+    #     merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega)
+    #     logger.debug("YAML overrides merged successfully.")
+
+    # Apply command-line overrides using Hydra-style parsing
+    if cli_overrides:
+        logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}")
+        merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
+        logger.debug("Hydra-style command-line overrides applied successfully.")
+
+    # Apply the final merged OmegaConf configuration back to the original ConfigContainer
+    logger.debug("Applying final merged configuration back to Python ConfigContainer...")
+    final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
+    # Apply overrides while preserving excluded fields
+    apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
+
+    # Display final configuration
+    if get_rank_safe() == 0:
+        logger.info("--- Final Merged Configuration ---")
+        cfg.print_yaml()
+        logger.info("----------------------------------")
+
+    # Initialize W&B if configured (only on rank 0)
+    if get_rank_safe() == 0 and hasattr(cfg, 'logger') and hasattr(cfg.logger, 'wandb_project'):
+        if cfg.logger.wandb_project:
+            wandb_config = {
+                'project': cfg.logger.wandb_project,
+                'name': getattr(cfg.logger, 'wandb_exp_name', None),
+                'dir': getattr(cfg.logger, 'wandb_save_dir', None),
+                'config': OmegaConf.to_container(merged_omega_conf, resolve=True)
+            }
+            # Remove None values
+            wandb_config = {k: v for k, v in wandb_config.items() if v is not None}
+            
+            wandb.init(**wandb_config)
+            logger.info(f"W&B initialized: project={cfg.logger.wandb_project}, name={wandb_config.get('name', 'N/A')}")
+
+    # Start finetuning
+    logger.debug("Starting VACE finetuning...")
+    pretrain(config=cfg, forward_step_func=VACEForwardStep())
+    
+    # Finish W&B run
+    if get_rank_safe() == 0:
+        wandb.finish()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/recipes/wan/pretrain_wan.py b/examples/recipes/wan/pretrain_wan.py
new file mode 100644
index 0000000000..72a693ee64
--- /dev/null
+++ b/examples/recipes/wan/pretrain_wan.py
@@ -0,0 +1,174 @@
+
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Wan Pretraining Script with YAML and CLI Configuration Overrides.
+
+This script provides a flexible way to pretrain Wan models using Megatron-Bridge with support for
+both YAML configuration files and command-line overrides using Hydra-style syntax.
+
+Examples:
+    Basic usage with default configuration:
+        $ torchrun --nproc_per_node=8 pretrain_wan.py
+
+    Using a custom YAML config file:
+        $ torchrun --nproc_per_node=8 pretrain_wan.py --config-file my_custom_config.yaml
+
+    Using CLI overrides only:
+        $ torchrun --nproc_per_node=8 pretrain_wan.py model.tensor_model_parallel_size=4 train.train_iters=100000
+
+    Combining YAML and CLI overrides (CLI takes precedence):
+        $ torchrun --nproc_per_node=8 pretrain_wan.py --config-file conf/my_config.yaml \
+        model.pipeline_dtype=torch.float16 \
+        train.global_batch_size=512
+
+Configuration Precedence:
+    1. Base configuration from pretrain_config() recipe
+    2. YAML overrides from --config-file (if provided)
+    3. CLI overrides (highest precedence)
+
+Supported Override Syntax:
+    - Standard assignment: key=value
+    - Nested assignment: section.subsection.key=value
+    - Addition: +new_key=value
+    - Deletion: ~key_to_remove
+    - Type conversion: Automatic for basic types (int, float, bool, str)
+    - Complex types: torch.dtype, enums, etc. are supported
+"""
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Tuple
+
+from omegaconf import OmegaConf
+
+from megatron.bridge.recipes.wan.wan import pretrain_config
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.models.wan.wan_step import WanForwardStep
+from megatron.bridge.training.pretrain import pretrain
+from megatron.bridge.training.utils.omegaconf_utils import (
+    apply_overrides,
+    create_omegaconf_dict_config,
+    parse_hydra_overrides,
+)
+from megatron.bridge.utils.common_utils import get_rank_safe
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# Define paths relative to this script's location
+# Assumes this script (pretrain_wan.py) is in Megatron-Bridge/examples/recipes/wan/
+# and the config is in a 'conf' subdirectory.
+SCRIPT_DIR: Path = Path(__file__).parent.resolve()
+DEFAULT_CONFIG_FILENAME: str = "wan_pretrain_override_example.yaml"
+DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME
+
+# DEBUGGING
+import numpy as np
+import torch
+np.set_printoptions(precision=10, suppress=False)
+torch.set_printoptions(precision=10, sci_mode=False)
+
+def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
+    """Parse command line arguments, separating known script args from OmegaConf overrides."""
+    parser = argparse.ArgumentParser(
+        description="Pretrain Wan model using Megatron-Bridge with YAML and CLI overrides",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=str(DEFAULT_CONFIG_FILE_PATH),
+        help="Path to the YAML OmegaConf override file. Default: conf/wan_pretrain_override_example.yaml",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+
+    # Parse known args for the script, remaining will be treated as overrides
+    args, cli_dotlist_overrides = parser.parse_known_args()
+    return args, cli_dotlist_overrides
+
+
+def main() -> None:
+    """
+    Entry point for the Wan pretraining script.
+
+    This function orchestrates the complete configuration workflow:
+    1. Loads the base configuration from pretrain_config() recipe
+    2. Applies YAML overrides from --config-file (if exists)
+    3. Applies CLI overrides using Hydra-style syntax
+    4. Starts Megatron pretraining with the final merged configuration
+
+    Configuration merging preserves callable fields (like activation functions)
+    and handles type conversions automatically.
+
+    Examples of CLI usage:
+        # Use default config with custom learning rate
+        torchrun --nproc_per_node=8 pretrain_wan.py optimizer.lr=0.0002
+
+        # Custom config file with additional overrides
+        torchrun --nproc_per_node=8 pretrain_wan.py --config-file my_config.yaml train.train_iters=50000
+
+        # Multiple overrides for distributed training
+        torchrun --nproc_per_node=8 pretrain_wan.py \
+            model.tensor_model_parallel_size=4 \
+            model.pipeline_model_parallel_size=2 \
+            train.global_batch_size=512
+    """
+    args, cli_overrides = parse_cli_args()
+
+    logger.info("Megatron-Bridge Wan Pretraining Script with YAML & CLI Overrides")
+    logger.info("------------------------------------------------------------------")
+
+    # Load base configuration from the recipe as a Python dataclass
+    cfg: ConfigContainer = pretrain_config()
+    logger.info("Loaded base configuration")
+
+    # Print configuration on rank 0
+    if get_rank_safe() == 0:
+        cfg.print_yaml()
+
+    # Convert the initial Python dataclass to an OmegaConf DictConfig for merging
+    merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
+
+    # Apply command-line overrides using Hydra-style parsing
+    if cli_overrides:
+        logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}")
+        merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
+        logger.debug("Hydra-style command-line overrides applied successfully.")
+
+    # Apply the final merged OmegaConf configuration back to the original ConfigContainer
+    logger.debug("Applying final merged configuration back to Python ConfigContainer...")
+    final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
+    # Apply overrides while preserving excluded fields
+    apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
+
+    # Display final configuration
+    if get_rank_safe() == 0:
+        logger.info("--- Final Merged Configuration ---")
+        cfg.print_yaml()
+        logger.info("----------------------------------")
+
+    # Start training
+    logger.debug("Starting pretraining...")
+    pretrain(config=cfg, forward_step_func=WanForwardStep())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/recipes/wan/run_vace_pretrain.sh b/examples/recipes/wan/run_vace_pretrain.sh
new file mode 100644
index 0000000000..b5f46d786a
--- /dev/null
+++ b/examples/recipes/wan/run_vace_pretrain.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# VACE Finetuning Script
+# This script demonstrates how to finetune the VACE video editing model
+
+export MBRIDGE_PATH=/workspace/vace/Megatron-Bridge
+export PYTHONPATH="${MBRIDGE_PATH}/.:${MBRIDGE_PATH}/src/.:/opt/NeMo-Framework-Launcher/launcher_scripts"
+
+### Prepare energon dataset
+python src/megatron/bridge/data/wan/prepare_energon_dataset_vace.py \
+  --video_dir /workspace/all_mixkit_segmented \
+  --output_dir /workspace/all_mixkit_energon_vace_V2V \
+  --checkpoint_dir /opt/megatron_checkpoint_VACE \
+  --t5_checkpoint_dir /workspace/checkpoints/T5 \
+  --vae_checkpoint_dir /workspace/checkpoints/ \
+  --vace_mode V2V \
+  --device cuda \
+  --height 224 --width 224 --resize_mode bilinear --center-crop \
+  --shard_maxcount 100 2>&1 | tee /tmp/prepare_log.txt
+
+energon prepare /workspace/all_mixkit_energon_vace_V2V
+
+# ============================
+# Configuration Parameters
+# ============================
+
+DATASET_PATH="/workspace/all_mixkit_energon_vace_V2V"
+PRETRAINED_CHECKPOINT="/opt/megatron_checkpoint_VACE"
+CHECKPOINT_DIR="/workspace/checkpoints_vace_ft_V2V"
+EXP_NAME=wan_vace_ft_V2V
+
+# ============================
+# Launch Training
+# ============================
+
+echo "Starting VACE finetuning..."
+echo ""
+
+NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=2 examples/recipes/wan/pretrain_vace.py \
+    model.tensor_model_parallel_size=1 \
+    model.pipeline_model_parallel_size=1 \
+    model.context_parallel_size=1 \
+    model.sequence_parallel=false \
+    model.qkv_format=thd \
+    dataset.path=${DATASET_PATH} \
+    dataset.num_workers=0 \
+    checkpoint.save=${CHECKPOINT_DIR} \
+    checkpoint.load=${PRETRAINED_CHECKPOINT} \
+    checkpoint.load_optim=false \
+    checkpoint.save_interval=500 \
+    optimizer.lr=5e-6 \
+    optimizer.min_lr=5e-6 \
+    train.eval_iters=0 \
+    scheduler.lr_decay_style=constant \
+    scheduler.lr_warmup_iters=0 \
+    model.seq_length=512 \
+    dataset.seq_length=512 \
+    train.global_batch_size=2 \
+    train.micro_batch_size=1 \
+    dataset.global_batch_size=2 \
+    dataset.micro_batch_size=1 \
+    logger.log_interval=1 \
+    logger.wandb_project="vace" \
+    logger.wandb_exp_name=${EXP_NAME} \
+    logger.wandb_save_dir=${CHECKPOINT_DIR}
+    # train.train_iters=$TRAIN_ITERS \
+    # train.eval_interval=$EVAL_INTERVAL \
+echo ""
+echo "=========================================="
+echo "VACE Finetuning Complete!"
+echo "Checkpoints saved to: $CHECKPOINT_DIR"
+echo "=========================================="
diff --git a/scripts/performance/run_script.py b/scripts/performance/run_script.py
index e2ead72df9..8fa2f5a402 100644
--- a/scripts/performance/run_script.py
+++ b/scripts/performance/run_script.py
@@ -16,16 +16,21 @@
 import os
 import sys
 
+import torch
 from argument_parser import parse_cli_args
 from omegaconf import OmegaConf
 from utils.helpers import COMM_OVERLAP_CONFIG_MAP, apply_perf_matrix_overrides, get_precision_config
 
 from megatron.bridge.recipes.deepseek.deepseek_v3 import pretrain_config as deepseek_v3_pretrain_config
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config as llama3_8b_pretrain_config
-from megatron.bridge.recipes.llama.llama3_70b import pretrain_config as llama3_70b_pretrain_config
-from megatron.bridge.recipes.llama.llama31_405b import pretrain_config as llama31_405b_pretrain_config
-from megatron.bridge.recipes.qwen.qwen3_30b_a3b import pretrain_config as qwen3_30b_a3b_pretrain_config
-from megatron.bridge.recipes.qwen.qwen3_235b_a22b import pretrain_config as qwen3_235b_a22b_pretrain_config
+from megatron.bridge.recipes.llama import (
+    llama3_8b_pretrain_config,
+    llama3_70b_pretrain_config,
+    llama31_405b_pretrain_config,
+)
+from megatron.bridge.recipes.qwen import (
+    qwen3_30b_a3b_pretrain_config,
+    qwen3_235b_a22b_pretrain_config,
+)
 from megatron.bridge.training.comm_overlap import CommOverlapConfig
 from megatron.bridge.training.gpt_step import forward_step
 from megatron.bridge.training.pretrain import pretrain
@@ -165,6 +170,10 @@ def main():
 
     pretrain(config=recipe, forward_step_func=forward_step)
 
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+
 
 if __name__ == "__main__":
     main()
diff --git a/src/megatron/bridge/data/Dit/base.py b/src/megatron/bridge/data/Dit/base.py
new file mode 100644
index 0000000000..413dc6860c
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/base.py
@@ -0,0 +1,343 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+from typing import Any, Dict, Literal, Optional
+
+from megatron.core import parallel_state
+from megatron.energon import WorkerConfig, get_savable_loader, get_train_dataset
+from torch.utils.data import DataLoader
+from typing_extensions import Self
+import logging
+logger = logging.getLogger(__name__)
+
+
+class EnergonMultiModalDataModule:
+    """
+    A PyTorch Lightning DataModule for handling multimodal datasets with images and text.
+
+    This data module is designed to work with multimodal datasets that involve both images and text.
+    It provides a seamless interface to load training and validation data, manage batching, and handle
+    the state of the data pipeline across training epochs. The module integrates with the Megatron-Energon
+    framework for efficient data handling in large-scale distributed training.
+
+    Attributes:
+    path (str): Path to the energon dataset.
+    tokenizer (Tokenizer): The tokenizer used for processing text.
+    image_processor (ImageProcessor): The image processor used for preprocessing images.
+    seq_length (int): The maximum sequence length for tokenized text.
+    micro_batch_size (int): The batch size for training and validation.
+    num_workers (int): Number of workers for data loading.
+    pin_memory (bool): Whether to pin memory in the DataLoader.
+    multimodal_sample_config (MultiModalSampleConfig): Configuration object for multimodal samples.
+    task_encoder (MultiModalTaskEncoder): Encoder responsible for encoding and batching samples.
+    init_global_step (int): The initial global step for the trainer, used for resuming training.
+    data_sampler (SequentialMegatronSampler): Sampler responsible for generating sequential samples.
+    train_dataloader_object (Optional): The DataLoader object for training data.
+    val_dataloader_object (Optional): The DataLoader object for validation data.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        tokenizer,
+        image_processor,
+        seq_length: int = 2048,
+        micro_batch_size: int = 1,
+        global_batch_size: int = 1,
+        num_workers: int = 1,
+        num_val_workers: int | None = None,
+        pin_memory: bool = True,
+        shuffle_buffer_size: int = 100,
+        max_samples_per_sequence: int | None = None,
+        multimodal_sample_config: Optional[Any] = None,
+        task_encoder: Optional[Any] = None,
+        decoder_seq_length: Optional[int] = None,
+        packing_buffer_size: Optional[int] = None,
+        validation_task_encoder: Optional[Any] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Initialize the EnergonMultiModalDataModule.
+
+        Parameters:
+        path (str): Path to the dataset.
+        tokenizer (Tokenizer): The tokenizer used for processing text.
+        image_processor (ImageProcessor): The image processor used for preprocessing images.
+        seq_length (int, optional): The maximum sequence length for tokenized text. Defaults to 2048.
+        micro_batch_size (int, optional): The batch size for training and validation. Defaults to 1.
+        num_workers (int, optional): Number of workers for data loading. Defaults to 1.
+        num_val_workers (int, optional): Number of workers for validation data loading. Defaults to num_workers.
+        pin_memory (bool, optional): Whether to pin memory in the DataLoader. Defaults to True.
+        multimodal_sample_config (MultiModalSampleConfig, optional): Configuration object for multimodal samples.
+        Defaults to MultiModalSampleConfig().
+        shuffle_buffer_size (int, optional): Size of the shuffle buffer. Defaults to 100.
+        max_samples_per_sequence (int, optional): Maximum number of samples per sequence to load from memory.
+        Defaults to None (loads the whole tar file at once).
+        task_encoder (MultiModalTaskEncoder, optional): Encoder responsible for encoding and batching samples.
+        If not provided, a default (MultimodalTaskEncoder) encoder will be created. Defaults to None.
+        decoder_seq_length (int, optional): The max sequence length for the decoder. Used in encoder-decoder models
+        packing_buffer_size (int, optional): Size of the packing buffer for batched samples. Defaults to None.
+        validation_task_encoder (MultiModalTaskEncoder, optional): Encoder responsible for encoding
+        and batching samples for validation. Defaults to None and will be the same as task_encoder.
+        **kwargs: Additional keyword arguments. Will be passed to get_train_dataset() of Energon
+        """
+
+        super().__init__()
+        self.path = path
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.seq_length = seq_length
+        self.decoder_seq_length = decoder_seq_length
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.multimodal_sample_config = multimodal_sample_config
+        self.shuffle_buffer_size = shuffle_buffer_size
+        self.max_samples_per_sequence = max_samples_per_sequence
+        self.task_encoder = task_encoder 
+        self.init_global_step = 0
+        self.train_dataloader_object = None
+        self.val_dataloader_object = None
+        self.packing_buffer_size = packing_buffer_size
+        self.validation_task_encoder = validation_task_encoder or self.task_encoder
+        self.num_val_workers = num_val_workers or self.num_workers
+        self.kwargs = kwargs
+
+
+    def datasets_provider(self, worker_config, split: Literal['train', 'val'] = 'val'):
+        """
+        Provide the dataset for training or validation.
+
+        This method retrieves the dataset for the specified split (either 'train' or 'val') and configures
+        it according to the worker configuration.
+
+        Parameters:
+        worker_config: Configuration for the data loader workers.
+        split (Literal['train', 'val'], optional): The data split to retrieve ('train' or 'val'). Defaults to 'val'.
+
+        Returns:
+        Dataset: The dataset configured for the specified split.
+        """
+
+        if split not in {'train', 'val'}:
+            raise ValueError("Invalid value for split. Allowed values are 'train' or 'val'.")
+
+        if split == "train":
+            task_encoder = self.task_encoder
+        else:
+            task_encoder = self.validation_task_encoder
+
+        _dataset = get_train_dataset(
+            self.path,
+            batch_size=self.micro_batch_size,
+            task_encoder=task_encoder,
+            worker_config=worker_config,
+            packing_buffer_size=self.packing_buffer_size,
+            split_part=split,
+            shuffle_buffer_size=self.shuffle_buffer_size,
+            max_samples_per_sequence=self.max_samples_per_sequence,
+            **self.kwargs,
+        )
+
+        return _dataset
+
+    def build(self):
+        return self.train_dataloader(), self.val_dataloader()
+
+    def train_dataloader(self) -> Any:
+        """
+        Initialize and return the training DataLoader.
+
+        This method initializes the DataLoader for the training dataset. It uses the global step
+        from the trainer to configure the data sampler and ensures that the parallel state is initialized
+        correctly for distributed training.
+
+        Returns:
+        TRAIN_DATALOADERS: The DataLoader for the training dataset.
+        """
+
+        logger.info(f"Multimodal train dataloader initializing with init_global_step {self.init_global_step}")
+        if self.train_dataloader_object:
+            return self.train_dataloader_object
+        if not parallel_state.is_initialized():
+            logger.info(
+                f"Muiltimodal data loader parallel state is not initialized,"
+                f"using default worker config with no_workers {self.num_workers}"
+            )
+            worker_config = WorkerConfig.default_worker_config(self.num_workers)
+        else:
+            rank = parallel_state.get_data_parallel_rank()
+            world_size = parallel_state.get_data_parallel_world_size()
+            data_parallel_group = parallel_state.get_data_parallel_group()
+            logger.info(
+                f" Multimodal  train dataloader initializing with"
+                f"rank {rank} world_size {world_size} data_parallel_group {data_parallel_group} ****** "
+            )
+            worker_config = WorkerConfig(
+                rank=rank,
+                world_size=world_size,
+                num_workers=self.num_workers,
+                data_parallel_group=data_parallel_group,
+                worker_debug_path=None,
+                worker_log_level=0,
+            )
+        train_dataset = self.datasets_provider(worker_config, split='train')
+        energon_dataloader = get_savable_loader(train_dataset, worker_config=worker_config)
+        self.train_dataloader_object = energon_dataloader
+        return self.train_dataloader_object
+
+    def val_dataloader(self):
+        """
+        Initialize and return the validation DataLoader.
+
+        This method initializes the DataLoader for the validation dataset. It ensures that the parallel state
+        is initialized correctly for distributed training and returns a configured DataLoader object.
+
+        Returns:
+        EVAL_DATALOADERS: The DataLoader for the validation dataset.
+        """
+        if self.val_dataloader_object:
+            return self.val_dataloader_object
+
+        if not parallel_state.is_initialized():
+            logger.info(
+                f"Muiltimodal val data loader parallel state is not initialized,"
+                f"using default worker config with no_workers {self.num_workers}"
+            )
+            worker_config = WorkerConfig.default_worker_config(self.num_val_workers)
+        else:
+            rank = parallel_state.get_data_parallel_rank()
+            world_size = parallel_state.get_data_parallel_world_size()
+            data_parallel_group = parallel_state.get_data_parallel_group()
+
+            logger.info(f"rank {rank} world_size {world_size} data_parallel_group {data_parallel_group}")
+            worker_config = WorkerConfig(
+                rank=rank,
+                world_size=world_size,
+                num_workers=self.num_workers,
+                data_parallel_group=data_parallel_group,
+                worker_debug_path=None,
+                worker_log_level=0,
+            )
+        val_dataset = self.datasets_provider(worker_config, split='val')
+        energon_loader = get_savable_loader(val_dataset, worker_config=worker_config)
+        self.val_dataloader_object = energon_loader
+        return self.val_dataloader_object
+
+    def test_dataloader(self) -> None:
+        """
+        Return None as test dataset split does not exist.
+
+        This method overrides the test_dataloader method and returns None since the test dataset split
+        is not defined or used in this module.
+
+        Returns:
+        None
+        """
+        logger.warning("Multimodal dataloader test dataset split does not exist")
+        return None
+
+    def state_dict(self) -> Dict[str, Any]:
+        """
+        Save the state of the data module.
+
+        This method is called when saving a checkpoint. It generates and saves the state of the data module,
+        including the state of the dataloader and the number of consumed samples.
+
+        Returns:
+        Dict[str, Any]: A dictionary containing the state of the data module.
+        """
+
+        if self.trainer:
+            dataloader_obj = self.trainer.train_dataloader
+
+            state = []
+            # All ranks should be zero except the dp rank.
+            if (
+                parallel_state.get_context_parallel_rank()
+                or parallel_state.get_pipeline_model_parallel_rank()
+                or parallel_state.get_tensor_model_parallel_rank()
+                or parallel_state.get_expert_model_parallel_rank()
+            ) == 0:
+                # Save_state_global in energon assumes that we call it for only the first rank within each group that
+                # shares the same dataloader state. By making sure that current rank is the first rank in a model
+                # parallel group, we ensure this.
+                state = dataloader_obj.save_state_global(global_dst_rank=0)
+
+            consumed_samples = self.data_sampler.compute_consumed_samples(
+                self.trainer.global_step - self.init_global_step
+            )
+
+            if state is None:
+                state = []  # Megatron core requires all the states on all the ranks to have same python
+            # type. Energon sends the state as a list
+            logger.info(f"Multimodal data loader saving dataloader state dict consumed samples {consumed_samples}")
+            return {'dataloader_state': state, 'consumed_samples': consumed_samples}
+
+        logger.warning("trainer object not connected to data module object returning empty state")
+        return {}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        """
+        Load the state of the data module from a checkpoint.
+
+        This method is called when loading a checkpoint. It restores the state of the data module,
+        including the state of the dataloader and the number of consumed samples.
+
+        Parameters:
+        state_dict (Dict[str, Any]): The state dictionary containing the saved state of the data module.
+        """
+        if not 'dataloader_state' in state_dict:
+            logger.warning(
+                f"Data loader state cannot be resumed from state_dict, "
+                f"it does not have the required key dataloader_state. It has {state_dict.keys()}"
+            )
+            return
+
+        state = state_dict['dataloader_state']
+        try:
+            if self.trainer:
+                self.trainer.datamodule.train_dataloader().restore_state_global(state)
+                logger.info("Multimodal dataloader state restored")
+            else:
+                logger.error(f"Cannot restore state from state_dict {state_dict}")
+                raise ValueError(
+                    "Cannot restore state from state_dict: "
+                    "Is the trainer object is initialized and attached to datamodule???"
+                )
+        except Exception as e:
+            logger.warning(
+                f"Failed to dataloader restore state due to [Please ensure you are using same version "
+                f"of energon while saving and loading, Continuing without restoring data loader] : {e}"
+            )
+
+        try:
+            from megatron.core.num_microbatches_calculator import update_num_microbatches
+
+        except (ImportError, ModuleNotFoundError):
+            logger.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import update_num_microbatches
+
+        consumed_samples = state_dict['consumed_samples']
+        self.data_sampler.init_consumed_samples = consumed_samples
+        self.data_sampler.prev_consumed_samples = consumed_samples
+        logger.info(f"Multimodal dataloader load state dict with consumed_samples {consumed_samples}")
+        update_num_microbatches(
+            consumed_samples=consumed_samples,
+            consistency_check=False,
+        )
+
+
diff --git a/tests/unit_tests/recipes/qwen/__init__.py b/src/megatron/bridge/data/Dit/data/__init__.py
similarity index 89%
rename from tests/unit_tests/recipes/qwen/__init__.py
rename to src/megatron/bridge/data/Dit/data/__init__.py
index 341a77c5bc..d9155f923f 100644
--- a/tests/unit_tests/recipes/qwen/__init__.py
+++ b/src/megatron/bridge/data/Dit/data/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/megatron/bridge/data/Dit/data/camera.py b/src/megatron/bridge/data/Dit/data/camera.py
new file mode 100644
index 0000000000..3297ddc4d7
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/camera.py
@@ -0,0 +1,639 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import numpy as np
+import torch
+
+
+class Pose:
+    """
+    A class of operations on camera poses (PyTorch tensors with shape [...,3,4]).
+    Each [3,4] camera pose takes the form of [R|t].
+    """
+
+    def __call__(self, R=None, t=None):
+        # Construct a camera pose from the given R and/or t.
+        assert R is not None or t is not None
+        if R is None:
+            if not isinstance(t, torch.Tensor):
+                t = torch.tensor(t)
+            R = torch.eye(3, device=t.device).repeat(*t.shape[:-1], 1, 1)
+        elif t is None:
+            if not isinstance(R, torch.Tensor):
+                R = torch.tensor(R)
+            t = torch.zeros(R.shape[:-1], device=R.device)
+        else:
+            if not isinstance(R, torch.Tensor):
+                R = torch.tensor(R)
+            if not isinstance(t, torch.Tensor):
+                t = torch.tensor(t)
+        assert R.shape[:-1] == t.shape and R.shape[-2:] == (3, 3)
+        R = R.float()
+        t = t.float()
+        pose = torch.cat([R, t[..., None]], dim=-1)  # [...,3,4]
+        assert pose.shape[-2:] == (3, 4)
+        return pose
+
+    def invert(self, pose, use_inverse=False):
+        # Invert a camera pose.
+        R, t = pose[..., :3], pose[..., 3:]
+        R_inv = R.inverse() if use_inverse else R.transpose(-1, -2)
+        t_inv = (-R_inv @ t)[..., 0]
+        pose_inv = self(R=R_inv, t=t_inv)
+        return pose_inv
+
+    def compose(self, pose_list):
+        # Compose a sequence of poses together.
+        # pose_new(x) = poseN o ... o pose2 o pose1(x)
+        pose_new = pose_list[0]
+        for pose in pose_list[1:]:
+            pose_new = self.compose_pair(pose_new, pose)
+        return pose_new
+
+    def compose_pair(self, pose_a, pose_b):
+        # pose_new(x) = pose_b o pose_a(x)
+        R_a, t_a = pose_a[..., :3], pose_a[..., 3:]
+        R_b, t_b = pose_b[..., :3], pose_b[..., 3:]
+        R_new = R_b @ R_a
+        t_new = (R_b @ t_a + t_b)[..., 0]
+        pose_new = self(R=R_new, t=t_new)
+        return pose_new
+
+    def scale_center(self, pose, scale):
+        """Scale the camera center from the origin.
+        0 = R@c+t --> c = -R^T@t (camera center in world coordinates)
+        0 = R@(sc)+t' --> t' = -R@(sc) = -R@(-R^T@st) = st
+        """
+        R, t = pose[..., :3], pose[..., 3:]
+        pose_new = torch.cat([R, t * scale], dim=-1)
+        return pose_new
+
+    def interpolate(self, pose_a, pose_b, alpha):
+        """Interpolate between two poses with Slerp.
+        Args:
+            pose_a (tensor [...,3,4]): Pose at time t=0.
+            pose_b (tensor [...,3,4]): Pose at time t=1.
+            alpha (tensor [...,1]): Interpolation parameter.
+        Returns:
+            pose (tensor [...,3,4]): Pose at time t.
+        """
+        R_a, t_a = pose_a[..., :3], pose_a[..., 3:]
+        R_b, t_b = pose_b[..., :3], pose_b[..., 3:]
+        q_a = quaternion.R_to_q(R_a)  # [...,4]
+        q_b = quaternion.R_to_q(R_b)  # [...,4]
+        q_intp = quaternion.interpolate(q_a, q_b, alpha)  # [...,4]
+        R_intp = quaternion.q_to_R(q_intp)  # [...,3,3]
+        t_intp = (1 - alpha) * t_a + alpha * t_b  # [...,3]
+        pose_intp = torch.cat([R_intp, t_intp], dim=-1)  # [...,3,4]
+        return pose_intp
+
+    def to_4x4(self, pose):
+        last_row = torch.tensor([0, 0, 0, 1], device=pose.device)[None, None].expand(pose.shape[0], 1, 4)
+        return torch.cat([pose, last_row], dim=-2)
+
+
+class Lie:
+    """
+    Lie algebra for SO(3) and SE(3) operations in PyTorch.
+    """
+
+    def so3_to_SO3(self, w):  # [..., 3]
+        wx = self.skew_symmetric(w)
+        theta = w.norm(dim=-1)[..., None, None]
+        eye = torch.eye(3, device=w.device, dtype=torch.float32)
+        A = self.taylor_A(theta)
+        B = self.taylor_B(theta)
+        R = eye + A * wx + B * wx @ wx
+        return R
+
+    def SO3_to_so3(self, R, eps=1e-7):  # [..., 3, 3]
+        trace = R[..., 0, 0] + R[..., 1, 1] + R[..., 2, 2]
+        theta = ((trace - 1) / 2).clamp(-1 + eps, 1 - eps).acos_()[
+            ..., None, None
+        ] % np.pi  # ln(R) will explode if theta==pi
+        lnR = 1 / (2 * self.taylor_A(theta) + 1e-8) * (R - R.transpose(-2, -1))  # FIXME: wei-chiu finds it weird
+        w0, w1, w2 = lnR[..., 2, 1], lnR[..., 0, 2], lnR[..., 1, 0]
+        w = torch.stack([w0, w1, w2], dim=-1)
+        return w
+
+    def se3_to_SE3(self, wu):  # [...,3]
+        w, u = wu.split([3, 3], dim=-1)
+        wx = self.skew_symmetric(w)
+        theta = w.norm(dim=-1)[..., None, None]
+        eye = torch.eye(3, device=w.device, dtype=torch.float32)
+        A = self.taylor_A(theta)
+        B = self.taylor_B(theta)
+        C = self.taylor_C(theta)
+        R = eye + A * wx + B * wx @ wx
+        V = eye + B * wx + C * wx @ wx
+        Rt = torch.cat([R, (V @ u[..., None])], dim=-1)
+        return Rt
+
+    def SE3_to_se3(self, Rt, eps=1e-8):  # [...,3,4]
+        R, t = Rt.split([3, 1], dim=-1)
+        w = self.SO3_to_so3(R)
+        wx = self.skew_symmetric(w)
+        theta = w.norm(dim=-1)[..., None, None]
+        eye = torch.eye(3, device=w.device, dtype=torch.float32)
+        A = self.taylor_A(theta)
+        B = self.taylor_B(theta)
+        invV = eye - 0.5 * wx + (1 - A / (2 * B)) / (theta**2 + eps) * wx @ wx
+        u = (invV @ t)[..., 0]
+        wu = torch.cat([w, u], dim=-1)
+        return wu
+
+    def skew_symmetric(self, w):
+        w0, w1, w2 = w.unbind(dim=-1)
+        zero = torch.zeros_like(w0)
+        wx = torch.stack(
+            [
+                torch.stack([zero, -w2, w1], dim=-1),
+                torch.stack([w2, zero, -w0], dim=-1),
+                torch.stack([-w1, w0, zero], dim=-1),
+            ],
+            dim=-2,
+        )
+        return wx
+
+    def taylor_A(self, x, nth=10):
+        # Taylor expansion of sin(x)/x.
+        ans = torch.zeros_like(x)
+        denom = 1.0
+        for i in range(nth + 1):
+            if i > 0:
+                denom *= (2 * i) * (2 * i + 1)
+            ans = ans + (-1) ** i * x ** (2 * i) / denom
+        return ans
+
+    def taylor_B(self, x, nth=10):
+        # Taylor expansion of (1-cos(x))/x**2.
+        ans = torch.zeros_like(x)
+        denom = 1.0
+        for i in range(nth + 1):
+            denom *= (2 * i + 1) * (2 * i + 2)
+            ans = ans + (-1) ** i * x ** (2 * i) / denom
+        return ans
+
+    def taylor_C(self, x, nth=10):
+        # Taylor expansion of (x-sin(x))/x**3.
+        ans = torch.zeros_like(x)
+        denom = 1.0
+        for i in range(nth + 1):
+            denom *= (2 * i + 2) * (2 * i + 3)
+            ans = ans + (-1) ** i * x ** (2 * i) / denom
+        return ans
+
+
+class Quaternion:
+    def q_to_R(self, q):  # [...,4]
+        # https://en.wikipedia.org/wiki/Rotation_matrix#Quaternion
+        qa, qb, qc, qd = q.unbind(dim=-1)
+        R = torch.stack(
+            [
+                torch.stack([1 - 2 * (qc**2 + qd**2), 2 * (qb * qc - qa * qd), 2 * (qa * qc + qb * qd)], dim=-1),
+                torch.stack([2 * (qb * qc + qa * qd), 1 - 2 * (qb**2 + qd**2), 2 * (qc * qd - qa * qb)], dim=-1),
+                torch.stack([2 * (qb * qd - qa * qc), 2 * (qa * qb + qc * qd), 1 - 2 * (qb**2 + qc**2)], dim=-1),
+            ],
+            dim=-2,
+        )
+        return R
+
+    def R_to_q(self, R, eps=1e-6):  # [...,3,3]
+        # https://en.wikipedia.org/wiki/Rotation_matrix#Quaternion
+        row0, row1, row2 = R.unbind(dim=-2)
+        R00, R01, R02 = row0.unbind(dim=-1)
+        R10, R11, R12 = row1.unbind(dim=-1)
+        R20, R21, R22 = row2.unbind(dim=-1)
+        t = R[..., 0, 0] + R[..., 1, 1] + R[..., 2, 2]
+        r = (1 + t + eps).sqrt()
+        qa = 0.5 * r
+        qb = (R21 - R12).sign() * 0.5 * (1 + R00 - R11 - R22 + eps).sqrt()
+        qc = (R02 - R20).sign() * 0.5 * (1 - R00 + R11 - R22 + eps).sqrt()
+        qd = (R10 - R01).sign() * 0.5 * (1 - R00 - R11 + R22 + eps).sqrt()
+        q = torch.stack([qa, qb, qc, qd], dim=-1)
+        return q
+
+    def invert(self, q):  # [...,4]
+        qa, qb, qc, qd = q.unbind(dim=-1)
+        norm = q.norm(dim=-1, keepdim=True)
+        q_inv = torch.stack([qa, -qb, -qc, -qd], dim=-1) / norm**2
+        return q_inv
+
+    def product(self, q1, q2):  # [...,4]
+        q1a, q1b, q1c, q1d = q1.unbind(dim=-1)
+        q2a, q2b, q2c, q2d = q2.unbind(dim=-1)
+        hamil_prod = torch.stack(
+            [
+                q1a * q2a - q1b * q2b - q1c * q2c - q1d * q2d,
+                q1a * q2b + q1b * q2a + q1c * q2d - q1d * q2c,
+                q1a * q2c - q1b * q2d + q1c * q2a + q1d * q2b,
+                q1a * q2d + q1b * q2c - q1c * q2b + q1d * q2a,
+            ],
+            dim=-1,
+        )
+        return hamil_prod
+
+    def interpolate(self, q1, q2, alpha):  # [...,4],[...,4],[...,1]
+        # https://en.wikipedia.org/wiki/Slerp
+        cos_angle = (q1 * q2).sum(dim=-1, keepdim=True)  # [...,1]
+        flip = cos_angle < 0
+        q1 = q1 * (~flip) - q1 * flip  # [...,4]
+        theta = cos_angle.abs().acos()  # [...,1]
+        slerp = (((1 - alpha) * theta).sin() * q1 + (alpha * theta).sin() * q2) / theta.sin()  # [...,4]
+        return slerp
+
+
+pose = Pose()
+lie = Lie()
+quaternion = Quaternion()
+
+
+def to_hom(X):
+    # Get homogeneous coordinates of the input.
+    X_hom = torch.cat([X, torch.ones_like(X[..., :1])], dim=-1)
+    return X_hom
+
+
+# Basic operations of transforming 3D points between world/camera/image coordinates.
+def world2cam(X, pose):  # [B,N,3]
+    X_hom = to_hom(X)
+    return X_hom @ pose.transpose(-1, -2)
+
+
+def cam2img(X, cam_intr):
+    return X @ cam_intr.transpose(-1, -2)
+
+
+def img2cam(X, cam_intr):
+    _dtype = cam_intr.dtype
+    X = X.float()
+    cam_intr = cam_intr.float()
+    result = X @ cam_intr.inverse().transpose(-1, -2)
+    return result.to(dtype=_dtype)
+
+
+def cam2world(X, pose):
+    _dtype = pose.dtype
+    X = X.float()
+    pose = pose.float()
+    X_hom = to_hom(X)
+    pose_inv = Pose().invert(pose)
+    result = X_hom @ pose_inv.transpose(-1, -2)
+    return result.to(dtype=_dtype)
+
+
+def angle_to_rotation_matrix(a, axis):
+    # Get the rotation matrix from Euler angle around specific axis.
+    roll = dict(X=1, Y=2, Z=0)[axis]
+    if isinstance(a, float):
+        a = torch.tensor(a)
+    zero = torch.zeros_like(a)
+    eye = torch.ones_like(a)
+    M = torch.stack(
+        [
+            torch.stack([a.cos(), -a.sin(), zero], dim=-1),
+            torch.stack([a.sin(), a.cos(), zero], dim=-1),
+            torch.stack([zero, zero, eye], dim=-1),
+        ],
+        dim=-2,
+    )
+    M = M.roll((roll, roll), dims=(-2, -1))
+    return M
+
+
+def get_center_and_ray(pose, intr, image_size):
+    """
+    Args:
+        pose (tensor [3,4]/[B,3,4]): Camera pose.
+        intr (tensor [3,3]/[B,3,3]): Camera intrinsics.
+        image_size (list of int): Image size.
+    Returns:
+        center_3D (tensor [HW,3]/[B,HW,3]): Center of the camera.
+        ray (tensor [HW,3]/[B,HW,3]): Ray of the camera with depth=1 (note: not unit ray).
+    """
+    assert pose.dtype == torch.float32 and intr.dtype == torch.float32, (
+        f"pose and intr should be float32, got {pose.dtype} and {intr.dtype}"
+    )
+
+    H, W = image_size
+    # Given the intrinsic/extrinsic matrices, get the camera center and ray directions.
+    with torch.no_grad():
+        # Compute image coordinate grid.
+        X, Y = get_pixel_grid(W, H, pose.device, normalized_coordinate=False)  # [H,W]
+        xy_grid = torch.stack([X, Y], dim=-1).view(-1, 2)  # [HW,2]
+    # Compute center and ray.
+    if len(pose.shape) == 3:
+        batch_size = len(pose)
+        xy_grid = xy_grid.repeat(batch_size, 1, 1)  # [B,HW,2]
+    grid_3D = img2cam(to_hom(xy_grid), intr)  # [HW,3]/[B,HW,3]
+    center_3D = torch.zeros_like(grid_3D)  # [HW,3]/[B,HW,3]
+    # Transform from camera to world coordinates.
+    grid_3D = cam2world(grid_3D, pose)  # [HW,3]/[B,HW,3]
+    center_3D = cam2world(center_3D, pose)  # [HW,3]/[B,HW,3]
+    ray = grid_3D - center_3D  # [B,HW,3]
+    return center_3D, ray
+
+
+def get_pixel_grid(width: int, height: int, device: torch.device, normalized_coordinate: bool = False):
+    """Generate pixel grid given the image size.
+
+    Args:
+        width (int): image width
+        height (int): image height
+        device (torch.device)
+        normalized_coordinate (bool, optional): normalized coordinate is between 0 and 1. Defaults to False.
+
+    Returns:
+        torch.tensor: x,y pixel grid
+    """
+    y_range = torch.arange(height, dtype=torch.float32, device=device).add_(0.5)
+    x_range = torch.arange(width, dtype=torch.float32, device=device).add_(0.5)
+    if normalized_coordinate:
+        y_range = y_range / height
+        x_range = x_range / width
+    y, x = torch.meshgrid(y_range, x_range, indexing="ij")  # [H, W]
+    return x, y
+
+
+def get_3D_points_from_dist(
+    center: torch.tensor, ray_unit: torch.tensor, dist: torch.tensor, multiple_samples_per_ray: bool = False
+):
+    """Convert dist to 3D points in the world coordinate.
+
+    Args:
+        center (torch.tensor): camer center in world coordinates, [..., 3]
+        ray_unit (torch.tensor): ray directions (unit vector), [..., 3]
+        dist (torch.tensor): distance along the ray, [..., 1] or [..., N_samples, 1]
+                             if sampling muliple points along rays
+        multiple_samples_per_ray (bool): If True, dist is [..., N_samples, 1]
+
+    Returns:
+        torch.tensor: [..., 3] or [..., N_samples, 3]
+    """
+    assert torch.allclose(ray_unit.norm(dim=-1), torch.ones_like(ray_unit.norm(dim=-1))), (
+        f"ray_unit norm is not equal to 1, max {ray_unit.norm(dim=-1).max()} min {ray_unit.norm(dim=-1).min()}"
+    )
+    if multiple_samples_per_ray:
+        assert len(dist.shape) == len(center.shape) + 1
+        center, ray_unit = center[..., None, :], ray_unit[..., None, :]  # [...,1,3]
+    else:
+        assert len(dist.shape) == len(center.shape), f"dist shape {dist.shape} center shape {center.shape}"
+    points_3D = center + ray_unit * dist  # [...,3]/[...,N_samples,3]
+    return points_3D
+
+
+def get_3D_points_from_depth(
+    center: torch.tensor, ray: torch.tensor, depth: torch.tensor, multiple_samples_per_ray: bool = False
+):
+    """Convert depth to 3D points in the world coordinate.
+    NOTE: this function assuems the ray is NOT noramlized and returned directly from get_center_and_ray()!!
+
+    Args:
+        center (torch.tensor): camer center in world coordinates, [..., 3]
+        ray (torch.tensor): ray directions (z component is 1), [..., 3]
+        depth (torch.tensor): z depth from camera center, [..., 1] or [..., N_samples, 1]
+                              if sampling muliple points along rays
+        multiple_samples_per_ray (bool): If True, depth is [..., N_samples, 1]
+
+    Returns:
+        torch.tensor: [..., 3] or [..., N_samples, 3]
+    """
+    if multiple_samples_per_ray:
+        assert len(depth.shape) == len(center.shape) + 1
+        center, ray = center[..., None, :], ray[..., None, :]  # [...,1,3]
+    else:
+        assert len(depth.shape) == len(center.shape)
+    points_3D = center + ray * depth  # [...,3]/[...,N,3]
+    return points_3D
+
+
+def convert_NDC(center, ray, intr, near=1):
+    # Shift camera center (ray origins) to near plane (z=1).
+    # (Unlike conventional NDC, we assume the cameras are facing towards the +z direction.)
+    center = center + (near - center[..., 2:]) / ray[..., 2:] * ray
+    # Projection.
+    cx, cy, cz = center.unbind(dim=-1)  # [...,R]
+    rx, ry, rz = ray.unbind(dim=-1)  # [...,R]
+    scale_x = intr[..., 0, 0] / intr[..., 0, 2]  # [...]
+    scale_y = intr[..., 1, 1] / intr[..., 1, 2]  # [...]
+    cnx = scale_x[..., None] * (cx / cz)
+    cny = scale_y[..., None] * (cy / cz)
+    cnz = 1 - 2 * near / cz
+    rnx = scale_x[..., None] * (rx / rz - cx / cz)
+    rny = scale_y[..., None] * (ry / rz - cy / cz)
+    rnz = 2 * near / cz
+    center_ndc = torch.stack([cnx, cny, cnz], dim=-1)  # [...,R,3]
+    ray_ndc = torch.stack([rnx, rny, rnz], dim=-1)  # [...,R,3]
+    return center_ndc, ray_ndc
+
+
+def convert_NDC2(center, ray, intr):
+    # Similar to convert_NDC() but shift the ray origins to its own image plane instead of the global near plane.
+    # Also this version is much more interpretable.
+    scale_x = intr[..., 0, 0] / intr[..., 0, 2]  # [...]
+    scale_y = intr[..., 1, 1] / intr[..., 1, 2]  # [...]
+    # Get the metric image plane (i.e. new "center"): (sx*cx/cz, sy*cy/cz, 1-2/cz).
+    center = center + ray  # This is the key difference.
+    cx, cy, cz = center.unbind(dim=-1)  # [...,R]
+    image_plane = torch.stack([scale_x[..., None] * cx / cz, scale_x[..., None] * cy / cz, 1 - 2 / cz], dim=-1)
+    # Get the infinity plane: (sx*rx/rz, sy*ry/rz, 1).
+    rx, ry, rz = ray.unbind(dim=-1)  # [...,R]
+    inf_plane = torch.stack([scale_x[..., None] * rx / rz, scale_y[..., None] * ry / rz, torch.ones_like(rz)], dim=-1)
+    # The NDC ray is the difference between the two planes, assuming t \in [0,1].
+    ndc_ray = inf_plane - image_plane
+    return image_plane, ndc_ray
+
+
+def rotation_distance(R1, R2, eps=1e-7):
+    # http://www.boris-belousov.net/2016/12/01/quat-dist/
+    R_diff = R1 @ R2.transpose(-2, -1)
+    trace = R_diff[..., 0, 0] + R_diff[..., 1, 1] + R_diff[..., 2, 2]
+    angle = ((trace - 1) / 2).clamp(-1 + eps, 1 - eps).acos_()  # numerical stability near -1/+1
+    return angle
+
+
+def get_oscil_novel_view_poses(N=60, angle=0.05, dist=5):
+    # Create circular viewpoints (small oscillations).
+    theta = torch.arange(N) / N * 2 * np.pi
+    R_x = angle_to_rotation_matrix((theta.sin() * angle).asin(), "X")
+    R_y = angle_to_rotation_matrix((theta.cos() * angle).asin(), "Y")
+    pose_rot = pose(R=R_y @ R_x)
+    pose_shift = pose(t=[0, 0, dist])
+    pose_oscil = pose.compose([pose.invert(pose_shift), pose_rot, pose_shift])
+    return pose_oscil
+
+
+def cross_product_matrix(x):
+    """Matrix form of cross product opertaion.
+
+    param x: [3,] tensor.
+    return: [3, 3] tensor representing the matrix form of cross product.
+    """
+    return torch.tensor(
+        [
+            [0, -x[2], x[1]],
+            [x[2], 0, -x[0]],
+            [
+                -x[1],
+                x[0],
+                0,
+            ],
+        ]
+    )
+
+
+def essential_matrix(poses):
+    """Compute Essential Matrix from a relative pose.
+
+    param poses: [views, 3, 4] tensor representing relative poses.
+    return: [views, 3, 3] tensor representing Essential Matrix.
+    """
+    r = poses[..., 0:3]
+    t = poses[..., 3]
+    tx = torch.stack([cross_product_matrix(tt) for tt in t], axis=0)
+    return tx @ r
+
+
+def fundamental_matrix(poses, intr1, intr2):
+    """Compute Fundamental Matrix from a relative pose and intrinsics.
+
+    param poses: [views, 3, 4] tensor representing relative poses.
+          intr1: [3, 3] tensor. Camera intrinsic of reference image.
+          intr2: [views, 3, 3] tensor. Camera Intrinsic of target image.
+    return: [views, 3, 3] tensor representing Fundamental Matrix.
+    """
+    return intr2.inverse().transpose(-1, -2) @ essential_matrix(poses) @ intr1.inverse()
+
+
+def get_ray_depth_plane_intersection(center, ray, depths):
+    """Compute the intersection of a ray with a depth plane.
+    Args:
+        center (tensor [B,HW,3]): Camera center of the target pose.
+        ray (tensor [B,HW,3]): Ray direction of the target pose.
+        depth (tensor [L]): The depth values from the source view (e.g. for MPI planes).
+    Returns:
+        intsc_points (tensor [B,HW,L,3]): Intersecting 3D points with the MPI.
+    """
+    # Each 3D point x along the ray v from center c can be written as x = c+t*v.
+    # Plane equation: n@x = d, where normal n = (0,0,1), d = depth.
+    # --> t = (d-n@c)/(n@v).
+    # --> x = c+t*v = c+(d-n@c)/(n@v)*v.
+    center, ray = center[:, :, None], ray[:, :, None]  # [B,HW,L,3], [B,HW,1,3]
+    depths = depths[None, None, :, None]  # [1,1,L,1]
+    intsc_points = center + (depths - center[..., 2:]) / ray[..., 2:] * ray  # [B,HW,L,3]
+    return intsc_points
+
+
+def unit_view_vector_to_rotation_matrix(v, axes="ZYZ"):
+    """
+    Args:
+        v (tensor [...,3]): Unit vectors on the view sphere.
+        axes: rotation axis order.
+
+    Returns:
+        rotation_matrix (tensor [...,3,3]): rotation matrix R @ v + [0, 0, 1] = 0.
+    """
+    alpha = torch.arctan2(v[..., 1], v[..., 0])  # [...]
+    beta = np.pi - v[..., 2].arccos()  # [...]
+    euler_angles = torch.stack([torch.ones_like(alpha) * np.pi / 2, -beta, alpha], dim=-1)  # [...,3]
+    rot2 = angle_to_rotation_matrix(euler_angles[..., 2], axes[2])  # [...,3,3]
+    rot1 = angle_to_rotation_matrix(euler_angles[..., 1], axes[1])  # [...,3,3]
+    rot0 = angle_to_rotation_matrix(euler_angles[..., 0], axes[0])  # [...,3,3]
+    rot = rot2 @ rot1 @ rot0  # [...,3,3]
+    return rot.transpose(-2, -1)
+
+
+def sample_on_spherical_cap(anchor, N, max_angle, min_angle=0.0):
+    """Sample n points on the view hemisphere within the angle to x.
+    Args:
+        anchor (tensor [...,3]): Reference 3-D unit vector on the view hemisphere.
+        N (int): Number of sampled points.
+        max_angle (float): Sampled points should have max angle to x.
+    Returns:
+        sampled_points (tensor [...,N,3]): Sampled points on the spherical caps.
+    """
+    batch_shape = anchor.shape[:-1]
+    # First, sample uniformly on a unit 2D disk.
+    radius = torch.rand(*batch_shape, N, device=anchor.device)  # [...,N]
+    h_max = 1 - np.cos(max_angle)  # spherical cap height
+    h_min = 1 - np.cos(min_angle)  # spherical cap height
+    radius = (radius * (h_max - h_min) + h_min) / h_max
+    theta = torch.rand(*batch_shape, N, device=anchor.device) * 2 * np.pi  # [...,N]
+    x = radius.sqrt() * theta.cos()  # [...,N]
+    y = radius.sqrt() * theta.sin()  # [...,N]
+    # Reparametrize to a unit spherical cap with height h.
+    # http://marc-b-reynolds.github.io/distribution/2016/11/28/Uniform.html
+    k = h_max * radius  # [...,N]
+    s = (h_max * (2 - k)).sqrt()  # [...,N]
+    points = torch.stack([s * x, s * y, 1 - k], dim=-1)  # [...,N,3]
+    # Transform to center around the anchor.
+    ref_z = torch.tensor([0.0, 0.0, 1.0], device=anchor.device)
+    v = -anchor.cross(ref_z)  # [...,3]
+    ss_v = lie.skew_symmetric(v)  # [...,3,3]
+    R = torch.eye(3, device=anchor.device) + ss_v + ss_v @ ss_v / (1 + anchor @ ref_z)[..., None, None]  # [...,3,3]
+    points = points @ R.transpose(-2, -1)  # [...,N,3]
+    return points
+
+
+def sample_on_spherical_cap_northern(anchor, N, max_angle, away_from=None, max_reject_count=None):
+    """Sample n points only the northern view hemisphere within the angle to x."""
+
+    def find_invalid_points(points):
+        southern = points[..., 2] < 0  # [...,N]
+        if away_from is not None:
+            cosine_ab = (away_from * anchor).sum(dim=-1, keepdim=True)  # [...,1]
+            cosine_ac = (away_from[..., None, :] * points).sum(dim=-1)  # [...,N]
+            not_outwards = cosine_ab < cosine_ac  # [...,N]
+            invalid = southern | not_outwards
+        else:
+            invalid = southern
+        return invalid
+
+    assert (anchor[..., 2] > 0).all()
+    assert anchor.norm(dim=-1).allclose(torch.ones_like(anchor[..., 0]))
+    points = sample_on_spherical_cap(anchor, N, max_angle)  # [...,N,3]
+    invalid = find_invalid_points(points)
+    count = 0
+    while invalid.any():
+        # Reject and resample.
+        points_resample = sample_on_spherical_cap(anchor, N, max_angle)
+        points[invalid] = points_resample[invalid]
+        invalid = find_invalid_points(points)
+        count += 1
+        if max_reject_count and count > max_reject_count:
+            points = anchor.repeat(N, 1)
+    return points
+
+
+def depth_to_pointcloud(depth: torch.tensor, intr: torch.tensor, extr: torch.tensor):
+    """Convert depth to pointcloud.
+    Args:
+        depth (torch.tensor): [1,H,W]/[B,1,H,W]
+        intr (torch.tensor): [3,3]/[B,3,3]
+        extr (torch.tensor): [3,4]/[B,3,4]
+
+    Returns:
+        pc (torch.tensor): [HW,3]
+    """
+
+    assert len(depth.shape) == len(intr.shape) + 1, (
+        f"dist ({depth.shape}) and intr ({intr.shape}) should have the same batch size"
+    )
+    # convert depth to pointcloud
+    center, ray = get_center_and_ray(extr, intr, depth.shape[-2:])
+    depth = depth.view(*center.shape[:-1], 1)  # [HW, 1]/[B,HW,1]
+    pc = get_3D_points_from_depth(center, ray, depth)
+    return pc  # HW,3/B,HW,3
diff --git a/src/megatron/bridge/data/Dit/data/camera_ctrl_utils.py b/src/megatron/bridge/data/Dit/data/camera_ctrl_utils.py
new file mode 100644
index 0000000000..7d7db44a5b
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/camera_ctrl_utils.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import numpy as np
+import torch
+from megatron.bridge.data.Dit.data import camera
+from megatron.bridge.data.Dit.data.camera import get_center_and_ray
+
+
+def plucker_coordinates(pose: torch.tensor, intr: torch.tensor, width: int, height: int):
+    """Return plücker coordinates from pose and intrinsics. Plücker coordinates are defined as
+    [(rx,ry,rz),(rx,ry,rz)x(cx,cy,cz)] where (cx,cy,cz) is the camera origin
+    and (rx,ry,rz) is the direction of the ray.
+    Plücker coordinates are used to represent a line in 3D space.
+
+    Useful references:
+    - https://www.euclideanspace.com/maths/geometry/elements/line/plucker/index.htm
+
+
+    Args:
+        pose (torch.tensor): Extrinsics [B,3,4]
+        intr (torch.tensor): Intrinsics [B,3,3]
+        width (int): Image width
+        height (int): Image height
+
+    Returns:
+        torch.tensor: plücker coordinates
+    """
+    center, ray = get_center_and_ray(pose, intr, [height, width])  # [B,HW,3]
+    ray = ray / torch.norm(ray, dim=-1, keepdim=True)  # [B,HW,3], unit length
+    plucker_coords = torch.cat([torch.cross(center, ray, dim=-1), ray], dim=-1)  # [B,HW,6]
+    return plucker_coords
+
+
+def get_relative_pose(pose_list: list[torch.Tensor | np.ndarray]) -> list[np.ndarray]:
+    """
+    Convert a list of 3x4 world to camera pose to relative pose to the first frame
+    Args:
+        pose_list (list[torch.Tensor | np.ndarray]): List of 3x4 world to camera pose
+    Returns:
+        ret_poses (list[np.ndarray]): List of relative poses
+    """
+    if isinstance(pose_list[0], np.ndarray):
+        poses = torch.from_numpy(np.stack(list(pose_list), axis=0))  # [N,3,4]
+    else:
+        poses = torch.stack(list(pose_list), dim=0)  # [N,3,4]
+    pose_0 = poses[:1]
+    pose_0_inv = camera.pose.invert(pose_0)
+    rel_poses = camera.pose.compose_pair(pose_0_inv, poses)
+    # Homogeneous form (4x4)
+    rel_poses_4x4 = torch.eye(4).repeat(len(rel_poses), 1, 1)
+    rel_poses_4x4[:, :3, :] = rel_poses
+    return rel_poses_4x4.numpy()
+
+
+def estimate_pose_list_to_plucker_embedding(
+    pose_list: list,
+    latent_compression_ratio_h: int,
+    latent_compression_ratio_w: int,
+    image_size: torch.tensor,
+    use_relative_pose: bool = True,
+) -> torch.tensor:
+    """
+    Convert a list of pose to plücker coordinates
+    Args:
+        pose_list (list): List of pose, each element is a dict with keys "intrinsics", "rotation", "translation"
+            e.g. {'intrinsics': [[0.4558800160884857, 0.0, 0.5], [0.0, 0.8124798536300659, 0.5], [0.0, 0.0, 0.0]],
+            'rotation': [[0.5067835450172424, 0.4129045605659485, -0.7567564249038696],
+                         [-0.41741496324539185, 0.8855977654457092, 0.20366966724395752],
+                         [0.7542779445648193, 0.21266502141952515, 0.6211589574813843]
+                        ],
+            'translation': [1.5927585363388062, -0.41845059394836426, 0.6559827327728271]}
+        image_size (torch.tensor): Image size of the current video after processing, the input is
+            h_after_padded, w_after_padded, h_after_resize, w_after_resize,
+            e.g. [ 704., 1280.,  704., 1252.] for input with raw shape [720, 1280]
+        latent_compression_ratio_h (int): compression height of the plücker embedding image
+        latent_compression_ratio_w (int): compression width of the plücker embedding image
+        use_relative_pose (bool): Whether to use relative pose
+    Returns:
+        plücker_coords (torch.tensor): Plücker embedding of shape [num_frame, HW, 6]
+    """
+    num_frame = len(pose_list)
+    # e.g. 704, 1280, 704, 1252
+    h_after_padded, w_after_padded, h_after_resize, w_after_resize = image_size
+    H = h_after_padded.item() // latent_compression_ratio_h  # e.g. 704 / 8 = 88
+    W = w_after_padded.item() // latent_compression_ratio_w  # e.g. 1280 / 8 = 160
+    ratio_w = w_after_resize.item() / w_after_padded.item()
+    ratio_h = h_after_resize.item() / h_after_padded.item()
+
+    H = int(H)
+    W = int(W)
+    # Compute mv_intr_denormalized
+    mv_intr_denormalized = []
+    for p in pose_list:
+        intrinsic = torch.tensor(p["intrinsics"])
+        intrinsic[2, 2] = 1
+        intrinsic[0, :] *= W * ratio_w
+        intrinsic[1, :] *= H * ratio_h
+        mv_intr_denormalized.append(intrinsic)
+
+    mv_pose = [
+        torch.cat([torch.tensor(p["rotation"]), torch.tensor(p["translation"]).unsqueeze(1)], dim=1) for p in pose_list
+    ]
+
+    # Convert to pose relative to the first frame
+    if use_relative_pose:
+        mv_pose = get_relative_pose(mv_pose)
+    mv_intr_denormalized = torch.stack(mv_intr_denormalized)
+    mv_pose = torch.tensor(np.stack(mv_pose))
+    mv_pose = mv_pose[:, :3]  # B*N,3,4
+    mv_intr_denormalized = mv_intr_denormalized.view(num_frame, 3, 3)  # B*N,3,3
+
+    # plucker coordinates to encode pose
+    plucker_coords = plucker_coordinates(mv_pose, mv_intr_denormalized, W, H)  # [B,HW,6]
+
+    return plucker_coords, H, W
+
+
+def normalize_camera_trajectory_to_unit_sphere(pose_list: list[dict]) -> None:
+    """
+    Normalize the camera trajectory to fit within a unit sphere.
+    This function takes a list of camera poses, each represented as a dictionary with a "translation" key,
+    and normalizes the translation vectors such that the maximum distance between any two cameras is 1.
+    The normalization is done in-place.
+    Args:
+        pose_list (list[dict]): A list of dictionaries, where each dictionary contains a "translation" key
+                                with a list or array of three floats representing the camera translation vector.
+    Returns:
+        None
+    """
+    translation = np.array([pose["translation"] for pose in pose_list])  # [N,3]
+
+    # Find the max distance between any two cameras. It is equivalent to the max distance of translation vectors.
+    def _longest_distance(points):
+        # Compute the pairwise distances.
+        diff = points[:, None, :] - points[None, :, :]
+        distances = np.linalg.norm(diff, axis=-1)
+        # Find the maximum distance
+        max_distance = np.max(distances)
+        return max_distance
+
+    max_distance = _longest_distance(translation)
+    for pose in pose_list:
+        trans = np.array(pose["translation"])
+        trans /= max_distance
+        pose["translation"] = trans.tolist()
diff --git a/src/megatron/bridge/data/Dit/data/diffusion_energon_datamodule.py b/src/megatron/bridge/data/Dit/data/diffusion_energon_datamodule.py
new file mode 100644
index 0000000000..fa38e9c6c8
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/diffusion_energon_datamodule.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+from dataclasses import dataclass
+import logging
+from typing import Any, Dict, Literal
+
+from torch import int_repr
+
+from megatron.bridge.data.Dit.data.diffusion_taskencoder import BasicDiffusionTaskEncoder
+from megatron.bridge.data.utils import DatasetBuildContext, DatasetProvider
+from megatron.energon import DefaultTaskEncoder, get_train_dataset
+from megatron.bridge.data.Dit.base import EnergonMultiModalDataModule
+
+@dataclass(kw_only=True)
+class DiffusionDataModuleConfig(DatasetProvider):
+    path: str
+    seq_length: int
+    micro_batch_size: int
+    task_encoder_seq_length: int
+    global_batch_size: int
+    num_workers: int_repr
+    dataloader_type: str = "external"
+
+    def __post_init__(self):
+        self.dataset = DiffusionDataModule(
+            path=self.path,
+            seq_length=self.seq_length,
+            task_encoder=BasicDiffusionTaskEncoder(seq_length=self.task_encoder_seq_length),
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=self.global_batch_size,
+            num_workers=self.num_workers)
+        self.sequence_length = self.dataset.seq_length
+    
+    def build_datasets(self, context: DatasetBuildContext):
+        return self.dataset.train_dataloader(), self.dataset.train_dataloader(), self.dataset.train_dataloader()
+    
+
+
+
+class DiffusionDataModule(EnergonMultiModalDataModule):
+    """
+    A PyTorch Lightning DataModule for handling multimodal datasets with images and text.
+
+    This data module is designed to work with multimodal datasets that involve both images and text.
+    It provides a seamless interface to load training and validation data, manage batching, and handle
+    the state of the data pipeline across training epochs. The module integrates with the Megatron-Energon
+    framework for efficient data handling in large-scale distributed training.
+
+    Attributes:
+    path (str): Path to the energon dataset.
+    tokenizer (Tokenizer): The tokenizer used for processing text.
+    image_processor (ImageProcessor): The image processor used for preprocessing images.
+    seq_length (int): The maximum sequence length for tokenized text.
+    micro_batch_size (int): The batch size for training and validation.
+    num_workers (int): Number of workers for data loading.
+    pin_memory (bool): Whether to pin memory in the DataLoader.
+    multimodal_sample_config (MultiModalSampleConfig): Configuration object for multimodal samples.
+    task_encoder (MultiModalTaskEncoder): Encoder responsible for encoding and batching samples.
+    init_global_step (int): The initial global step for the trainer, used for resuming training.
+    data_sampler (SequentialMegatronSampler): Sampler responsible for generating sequential samples.
+    train_dataloader_object (Optional): The DataLoader object for training data.
+    val_dataloader_object (Optional): The DataLoader object for validation data.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        seq_length: int = 2048,
+        micro_batch_size: int = 1,
+        global_batch_size: int = 8,
+        num_workers: int = 1,
+        pin_memory: bool = True,
+        task_encoder: DefaultTaskEncoder = None,
+        use_train_split_for_val: bool = False,
+    ) -> None:
+        """
+        Initialize the SimpleMultiModalDataModule.
+
+        Parameters:
+        path (str): Path to the dataset.
+        tokenizer (Tokenizer): The tokenizer used for processing text.
+        image_processor (ImageProcessor): The image processor used for preprocessing images.
+        seq_length (int, optional): The maximum sequence length for tokenized text. Defaults to 2048.
+        micro_batch_size (int, optional): The batch size for training and validation. Defaults to 1.
+        num_workers (int, optional): Number of workers for data loading. Defaults to 1.
+        pin_memory (bool, optional): Whether to pin memory in the DataLoader. Defaults to True.
+        """
+
+        super().__init__(
+            path=path,
+            tokenizer=None,
+            image_processor=None,
+            seq_length=seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            task_encoder=task_encoder,
+        )
+        self.use_train_split_for_val = use_train_split_for_val
+
+    def datasets_provider(self, worker_config, split: Literal["train", "val"] = "val"):
+        """
+        Provide the dataset for training or validation.
+
+        This method retrieves the dataset for the specified split (either 'train' or 'val') and configures
+        it according to the worker configuration.
+
+        Parameters:
+        worker_config: Configuration for the data loader workers.
+        split (Literal['train', 'val'], optional): The data split to retrieve ('train' or 'val'). Defaults to 'val'.
+
+        Returns:
+        Dataset: The dataset configured for the specified split.
+        """
+        if split not in {"train", "val"}:
+            raise ValueError("Invalid value for split. Allowed values are 'train' or 'val'.")
+        if self.use_train_split_for_val:
+            split = "train"
+        _dataset = get_train_dataset(
+            self.path,
+            batch_size=self.micro_batch_size,
+            task_encoder=self.task_encoder,
+            worker_config=worker_config,
+            max_samples_per_sequence=None,
+            shuffle_buffer_size=100,
+            split_part=split,
+            batch_drop_last=True,
+            virtual_epoch_length=1_000_000_000,  # a hack to avoid energon end of epoch warning
+        )
+        return _dataset
+
+    def val_dataloader(self):
+        """
+        Configure the validation DataLoader.
+
+        This method configures the DataLoader for validation data.
+
+        Parameters:
+        worker_config: Configuration for the data loader workers.
+
+        Returns:
+        DataLoader: The DataLoader for validation data.
+        """
+        if self.use_train_split_for_val:
+            return self.train_dataloader()
+        return super().val_dataloader()
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        """
+        Load the state of the data module from a checkpoint.
+
+        This method is called when loading a checkpoint. It restores the state of the data module,
+        including the state of the dataloader and the number of consumed samples.
+
+        Parameters:
+        state_dict (Dict[str, Any]): The state dictionary containing the saved state of the data module.
+        """
+        try:
+            super().load_state_dict(state_dict)
+        except Exception as e:
+            logging.warning(f"datamodule.load_state_dict failed  {e}")
diff --git a/src/megatron/bridge/data/Dit/data/diffusion_fake_datamodule.py b/src/megatron/bridge/data/Dit/data/diffusion_fake_datamodule.py
new file mode 100644
index 0000000000..e85907e0b7
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/diffusion_fake_datamodule.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import lightning.pytorch as pl
+import torch
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from megatron.bridge.models.DiTModel.dit_provider import DiTModelProvider as DiTConfig
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+from torch.utils.data import DataLoader
+
+
+class PosEmb3D:
+    """Generates and provides 3D positional embeddings for video data."""
+
+    def __init__(self, *, max_t=96, max_h=960, max_w=960):
+        self.max_t = max_t
+        self.max_h = max_h
+        self.max_w = max_w
+        self.generate_pos_id()
+
+    def generate_pos_id(self):
+        """Generates the positional ID grid based on max_t, max_h, and max_w."""
+        self.grid = torch.stack(
+            torch.meshgrid(
+                torch.arange(self.max_t, device="cpu"),
+                torch.arange(self.max_h, device="cpu"),
+                torch.arange(self.max_w, device="cpu"),
+            ),
+            dim=-1,
+        )
+
+    def get_pos_id_3d(self, *, t, h, w):
+        """Retrieves a subset of the positional IDs for the specified dimensions.
+
+        Parameters:
+            t (int): Number of time frames.
+            h (int): Height dimension.
+            w (int): Width dimension.
+
+        Returns:
+            torch.Tensor: The positional IDs tensor with shape (t, h, w, 3).
+        """
+        if t > self.max_t or h > self.max_h or w > self.max_w:
+            self.max_t = max(self.max_t, t)
+            self.max_h = max(self.max_h, h)
+            self.max_w = max(self.max_w, w)
+            self.generate_pos_id()
+        return self.grid[:t, :h, :w]
+
+
+class DiTVideoLatentFakeDataset(torch.utils.data.Dataset):
+    """A fake dataset for generating synthetic video latent data."""
+
+    def __init__(
+        self,
+        n_frames,
+        max_h,
+        max_w,
+        patch_size,
+        in_channels,
+        crossattn_emb_size,
+        max_text_seqlen=512,
+        seq_length=8192,
+    ):
+        self.max_t = n_frames
+        self.max_height = max_h
+        self.max_width = max_w
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.text_dim = crossattn_emb_size
+        self.text_seqlen = max_text_seqlen
+        self.seq_length = seq_length
+
+    def __len__(self):
+        """Returns the total number of samples."""
+        return 100000000
+
+    def __getitem__(self, idx):
+        """Generates a single sample of data.
+
+        Parameters:
+            idx (int): Index of the data sample.
+
+        Returns:
+            dict: A dictionary containing video latent data and related information.
+        """
+        # t = self.max_t
+        # h = self.max_height
+        # w = self.max_width
+        p = self.patch_size
+        c = self.in_channels
+
+        video_latent = torch.ones(self.seq_length, c * p**2, dtype=torch.bfloat16) * 0.5
+        text_embedding = torch.randn(self.text_seqlen, self.text_dim, dtype=torch.bfloat16)
+        # pos_emb = pos_id_3d.get_pos_id_3d(t=t, h=h // p, w=w // p).reshape(-1, 3)
+
+        return {
+            "video": video_latent,
+            "t5_text_embeddings": text_embedding,
+            "seq_len_q": torch.tensor([video_latent.shape[0]], dtype=torch.int32).squeeze(),
+            "seq_len_kv": torch.tensor([self.text_seqlen], dtype=torch.int32).squeeze(),
+            "pos_ids": torch.zeros((self.seq_length, 3), dtype=torch.int32),
+            "loss_mask": torch.ones(video_latent.shape[0], dtype=torch.bfloat16),
+        }
+
+    def _collate_fn(self, batch):
+        """A default implementation of a collation function.
+
+        Users should override this method to define custom data loaders.
+        """
+        return torch.utils.data.dataloader.default_collate(batch)
+
+    def collate_fn(self, batch):
+        """Method that user passes as a functor to DataLoader.
+
+        The method optionally performs neural type checking and adds types to the outputs.
+
+        Please note, subclasses of Dataset should not implement `input_types`.
+
+        Usage:
+            dataloader = torch.utils.data.DataLoader(
+                    ....,
+                    collate_fn=dataset.collate_fn,
+                    ....
+            )
+
+        Returns:
+            Collated batch, with or without types.
+        """
+        return self._collate_fn(batch)
+
+
+class VideoLatentFakeDataModule(pl.LightningDataModule):
+    """A LightningDataModule for generating fake video latent data for training."""
+
+    def __init__(
+        self,
+        model_config: DiTConfig,
+        seq_length: int = 2048,
+        micro_batch_size: int = 1,
+        global_batch_size: int = 8,
+        num_workers: int = 1,
+        pin_memory: bool = True,
+        task_encoder=None,
+        use_train_split_for_val: bool = False,
+    ) -> None:
+        super().__init__()
+        self.seq_length = seq_length
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.num_workers = num_workers
+        self.model_config = model_config
+
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+        )
+
+    def setup(self, stage: str = "") -> None:
+        """Sets up the dataset for training and validation.
+
+        Parameters:
+            stage (str): Optional stage argument (unused).
+        """
+        self._train_ds = DiTVideoLatentFakeDataset(
+            n_frames=self.model_config.max_frames,
+            max_h=self.model_config.max_img_h,
+            max_w=self.model_config.max_img_w,
+            patch_size=self.model_config.patch_spatial,
+            in_channels=self.model_config.in_channels,
+            crossattn_emb_size=self.model_config.crossattn_emb_size,
+        )
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        """Returns the training DataLoader."""
+        if not hasattr(self, "_train_ds"):
+            self.setup()
+        return self._create_dataloader(self._train_ds)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        """Returns the validation DataLoader."""
+        if not hasattr(self, "_train_ds"):
+            self.setup()
+        return self._create_dataloader(self._train_ds)
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        """Creates a DataLoader for the given dataset.
+
+        Parameters:
+            dataset (Dataset): The dataset to load.
+            **kwargs: Additional arguments for DataLoader.
+
+        Returns:
+            DataLoader: The DataLoader instance.
+        """
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            persistent_workers=True,
+            collate_fn=dataset.collate_fn,
+            **kwargs,
+        )
diff --git a/src/megatron/bridge/data/Dit/data/diffusion_mock_datamodule.py b/src/megatron/bridge/data/Dit/data/diffusion_mock_datamodule.py
new file mode 100644
index 0000000000..73c4208a17
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/diffusion_mock_datamodule.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+from typing import List, Optional
+
+import lightning.pytorch as pl
+import torch
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+from torch.utils.data import DataLoader, Dataset
+
+
+class MockDataModule(pl.LightningDataModule):
+    """
+    A PyTorch Lightning DataModule for creating mock datasets for training, validation, and testing.
+
+    Args:
+        image_h (int): Height of the images in the dataset. Default is 1024.
+        image_w (int): Width of the images in the dataset. Default is 1024.
+        micro_batch_size (int): Micro batch size for the data sampler. Default is 4.
+        global_batch_size (int): Global batch size for the data sampler. Default is 8.
+        rampup_batch_size (Optional[List[int]]): Ramp-up batch size for the data sampler. Default is None.
+        num_train_samples (int): Number of training samples. Default is 10,000.
+        num_val_samples (int): Number of validation samples. Default is 10,000.
+        num_test_samples (int): Number of testing samples. Default is 10,000.
+        num_workers (int): Number of worker threads for data loading. Default is 8.
+        pin_memory (bool): Whether to use pinned memory for data loading. Default is True.
+        persistent_workers (bool): Whether to use persistent workers for data loading. Default is False.
+        image_precached (bool): Whether the images are pre-cached. Default is False.
+        text_precached (bool): Whether the text data is pre-cached. Default is False.
+    """
+
+    def __init__(
+        self,
+        image_h: int = 1024,
+        image_w: int = 1024,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        num_train_samples: int = 10_000,
+        num_val_samples: int = 10_000,
+        num_test_samples: int = 10_000,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        image_precached=False,
+        text_precached=False,
+    ):
+        super().__init__()
+        self.image_h = image_h
+        self.image_w = image_w
+        self.num_train_samples = num_train_samples
+        self.num_val_samples = num_val_samples
+        self.num_test_samples = num_test_samples
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.image_precached = image_precached
+        self.text_precached = text_precached
+        self.global_batch_size = global_batch_size
+
+        self.data_sampler = MegatronDataSampler(
+            seq_len=10,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+
+    def setup(self, stage: str = "") -> None:
+        """
+        Sets up datasets for training, validation, and testing.
+
+        Args:
+            stage (str): The stage of the process (e.g., 'fit', 'test'). Default is an empty string.
+        """
+        self._train_ds = _MockT2IDataset(
+            image_H=1024,
+            image_W=1024,
+            length=self.num_train_samples,
+            image_precached=self.image_precached,
+            text_precached=self.text_precached,
+        )
+        self._validation_ds = _MockT2IDataset(
+            image_H=1024,
+            image_W=1024,
+            length=self.num_val_samples,
+            image_precached=self.image_precached,
+            text_precached=self.text_precached,
+        )
+        self._test_ds = _MockT2IDataset(
+            image_H=1024,
+            image_W=1024,
+            length=self.num_test_samples,
+            image_precached=self.image_precached,
+            text_precached=self.text_precached,
+        )
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        """
+        Returns the training DataLoader.
+
+        Returns:
+            TRAIN_DATALOADERS: DataLoader for the training dataset.
+        """
+        if not hasattr(self, "_train_ds"):
+            self.setup()
+        return self._create_dataloader(self._train_ds)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        """
+        Returns the validation DataLoader.
+
+        Returns:
+            EVAL_DATALOADERS: DataLoader for the validation dataset.
+        """
+        if not hasattr(self, "_validation_ds"):
+            self.setup()
+        return self._create_dataloader(self._validation_ds)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        """
+        Returns the testing DataLoader.
+
+        Returns:
+            EVAL_DATALOADERS: DataLoader for the testing dataset.
+        """
+        if not hasattr(self, "_test_ds"):
+            self.setup()
+        return self._create_dataloader(self._test_ds)
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        """
+        Creates a DataLoader for the given dataset.
+
+        Args:
+            dataset: The dataset to load.
+            **kwargs: Additional arguments for the DataLoader.
+
+        Returns:
+            DataLoader: Configured DataLoader for the dataset.
+        """
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            **kwargs,
+        )
+
+
+class _MockT2IDataset(Dataset):
+    """
+    A mock dataset class for text-to-image tasks, simulating data samples for training and testing.
+
+    This dataset generates synthetic data for both image and text inputs, with options to use
+    pre-cached latent representations or raw data. The class is designed for use in testing and
+    prototyping machine learning models.
+
+    Attributes:
+        image_H (int): Height of the generated images.
+        image_W (int): Width of the generated images.
+        length (int): Total number of samples in the dataset.
+        image_key (str): Key for accessing image data in the output dictionary.
+        txt_key (str): Key for accessing text data in the output dictionary.
+        hint_key (str): Key for accessing hint data in the output dictionary.
+        image_precached (bool): Whether to use pre-cached latent representations for images.
+        text_precached (bool): Whether to use pre-cached embeddings for text.
+        prompt_seq_len (int): Sequence length for text prompts.
+        pooled_prompt_dim (int): Dimensionality of pooled text embeddings.
+        context_dim (int): Dimensionality of the text embedding context.
+        vae_scale_factor (int): Scaling factor for the VAE latent representation.
+        vae_channels (int): Number of channels in the VAE latent representation.
+        latent_shape (tuple): Shape of the latent representation for images (if pre-cached).
+        prompt_embeds_shape (tuple): Shape of the text prompt embeddings (if pre-cached).
+        pooped_prompt_embeds_shape (tuple): Shape of pooled text embeddings (if pre-cached).
+        text_ids_shape (tuple): Shape of the text token IDs (if pre-cached).
+
+    Methods:
+        __getitem__(index):
+            Retrieves a single sample from the dataset based on the specified index.
+        __len__():
+            Returns the total number of samples in the dataset.
+    """
+
+    def __init__(
+        self,
+        image_H,
+        image_W,
+        length=100000,
+        image_key="images",
+        txt_key="txt",
+        hint_key="hint",
+        image_precached=False,
+        text_precached=False,
+        prompt_seq_len=256,
+        pooled_prompt_dim=768,
+        context_dim=4096,
+        vae_scale_factor=8,
+        vae_channels=16,
+    ):
+        super().__init__()
+        self.length = length
+        self.H = image_H
+        self.W = image_W
+        self.image_key = image_key
+        self.txt_key = txt_key
+        self.hint_key = hint_key
+        self.image_precached = image_precached
+        self.text_precached = text_precached
+        if self.image_precached:
+            self.latent_shape = (vae_channels, int(image_H // vae_scale_factor), int(image_W // vae_scale_factor))
+        if self.text_precached:
+            self.prompt_embeds_shape = (prompt_seq_len, context_dim)
+            self.pooped_prompt_embeds_shape = (pooled_prompt_dim,)
+            self.text_ids_shape = (prompt_seq_len, 3)
+
+    def __getitem__(self, index):
+        """
+        Retrieves a single sample from the dataset.
+
+        The sample can include raw image and text data or pre-cached latent representations,
+        depending on the configuration.
+
+        Args:
+            index (int): Index of the sample to retrieve.
+
+        Returns:
+            dict: A dictionary containing the generated data sample. The keys and values
+                  depend on whether `image_precached` and `text_precached` are set.
+                  Possible keys include:
+                    - 'latents': Pre-cached latent representation of the image.
+                    - 'control_latents': Pre-cached control latent representation.
+                    - 'images': Raw image tensor.
+                    - 'hint': Hint tensor for the image.
+                    - 'prompt_embeds': Pre-cached text prompt embeddings.
+                    - 'pooled_prompt_embeds': Pooled text prompt embeddings.
+                    - 'text_ids': Text token IDs.
+                    - 'txt': Text input string (if text is not pre-cached).
+        """
+        item = {}
+        if self.image_precached:
+            item["latents"] = torch.randn(self.latent_shape)
+            item["control_latents"] = torch.randn(self.latent_shape)
+        else:
+            item[self.image_key] = torch.randn(3, self.H, self.W)
+            item[self.hint_key] = torch.randn(3, self.H, self.W)
+
+        if self.text_precached:
+            item["prompt_embeds"] = torch.randn(self.prompt_embeds_shape)
+            item["pooled_prompt_embeds"] = torch.randn(self.pooped_prompt_embeds_shape)
+            item["text_ids"] = torch.randn(self.text_ids_shape)
+        else:
+            item[self.txt_key] = "This is a sample caption input"
+
+        return item
+
+    def __len__(self):
+        """
+        Returns the total number of samples in the dataset.
+
+        Returns:
+            int: Total number of samples (`length` attribute).
+        """
+        return self.length
diff --git a/src/megatron/bridge/data/Dit/data/diffusion_taskencoder.py b/src/megatron/bridge/data/Dit/data/diffusion_taskencoder.py
new file mode 100644
index 0000000000..7faa1aaae3
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/diffusion_taskencoder.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from megatron.energon import DefaultTaskEncoder, SkipSample
+from megatron.energon.task_encoder.cooking import Cooker, basic_sample_keys
+
+
+def cook(sample: dict) -> dict:
+    """
+    Processes a raw sample dictionary from energon dataset and returns a new dictionary with specific keys.
+
+    Args:
+        sample (dict): The input dictionary containing the raw sample data.
+
+    Returns:
+        dict: A new dictionary containing the processed sample data with the following keys:
+            - All keys from the result of `basic_sample_keys(sample)`
+            - 'json': The contains meta data like resolution, aspect ratio, fps, etc.
+            - 'pth': contains video latent tensor
+            - 'pickle': contains text embeddings
+    """
+    return dict(
+        **basic_sample_keys(sample),
+        json=sample[".json"],
+        pth=sample[".pth"],
+        pickle=sample[".pickle"],
+    )
+
+
+class BasicDiffusionTaskEncoder(DefaultTaskEncoder):
+    """
+    BasicDiffusionTaskEncoder is a class that encodes image/video samples for diffusion tasks.
+    Attributes:
+        cookers (list): A list of Cooker objects used for processing.
+        max_frames (int, optional): The maximum number of frames to consider from the video. Defaults to None.
+        text_embedding_padding_size (int): The padding size for text embeddings. Defaults to 512.
+    Methods:
+        __init__(*args, max_frames=None, text_embedding_padding_size=512, **kwargs):
+            Initializes the BasicDiffusionTaskEncoder with optional maximum frames and text embedding padding size.
+        encode_sample(sample: dict) -> dict:
+            Encodes a given sample dictionary containing video and text data.
+            Args:
+                sample (dict): A dictionary containing 'pth' for video latent and 'json' for additional info.
+            Returns:
+                dict: A dictionary containing encoded video, text embeddings, text mask, and loss mask.
+            Raises:
+                SkipSample: If the video latent contains NaNs, Infs, or is not divisible by the tensor parallel size.
+    """
+
+    cookers = [
+        Cooker(cook),
+    ]
+
+    def __init__(
+        self,
+        *args,
+        max_frames: int = None,
+        text_embedding_padding_size: int = 512,
+        seq_length: int = None,
+        patch_spatial: int = 2,
+        patch_temporal: int = 1,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.max_frames = max_frames
+        self.text_embedding_padding_size = text_embedding_padding_size
+        self.seq_length = seq_length
+        self.patch_spatial = patch_spatial
+        self.patch_temporal = patch_temporal
+
+    def encode_sample(self, sample: dict) -> dict:
+        video_latent = sample["pth"]
+
+        if torch.isnan(video_latent).any() or torch.isinf(video_latent).any():
+            raise SkipSample()
+        if torch.max(torch.abs(video_latent)) > 1e3:
+            raise SkipSample()
+
+        info = sample["json"]
+        # remove batch dimension
+        video_latent = video_latent.squeeze(0)
+        # print(f"video_latent shape at start: {video_latent.shape}")
+        C, T, H, W = video_latent.shape
+        seq_len = (
+            video_latent.shape[-1]
+            * video_latent.shape[-2]
+            * video_latent.shape[-3]
+            // self.patch_spatial**2
+            // self.patch_temporal
+        )
+        # seq_len = 1536
+        is_image = T == 1
+
+        # print(f"Skipping sample {sample['__key__']} because seq_len {seq_len} > self.seq_length {self.seq_length}")
+        if seq_len > self.seq_length:
+            print(f"Skipping sample {sample['__key__']} because seq_len {seq_len} > self.seq_length {self.seq_length}")
+            raise SkipSample()
+
+        if self.max_frames is not None:
+            video_latent = video_latent[:, : self.max_frames, :, :]
+
+        # tpcp_size = parallel_state.get_tensor_model_parallel_world_size()
+        # if parallel_state.get_context_parallel_world_size() > 1:
+        #     tpcp_size *= parallel_state.get_context_parallel_world_size() * 2
+        # if (T * H * W) % tpcp_size != 0:
+        #     warnings.warn(f'skipping {video_latent.shape=} not divisible by {tpcp_size=}')
+        #     raise SkipSample()
+        # print(f"video_latent shape before rearrange: {video_latent.shape}")
+        # video_latent shape before rearrange: torch.Size([16, 1, 64, 96])
+        video_latent = rearrange(
+            video_latent,
+            "C (T pt) (H ph) (W pw) -> (T H W) (ph pw pt C)",
+            ph=self.patch_spatial,
+            pw=self.patch_spatial,
+            pt=self.patch_temporal,
+        )
+        # print(f"video_latent shape after rearrange: {video_latent.shape}")
+        # After reaaranging: video_latent shape after rearrange: torch.Size([1536, 64])
+        # convert sample["pickle"] to numpy, and remove batch dimension
+        sample["pickle"] = sample["pickle"].cpu().float().numpy().squeeze(0)
+        if is_image:
+            t5_text_embeddings = torch.from_numpy(sample["pickle"]).to(torch.bfloat16)
+        else:
+            t5_text_embeddings = torch.from_numpy(sample["pickle"][0]).to(torch.bfloat16)
+        t5_text_embeddings_seq_length = t5_text_embeddings.shape[0]
+
+        if t5_text_embeddings_seq_length > self.text_embedding_padding_size:
+            t5_text_embeddings = t5_text_embeddings[: self.text_embedding_padding_size]
+        else:
+            t5_text_embeddings = F.pad(
+                t5_text_embeddings,
+                (
+                    0,
+                    0,
+                    0,
+                    self.text_embedding_padding_size - t5_text_embeddings_seq_length,
+                ),
+            )
+        t5_text_mask = torch.ones(t5_text_embeddings_seq_length, dtype=torch.bfloat16)
+
+        if is_image:
+            h, w = info["image_height"], info["image_width"]
+            fps = torch.tensor([30] * 1, dtype=torch.bfloat16)
+            num_frames = torch.tensor([1] * 1, dtype=torch.bfloat16)
+        else:
+            h, w = info["height"], info["width"]
+            fps = torch.tensor([info["framerate"]] * 1, dtype=torch.bfloat16)
+            num_frames = torch.tensor([info["num_frames"]] * 1, dtype=torch.bfloat16)
+        image_size = torch.tensor([[h, w, h, w]] * 1, dtype=torch.bfloat16)
+
+        pos_ids = rearrange(
+            pos_id_3d.get_pos_id_3d(t=T // self.patch_temporal, h=H // self.patch_spatial, w=W // self.patch_spatial),
+            "T H W d -> (T H W) d",
+        )
+
+        if self.seq_length is not None:
+            pos_ids = F.pad(pos_ids, (0, 0, 0, self.seq_length - seq_len))
+            loss_mask = torch.zeros(self.seq_length, dtype=torch.bfloat16)
+            loss_mask[:seq_len] = 1
+            video_latent = F.pad(video_latent, (0, 0, 0, self.seq_length - seq_len))
+        else:
+            loss_mask = torch.ones(seq_len, dtype=torch.bfloat16)
+
+        print(f"Loss mask shape: {loss_mask.shape}")
+        print(f"video_latent shape final: {video_latent.shape}")
+        return dict(
+            video=video_latent,
+            t5_text_embeddings=t5_text_embeddings,
+            t5_text_mask=t5_text_mask,
+            image_size=image_size,
+            fps=fps,
+            num_frames=num_frames,
+            loss_mask=loss_mask,
+            seq_len_q=torch.tensor(seq_len, dtype=torch.int32),
+            seq_len_kv=torch.tensor(self.text_embedding_padding_size, dtype=torch.int32),
+            pos_ids=pos_ids,
+            latent_shape=torch.tensor([C, T, H, W], dtype=torch.int32),
+        )
+
+
+class PosID3D:
+    def __init__(self, *, max_t=32, max_h=128, max_w=128):
+        self.max_t = max_t
+        self.max_h = max_h
+        self.max_w = max_w
+        self.generate_pos_id()
+
+    def generate_pos_id(self):
+        self.grid = torch.stack(
+            torch.meshgrid(
+                torch.arange(self.max_t, device="cpu"),
+                torch.arange(self.max_h, device="cpu"),
+                torch.arange(self.max_w, device="cpu"),
+            ),
+            dim=-1,
+        )
+
+    def get_pos_id_3d(self, *, t, h, w):
+        if t > self.max_t or h > self.max_h or w > self.max_w:
+            self.max_t = max(self.max_t, t)
+            self.max_h = max(self.max_h, h)
+            self.max_w = max(self.max_w, w)
+            self.generate_pos_id()
+        return self.grid[:t, :h, :w]
+
+
+pos_id_3d = PosID3D()
+
+
+def cook_raw_iamges(sample: dict) -> dict:
+    """
+    Processes a raw sample dictionary from energon dataset and returns a new dictionary with specific keys.
+
+    Args:
+        sample (dict): The input dictionary containing the raw sample data.
+
+    Returns:
+        dict: A new dictionary containing the processed sample data with the following keys:
+            - All keys from the result of `basic_sample_keys(sample)`
+            - 'jpg': original images
+            - 'png': contains control images
+            - 'txt': contains raw text
+    """
+    return dict(
+        **basic_sample_keys(sample),
+        images=sample["jpg"],
+        hint=sample["png"],
+        txt=sample["txt"],
+    )
+
+
+class RawImageDiffusionTaskEncoder(DefaultTaskEncoder):
+    """
+    Dummy task encoder takes raw image input on CrudeDataset.
+    """
+
+    cookers = [
+        # Cooker(cook),
+        Cooker(cook_raw_iamges),
+    ]
diff --git a/src/megatron/bridge/data/Dit/data/prepare_energon_dataset.py b/src/megatron/bridge/data/Dit/data/prepare_energon_dataset.py
new file mode 100644
index 0000000000..56e57684bd
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/prepare_energon_dataset.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import os
+import pickle
+from typing import Callable, List
+
+import nemo_run as run
+import numpy as np
+import torch
+import torch.distributed as dist
+import webdataset as wds
+
+
+def get_start_end_idx_for_this_rank(dataset_size, rank, world_size):
+    """
+    Calculate the start and end indices for a given rank in a distributed setting.
+
+    Args:
+        dataset_size (int): The total size of the dataset.
+        rank (int): The rank of the current process.
+        world_size (int): The total number of processes.
+
+    Returns:
+        tuple: A tuple containing the start index (int) and end index (int) for the given rank.
+    """
+    split_size = dataset_size // world_size
+    start_idx = rank * split_size
+    # The last rank takes the remainder
+    end_idx = start_idx + split_size if rank != world_size - 1 else dataset_size
+    return start_idx, end_idx
+
+
+def dummy_process_func(input):
+    """
+    Generates a sample dictionary containing random image latent tensor, text embedding,
+    and metadata based on the provided input key.
+
+    Args:
+        input (str): The key to be used in the sample dictionary.
+
+    Returns:
+        dict: A dictionary containing the following keys:
+            - "__key__": The input key.
+            - ".pth": A randomly generated image latent tensor with shape (3, 1, 720, 1280) and dtype torch.bfloat16.
+            - ".pickle": A pickled numpy array representing a random text embedding with shape (512, 2048).
+            - ".json": A dictionary containing metadata with keys:
+                - "image_height": The height of the image (720).
+                - "image_width": The width of the image (1280).
+    """
+    C, T, H, W = 3, 1, 720, 1280
+    image_latent = torch.randn(C, T, H, W, dtype=torch.bfloat16)
+    text_embedding = np.random.randn(512, 2048)
+    sample = {
+        "__key__": input,
+        ".pth": image_latent,
+        ".pickle": pickle.dumps(text_embedding),
+        ".json": {
+            "image_height": H,
+            "image_width": W,
+        },
+    }
+    return sample
+
+
+@torch.no_grad()
+@run.cli.entrypoint
+def prepare(process_func: Callable, inputs: List[str], output_dir: str = "output"):
+    """
+    distributed prepration webdataset using the provided processing function, and writes the processed samples to tar files.
+
+    Args:
+        process_func (Callable): A function that processes a single input and returns the processed sample.
+        inputs (List[str]): A list of input file paths or data entries to be processed.
+        output_dir (str, optional): The directory where the output tar files will be saved. Defaults to 'output'.
+    """
+    rank = dist.get_rank()
+    world_size = torch.distributed.get_world_size()
+
+    start_idx, end_idx = get_start_end_idx_for_this_rank(len(inputs), rank, world_size)
+    os.makedirs(output_dir, exist_ok=True)
+    output_tar = os.path.join(output_dir, f"rank{rank}-%06d.tar")
+    with wds.ShardWriter(output_tar, maxcount=10000) as sink:
+        for i in range(start_idx, end_idx):
+            sample = process_func(inputs[i])
+            # Write the sample to the tar file
+            sink.write(sample)
+
+
+@run.cli.factory(target=prepare)
+def prepare_dummy_image_dataset() -> run.Partial:
+    recipe = run.Partial(
+        prepare,
+        process_func=dummy_process_func,
+        inputs=list(str(i) for i in range(1000)),
+    )
+    return recipe
+
+
+if __name__ == "__main__":
+    dist.init_process_group("nccl")
+    local_rank = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(local_rank)
+    run.cli.main(prepare, default_factory=prepare_dummy_image_dataset)
diff --git a/src/megatron/bridge/data/Dit/data/prepare_energon_dataset_butterfly.py b/src/megatron/bridge/data/Dit/data/prepare_energon_dataset_butterfly.py
new file mode 100644
index 0000000000..f4b95f4409
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/prepare_energon_dataset_butterfly.py
@@ -0,0 +1,301 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pickle
+from typing import Callable
+
+import nemo_run as run
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+import webdataset as wds
+from einops import rearrange
+from transformers import T5EncoderModel, T5TokenizerFast
+
+from nemo.collections.common.video_tokenizers.cosmos_tokenizer import CausalVideoTokenizer
+from nemo.collections.common.video_tokenizers.utils import read_image, resize_video
+
+def initialize_text_encoder(t5_cache_dir):
+    """
+    Initializes the T5 tokenizer and encoder model, loading them from a specified cache directory.
+
+    Args:
+        t5_cache_dir (str): Path to the cache directory for storing the pretrained model files.
+
+    Returns:
+        tuple: A tuple containing the tokenizer and encoder model instances.
+    """
+
+    # Load tokenizer and text encoder, save in cache directory
+    tokenizer = T5TokenizerFast.from_pretrained("google-t5/t5-11b", cache_dir=t5_cache_dir)
+    text_encoder = T5EncoderModel.from_pretrained("google-t5/t5-11b", cache_dir=t5_cache_dir)
+    text_encoder.to("cuda")
+    text_encoder.eval()
+
+    return tokenizer, text_encoder
+
+
+# Load dataset from HuggingFace
+df = pd.read_parquet("hf://datasets/huggan/smithsonian_butterflies_subset/data/train-00000-of-00001.parquet")
+# Load Cosmos tokenizer from HuggingFace
+
+autoencoder = CausalVideoTokenizer.from_pretrained("Cosmos-0.1-Tokenizer-CV4x8x8")
+
+# Load T5-XXL text encoder
+t5_cache_dir = ''  # Use your own custom cache path
+tokenizer, text_encoder = initialize_text_encoder(t5_cache_dir)
+
+
+class EncodedSample:
+    """
+    A class representing an encoded sample, containing the text encoding, length,
+    attention mask, and offset mappings.
+
+    Attributes:
+        encoded_text (np.ndarray): Encoded text array.
+        length (int): Length of the encoding.
+        attn_mask (np.ndarray): Attention mask for the encoding.
+        offset_mappings (np.ndarray): Mappings for offset positions.
+    """
+
+    def __init__(self, encoded_text: np.ndarray, length: int, attn_mask: np.ndarray, offset_mappings: np.ndarray):
+        self.encoded_text = encoded_text
+        self.length = length
+        self.attn_mask = attn_mask
+        self.offset_mappings = offset_mappings
+
+    def truncate(self) -> None:
+        """
+        Truncates the encoded text, attention mask, and offset mappings to the specified length.
+        """
+        self.encoded_text = self.encoded_text[0 : self.length].astype(np.float16)
+        self.attn_mask = self.attn_mask[0 : self.length].astype(np.int32)
+        if self.offset_mappings is not None:
+            self.offset_mappings = self.offset_mappings[0 : self.length].astype(np.int32)
+
+
+@torch.no_grad()
+def encode_for_batch(
+    tokenizer, encoder, prompts: list[str], truncate: bool = True, max_length=512, output_mapping=True
+):
+    """
+    Encodes a batch of text prompts into T5 embeddings.
+
+    Args:
+        tokenizer: Tokenizer instance for encoding.
+        encoder: T5 encoder model instance.
+        prompts (list[str]): List of text prompts to encode.
+        truncate (bool): If True, truncates the output embeddings.
+        max_length (int): Maximum length for each encoded prompt.
+        output_mapping (bool): If True, returns offset mappings for each prompt.
+
+    Returns:
+        list[EncodedSample]: A list of encoded samples containing text encodings and masks.
+    """
+    batch_encoding = tokenizer.batch_encode_plus(
+        prompts,
+        return_tensors="pt",
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_length=True,
+        return_offsets_mapping=output_mapping,
+    )
+
+    # We expect all the processing is done in GPU.
+    input_ids = batch_encoding.input_ids.cuda()
+    attn_mask = batch_encoding.attention_mask.cuda()
+    if output_mapping:
+        offsets_mapping = batch_encoding["offset_mapping"]
+        offsets_mapping = offsets_mapping.cpu().numpy()
+    else:
+        offsets_mapping = None
+
+    outputs = encoder(input_ids=input_ids, attention_mask=attn_mask)  # type: ignore
+    encoded_text = outputs.last_hidden_state
+
+    lengths = attn_mask.sum(dim=1).cpu()
+    for batch_id in range(encoded_text.shape[0]):
+        encoded_text[batch_id][lengths[batch_id] :] = 0
+
+    encoded_text = encoded_text.cpu().numpy()
+    attn_mask = attn_mask.cpu().numpy()
+
+    encoded_text = encoded_text[:, :max_length]
+    attn_mask = attn_mask[:, :max_length]
+
+    out = []
+    for idx in range(encoded_text.shape[0]):
+        if output_mapping:
+            offsets = offsets_mapping[idx]
+        else:
+            offsets = None
+
+        out.append(EncodedSample(encoded_text[idx].astype(np.float16), lengths[idx], attn_mask[idx], offsets))
+    if truncate:
+        for x in out:
+            x.truncate()
+    return out
+
+
+def generate_t5_embed(tokenizer, text_encoder, prompt, t5_embeding_max_length=512):
+    """
+    Generates a T5 embedding for a single text prompt.
+
+    Args:
+        tokenizer: T5 tokenizer instance.
+        text_encoder: T5 encoder model instance.
+        prompt (str): The text prompt to encode.
+        t5_embeding_max_length (int): Maximum length for the embedding.
+
+    Returns:
+        torch.Tensor: Padded T5 embedding tensor.
+    """
+    # encode text to t5 embedding
+    out = encode_for_batch(tokenizer, text_encoder, [prompt])[0]
+    encoded_text = torch.tensor(out.encoded_text, dtype=torch.bfloat16)
+
+    # padding t5 embedding to t5_embeding_max_length
+    L, C = encoded_text.shape
+    t5_embed = torch.zeros(1, t5_embeding_max_length, C, dtype=torch.bfloat16)
+    t5_embed[0, :L] = encoded_text
+
+    return t5_embed
+
+
+def get_start_end_idx_for_this_rank(dataset_size, rank, world_size):
+    """
+    Calculates the start and end indices for distributed processing based on rank.
+
+    Args:
+        dataset_size (int): Total dataset size.
+        rank (int): Current process rank.
+        world_size (int): Total number of processes.
+
+    Returns:
+        tuple: (start index, end index) for the rank.
+    """
+    split_size = dataset_size // world_size
+    start_idx = rank * split_size
+    # The last rank takes the remainder
+    end_idx = start_idx + split_size if rank != world_size - 1 else dataset_size
+    return start_idx, end_idx
+
+
+def butterfly_process_func(index, rank):
+    """
+    Generates a sample dictionary with image latent tensor, caption, and metadata.
+
+    Args:
+        index (int): Index of the dataset row.
+        rank (int): Current process rank for GPU device selection.
+
+    Returns:
+        dict: Dictionary containing processed image latents, embeddings, and metadata.
+    """
+    # Access the data from the dataframe
+    row = df.iloc[index]
+    image_url = row["image_url"]
+    image_caption = row["name"]
+
+    # Process image
+    video = read_image(image_url)
+    video = rearrange(video, 'h w (t c) -> t h w c', t=1)
+
+    # import pdb; pdb.set_trace()
+    video = resize_video(video, short_size=512)
+    import mediapy as media
+    # Ensure that h and w are divisible by 16
+    h, w = video.shape[1:3]
+    video = media.resize_video(video, shape=(h // 16 * 16, w // 16 * 16))
+    batch_video = video[np.newaxis, ...]
+
+
+    # Bx3xTxHxW
+    batch_video = rearrange(batch_video, 'b t h w c -> b c t h w')
+    # make video -1...1. Currenlty it has 0-255
+    batch_video = (batch_video / 255.0) * 2 - 1
+    # Run autoencoder to get latents
+
+    # import pdb; pdb.set_trace()
+    image_latent = autoencoder.encode(torch.from_numpy(batch_video).to(torch.bfloat16).cuda(device=rank))[0]
+    image_latent = image_latent.cpu()
+
+    text_embedding = generate_t5_embed(tokenizer, text_encoder, image_caption)
+
+    # Construct sample dictionary
+    sample = {
+        "__key__": f"{index:06}",
+        ".pth": image_latent.to(dtype=torch.bfloat16),
+        ".pickle": pickle.dumps(text_embedding),
+        ".json": {
+            "image_height": batch_video.shape[2],
+            "image_width": batch_video.shape[3],
+            # Add additional score as metadata
+        },
+    }
+    return sample
+
+
+@torch.no_grad()
+@run.cli.entrypoint
+def prepare(process_func: Callable, output_dir: str = 'output_butterfly'):
+    """
+    Prepares a WebDataset using the specified processing function, for distributed settings.
+
+    Args:
+        process_func (Callable): Function to process each dataset entry.
+        output_dir (str): Output directory to save processed dataset.
+
+    """
+    rank = dist.get_rank()
+    world_size = torch.distributed.get_world_size()
+        # rank = 0
+        # world_size = 1
+    # import pdb; pdb.set_trace()
+    print(f"Rank {rank} of {world_size} processing {len(df)} samples")
+    start_idx, end_idx = get_start_end_idx_for_this_rank(len(df), rank, world_size)
+    print(f"Rank {rank} of {world_size} processing {end_idx - start_idx} samples, from {start_idx} to {end_idx}")
+    os.makedirs(output_dir, exist_ok=True)
+    output_tar = os.path.join(output_dir, f"rank{rank}-%06d.tar")
+
+    with wds.ShardWriter(output_tar, maxcount=10000) as sink:
+        # for i in range(start_idx, end_idx):
+        from tqdm import tqdm
+        for i in tqdm(range(start_idx, end_idx)):
+            # convert to tqdm
+            sample = process_func(i, rank)
+            # Write sample to tar file
+            sink.write(sample)
+
+
+@run.cli.factory(target=prepare)
+def prepare_butterfly_dataset() -> run.Partial:
+    """
+    Prepares the butterfly dataset for distributed training.
+
+    Returns:
+        run.Partial: Partially configured run for WebDataset preparation.
+    """
+    recipe = run.Partial(prepare, process_func=butterfly_process_func, output_dir='butterfly_webdataset')
+    return recipe
+
+
+if __name__ == '__main__':
+    dist.init_process_group("nccl")
+    local_rank = int(os.environ['LOCAL_RANK'])
+    torch.cuda.set_device(local_rank)
+    run.cli.main(prepare, default_factory=prepare_butterfly_dataset)
diff --git a/src/megatron/bridge/data/Dit/data/readme.rst b/src/megatron/bridge/data/Dit/data/readme.rst
new file mode 100644
index 0000000000..57a1737988
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/readme.rst
@@ -0,0 +1,26 @@
+Preparing Image / Video Megatron Energon WebDataset with Cosmos Tokenizer
+===========================
+
+This script is an example on preparing a WebDataset for an image / video + text dataset using distributed processing with the Cosmos Tokenizer. It processes each sample by generating a **continuous** image / video latent using the Cosmos video tokenizer and a T5 embedding from the text caption. Then, the processed data is stored in a WebDataset-compatible format.
+
+Requirements
+------------
+- **Dependencies**:
+  - Please use the latest NeMo dev container: ``nvcr.io/nvidia/nemo:dev``
+  - You may also need to install ``jammy`` and ``mediapy`` depending on your dev container version.
+
+- **Data**:
+  - The script uses an example dataset that comes in parquet format. To use a custom, you will need to write a custom ``process_func`` and create a new factory recipe that uses your new ``process_func``.
+
+Usage
+-----
+1. **Set up your environment**:
+   Pull and launch the NeMo dev container to run your script.
+
+2. **Customize Cache Path**:
+   Set the T5 cache directory path in the script by specifying the `t5_cache_dir` variable.
+
+3. **Running the Script**:
+   To run the script on 8 GPUs, use the following command:
+   
+   ``bash torchrun --nproc_per_node=8 nemo/collections/diffusion/data/prepare_energon_dataset.py``
diff --git a/src/megatron/bridge/data/Dit/data/test_datamodule.py b/src/megatron/bridge/data/Dit/data/test_datamodule.py
new file mode 100644
index 0000000000..7507960046
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/test_datamodule.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import os
+import time
+
+import fiddle as fdl
+import numpy as np
+import pytest
+import torch
+from megatron.core import parallel_state
+from megatron.bridge.data.Dit.data.diffusion_taskencoder import BasicDiffusionTaskEncoder
+# from nemo_vfm.diffusion.train import multimodal_datamodule
+from tqdm import tqdm
+
+
+# Fixture to initialize distributed training only once
+@pytest.fixture(scope="session", autouse=True)
+def initialize_distributed():
+    if not torch.distributed.is_initialized():
+        rank = int(os.environ["LOCAL_RANK"])
+        world_size = torch.cuda.device_count()
+        torch.cuda.set_device(rank)
+        torch.distributed.init_process_group(backend="nccl", world_size=world_size, rank=rank)
+        parallel_state.initialize_model_parallel()
+
+
+# Fixture to get the value of the custom command-line option
+@pytest.fixture
+def path():
+    return os.getenv("DATA_DIR")
+
+
+def test_datamodule(path):
+    # config = multimodal_datamodule()
+    # config.path = path
+    # config.num_workers = 120
+    # config.seq_length = 260
+    # config.task_encoder.seq_length = 260
+    # datamodule = fdl.build(config)
+    # Note: multimodal_datamodule is not available - commented out to fix import issues
+    print("test_datamodule function needs to be updated with available datamodule class")
+    return
+    # datamodule = SimpleMultiModalDataModule(
+    #     path=path,
+    #     seq_length=260,
+    #     micro_batch_size=1,
+    #     num_workers=256,
+    #     tokenizer=None,
+    #     image_processor=None,
+    #     task_encoder=BasicDiffusionTaskEncoder(seq_length=260, text_embedding_padding_size=512,
+    #     ),
+    # )
+
+    for i, batch in enumerate(datamodule.train_dataloader()):
+        print(batch["seq_len_q"])
+        if i == 1:
+            start_time = time.time()
+        if i > 100:
+            break
+
+    elapsed_time = time.time() - start_time
+    print(f"Elapsed time for loading 100 batches: {elapsed_time} seconds, {elapsed_time / 100} seconds per batch")
+
+
+def test_taskencoder():
+    taskencoder = BasicDiffusionTaskEncoder(
+        text_embedding_padding_size=512,
+        seq_length=260,
+    )
+
+    start_time = time.time()
+    for _ in tqdm(range(100)):
+        sample = {
+            "pth": torch.randn(3, 1, 30, 30),
+            "pickle": np.random.randn(256, 1024),
+            "json": {"image_height": 1, "image_width": 1},
+        }
+        taskencoder.encode_sample(sample)
+
+    elapsed_time = time.time() - start_time
+    print(f"Elapsed time for loading 100 batches: {elapsed_time} seconds")
diff --git a/src/megatron/bridge/data/Dit/data/utils.py b/src/megatron/bridge/data/Dit/data/utils.py
new file mode 100644
index 0000000000..dbe8ebadee
--- /dev/null
+++ b/src/megatron/bridge/data/Dit/data/utils.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import numpy as np
+
+
+def minimal_crop(tensor, target_divisor):
+    """
+    Crops the input tensor minimally so that the total number of elements
+    (T * H * W) is divisible by the specified target_divisor.
+
+    Parameters:
+    - tensor: NumPy array of shape (C, T, H, W)
+    - target_divisor: Positive integer specifying the desired divisor
+
+    Returns:
+    - cropped_tensor: Cropped tensor meeting the divisibility requirement
+
+    Raises:
+    - ValueError: If it's impossible to meet the divisibility requirement
+    """
+    if not isinstance(target_divisor, int) or target_divisor <= 0:
+        raise ValueError("target_divisor must be a positive integer greater than zero.")
+
+    C, T, H, W = tensor.shape
+    total_elements = T * H * W
+    remainder = total_elements % target_divisor
+
+    if remainder == 0:
+        return tensor  # No cropping needed
+
+    # Elements per unit length in each dimension
+    elements_per_T = H * W
+    elements_per_H = T * W
+    elements_per_W = T * H
+
+    min_elements_removed = None
+    optimal_deltas = None
+
+    # Limit the search range to avoid unnecessary computations
+    max_delta_T = min(T - 1, (remainder // elements_per_T) + 1)
+    max_delta_H = min(H - 1, (remainder // elements_per_H) + 1)
+    max_delta_W = min(W - 1, (remainder // elements_per_W) + 1)
+
+    for delta_T in range(0, max_delta_T + 1):
+        for delta_H in range(0, max_delta_H + 1):
+            for delta_W in range(0, max_delta_W + 1):
+                if delta_T == delta_H == delta_W == 0:
+                    continue  # No cropping
+
+                new_T = T - delta_T
+                new_H = H - delta_H
+                new_W = W - delta_W
+
+                if new_T <= 0 or new_H <= 0 or new_W <= 0:
+                    continue  # Invalid dimensions
+
+                new_total_elements = new_T * new_H * new_W
+                if new_total_elements % target_divisor == 0:
+                    elements_removed = delta_T * elements_per_T + delta_H * elements_per_H + delta_W * elements_per_W
+                    if min_elements_removed is None or elements_removed < min_elements_removed:
+                        min_elements_removed = elements_removed
+                        optimal_deltas = (delta_T, delta_H, delta_W)
+
+    if optimal_deltas is None:
+        raise ValueError("Cannot crop tensor to meet divisibility requirement.")
+
+    delta_T, delta_H, delta_W = optimal_deltas
+
+    # Perform the cropping
+    # T dimension: crop from the end
+    end_T = T - delta_T
+
+    # H dimension: center crop
+    start_H = delta_H // 2
+    end_H = H - (delta_H - delta_H // 2)
+
+    # W dimension: center crop
+    start_W = delta_W // 2
+    end_W = W - (delta_W - delta_W // 2)
+
+    cropped_tensor = tensor[:, :end_T, start_H:end_H, start_W:end_W]
+    return cropped_tensor
+
+
+def test_no_cropping_needed():
+    """Test when the tensor already meets the divisibility requirement."""
+    C, T, H, W = 3, 8, 8, 8
+    target_divisor = 8
+    tensor = np.zeros((C, T, H, W))
+    cropped_tensor = minimal_crop(tensor, target_divisor)
+    assert cropped_tensor.shape == (C, T, H, W)
+    assert (T * H * W) % target_divisor == 0
+
+
+def test_minimal_cropping_T_dimension():
+    """Test minimal cropping along the T dimension."""
+    C, T, H, W = 3, 9, 7, 6
+    target_divisor = 8
+    tensor = np.zeros((C, T, H, W))
+    cropped_tensor = minimal_crop(tensor, target_divisor)
+    new_T = cropped_tensor.shape[1]
+    assert new_T == T - 1, cropped_tensor.shape
+    assert (new_T * H * W) % target_divisor == 0
+
+
+def test_minimal_cropping_H_dimension():
+    """Test minimal cropping along the H dimension."""
+    C, T, H, W = 3, 7, 9, 6
+    target_divisor = 8
+    tensor = np.zeros((C, T, H, W))
+    cropped_tensor = minimal_crop(tensor, target_divisor)
+    new_H = cropped_tensor.shape[2]
+    assert new_H == H - 1, cropped_tensor.shape
+    assert (T * new_H * W) % target_divisor == 0
+
+
+def test_minimal_cropping_W_dimension():
+    """Test minimal cropping along the W dimension."""
+    C, T, H, W = 3, 4, 3, 9
+    target_divisor = 8
+    tensor = np.zeros((C, T, H, W))
+    cropped_tensor = minimal_crop(tensor, target_divisor)
+    new_W = cropped_tensor.shape[3]
+    assert new_W == W - 1, cropped_tensor.shape
+    assert (T * H * new_W) % target_divisor == 0
+
+
+def test_cropping_multiple_dimensions():
+    """Test when minimal cropping requires adjustments on multiple dimensions."""
+    C, T, H, W = 3, 9, 9, 8
+    target_divisor = 16
+    tensor = np.zeros((C, T, H, W))
+    cropped_tensor = minimal_crop(tensor, target_divisor)
+    new_T, new_H, new_W = cropped_tensor.shape[1:]
+    assert new_T <= T and new_H <= H and new_W <= W
+    assert (new_T * new_H * new_W) % target_divisor == 0
+
+
+def test_large_tensor_high_divisor():
+    """Test with a larger tensor and higher target_divisor."""
+    C, T, H, W = 3, 50, 50, 50
+    target_divisor = 1024
+    tensor = np.zeros((C, T, H, W))
+    cropped_tensor = minimal_crop(tensor, target_divisor)
+    total_elements = cropped_tensor.shape[1] * cropped_tensor.shape[2] * cropped_tensor.shape[3]
+    assert total_elements % target_divisor == 0
+
+
+def test_impossible_cropping():
+    """Test that an error is raised when it's impossible to meet the requirement."""
+    C, T, H, W = 3, 1, 1, 1
+    target_divisor = 2
+    tensor = np.zeros((C, T, H, W))
+    try:
+        minimal_crop(tensor, target_divisor)
+    except ValueError:
+        pass
+
+
+def test_invalid_target_divisor():
+    """Test that an error is raised when target_divisor is invalid."""
+    C, T, H, W = 3, 8, 8, 8
+    tensor = np.zeros((C, T, H, W))
+    try:
+        minimal_crop(tensor, -1)
+    except ValueError:
+        pass
+
+
+def test_minimal_elements_removed():
+    """Test that the minimal number of elements are removed."""
+    C, T, H, W = 3, 7, 7, 7
+    target_divisor = 8
+    tensor = np.zeros((C, T, H, W))
+    cropped_tensor = minimal_crop(tensor, target_divisor)
+    elements_removed = (T * H * W) - (cropped_tensor.shape[1] * cropped_tensor.shape[2] * cropped_tensor.shape[3])
+    print(cropped_tensor.shape)
+    assert elements_removed > 0
+    assert (cropped_tensor.shape[1] * cropped_tensor.shape[2] * cropped_tensor.shape[3]) % target_divisor == 0
+
+
+test_no_cropping_needed()
+test_minimal_elements_removed()
+test_cropping_multiple_dimensions()
+test_minimal_cropping_T_dimension()
+test_minimal_cropping_H_dimension()
+test_minimal_cropping_W_dimension()
+test_impossible_cropping()
+test_invalid_target_divisor()
diff --git a/src/megatron/bridge/data/loaders.py b/src/megatron/bridge/data/loaders.py
index 6c3aeda95c..7d45114436 100644
--- a/src/megatron/bridge/data/loaders.py
+++ b/src/megatron/bridge/data/loaders.py
@@ -219,7 +219,11 @@ def worker_init_fn(_):
         valid_dataloader = build_pretraining_data_loader(
             valid_ds,
             train_state.consumed_valid_samples,
-            "cyclic",
+            # DEBUGGING
+            # known issue: 
+            # https://nvidia.slack.com/archives/C09MX7UEB0W/p1761316355203679
+            # "cyclic",
+            "external",
             cfg.train.micro_batch_size,
             cfg.dataset.num_workers,
             cfg.dataset.data_sharding,
diff --git a/src/megatron/bridge/data/wan/prepare_energon_dataset_vace.py b/src/megatron/bridge/data/wan/prepare_energon_dataset_vace.py
new file mode 100644
index 0000000000..69b948813b
--- /dev/null
+++ b/src/megatron/bridge/data/wan/prepare_energon_dataset_vace.py
@@ -0,0 +1,408 @@
+import os
+import json
+import pickle
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import torch
+import webdataset as wds
+import cv2
+import numpy as np
+from tqdm import tqdm
+
+from megatron.bridge.models.wan.flow_matching.flow_inference_pipeline import VACEFlowInferencePipeline
+from megatron.bridge.models.wan.inference.configs import WAN_CONFIGS
+from megatron.bridge.models.wan.utils.utils import patchify
+from diffusers import AutoencoderKLWan
+from transformers import AutoTokenizer, UMT5EncoderModel
+
+def _map_interpolation(resize_mode: str) -> int:
+    interpolation_map = {
+        "bilinear": cv2.INTER_LINEAR,
+        "bicubic": cv2.INTER_CUBIC,
+        "nearest": cv2.INTER_NEAREST,
+        "area": cv2.INTER_AREA,
+        "lanczos": cv2.INTER_LANCZOS4,
+    }
+    if resize_mode not in interpolation_map:
+        raise ValueError(f"Invalid resize_mode '{resize_mode}'. Choose from: {list(interpolation_map.keys())}")
+    return interpolation_map[resize_mode]
+
+
+def _calculate_resize_dimensions(
+    original_height: int,
+    original_width: int,
+    target_size: Optional[Tuple[int, int]],
+    maintain_aspect_ratio: bool,
+) -> Tuple[int, int]:
+    if target_size is None:
+        return original_height, original_width
+
+    target_height, target_width = target_size
+    if not maintain_aspect_ratio:
+        return target_height, target_width
+
+    original_aspect = original_width / max(1, original_height)
+    target_aspect = target_width / max(1, target_height)
+
+    if original_aspect > target_aspect:
+        new_width = target_width
+        new_height = int(round(target_width / max(1e-6, original_aspect)))
+    else:
+        new_height = target_height
+        new_width = int(round(target_height * original_aspect))
+
+    return new_height, new_width
+
+
+def _resize_frame(
+    frame: np.ndarray,
+    target_size: Optional[Tuple[int, int]],
+    resize_mode: str,
+    maintain_aspect_ratio: bool,
+    center_crop: bool,
+) -> np.ndarray:
+    if target_size is None:
+        return frame
+
+    original_height, original_width = frame.shape[:2]
+    resize_height, resize_width = _calculate_resize_dimensions(
+        original_height, original_width, target_size, maintain_aspect_ratio
+    )
+
+    interpolation = _map_interpolation(resize_mode)
+    resized_frame = cv2.resize(frame, (resize_width, resize_height), interpolation=interpolation)
+
+    if maintain_aspect_ratio and center_crop:
+        target_height, target_width = target_size
+        if resize_height != target_height or resize_width != target_width:
+            y_start = max(0, (resize_height - target_height) // 2)
+            x_start = max(0, (resize_width - target_width) // 2)
+            y_end = min(resize_height, y_start + target_height)
+            x_end = min(resize_width, x_start + target_width)
+            resized_frame = resized_frame[y_start:y_end, x_start:x_end]
+
+            if resized_frame.shape[0] < target_height or resized_frame.shape[1] < target_width:
+                pad_height = max(0, target_height - resized_frame.shape[0])
+                pad_width = max(0, target_width - resized_frame.shape[1])
+                # Handle both 2D (grayscale/mask) and 3D (RGB) frames
+                if resized_frame.ndim == 2:
+                    pad_spec = ((0, pad_height), (0, pad_width))
+                else:
+                    pad_spec = ((0, pad_height), (0, pad_width), (0, 0))
+                resized_frame = np.pad(
+                    resized_frame, pad_spec, mode="constant", constant_values=0
+                )
+
+    return resized_frame
+
+def _read_sidecar_caption(jsonl_path: Path) -> str:
+    if not jsonl_path.exists():
+        return ""
+    try:
+        with open(jsonl_path, "r") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except Exception:
+                    continue
+                # Prefer keys used across datasets
+                for key in ("vila_caption", "gemini_v2_caption", "caption", "text"):
+                    if key in obj and isinstance(obj[key], str):
+                        return obj[key]
+                # If no known key, try first string value
+                for v in obj.values():
+                    if isinstance(v, str):
+                        return v
+                break
+    except Exception:
+        return ""
+    return ""
+
+
+def _get_total_frames(video_path: str) -> int:
+    cap = cv2.VideoCapture(video_path)
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.release()
+    return max(0, total)
+
+
+def _load_metadata(video_folder: Path) -> List[Dict]:
+    meta_path = video_folder / "meta.json"
+    if meta_path.exists():
+        with open(meta_path, "r") as f:
+            return json.load(f)
+
+    # Fallback: scan for .mp4 files with sidecar .jsonl; use full frame range
+    items: List[Dict] = []
+    for entry in sorted(video_folder.iterdir()):
+        if not entry.is_file():
+            continue
+        if entry.suffix.lower() != ".mp4":
+            continue
+        video_name = entry.name
+        video_path = str(entry)
+        total_frames = _get_total_frames(video_path)
+        start_frame = 0
+        end_frame = max(0, total_frames - 1)
+        sidecar = entry.with_suffix("")
+        # Handle names with additional dots gracefully
+        sidecar_jsonl = Path(str(entry).rsplit(".", 1)[0] + ".jsonl")
+        caption = _read_sidecar_caption(sidecar_jsonl)
+        items.append(
+            {
+                "file_name": video_name,
+                "start_frame": start_frame,
+                "end_frame": end_frame,
+                "vila_caption": caption,
+            }
+        )
+    if not items:
+        raise FileNotFoundError(f"No meta.json and no .mp4 files found in {video_folder}")
+    return items
+
+def _load_frames_cv2(
+    video_path: str,
+    start_frame: int,
+    end_frame: int,
+    target_size: Optional[Tuple[int, int]],
+    resize_mode: str,
+    maintain_aspect_ratio: bool,
+    center_crop: bool,
+    target_dtype: torch.dtype,
+    is_mask: bool = False,
+) -> torch.Tensor:
+    cap = cv2.VideoCapture(video_path)
+    frames: List[np.ndarray] = []
+
+    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
+    for frame_idx in range(start_frame, end_frame + 1):
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if is_mask:
+            if frame.ndim == 3:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        else:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frame = _resize_frame(frame, target_size, resize_mode, maintain_aspect_ratio, center_crop)
+        frame = frame.astype(np.float32) / 255.0
+        frames.append(frame)
+    cap.release()
+
+    if not frames:
+        raise ValueError(f"No frames loaded from {video_path}")
+
+    video_array = np.array(frames)  # T, H, W, C (RGB) or T, H, W (mask) in [0,1]
+    video_tensor = torch.from_numpy(video_array)
+    
+    if is_mask:
+        # For masks: T, H, W -> 1, 1, T, H, W
+        video_tensor = video_tensor.unsqueeze(0).unsqueeze(0)  # 1, 1, T, H, W
+    else:
+        # For RGB: T, H, W, C -> 1, C, T, H, W
+        video_tensor = video_tensor.permute(3, 0, 1, 2).unsqueeze(0)  # 1, C, T, H, W
+    
+    video_tensor = video_tensor.to(dtype=target_dtype)
+    return video_tensor
+
+
+@torch.no_grad()
+def _encode_video_latents(
+    vae: AutoencoderKLWan,
+    device: str,
+    video_tensor: torch.Tensor,
+    # deterministic_latents: bool,
+) -> torch.Tensor:
+    video_tensor = video_tensor.to(device=device, dtype=vae.dtype)
+    video_tensor = video_tensor * 2.0 - 1.0  # [0,1] -> [-1,1]
+
+    latent_dist = vae.encode(video_tensor)
+    # if deterministic_latents:
+    #     video_latents = latent_dist[0].mean
+    # else:
+    #     video_latents = latent_dist[0].sample()
+    video_latents = latent_dist[0]
+
+    latent_mean = video_latents.mean().item()
+    latent_std = video_latents.std().item()
+
+    if abs(latent_mean) < 0.5 and 0.5 < latent_std < 2.0:
+        final_latents = video_latents
+    else:
+        if not hasattr(vae.config, "latents_mean") or not hasattr(vae.config, "latents_std"):
+            raise ValueError("Wan2.1 VAE requires latents_mean and latents_std in config")
+        latents_mean = torch.tensor(vae.config.latents_mean, device=device, dtype=vae.dtype).view(1, -1, 1, 1, 1)
+        latents_std = torch.tensor(vae.config.latents_std, device=device, dtype=vae.dtype).view(1, -1, 1, 1, 1)
+        final_latents = (video_latents - latents_mean) / latents_std
+
+    return final_latents
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Prepare VACE WebDataset shards using VACEFlowInferencePipeline")
+    parser.add_argument("--video_dir", type=str, required=True, help="Directory containing *_src_video.mp4 and *_mask.mp4 files")
+    parser.add_argument("--output_dir", type=str, required=True, help="Directory to write webdataset shards")
+    parser.add_argument("--checkpoint_dir", type=str, required=True, help="VACE checkpoint directory")
+    parser.add_argument("--checkpoint_step", type=int, default=0000, help="Checkpoint step (optional)")
+    parser.add_argument("--vae_checkpoint_dir", type=str, default=None, help="VAE checkpoint directory (optional)")
+    parser.add_argument("--t5_checkpoint_dir", type=str, default=None, help="T5 checkpoint directory (optional)")
+    parser.add_argument("--shard_maxcount", type=int, default=10000, help="Max samples per shard")
+    parser.add_argument("--device", type=str, default="cuda:0", help="Device to run the pipeline on")
+    parser.add_argument("--height", type=int, default=None, help="Target height for resizing frames")
+    parser.add_argument("--width", type=int, default=None, help="Target width for resizing frames")
+    parser.add_argument(
+        "--resize_mode",
+        default="bilinear",
+        choices=["bilinear", "bicubic", "nearest", "area", "lanczos"],
+        help="Interpolation mode for resizing",
+    )
+    parser.add_argument("--no-aspect-ratio", action="store_true", help="Disable aspect ratio preservation")
+    parser.add_argument("--center-crop", action="store_true", help="Center crop to exact target size after resize")
+    parser.add_argument("--stochastic", action="store_true", help="Use stochastic latents from VAE encoder")
+    parser.add_argument("--model_name", type=str, default="vace-1.3B", choices=list(WAN_CONFIGS.keys()), help="The model name to run.")
+    parser.add_argument("--vace_mode", default="T2V", choices=["T2V", "I2V", "V2V"], help="VACE mode: T2V, I2V or V2V")
+    args = parser.parse_args()
+
+    video_folder = Path(args.video_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    shard_pattern = str(output_dir / "shard-%06d.tar")
+    model_dtype = torch.float16 if args.device.startswith("cuda") else torch.float32
+
+    # Target size
+    target_size = None
+    if args.height is not None and args.width is not None:
+        target_size = (args.height, args.width)
+    elif (args.height is None) ^ (args.width is None):
+        parser.error("Both --height and --width must be specified together")
+
+    cfg = WAN_CONFIGS[args.model_name]
+    pipeline = VACEFlowInferencePipeline(
+        config=cfg,  # You may need to load config as in your training/inference scripts
+        checkpoint_dir=args.checkpoint_dir,
+        checkpoint_step=args.checkpoint_step,
+        t5_checkpoint_dir=args.t5_checkpoint_dir,
+        vae_checkpoint_dir=args.vae_checkpoint_dir,
+        device_id=0,
+        rank=0,
+        t5_cpu=False,
+        tensor_parallel_size=1,
+        context_parallel_size=1,
+        pipeline_parallel_size=1,
+        sequence_parallel=False,
+        pipeline_dtype=torch.float32,
+    )
+    pipeline.text_encoder.model.to(pipeline.device)
+    # Load metadata list
+    metadata_list = _load_metadata(video_folder)
+    with wds.ShardWriter(shard_pattern, maxcount=args.shard_maxcount) as sink:
+        written = 0
+        for idx, meta in enumerate(tqdm(metadata_list)):
+            video_name = meta["path"]
+            start_frame = int(meta['frame_idx'].split(':')[0])  # inclusive
+            end_frame = int(meta['frame_idx'].split(':')[1])      # inclusive
+            prompt = meta["cap"]
+            
+            video_path = os.path.join(args.video_dir, video_name)
+            video_base = os.path.split(video_path)[0]
+            src_video_path = os.path.join(video_base, "src_video_obj_1.mp4")
+            mask_path = os.path.join(video_base, "mask_obj_1.mp4")
+            
+            video_tensor = _load_frames_cv2(
+                video_path=video_path,
+                start_frame=start_frame,
+                end_frame=end_frame,
+                target_size=target_size,
+                resize_mode=args.resize_mode,
+                maintain_aspect_ratio=not args.no_aspect_ratio,
+                center_crop=args.center_crop,
+                target_dtype=model_dtype,
+            )
+            T, H, W = video_tensor.shape[2:5]
+            if not os.path.exists(src_video_path) or not os.path.exists(mask_path):
+                if args.vace_mode == "T2V":
+                    src_video_tensor = torch.zeros((1, 3, T, H, W), device=pipeline.device)
+                    mask_tensor = torch.ones((1, 1, T, H, W), device=pipeline.device).div(255.0)
+                elif args.vace_mode == "I2V":
+                    #Read first frame from video as src_video and remaining frames as zeros
+                    src_video_tensor = torch.zeros((3, T, H, W), device=video_tensor.device, dtype=video_tensor.dtype)
+                    src_video_tensor[:, 0] = video_tensor[0, :, 0, :, :] # C, T, H, W
+                    src_video_tensor = src_video_tensor.unsqueeze(0).to(pipeline.device)  # 1, C, T, H, W
+                    mask_tensor = torch.ones((1, 1, T, H, W), device=pipeline.device).div(255.0)    # 1, 1, T, H, W
+                elif args.vace_mode == "V2V":
+                    print(f"Failed to context read frames for {src_video_path}")
+                    continue
+            else:
+                src_video_tensor = _load_frames_cv2(src_video_path,
+                                                    start_frame=start_frame,
+                                                    end_frame=end_frame,
+                                                    target_size=target_size,
+                                                    resize_mode=args.resize_mode,
+                                                    maintain_aspect_ratio=not args.no_aspect_ratio,
+                                                    center_crop=args.center_crop,
+                                                    target_dtype=model_dtype,
+                                                    ).to(pipeline.device) 
+                mask_tensor = _load_frames_cv2(mask_path,
+                                              start_frame=start_frame,
+                                              end_frame=end_frame,
+                                              target_size=target_size,
+                                              resize_mode=args.resize_mode,
+                                              maintain_aspect_ratio=not args.no_aspect_ratio,
+                                              center_crop=args.center_crop,
+                                              target_dtype=model_dtype,
+                                              is_mask=True
+                                              ).to(pipeline.device) 
+            
+            # Use pipeline to encode frames/masks and get vace_context
+            text_embed = pipeline.text_encoder([prompt], pipeline.device)[0]
+            latents = _encode_video_latents(
+                vae=pipeline.vae,
+                device=pipeline.device,
+                video_tensor=video_tensor,
+                # deterministic_latents=not args.stochastic,
+            )
+            vace_context0 = pipeline.vace_encode_frames(src_video_tensor, ref_images=None, masks=mask_tensor)
+            mask0 = pipeline.vace_encode_masks(mask_tensor, ref_images=None)
+            vace_context_latent = pipeline.vace_latent(vace_context0, mask0)[0]
+            
+            vace_context_patchified = patchify([vace_context_latent], patch_size=(1,2,2))[0]
+
+            # Move to CPU for saving and convert to float16 to reduce file size
+            text_embed_cpu = text_embed.detach().cpu()
+            latents_cpu = latents.detach().cpu()
+            vace_context_cpu = vace_context_patchified.detach().to(dtype=torch.float16).cpu()
+
+            # Build JSON side-info similar to prepare_energon script
+            C, T, H, W = video_tensor.shape[1:]  # 1,C,T,H,W
+            json_data = {
+                "video_path": video_path,
+                "processed_frames": int(T),
+                "processed_height": int(H),
+                "processed_width": int(W),
+                "caption": prompt,
+                "deterministic_latents": bool(not args.stochastic),
+                "model_version": "wan2.1",
+                "resize_settings": {
+                    "target_size": target_size,
+                    "resize_mode": args.resize_mode,
+                    "maintain_aspect_ratio": bool(not args.no_aspect_ratio),
+                    "center_crop": bool(args.center_crop),
+                },
+            }
+            sample = {
+                "__key__": f"{idx:06}",
+                "pickle": pickle.dumps(text_embed_cpu),
+                "pth": latents_cpu,
+                "context.pth": vace_context_cpu,
+                "json": json_data,
+            }
+            sink.write(sample)
+            written += 1
+    
+    print(f"Done writing {written} VACE samples as shards.")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/megatron/bridge/data/wan/prepare_energon_dataset_wan.py b/src/megatron/bridge/data/wan/prepare_energon_dataset_wan.py
new file mode 100644
index 0000000000..a8464aa6ec
--- /dev/null
+++ b/src/megatron/bridge/data/wan/prepare_energon_dataset_wan.py
@@ -0,0 +1,404 @@
+import os
+import json
+import pickle
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+import torch
+import webdataset as wds
+
+from diffusers import AutoencoderKLWan
+from transformers import AutoTokenizer, UMT5EncoderModel
+
+
+def _map_interpolation(resize_mode: str) -> int:
+    interpolation_map = {
+        "bilinear": cv2.INTER_LINEAR,
+        "bicubic": cv2.INTER_CUBIC,
+        "nearest": cv2.INTER_NEAREST,
+        "area": cv2.INTER_AREA,
+        "lanczos": cv2.INTER_LANCZOS4,
+    }
+    if resize_mode not in interpolation_map:
+        raise ValueError(f"Invalid resize_mode '{resize_mode}'. Choose from: {list(interpolation_map.keys())}")
+    return interpolation_map[resize_mode]
+
+
+def _calculate_resize_dimensions(
+    original_height: int,
+    original_width: int,
+    target_size: Optional[Tuple[int, int]],
+    maintain_aspect_ratio: bool,
+) -> Tuple[int, int]:
+    if target_size is None:
+        return original_height, original_width
+
+    target_height, target_width = target_size
+    if not maintain_aspect_ratio:
+        return target_height, target_width
+
+    original_aspect = original_width / max(1, original_height)
+    target_aspect = target_width / max(1, target_height)
+
+    if original_aspect > target_aspect:
+        new_width = target_width
+        new_height = int(round(target_width / max(1e-6, original_aspect)))
+    else:
+        new_height = target_height
+        new_width = int(round(target_height * original_aspect))
+
+    return new_height, new_width
+
+
+def _resize_frame(
+    frame: np.ndarray,
+    target_size: Optional[Tuple[int, int]],
+    resize_mode: str,
+    maintain_aspect_ratio: bool,
+    center_crop: bool,
+) -> np.ndarray:
+    if target_size is None:
+        return frame
+
+    original_height, original_width = frame.shape[:2]
+    resize_height, resize_width = _calculate_resize_dimensions(
+        original_height, original_width, target_size, maintain_aspect_ratio
+    )
+
+    interpolation = _map_interpolation(resize_mode)
+    resized_frame = cv2.resize(frame, (resize_width, resize_height), interpolation=interpolation)
+
+    if maintain_aspect_ratio and center_crop:
+        target_height, target_width = target_size
+        if resize_height != target_height or resize_width != target_width:
+            y_start = max(0, (resize_height - target_height) // 2)
+            x_start = max(0, (resize_width - target_width) // 2)
+            y_end = min(resize_height, y_start + target_height)
+            x_end = min(resize_width, x_start + target_width)
+            resized_frame = resized_frame[y_start:y_end, x_start:x_end]
+
+            if resized_frame.shape[0] < target_height or resized_frame.shape[1] < target_width:
+                pad_height = max(0, target_height - resized_frame.shape[0])
+                pad_width = max(0, target_width - resized_frame.shape[1])
+                resized_frame = np.pad(
+                    resized_frame, ((0, pad_height), (0, pad_width), (0, 0)), mode="constant", constant_values=0
+                )
+
+    return resized_frame
+
+
+def _read_sidecar_caption(jsonl_path: Path) -> str:
+    if not jsonl_path.exists():
+        return ""
+    try:
+        with open(jsonl_path, "r") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except Exception:
+                    continue
+                # Prefer keys used across datasets
+                for key in ("vila_caption", "gemini_v2_caption", "caption", "text"):
+                    if key in obj and isinstance(obj[key], str):
+                        return obj[key]
+                # If no known key, try first string value
+                for v in obj.values():
+                    if isinstance(v, str):
+                        return v
+                break
+    except Exception:
+        return ""
+    return ""
+
+
+def _get_total_frames(video_path: str) -> int:
+    cap = cv2.VideoCapture(video_path)
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.release()
+    return max(0, total)
+
+
+def _load_metadata(video_folder: Path) -> List[Dict]:
+    meta_path = video_folder / "meta.json"
+    if meta_path.exists():
+        with open(meta_path, "r") as f:
+            return json.load(f)
+
+    # Fallback: scan for .mp4 files with sidecar .jsonl; use full frame range
+    items: List[Dict] = []
+    for entry in sorted(video_folder.iterdir()):
+        if not entry.is_file():
+            continue
+        if entry.suffix.lower() != ".mp4":
+            continue
+        video_name = entry.name
+        video_path = str(entry)
+        total_frames = _get_total_frames(video_path)
+        start_frame = 0
+        end_frame = max(0, total_frames - 1)
+        sidecar = entry.with_suffix("")
+        # Handle names with additional dots gracefully
+        sidecar_jsonl = Path(str(entry).rsplit(".", 1)[0] + ".jsonl")
+        caption = _read_sidecar_caption(sidecar_jsonl)
+        items.append(
+            {
+                "file_name": video_name,
+                "start_frame": start_frame,
+                "end_frame": end_frame,
+                "vila_caption": caption,
+            }
+        )
+    if not items:
+        raise FileNotFoundError(f"No meta.json and no .mp4 files found in {video_folder}")
+    return items
+
+
+def _load_frames_cv2(
+    video_path: str,
+    start_frame: int,
+    end_frame: int,
+    target_size: Optional[Tuple[int, int]],
+    resize_mode: str,
+    maintain_aspect_ratio: bool,
+    center_crop: bool,
+    target_dtype: torch.dtype,
+) -> torch.Tensor:
+    cap = cv2.VideoCapture(video_path)
+    frames: List[np.ndarray] = []
+
+    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
+    for frame_idx in range(start_frame, end_frame + 1):
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frame = _resize_frame(frame, target_size, resize_mode, maintain_aspect_ratio, center_crop)
+        frame = frame.astype(np.float32) / 255.0
+        frames.append(frame)
+    cap.release()
+
+    if not frames:
+        raise ValueError(f"No frames loaded from {video_path}")
+
+    video_array = np.array(frames)  # T, H, W, C in [0,1]
+    video_tensor = torch.from_numpy(video_array)  # T, H, W, C
+    video_tensor = video_tensor.permute(3, 0, 1, 2).unsqueeze(0)  # 1, C, T, H, W
+    video_tensor = video_tensor.to(dtype=target_dtype)
+    return video_tensor
+
+
+@torch.no_grad()
+def _init_hf_models(
+    model_id: str,
+    device: str,
+    enable_memory_optimization: bool,
+):
+    dtype = torch.float16 if device.startswith("cuda") else torch.float32
+
+    text_encoder = UMT5EncoderModel.from_pretrained(
+        model_id,
+        subfolder="text_encoder",
+        torch_dtype=dtype,
+    )
+    text_encoder.to(device)
+    text_encoder.eval()
+
+    vae = AutoencoderKLWan.from_pretrained(
+        model_id,
+        subfolder="vae",
+        torch_dtype=dtype,
+    )
+    vae.to(device)
+    vae.eval()
+    if enable_memory_optimization:
+        vae.enable_slicing()
+        vae.enable_tiling()
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer")
+
+    return vae, text_encoder, tokenizer, dtype
+
+
+@torch.no_grad()
+def _encode_text(
+    tokenizer: AutoTokenizer,
+    text_encoder: UMT5EncoderModel,
+    device: str,
+    caption: str,
+) -> torch.Tensor:
+    caption = caption.strip()
+    inputs = tokenizer(
+        caption,
+        max_length=512,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    outputs = text_encoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]).last_hidden_state
+    return outputs
+
+
+@torch.no_grad()
+def _encode_video_latents(
+    vae: AutoencoderKLWan,
+    device: str,
+    video_tensor: torch.Tensor,
+    deterministic_latents: bool,
+) -> torch.Tensor:
+    video_tensor = video_tensor.to(device=device, dtype=vae.dtype)
+    video_tensor = video_tensor * 2.0 - 1.0  # [0,1] -> [-1,1]
+
+    latent_dist = vae.encode(video_tensor)
+    if deterministic_latents:
+        video_latents = latent_dist.latent_dist.mean
+    else:
+        video_latents = latent_dist.latent_dist.sample()
+
+    latent_mean = video_latents.mean().item()
+    latent_std = video_latents.std().item()
+
+    if abs(latent_mean) < 0.5 and 0.5 < latent_std < 2.0:
+        final_latents = video_latents
+    else:
+        if not hasattr(vae.config, "latents_mean") or not hasattr(vae.config, "latents_std"):
+            raise ValueError("Wan2.1 VAE requires latents_mean and latents_std in config")
+        latents_mean = torch.tensor(vae.config.latents_mean, device=device, dtype=vae.dtype).view(1, -1, 1, 1, 1)
+        latents_std = torch.tensor(vae.config.latents_std, device=device, dtype=vae.dtype).view(1, -1, 1, 1, 1)
+        final_latents = (video_latents - latents_mean) / latents_std
+
+    return final_latents
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Prepare WAN WebDataset shards using HF automodel encoders and resizing"
+    )
+    parser.add_argument("--video_folder", type=str, required=True, help="Folder containing videos and meta.json")
+    parser.add_argument("--output_dir", type=str, required=True, help="Directory to write webdataset shards")
+    parser.add_argument(
+        "--model",
+        default="Wan-AI/Wan2.1-T2V-14B-Diffusers",
+        help="Wan2.1 model ID (e.g., Wan-AI/Wan2.1-T2V-14B-Diffusers or Wan-AI/Wan2.1-T2V-1.3B-Diffusers)",
+    )
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use")
+    parser.add_argument(
+        "--stochastic",
+        action="store_true",
+        help="Use stochastic encoding (sampling) instead of deterministic posterior mean",
+    )
+    parser.add_argument("--no-memory-optimization", action="store_true", help="Disable VAE slicing/tiling")
+    parser.add_argument("--shard_maxcount", type=int, default=10000, help="Max samples per shard")
+
+    # Resize arguments (match automodel)
+    parser.add_argument("--height", type=int, default=None, help="Target height for video frames")
+    parser.add_argument("--width", type=int, default=None, help="Target width for video frames")
+    parser.add_argument(
+        "--resize_mode",
+        default="bilinear",
+        choices=["bilinear", "bicubic", "nearest", "area", "lanczos"],
+        help="Interpolation mode for resizing",
+    )
+    parser.add_argument("--no-aspect-ratio", action="store_true", help="Disable aspect ratio preservation")
+    parser.add_argument("--center-crop", action="store_true", help="Center crop to exact target size after resize")
+
+    args = parser.parse_args()
+
+    video_folder = Path(args.video_folder)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    shard_pattern = str(output_dir / "shard-%06d.tar")
+
+    # Target size
+    target_size = None
+    if args.height is not None and args.width is not None:
+        target_size = (args.height, args.width)
+    elif (args.height is None) ^ (args.width is None):
+        parser.error("Both --height and --width must be specified together")
+
+    # Init HF models
+    vae, text_encoder, tokenizer, model_dtype = _init_hf_models(
+        model_id=args.model,
+        device=args.device,
+        enable_memory_optimization=not args.no_memory_optimization,
+    )
+
+    # Load metadata list
+    metadata_list = _load_metadata(video_folder)
+
+    with wds.ShardWriter(shard_pattern, maxcount=args.shard_maxcount) as sink:
+        written = 0
+        for index, meta in enumerate(metadata_list):
+            video_name = meta["file_name"]
+            start_frame = int(meta["start_frame"])  # inclusive
+            end_frame = int(meta["end_frame"])      # inclusive
+            caption_text = meta.get("vila_caption", "")
+
+            video_path = str(video_folder / video_name)
+            # Load frames using the same OpenCV + resize path as automodel
+            video_tensor = _load_frames_cv2(
+                video_path=video_path,
+                start_frame=start_frame,
+                end_frame=end_frame,
+                target_size=target_size,
+                resize_mode=args.resize_mode,
+                maintain_aspect_ratio=not args.no_aspect_ratio,
+                center_crop=args.center_crop,
+                target_dtype=model_dtype,
+            )
+
+            # Encode text and video with HF models exactly like automodel
+            text_embed = _encode_text(tokenizer, text_encoder, args.device, caption_text)
+            latents = _encode_video_latents(vae, args.device, video_tensor, deterministic_latents=not args.stochastic)
+
+            # Move to CPU without changing dtype; keep exact values to match automodel outputs
+            text_embed_cpu = text_embed.detach().to(device="cpu")
+            latents_cpu = latents.detach().to(device="cpu")
+
+            # Reshape to match Mcore's Wan input format
+            text_embed_cpu = text_embed_cpu[0]
+            latents_cpu = latents_cpu[0]
+
+            # Build JSON side-info similar to prepare_energon script
+            C, T, H, W = video_tensor.shape[1:]  # 1,C,T,H,W
+            json_data = {
+                "video_path": video_path,
+                "processed_frames": int(T),
+                "processed_height": int(H),
+                "processed_width": int(W),
+                "caption": caption_text,
+                "deterministic_latents": bool(not args.stochastic),
+                "memory_optimization": bool(not args.no_memory_optimization),
+                "model_version": "wan2.1",
+                "resize_settings": {
+                    "target_size": target_size,
+                    "resize_mode": args.resize_mode,
+                    "maintain_aspect_ratio": bool(not args.no_aspect_ratio),
+                    "center_crop": bool(args.center_crop),
+                },
+            }
+
+            sample = {
+                "__key__": f"{index:06}",
+                "pth": latents_cpu,
+                "pickle": pickle.dumps(text_embed_cpu),
+                "json": json_data,
+            }
+            sink.write(sample)
+            written += 1
+
+    print("Done writing shards using HF automodel encoders.")
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/src/megatron/bridge/data/wan/wan_energon_datamodule.py b/src/megatron/bridge/data/wan/wan_energon_datamodule.py
new file mode 100644
index 0000000000..0f38ea00f6
--- /dev/null
+++ b/src/megatron/bridge/data/wan/wan_energon_datamodule.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+from dataclasses import dataclass
+import logging
+from typing import Any, Dict, Literal
+
+from torch import int_repr
+
+from megatron.bridge.data.Dit.data.diffusion_energon_datamodule import DiffusionDataModule
+from megatron.bridge.data.wan.wan_taskencoder import WanTaskEncoder, VaceTaskEncoder
+from megatron.bridge.data.utils import DatasetBuildContext, DatasetProvider
+
+@dataclass(kw_only=True)
+class WanDataModuleConfig(DatasetProvider):
+    path: str
+    seq_length: int
+    micro_batch_size: int
+    global_batch_size: int
+    num_workers: int_repr
+    dataloader_type: str = "external"
+
+    def __post_init__(self):
+        self.dataset = DiffusionDataModule(
+            path=self.path,
+            seq_length=self.seq_length,
+            task_encoder=WanTaskEncoder(seq_length=self.seq_length),
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=self.global_batch_size,
+            num_workers=self.num_workers)
+        self.sequence_length = self.dataset.seq_length
+    
+    def build_datasets(self, context: DatasetBuildContext):
+        return self.dataset.train_dataloader(), self.dataset.train_dataloader(), self.dataset.train_dataloader()
+    
+@dataclass(kw_only=True)
+class VaceDataModuleConfig(DatasetProvider):
+    path: str
+    seq_length: int
+    micro_batch_size: int
+    global_batch_size: int
+    num_workers: int_repr
+    dataloader_type: str = "external"
+
+    def __post_init__(self):
+        self.dataset = DiffusionDataModule(
+            path=self.path,
+            seq_length=self.seq_length,
+            task_encoder=VaceTaskEncoder(seq_length=self.seq_length),
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=self.global_batch_size,
+            num_workers=self.num_workers)
+        self.sequence_length = self.dataset.seq_length
+    
+    def build_datasets(self, context: DatasetBuildContext):
+        return self.dataset.train_dataloader(), self.dataset.train_dataloader(), self.dataset.train_dataloader()
\ No newline at end of file
diff --git a/src/megatron/bridge/data/wan/wan_taskencoder.py b/src/megatron/bridge/data/wan/wan_taskencoder.py
new file mode 100644
index 0000000000..36e22287a0
--- /dev/null
+++ b/src/megatron/bridge/data/wan/wan_taskencoder.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import torch
+import torch.nn.functional as F
+from megatron.energon import DefaultTaskEncoder, SkipSample
+from megatron.energon.task_encoder.cooking import Cooker, basic_sample_keys
+from megatron.bridge.models.wan.utils.utils import grid_sizes_calculation, patchify
+from megatron.core import parallel_state
+
+
+def cook(sample: dict) -> dict:
+    """
+    Processes a raw sample dictionary from energon dataset and returns a new dictionary with specific keys.
+
+    Args:
+        sample (dict): The input dictionary containing the raw sample data.
+
+    Returns:
+        dict: A new dictionary containing the processed sample data with the following keys:
+            - All keys from the result of `basic_sample_keys(sample)`
+            - 'json': The contains meta data like resolution, aspect ratio, fps, etc.
+            - 'pth': contains video latent tensor
+            - 'pickle': contains text embeddings
+    """
+    return dict(
+        **basic_sample_keys(sample),
+        json=sample["json"],
+        pth=sample["pth"],
+        pickle=sample["pickle"],
+    )
+
+def cook_vace(sample: dict) -> dict:
+    """
+    Processes a raw sample dictionary from energon dataset and returns a new dictionary with specific keys.
+
+    Args:
+        sample (dict): The input dictionary containing the raw sample data.
+
+    Returns:
+        dict: A new dictionary containing the processed sample data with the following keys:
+            - All keys from the result of `basic_sample_keys(sample)`
+            - 'json': The contains meta data like resolution, aspect ratio, fps, etc.
+            - 'pth': contains video latent tensor
+            - 'pickle': contains text embeddings
+    """
+    return dict(
+        **basic_sample_keys(sample),
+        json=sample["json"],
+        pth=sample["pth"],
+        pickle=sample["pickle"],
+        context_pth=sample["context.pth"],
+    )
+
+
+class WanTaskEncoder(DefaultTaskEncoder):
+    """
+    Task encoder for Wan dataset.
+    Attributes:
+        cookers (list): A list of Cooker objects used for processing.
+        patch_spatial (int): The spatial patch size. Defaults to 2.
+        patch_temporal (int): The temporal patch size. Defaults to 1.
+        seq_length (int): The sequence length. Defaults to 1024.
+    """
+
+    cookers = [
+        Cooker(cook),
+    ]
+
+    def __init__(
+        self,
+        *args,
+        max_frames: int = None,
+        patch_spatial: int = 2,
+        patch_temporal: int = 1,
+        seq_length: int = 1024,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.patch_spatial = patch_spatial
+        self.patch_temporal = patch_temporal
+        self.seq_length = seq_length
+
+
+    def encode_sample(self, sample: dict) -> dict:
+
+        video_latent = sample["pth"]
+        context_embeddings = sample["pickle"]
+        video_metadata = sample["json"]
+
+        # sanity quality check
+        if torch.isnan(video_latent).any() or torch.isinf(video_latent).any():
+            raise SkipSample()
+        if torch.max(torch.abs(video_latent)) > 1e3:
+            raise SkipSample()
+
+        # calculate grid size
+        grid_size = grid_sizes_calculation(
+            input_shape = video_latent.shape[1:], 
+            patch_size = (self.patch_temporal, self.patch_spatial, self.patch_spatial),
+        )
+
+        ### Note: shape of sample's values
+        # video_latent: [latents_channels, F_latents, W_latents, H_latents]
+        # grid_size: [F_patches, W_patches, H_patches]
+        # context_embeddings: [context_seq_len, text_embedding_dim]
+
+        return dict(
+            video_latent=video_latent,
+            grid_size=grid_size,
+            context_embeddings=context_embeddings,
+            video_metadata=video_metadata,
+        )
+
+
+    # def encode_sample(self, sample: dict) -> dict:
+
+    #     # mock encode sample
+    #     video_latent = torch.tensor(torch.randn(16, 3, 104, 60), dtype=torch.float32)
+    #     # video_latent = torch.tensor(torch.randn(16, 24, 104, 60), dtype=torch.float32)
+    #     grid_size = torch.tensor([video_latent.shape[1] // self.patch_temporal, video_latent.shape[2] // self.patch_spatial, video_latent.shape[3] // self.patch_spatial], dtype=torch.int32)
+    #     context_embeddings = torch.tensor(torch.randn(512, 4096), dtype=torch.float32)
+    #     video_metadata = {}
+
+    #     return dict(
+    #         video_latent=video_latent,
+    #         grid_size=grid_size,
+    #         context_embeddings=context_embeddings,
+    #         video_metadata=video_metadata,
+    #     )
+
+
+    def batch(self, samples: list[dict]) -> dict:
+
+        # process video latents
+        # do padding here for video latents
+        self.patch_size = (self.patch_temporal, self.patch_spatial, self.patch_spatial)
+
+        # running patchify
+        video_latents = patchify([sample["video_latent"] for sample in samples], self.patch_size)
+
+        # build per-sample loss masks (1 for valid tokens pre-padding)
+        loss_masks = [torch.ones(v.shape[0]) for v in video_latents]
+        # calculate all sequence lengths of video latents for self-attention (for videos, we do this before padding to get original seq len)
+        seq_len_q = [v.shape[0] for v in video_latents]
+        seq_len_q = torch.tensor(seq_len_q, dtype=torch.int32)
+
+
+        # padding and stack video latents
+        max_video_seq_len = max([video_latent.shape[0] for video_latent in video_latents])
+        # CAVEAT:
+        #   when using pipeline parallelism, we need to set batch sequence length to DataModule's seq_length because
+        #   because pipeline parallelism requires pre-specified sequence length to create buffer
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            if max_video_seq_len > self.seq_length:
+                raise ValueError(f"max_video_seq_len {max_video_seq_len} is greater than DataModule's seq_length {self.seq_length}")
+            else:
+                # set max_video_seq_len to DataModule's seq_length
+                max_video_seq_len = self.seq_length
+        # CAVEAT:
+        #   when using context parallelism, we need to pad batch sequence length to be divisible by [cp_rank*2]
+        #   (because TransformerEngine's context parallelism requires "AssertionError: Sequence length per GPU needs to be divisible by 2!")
+        if parallel_state.get_context_parallel_world_size() > 1:
+            batch_size = len(video_latents)
+            assert batch_size == 1, "Error: Batch size must be 1 when using context parallelism"
+            sharding_factor = parallel_state.get_context_parallel_world_size() * 2
+            max_video_seq_len = ((max_video_seq_len + sharding_factor - 1) // sharding_factor) * sharding_factor
+        video_latents = [F.pad(video_latent, (0, 0, 0, max_video_seq_len - video_latent.shape[0])) for video_latent in video_latents]
+        video_latents = torch.stack(video_latents, dim=1)
+        # pad and stack loss masks to shape [S_max, B]
+        loss_masks = [F.pad(m, (0, max_video_seq_len - m.shape[0])) for m in loss_masks]
+        loss_masks = torch.stack(loss_masks, dim=1)
+
+        # process grid sizes
+        grid_sizes = [torch.tensor(sample["grid_size"], dtype=torch.int32) for sample in samples]
+        grid_sizes = torch.stack(grid_sizes, dim=0)
+
+        # process text embeddings
+        # pad here for text embeddings
+        context_max_len = 512
+        context_embeddings = [sample["context_embeddings"] for sample in samples]
+        context_embeddings = [F.pad(context_embedding, (0, 0, 0, context_max_len - context_embedding.shape[0])) for context_embedding in context_embeddings]
+        # calculate all sequence lengths of context embeddings for cross-attention (for videos, we do this after padding to get padded seq len)
+        seq_len_kv = [c.shape[0] for c in context_embeddings]
+        seq_len_kv = torch.tensor(seq_len_kv, dtype=torch.int32)
+        # stack context embeddings
+        context_embeddings = torch.stack(context_embeddings, dim=1)
+
+        # process video metadata
+        video_metadata = [sample["video_metadata"] for sample in samples]
+
+        return dict(
+            video_latents = video_latents,
+            max_video_seq_len = max_video_seq_len,
+            grid_sizes = grid_sizes,
+            context_embeddings = context_embeddings,
+            loss_mask = loss_masks,
+            seq_len_q = seq_len_q,
+            seq_len_kv = seq_len_kv,
+            video_metadata = video_metadata,
+        )
+    
+class VaceTaskEncoder(WanTaskEncoder):
+    """
+    Task encoder for VACE datasets.
+
+    Extends WanTaskEncoder by additionally reading `vace_context` from the
+    energon sample (stored as `context.pth`) and batching it alongside the
+    video latents, text embeddings, and metadata.
+    """
+
+    # Use a cooker that extracts the additional `context.pth` key
+    cookers = [
+        Cooker(cook_vace),
+    ]
+
+    def encode_sample(self, sample: dict) -> dict:
+        """Encode single VACE sample, including vace_context.
+
+        Expected sample keys (post-cook):
+        - pth: video latents tensor
+        - pickle: text embeddings
+        - json: metadata
+        - context_pth: vace context latents tensor
+        """
+
+        video_latent = sample["pth"]
+        context_embeddings = sample["pickle"]
+        video_metadata = sample["json"]
+        vace_context = sample.get("context_pth", None)
+
+        # Sanity checks on video latents
+        if torch.isnan(video_latent).any() or torch.isinf(video_latent).any():
+            raise SkipSample()
+        if torch.max(torch.abs(video_latent)) > 1e3:
+            raise SkipSample()
+
+        # calculate grid size for video latents
+        grid_size = grid_sizes_calculation(
+            input_shape=video_latent.shape[1:],
+            patch_size=(self.patch_temporal, self.patch_spatial, self.patch_spatial),
+        )
+
+        encoded = dict(
+            video_latent=video_latent,
+            grid_size=grid_size,
+            context_embeddings=context_embeddings,
+            video_metadata=video_metadata,
+        )
+
+        # Optional: include vace_context if present
+        if vace_context is not None:
+            encoded["vace_context"] = vace_context
+
+        return encoded
+
+    def batch(self, samples: list[dict]) -> dict:
+        """Batch VACE samples, padding vace_context to match sequence length.
+
+        The vace_context is expected to have its first dimension aligned with the
+        patchified sequence dimension of video latents. If shapes are incompatible,
+        the sample is skipped.
+        """
+
+        # First, run base batching for video/text/metadata
+        base = super().batch(samples)
+
+        # If none of the samples include vace_context, return base
+        if not any("vace_context" in s for s in samples):
+            return base
+
+        # Prepare/pad vace_context to [S_max, B, ...] like video_latents
+        vace_context_list = []
+        seq_lengths = []
+        for s in samples:
+            vc = s.get("vace_context", None)
+            if vc is None:
+                raise SkipSample()
+            
+            # Dataset provides pre-patchified 2D tensors [num_patches, feature_dim]
+            if vc.ndim != 2:
+                raise SkipSample(f"Expected 2D vace_context, got shape {vc.shape}")
+            
+            # Ensure tensor dtype/device consistency
+            vc = vc.to(dtype=base["video_latents"].dtype, device=base["video_latents"].device)
+            seq_lengths.append(vc.shape[0])
+            vace_context_list.append(vc)
+
+        # Determine max sequence length used for video_latents in base (after padding)
+        S_max = base["max_video_seq_len"]
+
+        # Pad each vace_context to S_max along the first dimension and stack to [S_max, B, ...]
+        # vace_context tensors are 2D [S, D] for the model
+        if not all(vc.ndim == 2 for vc in vace_context_list):
+            raise SkipSample()
+        vace_context_list = [F.pad(vc, (0, 0, 0, S_max - vc.shape[0])) for vc in vace_context_list]
+
+        # Stack along batch dim 1 for consistency with video_latents [S_max, B, ...]
+        try:
+            vace_context = torch.stack(vace_context_list, dim=1)
+        except Exception:
+            # If stacking fails due to mismatched trailing dims, skip these samples
+            raise SkipSample()
+
+        base["vace_context"] = vace_context
+        return base
\ No newline at end of file
diff --git a/src/megatron/bridge/models/DiTModel/diffusers_vae.py b/src/megatron/bridge/models/DiTModel/diffusers_vae.py
new file mode 100644
index 0000000000..04b34446ca
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/diffusers_vae.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import torch
+from diffusers import AutoencoderKL
+from einops import rearrange
+
+
+class AutoencoderKLVAE(torch.nn.Module):
+    def __init__(self, path):
+        super().__init__()
+        self.vae = AutoencoderKL.from_pretrained(path, torch_dtype=torch.bfloat16)
+
+    @torch.no_grad()
+    def decode(self, x):
+        B, C, T, H, W = x.shape
+        if T == 1:
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = x / self.vae.config.scaling_factor
+        out = self.vae.decode(x, return_dict=False)[0]
+        if T == 1:
+            return rearrange(out, "(b t) c h w -> b c t h w", t=1)
+        return out
diff --git a/src/megatron/bridge/models/DiTModel/dit_attention.py b/src/megatron/bridge/models/DiTModel/dit_attention.py
new file mode 100644
index 0000000000..c0336529bf
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/dit_attention.py
@@ -0,0 +1,460 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+# pylint: disable=C0115,C0116,C0301
+
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
+from megatron.core.transformer.attention import Attention, SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+@dataclass
+class JointSelfAttentionSubmodules:
+    linear_qkv: Union[ModuleSpec, type] = None
+    added_linear_qkv: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    q_layernorm: Union[ModuleSpec, type] = None
+    k_layernorm: Union[ModuleSpec, type] = None
+    added_q_layernorm: Union[ModuleSpec, type] = None
+    added_k_layernorm: Union[ModuleSpec, type] = None
+
+
+# pylint: disable=C0116
+class JointSelfAttention(Attention):
+    """Joint Self-attention layer class
+
+    Used for MMDIT-like transformer block.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: JointSelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+        context_pre_only: bool = False,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="self",
+        )
+
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear or self.config.add_qkv_bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name="qkv",
+        )
+
+        if submodules.added_linear_qkv is not None:
+            self.added_linear_qkv = build_module(
+                submodules.added_linear_qkv,
+                self.config.hidden_size,
+                self.query_projection_size + 2 * self.kv_projection_size,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=self.config.add_qkv_bias,
+                skip_bias_add=False,
+                is_expert=False,
+                tp_comm_buffer_name="qkv",
+            )
+
+        if not context_pre_only:
+            self.added_linear_proj = build_module(
+                submodules.linear_proj,
+                self.query_projection_size,
+                self.config.hidden_size,
+                config=self.config,
+                init_method=self.config.output_layer_init_method,
+                bias=self.config.add_bias_linear,
+                input_is_parallel=True,
+                skip_bias_add=True,
+                is_expert=False,
+                tp_comm_buffer_name="proj",
+            )
+
+        if submodules.q_layernorm is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.q_layernorm = None
+
+        if submodules.k_layernorm is not None:
+            self.k_layernorm = build_module(
+                submodules.k_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.k_layernorm = None
+
+        if submodules.added_q_layernorm is not None:
+            self.added_q_layernorm = build_module(
+                submodules.added_q_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.added_q_layernorm = None
+
+        if submodules.added_k_layernorm is not None:
+            self.added_k_layernorm = build_module(
+                submodules.added_k_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.added_k_layernorm = None
+
+    def _split_qkv(self, mixed_qkv):
+        # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+        new_tensor_shape = mixed_qkv.size()[:-1] + (
+            self.num_query_groups_per_partition,
+            (
+                (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                * self.hidden_size_per_attention_head
+            ),
+        )
+        mixed_qkv = mixed_qkv.view(*new_tensor_shape)
+
+        split_arg_list = [
+            (
+                self.num_attention_heads_per_partition
+                // self.num_query_groups_per_partition
+                * self.hidden_size_per_attention_head
+            ),
+            self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head,
+        ]
+
+        if SplitAlongDim is not None:
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = SplitAlongDim(
+                mixed_qkv,
+                3,
+                split_arg_list,
+            )
+        else:
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = torch.split(
+                mixed_qkv,
+                split_arg_list,
+                dim=3,
+            )
+
+        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+        query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
+        return query, key, value
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        mixed_qkv, _ = self.linear_qkv(hidden_states)
+
+        query, key, value = self._split_qkv(mixed_qkv)
+
+        if self.config.test_mode:
+            self.run_realtime_tests()
+
+        if self.q_layernorm is not None:
+            query = self.q_layernorm(query)
+
+        if self.k_layernorm is not None:
+            key = self.k_layernorm(key)
+
+        return query, key, value
+
+    def get_added_query_key_value_tensors(self, added_hidden_states, key_value_states=None):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        mixed_qkv, _ = self.added_linear_qkv(added_hidden_states)
+
+        query, key, value = self._split_qkv(mixed_qkv)
+
+        if self.config.test_mode:
+            self.run_realtime_tests()
+
+        if self.added_q_layernorm is not None:
+            query = self.added_q_layernorm(query)
+
+        if self.added_k_layernorm is not None:
+            key = self.added_k_layernorm(key)
+
+        return query, key, value
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+        additional_hidden_states=None,
+    ):
+        # hidden_states: [sq, b, h]
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+
+        query, key, value = self.get_query_key_value_tensors(hidden_states)
+        added_query, added_key, added_value = self.get_added_query_key_value_tensors(additional_hidden_states)
+
+        query = torch.cat([added_query, query], dim=0)
+        key = torch.cat([added_key, key], dim=0)
+        value = torch.cat([added_value, value], dim=0)
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        query, key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, query, key, value, rotary_pos_emb
+        )
+
+        if packed_seq_params is not None:
+            query = query.squeeze(1)
+            key = key.squeeze(1)
+            value = value.squeeze(1)
+
+        # ================================================
+        # relative positional embedding (rotary embedding)
+        # ================================================
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+
+            if packed_seq_params is not None:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+            else:
+                cu_seqlens_q = cu_seqlens_kv = None
+            query = apply_rotary_pos_emb(
+                query,
+                q_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_q,
+            )
+            key = apply_rotary_pos_emb(
+                key,
+                k_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_kv,
+            )
+
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        encoder_attention_output = core_attn_out[: additional_hidden_states.shape[0], :, :]
+        attention_output = core_attn_out[additional_hidden_states.shape[0] :, :, :]
+
+        output, bias = self.linear_proj(attention_output)
+        encoder_output, encoder_bias = self.added_linear_proj(encoder_attention_output)
+
+        output = output + bias
+        encoder_output = encoder_output + encoder_bias
+
+        return output, encoder_output
+
+
+class FluxSingleAttention(SelfAttention):
+    """Self-attention layer class
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: SelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+        cp_comm_type: str = None,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            cp_comm_type=cp_comm_type,
+        )
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=False,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name="proj",
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states)
+        # print(f'megatron q before ln: {query.transpose(0, 1).contiguous()}, {query.transpose(0, 1).contiguous().shape}')
+        # print(f'megatron k before ln: {key.transpose(0, 1).contiguous()}, {key.transpose(0, 1).contiguous().shape}')
+        # print(f'megatron v before ln: {value.transpose(0, 1).contiguous()}, {value.transpose(0, 1).contiguous().shape}')
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        query, key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, query, key, value, rotary_pos_emb
+        )
+
+        if packed_seq_params is not None:
+            query = query.squeeze(1)
+            key = key.squeeze(1)
+            value = value.squeeze(1)
+
+        # ================================================
+        # relative positional embedding (rotary embedding)
+        # ================================================
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+
+            if packed_seq_params is not None:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+            else:
+                cu_seqlens_q = cu_seqlens_kv = None
+            query = apply_rotary_pos_emb(
+                query,
+                q_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_q,
+            )
+            key = apply_rotary_pos_emb(
+                key,
+                k_pos_emb,
+                config=self.config,
+                cu_seqlens=cu_seqlens_kv,
+            )
+
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        output, _ = self.linear_proj(core_attn_out)
+        return output
+
+
+# pylint: disable=C0116
diff --git a/src/megatron/bridge/models/DiTModel/dit_embeddings.py b/src/megatron/bridge/models/DiTModel/dit_embeddings.py
new file mode 100644
index 0000000000..5bbfd5db6b
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/dit_embeddings.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+
+import logging
+from typing import Optional
+
+import torch
+from diffusers.models.embeddings import TimestepEmbedding, get_3d_sincos_pos_embed
+from einops import rearrange
+from megatron.core import parallel_state
+from megatron.core.transformer.module import MegatronModule
+from torch import nn
+
+
+log = logging.getLogger(__name__)
+
+
+class SDXLTimestepEmbedding(nn.Module):
+    def __init__(self, in_features: int, out_features: int, use_adaln_lora: bool = False):
+        super().__init__()
+        log.critical(
+            f"Using AdaLN LoRA Flag:  {use_adaln_lora}. We enable bias if no AdaLN LoRA for backward compatibility."
+        )
+        self.linear_1 = nn.Linear(in_features, out_features, bias=not use_adaln_lora)
+        self.activation = nn.SiLU()
+        self.use_adaln_lora = use_adaln_lora
+        if use_adaln_lora:
+            self.linear_2 = nn.Linear(out_features, 3 * out_features, bias=False)
+        else:
+            self.linear_2 = nn.Linear(out_features, out_features, bias=True)
+
+    def forward(self, sample: torch.Tensor) -> torch.Tensor:
+        emb = self.linear_1(sample)
+        emb = self.activation(emb)
+        emb = self.linear_2(emb)
+
+        if self.use_adaln_lora:
+            adaln_lora_B_3D = emb
+            emb_B_D = sample
+        else:
+            emb_B_D = emb
+            adaln_lora_B_3D = None
+
+        return emb_B_D, adaln_lora_B_3D
+
+
+class ParallelSDXLTimestepEmbedding(SDXLTimestepEmbedding):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        use_adaln_lora: bool = False,
+        seed: Optional[int] = None,
+    ):
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            use_adaln_lora=use_adaln_lora,
+        )
+        if seed is not None:
+            with torch.random.fork_rng():
+                torch.manual_seed(seed)
+                self.linear_1.reset_parameters()
+                self.linear_2.reset_parameters()
+
+        # Check for pipeline model parallelism and set attributes accordingly
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            setattr(self.linear_1.weight, "pipeline_parallel", True)
+            if self.linear_1.bias is not None:
+                setattr(self.linear_1.bias, "pipeline_parallel", True)
+            setattr(self.linear_2.weight, "pipeline_parallel", True)
+            if self.linear_2.bias is not None:
+                setattr(self.linear_2.bias, "pipeline_parallel", True)
+
+    def forward(self, sample: torch.Tensor) -> torch.Tensor:
+        sample = sample.to(torch.bfloat16, non_blocking=True)
+        return super().forward(sample)
+
+
+class ParallelTimestepEmbedding(TimestepEmbedding):
+    """
+    ParallelTimestepEmbedding is a subclass of TimestepEmbedding that initializes
+    the embedding layers with an optional random seed for syncronization.
+
+    Args:
+        in_channels (int): Number of input channels.
+        time_embed_dim (int): Dimension of the time embedding.
+        seed (int, optional): Random seed for initializing the embedding layers.
+                              If None, no specific seed is set.
+
+    Attributes:
+        linear_1 (nn.Module): First linear layer for the embedding.
+        linear_2 (nn.Module): Second linear layer for the embedding.
+
+    Methods:
+        __init__(in_channels, time_embed_dim, seed=None): Initializes the embedding layers.
+    """
+
+    def __init__(self, in_channels: int, time_embed_dim: int, seed=None):
+        super().__init__(in_channels=in_channels, time_embed_dim=time_embed_dim)
+        if seed is not None:
+            with torch.random.fork_rng():
+                torch.manual_seed(seed)
+                self.linear_1.reset_parameters()
+                self.linear_2.reset_parameters()
+
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            setattr(self.linear_1.weight, "pipeline_parallel", True)
+            setattr(self.linear_1.bias, "pipeline_parallel", True)
+            setattr(self.linear_2.weight, "pipeline_parallel", True)
+            setattr(self.linear_2.bias, "pipeline_parallel", True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Computes the positional embeddings for the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (B, T, H, W, C).
+
+        Returns:
+            torch.Tensor: Positional embeddings of shape (B, T, H, W, C).
+        """
+        return super().forward(x.to(torch.bfloat16, non_blocking=True))
+
+
+def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim):
+    """
+    Adjusts the positional embeddings tensor to the current context parallel rank.
+
+    Args:
+        pos_emb (torch.Tensor): The positional embeddings tensor.
+        seq_dim (int): The sequence dimension index in the positional embeddings tensor.
+
+    Returns:
+        torch.Tensor: The adjusted positional embeddings tensor for the current context parallel rank.
+    """
+    cp_size = parallel_state.get_context_parallel_world_size()
+    cp_rank = parallel_state.get_context_parallel_rank()
+    cp_idx = torch.tensor([cp_rank], device="cpu", pin_memory=True).cuda(non_blocking=True)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], cp_size, -1, *pos_emb.shape[(seq_dim + 1) :])
+    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
+    return pos_emb
+
+
+class SinCosPosEmb3D(MegatronModule):
+    """
+    SinCosPosEmb3D is a 3D sine-cosine positional embedding module.
+
+    Args:
+        model_channels (int): Number of channels in the model.
+        h (int): Length of the height dimension.
+        w (int): Length of the width dimension.
+        t (int): Length of the temporal dimension.
+        spatial_interpolation_scale (float, optional): Scale factor for spatial interpolation. Default is 1.0.
+        temporal_interpolation_scale (float, optional): Scale factor for temporal interpolation. Default is 1.0.
+
+    Methods:
+        forward(pos_ids: torch.Tensor) -> torch.Tensor:
+            Computes the positional embeddings for the input tensor.
+
+            Args:
+                pos_ids (torch.Tensor): Input tensor of shape (B S 3).
+
+            Returns:
+                torch.Tensor: Positional embeddings of shape (B S D).
+    """
+
+    def __init__(
+        self,
+        config,
+        h: int,
+        w: int,
+        t: int,
+        spatial_interpolation_scale=1.0,
+        temporal_interpolation_scale=1.0,
+    ):
+        super().__init__(config=config)
+        self.h = h
+        self.w = w
+        self.t = t
+        # h w t
+        param = get_3d_sincos_pos_embed(
+            config.hidden_size, [h, w], t, spatial_interpolation_scale, temporal_interpolation_scale
+        )
+        param = rearrange(param, "t hw c -> (t hw) c")
+        self.pos_embedding = torch.nn.Embedding(param.shape[0], config.hidden_size)
+        self.pos_embedding.weight = torch.nn.Parameter(torch.tensor(param), requires_grad=False)
+
+    def forward(self, pos_ids: torch.Tensor):
+        # pos_ids: t h w
+        pos_id = pos_ids[..., 0] * self.h * self.w + pos_ids[..., 1] * self.w + pos_ids[..., 2]
+        return self.pos_embedding(pos_id)
+
+
+class FactorizedLearnable3DEmbedding(MegatronModule):
+    def __init__(
+        self,
+        config,
+        t: int,
+        h: int,
+        w: int,
+        **kwargs,
+    ):
+        super().__init__(config=config)
+        self.emb_t = torch.nn.Embedding(t, config.hidden_size)
+        self.emb_h = torch.nn.Embedding(h, config.hidden_size)
+        self.emb_w = torch.nn.Embedding(w, config.hidden_size)
+
+        if "seed" in kwargs.keys():
+            seed = kwargs["seed"]
+            with torch.random.fork_rng():
+                torch.manual_seed(seed)
+                if config.perform_initialization:
+                    self.customize_init_param()
+                else:
+                    self.reset_parameters()
+        else:
+            if config.perform_initialization:
+                self.customize_init_param()
+
+    def customize_init_param(self):
+        self.config.init_method(self.emb_t.weight)
+        self.config.init_method(self.emb_h.weight)
+        self.config.init_method(self.emb_w.weight)
+
+    def reset_parameters(self):
+        self.emb_t.reset_parameters()
+        self.emb_h.reset_parameters()
+        self.emb_w.reset_parameters()
+
+    def forward(self, pos_ids: torch.Tensor):
+        return self.emb_t(pos_ids[..., 0]) + self.emb_h(pos_ids[..., 1]) + self.emb_w(pos_ids[..., 2])
diff --git a/src/megatron/bridge/models/DiTModel/dit_layer_spec.py b/src/megatron/bridge/models/DiTModel/dit_layer_spec.py
new file mode 100644
index 0000000000..f6fccfa59a
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/dit_layer_spec.py
@@ -0,0 +1,851 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import copy
+from dataclasses import dataclass
+from typing import Literal, Optional, Union
+
+import torch
+import torch.nn as nn
+from megatron.core.jit import jit_fuser
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer.attention import (
+    CrossAttention,
+    CrossAttentionSubmodules,
+    SelfAttention,
+    SelfAttentionSubmodules,
+)
+from megatron.core.transformer.cuda_graphs import CudaGraphManager
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TENorm,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_block import TransformerConfig
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.utils import make_viewless_tensor
+from megatron.bridge.models.DiTModel.dit_attention import (
+    JointSelfAttention,
+    JointSelfAttentionSubmodules,
+)
+
+
+# pylint: disable=C0116
+@dataclass
+class DiTWithAdaLNSubmodules(TransformerLayerSubmodules):
+    temporal_self_attention: Union[ModuleSpec, type] = IdentityOp
+    full_self_attention: Union[ModuleSpec, type] = IdentityOp
+
+
+@dataclass
+class STDiTWithAdaLNSubmodules(TransformerLayerSubmodules):
+    spatial_self_attention: Union[ModuleSpec, type] = IdentityOp
+    temporal_self_attention: Union[ModuleSpec, type] = IdentityOp
+    full_self_attention: Union[ModuleSpec, type] = IdentityOp
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, config, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+class AdaLN(MegatronModule):
+    """
+    Adaptive Layer Normalization Module for DiT.
+    """
+
+    def __init__(
+        self, config: TransformerConfig, n_adaln_chunks=9, use_adaln_lora=True, adaln_lora_dim=256, norm=nn.LayerNorm
+    ):
+        super().__init__(config)
+        if norm == TENorm:
+            self.ln = norm(config, config.hidden_size, config.layernorm_epsilon)
+        else:
+            self.ln = norm(config.hidden_size, elementwise_affine=False, eps=self.config.layernorm_epsilon)
+        self.n_adaln_chunks = n_adaln_chunks
+        if use_adaln_lora:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(config.hidden_size, adaln_lora_dim, bias=False),
+                nn.Linear(adaln_lora_dim, self.n_adaln_chunks * config.hidden_size, bias=False),
+            )
+        else:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(), nn.Linear(config.hidden_size, self.n_adaln_chunks * config.hidden_size, bias=False)
+            )
+        nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+
+        setattr(self.adaLN_modulation[-1].weight, "sequence_parallel", config.sequence_parallel)
+
+    def forward(self, timestep_emb):
+        return self.adaLN_modulation(timestep_emb).chunk(self.n_adaln_chunks, dim=-1)
+
+    def modulate(self, x, shift, scale):
+        return x * (1 + scale) + shift
+
+    def scale_add(self, residual, x, gate):
+        return residual + gate * x
+
+    def modulated_layernorm(self, x, shift, scale):
+        # Optional Input Layer norm
+        # import pdb; pdb.set_trace()
+        input_layernorm_output = self.ln(x).type_as(x)
+
+        # DiT block specific
+        return self.modulate(input_layernorm_output, shift, scale)
+
+    # @jit_fuser
+    def scaled_modulated_layernorm(self, residual, x, gate, shift, scale):
+        hidden_states = self.scale_add(residual, x, gate)
+        shifted_pre_mlp_layernorm_output = self.modulated_layernorm(hidden_states, shift, scale)
+        return hidden_states, shifted_pre_mlp_layernorm_output
+
+
+class AdaLNContinuous(MegatronModule):
+    """
+    A variant of AdaLN used for flux models.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        conditioning_embedding_dim: int,
+        modulation_bias: bool = True,
+        norm_type: str = "layer_norm",
+    ):
+        super().__init__(config)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(conditioning_embedding_dim, config.hidden_size * 2, bias=modulation_bias)
+        )
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6, bias=modulation_bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(config.hidden_size, eps=1e-6)
+        else:
+            raise ValueError("Unknown normalization type {}".format(norm_type))
+
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        emb = self.adaLN_modulation(conditioning_embedding)
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+
+
+class STDiTLayerWithAdaLN(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    Spatial-Temporal DiT with Adapative Layer Normalization.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+        position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute",
+    ):
+        def _replace_no_cp_submodules(submodules):
+            modified_submods = copy.deepcopy(submodules)
+            modified_submods.cross_attention = IdentityOp
+            modified_submods.spatial_self_attention = IdentityOp
+            return modified_submods
+
+        # Replace any submodules that will have CP disabled and build them manually later after TransformerLayer init.
+        modified_submods = _replace_no_cp_submodules(submodules)
+        super().__init__(
+            config=config, submodules=modified_submods, layer_number=layer_number, hidden_dropout=hidden_dropout
+        )
+
+        # Override Spatial Self Attention and Cross Attention to disable CP.
+        # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to
+        # incorrect tensor shapes.
+        sa_cp_override_config = copy.deepcopy(config)
+        sa_cp_override_config.context_parallel_size = 1
+        sa_cp_override_config.tp_comm_overlap = False
+        self.spatial_self_attention = build_module(
+            submodules.spatial_self_attention, config=sa_cp_override_config, layer_number=layer_number
+        )
+        self.cross_attention = build_module(
+            submodules.cross_attention,
+            config=sa_cp_override_config,
+            layer_number=layer_number,
+        )
+
+        self.temporal_self_attention = build_module(
+            submodules.temporal_self_attention,
+            config=self.config,
+            layer_number=layer_number,
+        )
+
+        self.full_self_attention = build_module(
+            submodules.full_self_attention,
+            config=self.config,
+            layer_number=layer_number,
+        )
+
+        self.adaLN = AdaLN(config=self.config, n_adaln_chunks=3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # timestep embedding
+        timestep_emb = attention_mask
+
+        # ******************************************** spatial self attention *****************************************
+
+        shift_sa, scale_sa, gate_sa = self.adaLN(timestep_emb)
+
+        # adaLN with scale + shift
+        pre_spatial_attn_layernorm_output_ada = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_sa, scale=scale_sa
+        )
+
+        attention_output, _ = self.spatial_self_attention(
+            pre_spatial_attn_layernorm_output_ada,
+            attention_mask=None,
+            # packed_seq_params=packed_seq_params['self_attention'],
+        )
+
+        # ******************************************** full self attention ********************************************
+
+        shift_full, scale_full, gate_full = self.adaLN(timestep_emb)
+
+        # adaLN with scale + shift
+        hidden_states, pre_full_attn_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_sa,
+            shift=shift_full,
+            scale=scale_full,
+        )
+
+        # import pdb;pdb.set_trace()
+
+        attention_output, _ = self.full_self_attention(
+            pre_full_attn_layernorm_output_ada,
+            attention_mask=None,
+            # packed_seq_params=packed_seq_params['self_attention'],
+        )
+
+        # ******************************************** cross attention ************************************************
+
+        shift_ca, scale_ca, gate_ca = self.adaLN(timestep_emb)
+
+        # adaLN with scale + shift
+        hidden_states, pre_cross_attn_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_full,
+            shift=shift_ca,
+            scale=scale_ca,
+        )
+
+        #import pdb; pdb.set_trace()
+        attention_output, _ = self.cross_attention(
+            pre_cross_attn_layernorm_output_ada,
+            attention_mask=context_mask,
+            key_value_states=context,
+            # packed_seq_params=packed_seq_params['cross_attention'],
+        )
+
+        # ******************************************** temporal self attention ****************************************
+
+        shift_ta, scale_ta, gate_ta = self.adaLN(timestep_emb)
+
+        hidden_states, pre_temporal_attn_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_ca,
+            shift=shift_ta,
+            scale=scale_ta,
+        )
+
+        attention_output, _ = self.temporal_self_attention(
+            pre_temporal_attn_layernorm_output_ada,
+            attention_mask=None,
+            # packed_seq_params=packed_seq_params['self_attention'],
+        )
+
+        # ******************************************** mlp ************************************************************
+
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN(timestep_emb)
+
+        hidden_states, pre_mlp_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_ta,
+            shift=shift_mlp,
+            scale=scale_mlp,
+        )
+
+        mlp_output, _ = self.mlp(pre_mlp_layernorm_output_ada)
+        hidden_states = self.adaLN.scale_add(residual=hidden_states, x=mlp_output, gate=gate_mlp)
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        return output, context
+
+
+class DiTLayerWithAdaLN(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    DiT with Adapative Layer Normalization.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+        position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute",
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        vp_stage: Optional[int] = None,
+    ):
+        def _replace_no_cp_submodules(submodules):
+            modified_submods = copy.deepcopy(submodules)
+            modified_submods.cross_attention = IdentityOp
+            # modified_submods.temporal_self_attention = IdentityOp
+            return modified_submods
+
+        # Replace any submodules that will have CP disabled and build them manually later after TransformerLayer init.
+        modified_submods = _replace_no_cp_submodules(submodules)
+        super().__init__(
+            config=config, submodules=modified_submods, layer_number=layer_number, hidden_dropout=hidden_dropout
+        )
+
+        # Override Cross Attention to disable CP.
+        # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to
+        # incorrect tensor shapes.
+        if submodules.cross_attention != IdentityOp:
+            cp_override_config = copy.deepcopy(config)
+            cp_override_config.context_parallel_size = 1
+            cp_override_config.tp_comm_overlap = False
+            # import pdb;pdb.set_trace()
+            self.cross_attention = build_module(
+                submodules.cross_attention,
+                config=cp_override_config,
+                layer_number=layer_number,
+            )
+        else:
+            self.cross_attention = None
+
+        self.full_self_attention = build_module(
+            submodules.full_self_attention,
+            config=self.config,
+            layer_number=layer_number,
+        )
+
+        self.adaLN = AdaLN(config=self.config, n_adaln_chunks=9 if self.cross_attention else 6)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        rotary_pos_cos=None,
+        rotary_pos_sin=None,
+        attention_bias=None,
+        inference_params=None,
+        packed_seq_params=None,
+        sequence_len_offset=None,
+        inference_context=None
+    ):
+        # timestep embedding
+        timestep_emb = attention_mask
+
+        # ******************************************** full self attention ********************************************
+        if self.cross_attention:
+            shift_full, scale_full, gate_full, shift_ca, scale_ca, gate_ca, shift_mlp, scale_mlp, gate_mlp = (
+                self.adaLN(timestep_emb)
+            )
+        else:
+            shift_full, scale_full, gate_full, shift_mlp, scale_mlp, gate_mlp = self.adaLN(timestep_emb)
+
+        # import pdb; pdb.set_trace()
+
+        # adaLN with scale + shift
+        pre_full_attn_layernorm_output_ada = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_full, scale=scale_full
+        )
+        # import pdb;pdb.set_trace()
+        attention_output, _ = self.full_self_attention(
+            pre_full_attn_layernorm_output_ada,
+            attention_mask=None,
+            packed_seq_params=None if packed_seq_params is None else packed_seq_params["self_attention"],
+        )
+
+        if self.cross_attention:
+            # ******************************************** cross attention ********************************************
+            # adaLN with scale + shift
+            hidden_states, pre_cross_attn_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+                residual=hidden_states,
+                x=attention_output,
+                gate=gate_full,
+                shift=shift_ca,
+                scale=scale_ca,
+            )
+            #import pdb; pdb.set_trace()
+            attention_output, _ = self.cross_attention(
+                pre_cross_attn_layernorm_output_ada,
+                attention_mask=context_mask,
+                key_value_states=context,
+                packed_seq_params=None if packed_seq_params is None else packed_seq_params["cross_attention"],
+            )
+
+        # ******************************************** mlp ******************************************************
+        hidden_states, pre_mlp_layernorm_output_ada = self.adaLN.scaled_modulated_layernorm(
+            residual=hidden_states,
+            x=attention_output,
+            gate=gate_ca if self.cross_attention else gate_full,
+            shift=shift_mlp,
+            scale=scale_mlp,
+        )
+
+        mlp_output, _ = self.mlp(pre_mlp_layernorm_output_ada)
+        hidden_states = self.adaLN.scale_add(residual=hidden_states, x=mlp_output, gate=gate_mlp)
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        return output, context
+
+
+class DiTLayer(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    Original DiT layer implementation from [https://arxiv.org/pdf/2212.09748].
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        mlp_ratio: int = 4,
+        n_adaln_chunks: int = 6,
+        modulation_bias: bool = True,
+    ):
+        # Modify the mlp layer hidden_size of a dit layer according to mlp_ratio
+        config.ffn_hidden_size = int(mlp_ratio * config.hidden_size)
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+
+        self.adaLN = AdaLN(
+            config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=True
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # passing in conditioning information via attention mask here
+        c = attention_mask
+
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN(c)
+
+        shifted_input_layernorm_output = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0
+        )
+
+        x, bias = self.self_attention(shifted_input_layernorm_output, attention_mask=None)
+
+        hidden_states = self.adaLN.scale_add(hidden_states, x=(x + bias), gate=gate_msa)
+
+        residual = hidden_states
+
+        shited_pre_mlp_layernorm_output = self.adaLN.modulated_layernorm(
+            hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1
+        )
+
+        x, bias = self.mlp(shited_pre_mlp_layernorm_output)
+
+        hidden_states = self.adaLN.scale_add(residual, x=(x + bias), gate=gate_mlp)
+
+        return hidden_states, context
+
+
+class MMDiTLayer(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    MMDiT layer implementation from [https://arxiv.org/pdf/2403.03206].
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        context_pre_only: bool = False,
+    ):
+        hidden_size = config.hidden_size
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+
+        if config.enable_cuda_graph:
+            self.cudagraph_manager = CudaGraphManager(config, share_cudagraph_io_buffers=False)
+
+        self.adaln = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True)
+
+        self.context_pre_only = context_pre_only
+        context_norm_type = "ada_norm_continuous" if context_pre_only else "ada_norm_zero"
+
+        if context_norm_type == "ada_norm_continuous":
+            self.adaln_context = AdaLNContinuous(config, hidden_size, modulation_bias=True, norm_type="layer_norm")
+        elif context_norm_type == "ada_norm_zero":
+            self.adaln_context = AdaLN(config, modulation_bias=True, n_adaln_chunks=6, use_second_norm=True)
+        else:
+            raise ValueError(
+                f"Unknown context_norm_type: {context_norm_type}, "
+                f"currently only support `ada_norm_continous`, `ada_norm_zero`"
+            )
+        # Override Cross Attention to disable CP.
+        # Disable TP Comm overlap as well. Not disabling will attempt re-use of buffer size same as Q and lead to
+        # incorrect tensor shapes.
+        cp_override_config = copy.deepcopy(config)
+        cp_override_config.context_parallel_size = 1
+        cp_override_config.tp_comm_overlap = False
+
+        if not context_pre_only:
+            self.context_mlp = build_module(
+                submodules.mlp,
+                config=cp_override_config,
+            )
+        else:
+            self.context_mlp = None
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+        emb=None,
+    ):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaln(emb)
+
+        norm_hidden_states = self.adaln.modulated_layernorm(
+            hidden_states, shift=shift_msa, scale=scale_msa, layernorm_idx=0
+        )
+        if self.context_pre_only:
+            norm_encoder_hidden_states = self.adaln_context(encoder_hidden_states, emb)
+        else:
+            c_shift_msa, c_scale_msa, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.adaln_context(emb)
+            norm_encoder_hidden_states = self.adaln_context.modulated_layernorm(
+                encoder_hidden_states, shift=c_shift_msa, scale=c_scale_msa, layernorm_idx=0
+            )
+
+        attention_output, encoder_attention_output = self.self_attention(
+            norm_hidden_states,
+            attention_mask=attention_mask,
+            key_value_states=None,
+            additional_hidden_states=norm_encoder_hidden_states,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+        hidden_states = self.adaln.scale_add(hidden_states, x=attention_output, gate=gate_msa)
+        norm_hidden_states = self.adaln.modulated_layernorm(
+            hidden_states, shift=shift_mlp, scale=scale_mlp, layernorm_idx=1
+        )
+
+        mlp_output, mlp_output_bias = self.mlp(norm_hidden_states)
+        hidden_states = self.adaln.scale_add(hidden_states, x=(mlp_output + mlp_output_bias), gate=gate_mlp)
+
+        if self.context_pre_only:
+            encoder_hidden_states = None
+        else:
+            encoder_hidden_states = self.adaln_context.scale_add(
+                encoder_hidden_states, x=encoder_attention_output, gate=c_gate_msa
+            )
+            norm_encoder_hidden_states = self.adaln_context.modulated_layernorm(
+                encoder_hidden_states, shift=c_shift_mlp, scale=c_scale_mlp, layernorm_idx=1
+            )
+
+            context_mlp_output, context_mlp_output_bias = self.context_mlp(norm_encoder_hidden_states)
+            encoder_hidden_states = self.adaln.scale_add(
+                encoder_hidden_states, x=(context_mlp_output + context_mlp_output_bias), gate=c_gate_mlp
+            )
+
+        return hidden_states, encoder_hidden_states
+
+    def __call__(self, *args, **kwargs):
+        if hasattr(self, "cudagraph_manager"):
+            return self.cudagraph_manager(self, args, kwargs)
+        return super(MegatronModule, self).__call__(*args, **kwargs)
+
+
+class FluxSingleTransformerBlock(TransformerLayer):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        mlp_ratio: int = 4,
+        n_adaln_chunks: int = 3,
+        modulation_bias: bool = True,
+    ):
+        super().__init__(config=config, submodules=submodules, layer_number=layer_number)
+
+        if config.enable_cuda_graph:
+            self.cudagraph_manager = CudaGraphManager(config, share_cudagraph_io_buffers=False)
+        self.adaln = AdaLN(
+            config=config, n_adaln_chunks=n_adaln_chunks, modulation_bias=modulation_bias, use_second_norm=False
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+        emb=None,
+    ):
+        residual = hidden_states
+
+        shift, scale, gate = self.adaln(emb)
+
+        norm_hidden_states = self.adaln.modulated_layernorm(hidden_states, shift=shift, scale=scale)
+
+        mlp_hidden_states, mlp_bias = self.mlp(norm_hidden_states)
+
+        attention_output = self.self_attention(
+            norm_hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb
+        )
+
+        hidden_states = mlp_hidden_states + mlp_bias + attention_output
+
+        hidden_states = self.adaln.scale_add(residual, x=hidden_states, gate=gate)
+
+        return hidden_states, None
+
+    def __call__(self, *args, **kwargs):
+        if hasattr(self, "cudagraph_manager"):
+            return self.cudagraph_manager(self, args, kwargs)
+        return super(MegatronModule, self).__call__(*args, **kwargs)
+
+
+def get_stdit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.padding}
+    return ModuleSpec(
+        module=STDiTLayerWithAdaLN,
+        submodules=STDiTWithAdaLNSubmodules(
+            spatial_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            temporal_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            full_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            cross_attention=ModuleSpec(
+                module=CrossAttention,
+                params=params,
+                submodules=CrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+
+
+def get_dit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.padding}
+    return ModuleSpec(
+        module=DiTLayerWithAdaLN,
+        submodules=DiTWithAdaLNSubmodules(
+            full_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=RMSNorm,
+                    k_layernorm=RMSNorm,
+                ),
+            ),
+            cross_attention=ModuleSpec(
+                module=CrossAttention,
+                params=params,
+                submodules=CrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    # Cross attention no longer is supports q and k layernorms
+                    # q_layernorm=RMSNorm,
+                    # k_layernorm=RMSNorm,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+
+
+def get_official_dit_adaln_block_with_transformer_engine_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.no_mask}
+    return ModuleSpec(
+        module=DiTLayerWithAdaLN,
+        submodules=DiTWithAdaLNSubmodules(
+            full_self_attention=ModuleSpec(
+                module=SelfAttention,
+                params=params,
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+
+
+def get_mm_dit_block_with_transformer_engine_spec() -> ModuleSpec:
+    return ModuleSpec(
+        module=MMDiTLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=JointSelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=JointSelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    added_linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+
+
+
+# pylint: disable=C0116
diff --git a/src/megatron/bridge/models/DiTModel/dit_model.py b/src/megatron/bridge/models/DiTModel/dit_model.py
new file mode 100644
index 0000000000..ff90bedbf0
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/dit_model.py
@@ -0,0 +1,378 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+from typing import Dict, Literal, Optional
+
+import torch
+import torch.distributed
+import torch.nn as nn
+from diffusers.models.embeddings import Timesteps
+from einops import rearrange, repeat
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.models.common.vision_module.vision_module import VisionModule
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_sharded_tensor_for_checkpoint
+
+from megatron.bridge.models.DiTModel.dit_embeddings import ParallelTimestepEmbedding
+from megatron.bridge.models.DiTModel import dit_embeddings
+from megatron.bridge.models.DiTModel.dit_layer_spec import (
+    get_dit_adaln_block_with_transformer_engine_spec as DiTLayerWithAdaLNspec,
+)
+from torch import Tensor
+
+
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, channel: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(channel))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+
+    def __init__(self, hidden_size, spatial_patch_size, temporal_patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False
+        )
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=False))
+
+    def forward(self, x_BT_HW_D, emb_B_D):
+        shift_B_D, scale_B_D = self.adaLN_modulation(emb_B_D).chunk(2, dim=1)
+        T = x_BT_HW_D.shape[0] // emb_B_D.shape[0]
+        shift_BT_D, scale_BT_D = repeat(shift_B_D, "b d -> (b t) d", t=T), repeat(scale_B_D, "b d -> (b t) d", t=T)
+        x_BT_HW_D = modulate(self.norm_final(x_BT_HW_D), shift_BT_D, scale_BT_D)
+        x_BT_HW_D = self.linear(x_BT_HW_D)
+        return x_BT_HW_D
+
+
+class DiTCrossAttentionModel(VisionModule):
+    """
+    DiTCrossAttentionModel is a VisionModule that implements a DiT model with a cross-attention block.
+    Attributes:
+        config (TransformerConfig): Configuration for the transformer.
+        pre_process (bool): Whether to apply pre-processing steps.
+        post_process (bool): Whether to apply post-processing steps.
+        fp16_lm_cross_entropy (bool): Whether to use fp16 for cross-entropy loss.
+        parallel_output (bool): Whether to use parallel output.
+        position_embedding_type (Literal["learned_absolute", "rope"]): Type of position embedding.
+        max_img_h (int): Maximum image height.
+        max_img_w (int): Maximum image width.
+        max_frames (int): Maximum number of frames.
+        patch_spatial (int): Spatial patch size.
+        patch_temporal (int): Temporal patch size.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        transformer_decoder_layer_spec (DiTLayerWithAdaLNspec): Specification for the transformer decoder layer.
+        add_encoder (bool): Whether to add an encoder.
+        add_decoder (bool): Whether to add a decoder.
+        share_embeddings_and_output_weights (bool): Whether to share embeddings and output weights.
+        concat_padding_mask (bool): Whether to concatenate padding mask.
+        pos_emb_cls (str): Class of position embedding.
+        model_type (ModelType): Type of the model.
+        decoder (TransformerBlock): Transformer decoder block.
+        t_embedder (torch.nn.Sequential): Time embedding layer.
+        x_embedder (nn.Conv3d): Convolutional layer for input embedding.
+        pos_embedder (dit_embeddings.SinCosPosEmb3D): Position embedding layer.
+        final_layer_linear (torch.nn.Linear): Final linear layer.
+        affline_norm (RMSNorm): Affine normalization layer.
+    Methods:
+        forward(x: Tensor, timesteps: Tensor, crossattn_emb: Tensor, packed_seq_params: PackedSeqParams = None, pos_ids: Tensor = None, **kwargs) -> Tensor:
+            Forward pass of the model.
+        set_input_tensor(input_tensor: Tensor) -> None:
+            Sets input tensor to the model.
+        sharded_state_dict(prefix: str = 'module.', sharded_offsets: tuple = (), metadata: Optional[Dict] = None) -> ShardedStateDict:
+            Sharded state dict implementation for backward-compatibility.
+        tie_embeddings_weights_state_dict(tensor, sharded_state_dict: ShardedStateDict, output_layer_weight_key: str, first_stage_word_emb_key: str) -> None:
+            Ties the embedding and output weights in a given sharded state dict.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        position_embedding_type: Literal["learned_absolute", "rope"] = "rope",
+        max_img_h: int = 80,
+        max_img_w: int = 80,
+        max_frames: int = 34,
+        patch_spatial: int = 1,
+        patch_temporal: int = 1,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        transformer_decoder_layer_spec=DiTLayerWithAdaLNspec,
+        pos_embedder=dit_embeddings.SinCosPosEmb3D,
+        **kwargs,
+    ):
+        super(DiTCrossAttentionModel, self).__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        self.transformer_decoder_layer_spec = transformer_decoder_layer_spec()
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.add_encoder = True
+        self.add_decoder = True
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.position_embedding_type = position_embedding_type
+        self.share_embeddings_and_output_weights = False
+        self.concat_padding_mask = True
+        self.pos_emb_cls = "sincos"
+        self.patch_spatial = patch_spatial
+        self.patch_temporal = patch_temporal
+
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+
+        # Transformer decoder
+        self.decoder = TransformerBlock(
+            config=self.config,
+            spec=self.transformer_decoder_layer_spec,
+            pre_process=self.pre_process,
+            post_process=False,
+            post_layer_norm=False,
+        )
+
+        self.t_embedder = torch.nn.Sequential(
+            Timesteps(self.config.hidden_size, flip_sin_to_cos=False, downscale_freq_shift=0),
+            dit_embeddings.ParallelTimestepEmbedding(self.config.hidden_size, self.config.hidden_size, seed=1234),
+        )
+
+        self.fps_embedder = nn.Sequential(
+            Timesteps(num_channels=256, flip_sin_to_cos=False, downscale_freq_shift=1),
+            ParallelTimestepEmbedding(256, 256, seed=1234),
+        )
+
+        if self.pre_process:
+            self.x_embedder = torch.nn.Linear(in_channels * patch_spatial**2, self.config.hidden_size)
+
+        if pos_embedder is dit_embeddings.SinCosPosEmb3D:
+            if self.pre_process:
+                self.pos_embedder = pos_embedder(
+                    config,
+                    t=max_frames // patch_temporal,
+                    h=max_img_h // patch_spatial,
+                    w=max_img_w // patch_spatial,
+                )
+        else:
+            # here I just follow the original logic, that except with SinCosPosEmb3D, the pos_emb would be feeded to transformer blocks,
+            # so the other embedders should be replicated across pp ranks.
+            self.pos_embedder = pos_embedder(
+                config,
+                t=max_frames // patch_temporal,
+                h=max_img_h // patch_spatial,
+                w=max_img_w // patch_spatial,
+                seed=1234,
+            )
+            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                for p in self.pos_embedder.parameters():
+                    setattr(p, "pipeline_parallel", True)
+
+        if self.post_process:
+            self.final_layer_linear = torch.nn.Linear(
+                self.config.hidden_size,
+                patch_spatial**2 * patch_temporal * out_channels,
+            )
+
+        self.affline_norm = RMSNorm(self.config.hidden_size)
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            setattr(self.affline_norm.weight, "pipeline_parallel", True)
+
+    def forward(
+        self,
+        x: Tensor,
+        timesteps: Tensor,
+        crossattn_emb: Tensor,
+        packed_seq_params: PackedSeqParams = None,
+        pos_ids: Tensor = None,
+        **kwargs,
+    ) -> Tensor:
+        """Forward pass.
+
+        Args:
+            x (Tensor): vae encoded data (b s c)
+            encoder_decoder_attn_mask (Tensor): cross-attention mask between encoder and decoder
+            inference_params (InferenceParams): relevant arguments for inferencing
+
+        Returns:
+            Tensor: loss tensor
+        """
+        B = x.shape[0]
+        fps = kwargs.get(
+            "fps",
+            torch.tensor(
+                [
+                    30,
+                ]
+                * B,
+                dtype=torch.bfloat16,
+            ),
+        ).view(-1)
+        if self.pre_process:
+            # transpose to match
+            x_B_S_D = self.x_embedder(x)
+            if isinstance(self.pos_embedder, dit_embeddings.SinCosPosEmb3D):
+                pos_emb = None
+                x_B_S_D += self.pos_embedder(pos_ids)
+            else:
+                pos_emb = self.pos_embedder(pos_ids)
+                pos_emb = rearrange(pos_emb, "B S D -> S B D")
+            x_S_B_D = rearrange(x_B_S_D, "B S D -> S B D")
+        else:
+            # intermediate stage of pipeline
+            x_S_B_D = None  ### should it take encoder_hidden_states
+            if (not hasattr(self, "pos_embedder")) or isinstance(self.pos_embedder, dit_embeddings.SinCosPosEmb3D):
+                pos_emb = None
+            else:
+                # if transformer blocks need pos_emb, then pos_embedder should
+                # be replicated across pp ranks.
+                pos_emb = rearrange(self.pos_embedder(pos_ids), "B S D -> S B D")
+
+        timesteps_B_D = self.t_embedder(timesteps.flatten()).to(torch.bfloat16)  # (b d_text_embedding)
+
+        affline_emb_B_D = timesteps_B_D
+        fps_B_D = self.fps_embedder(fps)
+        fps_B_D = nn.functional.pad(fps_B_D, (0, self.config.hidden_size - fps_B_D.shape[1]))
+        affline_emb_B_D += fps_B_D
+
+        crossattn_emb = rearrange(crossattn_emb, "B S D -> S B D")
+
+
+        #import pdb; pdb.set_trace()
+        if self.config.sequence_parallel:
+            if self.pre_process:
+                x_S_B_D = tensor_parallel.scatter_to_sequence_parallel_region(x_S_B_D)
+                if isinstance(self.pos_embedder, dit_embeddings.FactorizedLearnable3DEmbedding):
+                    pos_emb = tensor_parallel.scatter_to_sequence_parallel_region(pos_emb)
+
+            crossattn_emb = tensor_parallel.scatter_to_sequence_parallel_region(crossattn_emb)
+            # `scatter_to_sequence_parallel_region` returns a view, which prevents
+            # the original tensor from being garbage collected. Clone to facilitate GC.
+            # Has a small runtime cost (~0.5%).
+            if self.config.clone_scatter_output_in_embedding:
+                if self.pre_process:
+                    x_S_B_D = x_S_B_D.clone()
+                crossattn_emb = crossattn_emb.clone()
+
+        x_S_B_D = self.decoder(
+            hidden_states=x_S_B_D,
+            attention_mask=affline_emb_B_D,
+            context=crossattn_emb,
+            context_mask=None,
+            rotary_pos_emb=pos_emb,
+            packed_seq_params=packed_seq_params,
+        )
+
+        if not self.post_process:
+            return x_S_B_D
+
+        if self.config.sequence_parallel:
+            x_S_B_D = tensor_parallel.gather_from_sequence_parallel_region(x_S_B_D)
+
+        x_S_B_D = self.final_layer_linear(x_S_B_D)
+        return rearrange(x_S_B_D, "S B D -> B S D")
+
+    def set_input_tensor(self, input_tensor: Tensor) -> None:
+        """Sets input tensor to the model.
+
+        See megatron.model.transformer.set_input_tensor()
+
+        Args:
+            input_tensor (Tensor): Sets the input tensor for the model.
+        """
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, "input_tensor should only be length 1 for gpt/bert"
+        self.decoder.set_input_tensor(input_tensor[0])
+
+    def sharded_state_dict(
+        self, prefix: str = "module.", sharded_offsets: tuple = (), metadata: Optional[Dict] = None
+    ) -> ShardedStateDict:
+        """Sharded state dict implementation for GPTModel backward-compatibility (removing extra state).
+
+        Args:
+            prefix (str): Module name prefix.
+            sharded_offsets (tuple): PP related offsets, expected to be empty at this module level.
+            metadata (Optional[Dict]): metadata controlling sharded state dict creation.
+
+        Returns:
+            ShardedStateDict: sharded state dict for the GPTModel
+        """
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+
+        for module in ["t_embedder"]:
+            for param_name, param in getattr(self, module).named_parameters():
+                weight_key = f"{prefix}{module}.{param_name}"
+                self._set_embedder_weights_replica_id(param, sharded_state_dict, weight_key)
+        return sharded_state_dict
+
+    def _set_embedder_weights_replica_id(
+        self, tensor: Tensor, sharded_state_dict: ShardedStateDict, embedder_weight_key: str
+    ) -> None:
+        """set replica ids of the weights in t_embedder for sharded state dict.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): state dict with the weight to tie
+            weight_key (str): key of the weight in the state dict.
+                This entry will be replaced with a tied version
+
+        Returns: None, acts in-place
+        """
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        vpp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+        vpp_rank = vpp_rank if vpp_rank else 0
+        vpp_world = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+        vpp_world = vpp_world if vpp_world else 1
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        del sharded_state_dict[embedder_weight_key]
+        replica_id = (
+            tp_rank,
+            (vpp_rank + pp_rank * vpp_world),
+            parallel_state.get_data_parallel_rank(with_context_parallel=True),
+        )
+
+        sharded_state_dict[embedder_weight_key] = make_sharded_tensor_for_checkpoint(
+            tensor=tensor,
+            key=embedder_weight_key,
+            replica_id=replica_id,
+            allow_shape_mismatch=False,
+        )
diff --git a/src/megatron/bridge/models/DiTModel/dit_provider.py b/src/megatron/bridge/models/DiTModel/dit_provider.py
new file mode 100644
index 0000000000..6df225154b
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/dit_provider.py
@@ -0,0 +1,296 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import inspect
+import logging
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Any, Callable, Dict, Literal, Optional, Union
+
+from megatron.bridge.models.DiTModel.dit_layer_spec import get_dit_adaln_block_with_transformer_engine_spec
+from megatron.bridge.models.DiTModel.dit_model import DiTCrossAttentionModel
+import torch
+from megatron.core import parallel_state
+from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.transformer import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.bridge.models.DiTModel.dit_utils import dynamic_import
+
+from megatron.bridge.models.model_provider import ModelProviderMixin
+from megatron.bridge.utils import fusions
+from megatron.bridge.utils.vocab_utils import calculate_padded_vocab_size
+from megatron.core.models.common.vision_module.vision_module import VisionModule
+
+logger = logging.getLogger(__name__)
+
+
+def dit_transformer_engine_layer_spec() -> ModuleSpec:
+    """Create a Transformer Engine layer specification based on the provided config."""
+    return get_dit_adaln_block_with_transformer_engine_spec()
+
+
+def dit_forward_step(model, batch) -> torch.Tensor:
+    return model(**batch)
+
+
+def dit_data_step(module, dataloader_iter):
+    batch = next(dataloader_iter)[0]
+    batch = get_batch_on_this_cp_rank(batch)
+    batch = {k: v.to(device="cuda", non_blocking=True) if torch.is_tensor(v) else v for k, v in batch.items()}
+    batch["is_preprocessed"] = True  # assume data is preprocessed
+
+    if ("seq_len_q" in batch) and ("seq_len_kv" in batch):
+        cu_seqlens = batch["seq_len_q"].cumsum(dim=0).to(torch.int32)
+        zero = torch.zeros(1, dtype=torch.int32, device="cuda")
+        cu_seqlens = torch.cat((zero, cu_seqlens))
+
+        cu_seqlens_kv = batch["seq_len_kv"].cumsum(dim=0).to(torch.int32)
+        cu_seqlens_kv = torch.cat((zero, cu_seqlens_kv))
+
+        batch["packed_seq_params"] = {
+            "self_attention": PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens,
+                qkv_format=module.qkv_format,
+            ),
+            "cross_attention": PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens_kv,
+                qkv_format=module.qkv_format,
+            ),
+        }
+
+    return batch
+
+
+def get_batch_on_this_cp_rank(data: Dict):
+    """Split the data for context parallelism."""
+    from megatron.core import mpu
+
+    cp_size = mpu.get_context_parallel_world_size()
+    cp_rank = mpu.get_context_parallel_rank()
+
+    t = 16
+    if cp_size > 1:
+        # cp split on seq_length, for video_latent, noise_latent and pos_ids
+        assert t % cp_size == 0, "t must divisibly by cp_size"
+        num_valid_tokens_in_ub = None
+        if "loss_mask" in data and data["loss_mask"] is not None:
+            num_valid_tokens_in_ub = data["loss_mask"].sum()
+
+        for key, value in data.items():
+            if (value is not None) and (key in ["video", "video_latent", "noise_latent", "pos_ids"]):
+                if len(value.shape) > 5:
+                    value = value.squeeze(0)
+                B, C, T, H, W = value.shape
+                if T % cp_size == 0:
+                    # FIXME packed sequencing
+                    data[key] = value.view(B, C, cp_size, T // cp_size, H, W)[:, :, cp_rank, ...].contiguous()
+                else:
+                    # FIXME packed sequencing
+                    data[key] = value.view(B, C, T, cp_size, H // cp_size, W)[:, :, :, cp_rank, ...].contiguous()
+        loss_mask = data["loss_mask"]
+        data["loss_mask"] = loss_mask.view(loss_mask.shape[0], cp_size, loss_mask.shape[1] // cp_size)[
+            :, cp_rank, ...
+        ].contiguous()
+        data["num_valid_tokens_in_ub"] = num_valid_tokens_in_ub
+
+    return data
+
+
+@dataclass
+class DiTModelProvider(TransformerConfig, ModelProviderMixin[VisionModule]):
+    """
+    Config for DiT-S model
+    """
+
+    crossattn_emb_size: int = 1024
+    add_bias_linear: bool = False
+    gated_linear_unit: bool = False
+
+    num_layers: int = 12
+    hidden_size: int = 1024
+    max_img_h: int = 80
+    max_img_w: int = 80
+    max_frames: int = 34
+    patch_spatial: int = 2
+    num_attention_heads: int = 6
+    layernorm_epsilon = 1e-6
+    normalization = "RMSNorm"
+    add_bias_linear = False
+    qk_layernorm_per_head = True
+    layernorm_zero_centered_gamma = False
+
+    fp16_lm_cross_entropy: bool = False
+    parallel_output: bool = True
+    share_embeddings_and_output_weights: bool = True
+
+    # max_position_embeddings: int = 5400
+    hidden_dropout: float = 0
+    attention_dropout: float = 0
+
+    bf16: bool = True
+    params_dtype: torch.dtype = torch.bfloat16
+    vae_module: str = "megatron.bridge.models.DiTModel.diffusers_vae.AutoencoderKLVAE"
+    vae_path: str = None
+    sigma_data: float = 0.5
+
+    in_channels: int = 16
+
+    # remove these 2 parameters
+    data_step_fn = dit_data_step
+    forward_step_fn = dit_forward_step
+
+    replicated_t_embedder = True
+    qkv_format: str = 'sbhd'
+    seq_length: int = 1024
+    vocab_size: int = None
+    make_vocab_size_divisible_by: int = 128
+
+
+    def provide(self, pre_process=None, post_process=None, vp_stage=None) -> DiTCrossAttentionModel:
+        vp_size = self.virtual_pipeline_model_parallel_size
+        if vp_size:
+            p_size = self.pipeline_model_parallel_size
+            assert (self.num_layers // p_size) % vp_size == 0, (
+                "Make sure the number of model chunks is the same across all pipeline stages."
+            )
+
+        model = DiTCrossAttentionModel
+
+        return model(
+            self,
+            fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
+            parallel_output=self.parallel_output,
+            pre_process=parallel_state.is_pipeline_first_stage(),
+            post_process=parallel_state.is_pipeline_last_stage(),
+            max_img_h=self.max_img_h,
+            max_img_w=self.max_img_w,
+            max_frames=self.max_frames,
+            patch_spatial=self.patch_spatial,
+        )
+
+    def configure_vae(self):
+        return dynamic_import(self.vae_module)(self.vae_path)
+
+
+# Add all the DIT configs here like DIT7B, 14B, cosmos, etc, etc,
+# @dataclass
+# class GPTProvider126M(GPTModelProvider):
+#     """Configuration for a 126M parameter GPT model.
+
+#     Predefined configuration for a small GPT model with 12 layers,
+#     768 hidden size, and 12 attention heads.
+#     """
+
+#     seq_length: int = 2048
+#     num_layers: int = 12
+#     hidden_size: int = 768
+#     ffn_hidden_size: int = 3072
+#     num_attention_heads: int = 12
+#     bias_activation_fusion: bool = True
+#     bias_dropout_add_fusion: bool = True
+
+
+# @dataclass
+# class GPTProvider5B(GPTModelProvider):
+#     """Configuration for a 5B parameter GPT model.
+
+#     Predefined configuration for a medium-sized GPT model with 24 layers,
+#     4096 hidden size, and 32 attention heads.
+#     """
+
+#     seq_length: int = 2048
+#     num_layers: int = 24
+#     hidden_size: int = 4096
+#     ffn_hidden_size: int = 16384
+#     num_attention_heads: int = 32
+#     bias_activation_fusion: bool = True
+#     bias_dropout_add_fusion: bool = True
+
+
+# @dataclass
+# class GPTProvider7B(GPTModelProvider):
+#     """Configuration for a 7B parameter GPT model.
+
+#     Predefined configuration for a medium-sized GPT model with 32 layers,
+#     4096 hidden size, and 32 attention heads.
+#     """
+
+#     seq_length: int = 2048
+#     num_layers: int = 32
+#     hidden_size: int = 4096
+#     ffn_hidden_size: int = 10880
+#     num_attention_heads: int = 32
+#     bias_activation_fusion: bool = True
+#     bias_dropout_add_fusion: bool = True
+
+
+# @dataclass
+# class GPTProvider20B(GPTModelProvider):
+#     """Configuration for a 20B parameter GPT model.
+
+#     Predefined configuration for a large GPT model with 44 layers,
+#     6144 hidden size, and 48 attention heads.
+#     """
+
+#     seq_length: int = 2048
+#     num_layers: int = 44
+#     hidden_size: int = 6144
+#     ffn_hidden_size: int = 24576
+#     num_attention_heads: int = 48
+#     bias_activation_fusion: bool = True
+#     bias_dropout_add_fusion: bool = True
+
+
+# @dataclass
+# class GPTProvider40B(GPTModelProvider):
+#     """Configuration for a 40B parameter GPT model.
+
+#     Predefined configuration for a large GPT model with 48 layers,
+#     8192 hidden size, and 64 attention heads.
+#     """
+
+#     seq_length: int = 2048
+#     num_layers: int = 48
+#     hidden_size: int = 8192
+#     ffn_hidden_size: int = 32768
+#     num_attention_heads: int = 64
+#     bias_activation_fusion: bool = True
+#     bias_dropout_add_fusion: bool = True
+
+
+# @dataclass
+# class GPTProvider175B(GPTModelProvider):
+#     """Configuration for a 175B parameter GPT model.
+
+#     Predefined configuration for a massive GPT model with 96 layers,
+#     12288 hidden size, and 96 attention heads.
+#     """
+
+#     seq_length: int = 2048
+#     num_layers: int = 96
+#     hidden_size: int = 12288
+#     ffn_hidden_size: int = 49152
+#     num_attention_heads: int = 96
+#     hidden_dropout: float = 0.0
+#     attention_dropout: float = 0.0
+#     bias_activation_fusion: bool = True
+#     bias_dropout_add_fusion: bool = True
+#     layernorm_zero_centered_gamma: bool = True
\ No newline at end of file
diff --git a/src/megatron/bridge/models/DiTModel/dit_step.py b/src/megatron/bridge/models/DiTModel/dit_step.py
new file mode 100644
index 0000000000..f152f0000f
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/dit_step.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from functools import partial
+from typing import Iterable
+
+import torch
+from megatron.core import parallel_state
+from megatron.core.models.gpt import GPTModel
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.utils import get_batch_on_this_cp_rank, get_model_config
+from megatron.bridge.models.DiTModel.edm.edm_pipeline import EDMPipeline
+
+from megatron.bridge.training.config import ConfigContainer, FinetuningDatasetConfig
+from megatron.bridge.training.losses import masked_next_token_loss
+from megatron.bridge.training.state import GlobalState
+
+
+logger = logging.getLogger(__name__)
+
+def dit_data_step(qkv_format, dataloader_iter):
+    # import pdb;pdb.set_trace()
+    batch = next(iter(dataloader_iter.iterable))
+    batch = get_batch_on_this_cp_rank(batch)
+    batch = {k: v.to(device="cuda", non_blocking=True) if torch.is_tensor(v) else v for k, v in batch.items()}
+    batch["is_preprocessed"] = True  # assume data is preprocessed
+
+    if ("seq_len_q" in batch) and ("seq_len_kv" in batch):
+        cu_seqlens = batch["seq_len_q"].cumsum(dim=0).to(torch.int32)
+        zero = torch.zeros(1, dtype=torch.int32, device="cuda")
+        cu_seqlens = torch.cat((zero, cu_seqlens))
+
+        cu_seqlens_kv = batch["seq_len_kv"].cumsum(dim=0).to(torch.int32)
+        cu_seqlens_kv = torch.cat((zero, cu_seqlens_kv))
+
+        batch["packed_seq_params"] = {
+            "self_attention": PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens,
+                qkv_format=qkv_format,
+            ),
+            "cross_attention": PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens_kv,
+                qkv_format=qkv_format,
+            ),
+        }
+
+    return batch
+
+
+def get_batch_on_this_cp_rank(data):
+    """Split the data for context parallelism."""
+    from megatron.core import mpu
+
+    cp_size = mpu.get_context_parallel_world_size()
+    cp_rank = mpu.get_context_parallel_rank()
+
+    t = 16
+    if cp_size > 1:
+        # cp split on seq_length, for video_latent, noise_latent and pos_ids
+        assert t % cp_size == 0, "t must divisibly by cp_size"
+        num_valid_tokens_in_ub = None
+        if "loss_mask" in data and data["loss_mask"] is not None:
+            num_valid_tokens_in_ub = data["loss_mask"].sum()
+
+        for key, value in data.items():
+            if (value is not None) and (key in ["video", "video_latent", "noise_latent", "pos_ids"]):
+                if len(value.shape) > 5:
+                    value = value.squeeze(0)
+                B, C, T, H, W = value.shape
+                if T % cp_size == 0:
+                    # FIXME packed sequencing
+                    data[key] = value.view(B, C, cp_size, T // cp_size, H, W)[:, :, cp_rank, ...].contiguous()
+                else:
+                    # FIXME packed sequencing
+                    data[key] = value.view(B, C, T, cp_size, H // cp_size, W)[:, :, :, cp_rank, ...].contiguous()
+        loss_mask = data["loss_mask"]
+        data["loss_mask"] = loss_mask.view(loss_mask.shape[0], cp_size, loss_mask.shape[1] // cp_size)[
+            :, cp_rank, ...
+        ].contiguous()
+        data["num_valid_tokens_in_ub"] = num_valid_tokens_in_ub
+
+    return data
+
+class DITForwardStep:
+    def __init__(self):
+        self.diffusion_pipeline = EDMPipeline(sigma_data=0.5)
+
+
+    def __call__(
+        self, state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False
+    ) -> tuple[torch.Tensor, partial]:
+        """Forward training step.
+
+        Args:
+            state: Global state for the run
+            data_iterator: Input data iterator
+            model: The GPT Model
+            return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor
+
+        Returns:
+            tuple containing the output tensor and the loss function
+        """
+        timers = state.timers
+        straggler_timer = state.straggler_timer
+
+        config = get_model_config(model)
+ 
+        timers("batch-generator", log_level=2).start()
+        # use_mtp = (getattr(config, "mtp_num_layers", None) or 0) > 0
+        qkv_format =getattr(config, "qkv_format", "sbhd")
+        with straggler_timer(bdata=True):
+            batch = dit_data_step(
+                qkv_format, data_iterator
+            )
+        timers("batch-generator").stop()
+        
+        check_for_nan_in_loss = state.cfg.rerun_state_machine.check_for_nan_in_loss
+        check_for_spiky_loss = state.cfg.rerun_state_machine.check_for_spiky_loss
+        # import pdb;pdb.set_trace()
+        with straggler_timer:
+            if parallel_state.is_pipeline_last_stage():
+                output_batch, loss = self.diffusion_pipeline.training_step(model, batch, 0)
+                output_tensor = torch.mean(loss, dim=-1)
+            else:
+                output_tensor = self.diffusion_pipeline.training_step(model, batch, 0)
+
+        loss = output_tensor
+        if "loss_mask" not in batch or batch["loss_mask"] is None:
+            loss_mask = torch.ones_like(loss)
+        loss_mask = batch["loss_mask"]
+        
+
+        loss_function = self._create_loss_function(loss_mask, check_for_nan_in_loss, check_for_spiky_loss)
+
+
+        return output_tensor, loss_function
+
+
+    def _create_loss_function(self, loss_mask: torch.Tensor, check_for_nan_in_loss: bool, check_for_spiky_loss: bool) -> partial:
+        """Create a partial loss function with the specified configuration.
+
+        Args:
+            loss_mask: Used to mask out some portions of the loss
+            check_for_nan_in_loss: Whether to check for NaN values in the loss
+            check_for_spiky_loss: Whether to check for spiky loss values
+
+        Returns:
+            A partial function that can be called with output_tensor to compute the loss
+        """
+        return partial(
+            masked_next_token_loss,
+            loss_mask,
+            check_for_nan_in_loss=check_for_nan_in_loss,
+            check_for_spiky_loss=check_for_spiky_loss,
+        )
diff --git a/src/megatron/bridge/models/DiTModel/dit_utils b/src/megatron/bridge/models/DiTModel/dit_utils
new file mode 100644
index 0000000000..22bde8ba7b
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/dit_utils
@@ -0,0 +1,30 @@
+def dynamic_import(full_path):
+    """
+    Dynamically import a class or function from a given full path.
+
+    :param full_path: The full path to the class or function (e.g., "package.module.ClassName")
+    :return: The imported class or function
+    :raises ImportError: If the module or attribute cannot be imported
+    :raises AttributeError: If the attribute does not exist in the module
+    """
+    try:
+        # Split the full path into module path and attribute name
+        module_path, attribute_name = full_path.rsplit(".", 1)
+    except ValueError as e:
+        raise ImportError(
+            f"Invalid full path '{full_path}'. It should contain both module and attribute names."
+        ) from e
+
+    # Import the module
+    try:
+        module = importlib.import_module(module_path)
+    except ImportError as e:
+        raise ImportError(f"Cannot import module '{module_path}'.") from e
+
+    # Retrieve the attribute from the module
+    try:
+        attribute = getattr(module, attribute_name)
+    except AttributeError as e:
+        raise AttributeError(f"Module '{module_path}' does not have an attribute '{attribute_name}'.") from e
+
+    return attribute
diff --git a/src/megatron/bridge/models/DiTModel/dit_utils.py b/src/megatron/bridge/models/DiTModel/dit_utils.py
new file mode 100644
index 0000000000..22bde8ba7b
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/dit_utils.py
@@ -0,0 +1,30 @@
+def dynamic_import(full_path):
+    """
+    Dynamically import a class or function from a given full path.
+
+    :param full_path: The full path to the class or function (e.g., "package.module.ClassName")
+    :return: The imported class or function
+    :raises ImportError: If the module or attribute cannot be imported
+    :raises AttributeError: If the attribute does not exist in the module
+    """
+    try:
+        # Split the full path into module path and attribute name
+        module_path, attribute_name = full_path.rsplit(".", 1)
+    except ValueError as e:
+        raise ImportError(
+            f"Invalid full path '{full_path}'. It should contain both module and attribute names."
+        ) from e
+
+    # Import the module
+    try:
+        module = importlib.import_module(module_path)
+    except ImportError as e:
+        raise ImportError(f"Cannot import module '{module_path}'.") from e
+
+    # Retrieve the attribute from the module
+    try:
+        attribute = getattr(module, attribute_name)
+    except AttributeError as e:
+        raise AttributeError(f"Module '{module_path}' does not have an attribute '{attribute_name}'.") from e
+
+    return attribute
diff --git a/src/megatron/bridge/models/DiTModel/edm/__init__.py b/src/megatron/bridge/models/DiTModel/edm/__init__.py
new file mode 100644
index 0000000000..d9155f923f
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/edm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/megatron/bridge/models/DiTModel/edm/edm.py b/src/megatron/bridge/models/DiTModel/edm/edm.py
new file mode 100644
index 0000000000..698acbb128
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/edm/edm.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+from statistics import NormalDist
+from typing import Callable, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from tqdm import tqdm
+
+
+class EDMScaling:
+    def __init__(self, sigma_data: float = 0.5):
+        self.sigma_data = sigma_data
+
+    def __call__(self, sigma: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = self.sigma_data**2 / (sigma**2 + self.sigma_data**2)
+        c_out = sigma * self.sigma_data / (sigma**2 + self.sigma_data**2) ** 0.5
+        c_in = 1 / (sigma**2 + self.sigma_data**2) ** 0.5
+        c_noise = 0.25 * sigma.log()
+        return c_skip, c_out, c_in, c_noise
+
+
+class EDMSDE:
+    def __init__(
+        self,
+        p_mean: float = -1.2,
+        p_std: float = 1.2,
+        sigma_max: float = 80.0,
+        sigma_min: float = 0.002,
+    ):
+        self.gaussian_dist = NormalDist(mu=p_mean, sigma=p_std)
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self._generator = np.random
+
+    def sample_t(self, batch_size: int) -> torch.Tensor:
+        cdf_vals = self._generator.uniform(size=(batch_size))
+        samples_interval_gaussian = [self.gaussian_dist.inv_cdf(cdf_val) for cdf_val in cdf_vals]
+        log_sigma = torch.tensor(samples_interval_gaussian, device="cuda")
+        return torch.exp(log_sigma)
+
+    def marginal_prob(self, x0: torch.Tensor, sigma: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        return x0, sigma
+
+
+class EDMSampler(nn.Module):
+    """
+    Elucidating the Design Space of Diffusion-Based Generative Models (EDM)
+    # https://github.com/NVlabs/edm/blob/62072d2612c7da05165d6233d13d17d71f213fee/generate.py#L25
+
+    Attributes:
+        None
+
+    Methods:
+        forward(x0_fn: Callable, x_sigma_max: torch.Tensor, num_steps: int = 35, sigma_min: float = 0.002,
+                sigma_max: float = 80, rho: float = 7, S_churn: float = 0, S_min: float = 0,
+                S_max: float = float("inf"), S_noise: float = 1) -> torch.Tensor:
+            Performs the forward pass for the EDM sampling process.
+
+            Parameters:
+                x0_fn (Callable): A function that takes in a tensor and returns a denoised tensor.
+                x_sigma_max (torch.Tensor): The initial noise level tensor.
+                num_steps (int, optional): The number of sampling steps. Default is 35.
+                sigma_min (float, optional): The minimum noise level. Default is 0.002.
+                sigma_max (float, optional): The maximum noise level. Default is 80.
+                rho (float, optional): The rho parameter for time step discretization. Default is 7.
+                S_churn (float, optional): The churn parameter for noise increase. Default is 0.
+                S_min (float, optional): The minimum value for the churn parameter. Default is 0.
+                S_max (float, optional): The maximum value for the churn parameter. Default is float("inf").
+                S_noise (float, optional): The noise scale for the churn parameter. Default is 1.
+
+            Returns:
+                torch.Tensor: The sampled tensor after the EDM process.
+    """
+
+    @torch.no_grad()
+    def forward(
+        self,
+        x0_fn: Callable,
+        x_sigma_max: torch.Tensor,
+        num_steps: int = 35,
+        sigma_min: float = 0.002,
+        sigma_max: float = 80,
+        rho: float = 7,
+        S_churn: float = 0,
+        S_min: float = 0,
+        S_max: float = float("inf"),
+        S_noise: float = 1,
+    ) -> torch.Tensor:
+        # Time step discretization.
+        in_dtype = x_sigma_max.dtype
+        _ones = torch.ones(x_sigma_max.shape[0], dtype=in_dtype, device=x_sigma_max.device)
+        step_indices = torch.arange(num_steps, dtype=torch.float64, device=x_sigma_max.device)
+        t_steps = (
+            sigma_max ** (1 / rho) + step_indices / (num_steps - 1) * (sigma_min ** (1 / rho) - sigma_max ** (1 / rho))
+        ) ** rho
+        t_steps = torch.cat([t_steps, torch.zeros_like(t_steps[:1])])  # t_N = 0
+
+        # Main sampling loop.
+        x_next = x_sigma_max.to(torch.float64)
+        for i, (t_cur, t_next) in enumerate(
+            tqdm(zip(t_steps[:-1], t_steps[1:], strict=False), total=len(t_steps) - 1)
+        ):  # 0, ..., N-1
+            x_cur = x_next
+
+            # Increase noise temporarily.
+            gamma = min(S_churn / num_steps, np.sqrt(2) - 1) if S_min <= t_cur <= S_max else 0
+            t_hat = t_cur + gamma * t_cur
+            x_hat = x_cur + (t_hat**2 - t_cur**2).sqrt() * S_noise * torch.randn_like(x_cur)
+
+            # Euler step.
+            denoised = x0_fn(x_hat.to(in_dtype), t_hat.to(in_dtype) * _ones).to(torch.float64)
+            d_cur = (x_hat - denoised) / t_hat
+            x_next = x_hat + (t_next - t_hat) * d_cur
+
+            # Apply 2nd order correction.
+            if i < num_steps - 1:
+                denoised = x0_fn(x_hat.to(in_dtype), t_hat.to(in_dtype) * _ones).to(torch.float64)
+                d_prime = (x_next - denoised) / t_next
+                x_next = x_hat + (t_next - t_hat) * (0.5 * d_cur + 0.5 * d_prime)
+
+        return x_next.to(in_dtype)
diff --git a/src/megatron/bridge/models/DiTModel/edm/edm_pipeline.py b/src/megatron/bridge/models/DiTModel/edm/edm_pipeline.py
new file mode 100644
index 0000000000..1d0b4d502c
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/edm/edm_pipeline.py
@@ -0,0 +1,432 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Dict, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed
+from megatron.core import parallel_state
+from megatron.bridge.models.DiTModel.sampler.batch_ops import batch_mul
+from megatron.bridge.models.DiTModel.sampler.context_parallel import cat_outputs_cp
+from megatron.bridge.models.DiTModel.edm.edm import EDMSDE, EDMSampler, EDMScaling
+from torch import Tensor
+
+
+class EDMPipeline:
+    """
+    EDMPipeline is a class that implements a diffusion model pipeline for video generation. It includes methods for
+    initializing the pipeline, encoding and decoding video data, performing training steps, denoising, and generating
+    samples.
+    Attributes:
+        p_mean: Mean for SDE process.
+        p_std: Standard deviation for SDE process.
+        sigma_max: Maximum noise level.
+        sigma_min: Minimum noise level.
+        _noise_generator: Generator for noise.
+        _noise_level_generator: Generator for noise levels.
+        sde: SDE process.
+        sampler: Sampler for the diffusion model.
+        scaling: Scaling for EDM.
+        input_data_key: Key for input video data.
+        input_image_key: Key for input image data.
+        tensor_kwargs: Tensor keyword arguments.
+        loss_reduce: Method for reducing loss.
+        loss_scale: Scale factor for loss.
+        aesthetic_finetuning: Aesthetic finetuning parameter.
+        camera_sample_weight: Camera sample weight parameter.
+        loss_mask_enabled: Flag for enabling loss mask.
+    Methods:
+        noise_level_generator: Returns the noise level generator.
+        _initialize_generators: Initializes noise and noise-level generators.
+        encode: Encodes input tensor using the video tokenizer.
+        decode: Decodes latent tensor using video tokenizer.
+        training_step: Performs a single training step for the diffusion model.
+        denoise: Performs denoising on the input noise data, noise level, and condition.
+        compute_loss_with_epsilon_and_sigma: Computes the loss for training.
+        get_per_sigma_loss_weights: Returns loss weights per sigma noise level.
+        get_condition_uncondition: Returns conditioning and unconditioning for classifier-free guidance.
+        get_x0_fn_from_batch: Creates a function to generate denoised predictions with the sampler.
+        generate_samples_from_batch: Generates samples based on input data batch.
+        _normalize_video_databatch_inplace: Normalizes video data in-place on a CUDA device to [-1, 1].
+        draw_training_sigma_and_epsilon: Draws training noise (epsilon) and noise levels (sigma).
+        random_dropout_input: Applies random dropout to the input tensor.
+        get_data_and_condition: Retrieves data and conditioning for model input.
+    """
+
+    def __init__(
+        self,
+        vae=None,
+        p_mean=0.0,
+        p_std=1.0,
+        sigma_max=80,
+        sigma_min=0.0002,
+        sigma_data=0.5,
+        seed=1234,
+    ):
+        """
+        Initializes the EDM pipeline with the given parameters.
+
+        Args:
+            net: The DiT model.
+            vae: The Video Tokenizer (optional).
+            p_mean (float): Mean for the SDE.
+            p_std (float): Standard deviation for the SDE.
+            sigma_max (float): Maximum sigma value for the SDE.
+            sigma_min (float): Minimum sigma value for the SDE.
+            sigma_data (float): Sigma value for EDM scaling.
+            seed (int): Random seed for reproducibility.
+
+        Attributes:
+            vae: The Video Tokenizer.
+            net: The DiT model.
+            p_mean (float): Mean for the SDE.
+            p_std (float): Standard deviation for the SDE.
+            sigma_max (float): Maximum sigma value for the SDE.
+            sigma_min (float): Minimum sigma value for the SDE.
+            sigma_data (float): Sigma value for EDM scaling.
+            seed (int): Random seed for reproducibility.
+            _noise_generator: Placeholder for noise generator.
+            _noise_level_generator: Placeholder for noise level generator.
+            sde: Instance of EDMSDE initialized with p_mean, p_std, sigma_max, and sigma_min.
+            sampler: Instance of EDMSampler.
+            scaling: Instance of EDMScaling initialized with sigma_data.
+            input_data_key (str): Key for input data.
+            input_image_key (str): Key for input images.
+            tensor_kwargs (dict): Tensor keyword arguments for device and dtype.
+            loss_reduce (str): Method to reduce loss ('mean' or other).
+            loss_scale (float): Scale factor for loss.
+        """
+        self.vae = vae
+
+        self.p_mean = p_mean
+        self.p_std = p_std
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.sigma_data = sigma_data
+
+        self.seed = seed
+        self._noise_generator = None
+        self._noise_level_generator = None
+
+        self.sde = EDMSDE(p_mean, p_std, sigma_max, sigma_min)
+        self.sampler = EDMSampler()
+        self.scaling = EDMScaling(sigma_data)
+
+        self.input_data_key = "video"
+        self.input_image_key = "images_1024"
+        self.tensor_kwargs = {"device": "cuda", "dtype": torch.bfloat16}
+        self.loss_reduce = "mean"
+        self.loss_scale = 1.0
+
+    @property
+    def noise_level_generator(self):
+        """
+        Generates noise levels for the EDM pipeline.
+
+        Returns:
+            Callable: A function or generator that produces noise levels.
+        """
+        return self._noise_level_generator
+
+    def _initialize_generators(self):
+        """
+        Initializes the random number generators for noise and noise level.
+
+        This method sets up two generators:
+        1. A PyTorch generator for noise, seeded with a combination of the base seed and the data parallel rank.
+        2. A NumPy generator for noise levels, seeded similarly but without considering context parallel rank.
+
+        Returns:
+            None
+        """
+        noise_seed = self.seed + 100 * parallel_state.get_data_parallel_rank(with_context_parallel=True)
+        noise_level_seed = self.seed + 100 * parallel_state.get_data_parallel_rank(with_context_parallel=False)
+        self._noise_generator = torch.Generator(device="cuda")
+        self._noise_generator.manual_seed(noise_seed)
+        self._noise_level_generator = np.random.default_rng(noise_level_seed)
+        self.sde._generator = self._noise_level_generator
+
+    def training_step(
+        self, model, data_batch: dict[str, torch.Tensor], iteration: int
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        """
+        Performs a single training step for the diffusion model.
+
+        This method is responsible for executing one iteration of the model's training. It involves:
+        1. Adding noise to the input data using the SDE process.
+        2. Passing the noisy data through the network to generate predictions.
+        3. Computing the loss based on the difference between the predictions and the original data.
+
+        Args:
+            data_batch (dict): raw data batch draw from the training data loader.
+            iteration (int): Current iteration number.
+
+        Returns:
+            A tuple with the output batch and the computed loss.
+        """
+        # import pdb; pdb.set_trace()
+        # Get the input data to noise and denoise~(image, video) and the corresponding conditioner.
+        self.net = model
+        x0_from_data_batch, x0, condition = self.get_data_and_condition(data_batch)
+
+        # Sample pertubation noise levels and N(0, 1) noises
+        sigma, epsilon = self.draw_training_sigma_and_epsilon(x0.size(), condition)
+
+        if parallel_state.is_pipeline_last_stage():
+            output_batch, pred_mse, edm_loss = self.compute_loss_with_epsilon_and_sigma(
+                data_batch, x0_from_data_batch, x0, condition, epsilon, sigma
+            )
+
+            return output_batch, edm_loss
+        else:
+            net_output = self.compute_loss_with_epsilon_and_sigma(
+                data_batch, x0_from_data_batch, x0, condition, epsilon, sigma
+            )
+            return net_output
+
+    def denoise(self, xt: torch.Tensor, sigma: torch.Tensor, condition: dict[str, torch.Tensor]):
+        """
+        Performs denoising on the input noise data, noise level, and condition
+
+        Args:
+            xt (torch.Tensor): The input noise data.
+            sigma (torch.Tensor): The noise level.
+            condition (dict[str, torch.Tensor]): conditional information
+
+        Returns:
+            Predicted clean data (x0) and noise (eps_pred).
+        """
+
+        xt = xt.to(**self.tensor_kwargs)
+        sigma = sigma.to(**self.tensor_kwargs)
+        # get precondition for the network
+        c_skip, c_out, c_in, c_noise = self.scaling(sigma=sigma)
+
+        net_output = self.net(
+            x=batch_mul(c_in, xt),  # Eq. 7 of https://arxiv.org/pdf/2206.00364.pdf
+            timesteps=c_noise,  # Eq. 7 of https://arxiv.org/pdf/2206.00364.pdf
+            **condition,
+        )
+
+        if not parallel_state.is_pipeline_last_stage():
+            return net_output
+
+        x0_pred = batch_mul(c_skip, xt) + batch_mul(c_out, net_output)
+
+        # get noise prediction based on sde
+        eps_pred = batch_mul(xt - x0_pred, 1.0 / sigma)
+
+        return x0_pred, eps_pred
+
+    def compute_loss_with_epsilon_and_sigma(
+        self,
+        data_batch: dict[str, torch.Tensor],
+        x0_from_data_batch: torch.Tensor,
+        x0: torch.Tensor,
+        condition: dict[str, torch.Tensor],
+        epsilon: torch.Tensor,
+        sigma: torch.Tensor,
+    ):
+        """
+        Computes the loss for training.
+
+        Args:
+            data_batch: Batch of input data.
+            x0_from_data_batch: Raw input tensor.
+            x0: Latent tensor.
+            condition: Conditional input data.
+            epsilon: Noise tensor.
+            sigma: Noise level tensor.
+
+        Returns:
+            The computed loss.
+        """
+        # Get the mean and stand deviation of the marginal probability distribution.
+        mean, std = self.sde.marginal_prob(x0, sigma)
+        # Generate noisy observations
+        xt = mean + batch_mul(std, epsilon)  # corrupted data
+
+        if parallel_state.is_pipeline_last_stage():
+            # make prediction
+            x0_pred, eps_pred = self.denoise(xt, sigma, condition)
+            # loss weights for different noise levels
+            weights_per_sigma = self.get_per_sigma_loss_weights(sigma=sigma)
+            pred_mse = (xt - x0_pred) ** 2
+            edm_loss = batch_mul(pred_mse, weights_per_sigma)
+
+            output_batch = {
+                "x0": x0,
+                "xt": xt,
+                "sigma": sigma,
+                "weights_per_sigma": weights_per_sigma,
+                "condition": condition,
+                "model_pred": {"x0_pred": x0_pred, "eps_pred": eps_pred},
+                "mse_loss": pred_mse.mean(),
+                "edm_loss": edm_loss.mean(),
+            }
+            return output_batch, pred_mse, edm_loss
+        else:
+            # make prediction
+            x0_pred = self.denoise(xt, sigma, condition)
+            return x0_pred.contiguous()
+
+    def get_per_sigma_loss_weights(self, sigma: torch.Tensor):
+        """
+        Args:
+            sigma (tensor): noise level
+
+        Returns:
+            loss weights per sigma noise level
+        """
+        return (sigma**2 + self.sigma_data**2) / (sigma * self.sigma_data) ** 2
+
+    def get_condition_uncondition(self, data_batch: Dict):
+        """Returns conditioning and unconditioning for classifier-free guidance."""
+        _, _, condition = self.get_data_and_condition(data_batch, dropout_rate=0.0)
+
+        if "neg_t5_text_embeddings" in data_batch:
+            data_batch["t5_text_embeddings"] = data_batch["neg_t5_text_embeddings"]
+            data_batch["t5_text_mask"] = data_batch["neg_t5_text_mask"]
+            _, _, uncondition = self.get_data_and_condition(data_batch, dropout_rate=1.0)
+        else:
+            _, _, uncondition = self.get_data_and_condition(data_batch, dropout_rate=1.0)
+
+        return condition, uncondition
+
+    def get_x0_fn_from_batch(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        is_negative_prompt: bool = False,
+    ) -> Callable:
+        """
+        Creates a function to generate denoised predictions with the sampler.
+
+        Args:
+            data_batch: Batch of input data.
+            guidance: Guidance scale factor.
+            is_negative_prompt: Whether to use negative prompts.
+
+        Returns:
+            A callable to predict clean data (x0).
+        """
+        condition, uncondition = self.get_condition_uncondition(data_batch)
+
+        def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+            cond_x0, _ = self.denoise(noise_x, sigma, condition)
+            uncond_x0, _ = self.denoise(noise_x, sigma, uncondition)
+            return cond_x0 + guidance * (cond_x0 - uncond_x0)
+
+        return x0_fn
+
+    def generate_samples_from_batch(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        state_shape: Tuple | None = None,
+        is_negative_prompt: bool = False,
+        num_steps: int = 35,
+    ) -> Tensor:
+        """
+        Generates samples based on input data batch.
+
+        Args:
+            data_batch: Batch of input data.
+            guidance: Guidance scale factor.
+            state_shape: Shape of the state.
+            is_negative_prompt: Whether to use negative prompts.
+            num_steps: Number of steps for sampling.
+            solver_option: SDE Solver option.
+
+        Returns:
+            Generated samples from diffusion model.
+        """
+        cp_enabled = parallel_state.get_context_parallel_world_size() > 1
+
+        if self._noise_generator is None:
+            self._initialize_generators()
+        x0_fn = self.get_x0_fn_from_batch(data_batch, guidance, is_negative_prompt=is_negative_prompt)
+
+        state_shape = list(state_shape)
+        state_shape[1] //= parallel_state.get_context_parallel_world_size()
+        x_sigma_max = (
+            torch.randn(state_shape, **self.tensor_kwargs, generator=self._noise_generator) * self.sde.sigma_max
+        )
+
+        samples = self.sampler(x0_fn, x_sigma_max, num_steps=num_steps, sigma_max=self.sde.sigma_max)
+
+        if cp_enabled:
+            cp_group = parallel_state.get_context_parallel_group()
+            samples = cat_outputs_cp(samples, seq_dim=2, cp_group=cp_group)
+
+        return samples
+
+    def draw_training_sigma_and_epsilon(self, x0_size: int, condition: Any) -> torch.Tensor:
+        """
+        Draws training noise (epsilon) and noise levels (sigma).
+
+        Args:
+            x0_size: Shape of the input tensor.
+            condition: Conditional input (unused).
+
+        Returns:
+            Noise level (sigma) and noise (epsilon).
+        """
+        del condition
+        batch_size = x0_size[0]
+        if self._noise_generator is None:
+            self._initialize_generators()
+        epsilon = torch.randn(x0_size, **self.tensor_kwargs, generator=self._noise_generator)
+        return self.sde.sample_t(batch_size).to(**self.tensor_kwargs), epsilon
+
+    def random_dropout_input(self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None) -> torch.Tensor:
+        """
+        Applies random dropout to the input tensor.
+
+        Args:
+            in_tensor: Input tensor.
+            dropout_rate: Dropout probability (optional).
+
+        Returns:
+            Conditioning with random dropout applied.
+        """
+        dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate
+        return batch_mul(
+            torch.bernoulli((1.0 - dropout_rate) * torch.ones(in_tensor.shape[0])).type_as(in_tensor),
+            in_tensor,
+        )
+
+    def get_data_and_condition(self, data_batch: dict[str, Tensor], dropout_rate=0.2) -> Tuple[Tensor]:
+        """
+        Retrieves data and conditioning for model input.
+
+        Args:
+            data_batch: Batch of input data.
+            dropout_rate: Dropout probability for conditioning.
+
+        Returns:
+            Raw data, latent data, and conditioning information.
+        """
+        # Latent state
+        raw_state = data_batch["video"] * self.sigma_data
+        # assume data is already encoded
+        latent_state = raw_state
+
+        # Condition
+        data_batch["crossattn_emb"] = self.random_dropout_input(
+            data_batch["t5_text_embeddings"], dropout_rate=dropout_rate
+        )
+
+        return raw_state, latent_state, data_batch
diff --git a/src/megatron/bridge/models/DiTModel/sampler/__init__.py b/src/megatron/bridge/models/DiTModel/sampler/__init__.py
new file mode 100644
index 0000000000..d9155f923f
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/sampler/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/megatron/bridge/models/DiTModel/sampler/batch_ops.py b/src/megatron/bridge/models/DiTModel/sampler/batch_ops.py
new file mode 100644
index 0000000000..956dfbee36
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/sampler/batch_ops.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch import Tensor
+
+
+def common_broadcast(x: Tensor, y: Tensor) -> tuple[Tensor, Tensor]:
+    """
+    Broadcasts two tensors to have the same shape by adding singleton dimensions where necessary.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        tuple[Tensor, Tensor]: A tuple containing the two tensors with broadcasted shapes.
+
+    Raises:
+        AssertionError: If the dimensions of the tensors do not match at any axis within their common dimensions.
+    """
+    ndims1 = x.ndim
+    ndims2 = y.ndim
+
+    common_ndims = min(ndims1, ndims2)
+    for axis in range(common_ndims):
+        assert x.shape[axis] == y.shape[axis], "Dimensions not equal at axis {}".format(axis)
+
+    if ndims1 < ndims2:
+        x = x.reshape(x.shape + (1,) * (ndims2 - ndims1))
+    elif ndims2 < ndims1:
+        y = y.reshape(y.shape + (1,) * (ndims1 - ndims2))
+
+    return x, y
+
+
+def batch_add(x: Tensor, y: Tensor) -> Tensor:
+    """
+    Adds two tensors element-wise after broadcasting them to a common shape.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        Tensor: The element-wise sum of the input tensors after broadcasting.
+    """
+    x, y = common_broadcast(x, y)
+    return x + y
+
+
+def batch_mul(x: Tensor, y: Tensor) -> Tensor:
+    """
+    Multiplies two tensors element-wise after broadcasting them to a common shape.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        Tensor: The element-wise product of the input tensors after broadcasting.
+    """
+    x, y = common_broadcast(x, y)
+    return x * y
+
+
+def batch_sub(x: Tensor, y: Tensor) -> Tensor:
+    """
+    Subtracts two tensors element-wise after broadcasting them to a common shape.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        Tensor: The result of element-wise subtraction of the input tensors.
+    """
+    x, y = common_broadcast(x, y)
+    return x - y
+
+
+def batch_div(x: Tensor, y: Tensor) -> Tensor:
+    """
+    Divides two tensors element-wise after broadcasting them to a common shape.
+
+    Args:
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
+
+    Returns:
+        Tensor: The result of element-wise division of `x` by `y` after broadcasting.
+    """
+    x, y = common_broadcast(x, y)
+    return x / y
diff --git a/src/megatron/bridge/models/DiTModel/sampler/context_parallel.py b/src/megatron/bridge/models/DiTModel/sampler/context_parallel.py
new file mode 100644
index 0000000000..71906fc4eb
--- /dev/null
+++ b/src/megatron/bridge/models/DiTModel/sampler/context_parallel.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import Tensor
+from torch.distributed import ProcessGroup, all_gather, get_process_group_ranks, get_world_size
+
+
+def split_inputs_cp(x: Tensor, seq_dim: int, cp_group: ProcessGroup) -> Tensor:
+    """
+    Split input tensor along the sequence dimension for checkpoint parallelism.
+
+    This function divides the input tensor into equal parts along the specified
+    sequence dimension, based on the number of ranks in the checkpoint parallelism group.
+    It then selects the part corresponding to the current rank.
+
+    Args:
+        x: Input tensor to be split.
+        seq_dim: The dimension along which to split the input (sequence dimension).
+        cp_group: The process group for checkpoint parallelism.
+
+    Returns:
+        A slice of the input tensor corresponding to the current rank.
+
+    Raises:
+        AssertionError: If the sequence dimension is not divisible by the number of ranks.
+    """
+    cp_ranks = get_process_group_ranks(cp_group)
+    cp_size = len(cp_ranks)
+
+    assert x.shape[seq_dim] % cp_size == 0, f"{x.shape[seq_dim]} cannot divide cp_size {cp_size}"
+    x = x.view(*x.shape[:seq_dim], cp_size, x.shape[seq_dim] // cp_size, *x.shape[(seq_dim + 1) :])
+    seq_idx = torch.tensor([cp_group.rank()], device=x.device)
+    x = x.index_select(seq_dim, seq_idx)
+    # Note that the new sequence length is the original sequence length / cp_size
+    x = x.view(*x.shape[:seq_dim], -1, *x.shape[(seq_dim + 2) :])
+    return x
+
+
+def cat_outputs_cp(x: Tensor, seq_dim: int, cp_group: ProcessGroup) -> Tensor:
+    """
+    Concatenates tensors from multiple processes along a specified dimension.
+
+    This function gathers tensors from all processes in the given process group
+    and concatenates them along the specified dimension.
+
+    Args:
+        x (Tensor): The input tensor to be gathered and concatenated.
+        seq_dim (int): The dimension along which to concatenate the gathered tensors.
+        cp_group (ProcessGroup): The process group containing all the processes involved in the gathering.
+
+    Returns:
+        Tensor: A tensor resulting from the concatenation of tensors from all processes.
+
+    Raises:
+        RuntimeError: If the gathering of tensors fails.
+    """
+    # Number of processes in the group
+    world_size = get_world_size(cp_group)
+
+    # List to hold tensors from each rank
+    gathered_tensors = [torch.zeros_like(x) for _ in range(world_size)]
+
+    # Attempt to gather tensors from all ranks
+    try:
+        all_gather(gathered_tensors, x, group=cp_group)
+    except RuntimeError as e:
+        raise RuntimeError(f"Gathering failed: {e}")
+
+    # Concatenate tensors along the specified dimension
+    return torch.cat(gathered_tensors, dim=seq_dim)
diff --git a/src/megatron/bridge/models/__init__.py b/src/megatron/bridge/models/__init__.py
index 8c9ea8a597..f7b01d50c5 100644
--- a/src/megatron/bridge/models/__init__.py
+++ b/src/megatron/bridge/models/__init__.py
@@ -37,6 +37,17 @@
     MoonlightModelProvider16B,
     MoonlightProvider,
 )
+from megatron.bridge.models.gemma import (
+    CodeGemmaModelProvider2B,
+    CodeGemmaModelProvider7B,
+    Gemma2ModelProvider,
+    Gemma2ModelProvider2B,
+    Gemma2ModelProvider9B,
+    Gemma2ModelProvider27B,
+    GemmaModelProvider,
+    GemmaModelProvider2B,
+    GemmaModelProvider7B,
+)
 from megatron.bridge.models.gpt_provider import GPTModelProvider
 from megatron.bridge.models.llama import (
     CodeLlamaModelProvider7B,
@@ -145,6 +156,15 @@
     "ReplicatedMapping",
     "RowParallelMapping",
     "AutoMapping",
+    "CodeGemmaModelProvider2B",
+    "CodeGemmaModelProvider7B",
+    "GemmaModelProvider",
+    "GemmaModelProvider2B",
+    "GemmaModelProvider7B",
+    "Gemma2ModelProvider",
+    "Gemma2ModelProvider2B",
+    "Gemma2ModelProvider9B",
+    "Gemma2ModelProvider27B",
     "GPTModelProvider",
     "T5ModelProvider",
     "LlamaModelProvider",
diff --git a/src/megatron/bridge/models/conversion/auto_bridge.py b/src/megatron/bridge/models/conversion/auto_bridge.py
index e710c5d78a..4c0214b0eb 100644
--- a/src/megatron/bridge/models/conversion/auto_bridge.py
+++ b/src/megatron/bridge/models/conversion/auto_bridge.py
@@ -17,7 +17,7 @@
 from pathlib import Path
 from typing import Any, Generic, Iterable, List, Optional, Type, TypeVar, Union
 
-import torch.distributed
+import torch.distributed as dist
 import transformers
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.transformer_config import MLATransformerConfig, TransformerConfig
@@ -35,7 +35,7 @@
 from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
 from megatron.bridge.models.hf_pretrained.safe_config_loader import safe_load_config_with_retry
 from megatron.bridge.models.hf_pretrained.state import SafeTensorsStateSource
-from megatron.bridge.models.model_provider import GetModelKwargs, ModelProviderMixin
+from megatron.bridge.models.model_provider import GetModelKwargs, ModelParallelKwargs, ModelProviderMixin
 
 
 MegatronModelT = TypeVar("MegatronModelT", bound=MegatronModule)
@@ -63,7 +63,7 @@ class AutoBridge(Generic[MegatronModelT]):
 
     Example:
         >>> # Load and convert a model to Megatron format
-        >>> bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3-8B")
+        >>> bridge = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-8B")
         >>> provider = bridge.to_megatron_provider()
         >>> megatron_model = provider.provide_distributed_model(wrap_with_ddp=False)
 
@@ -159,7 +159,7 @@ def from_hf_config(cls, config: PretrainedConfig) -> "AutoBridge":
             >>> from transformers import AutoConfig
             >>>
             >>> # Load just the configuration
-            >>> config = AutoConfig.from_pretrained("meta-llama/Llama-3-8B")
+            >>> config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-8B")
             >>>
             >>> # Create bridge from config (no weights)
             >>> bridge = AutoBridge.from_hf_config(config)
@@ -191,7 +191,7 @@ def from_hf_pretrained(cls, path: Union[str, Path], **kwargs) -> "AutoBridge":
 
         Args:
             path: HuggingFace model ID or path to model directory
-                Examples: "meta-llama/Llama-3-8B", "./my_model"
+                Examples: "meta-llama/Meta-Llama-3-8B", "./my_model"
             **kwargs: Additional arguments passed to HuggingFace from_hf_pretrained
                 Common options include:
                 - torch_dtype: Model precision (torch.float16, torch.bfloat16)
@@ -211,7 +211,7 @@ def from_hf_pretrained(cls, path: Union[str, Path], **kwargs) -> "AutoBridge":
 
             >>> # Load with specific settings
             >>> bridge = AutoBridge.from_hf_pretrained(
-            ...     "meta-llama/Llama-3-8B",
+            ...     "meta-llama/Meta-Llama-3-8B",
             ...     torch_dtype=torch.float16,
             ...     device_map="auto"
             ... )
@@ -240,7 +240,7 @@ def can_handle(cls, path: Union[str, Path], trust_remote_code: bool = False) ->
 
         Args:
             path: Path to model directory or HuggingFace model ID
-                Examples: "meta-llama/Llama-3-8B", "/models/my_model"
+                Examples: "meta-llama/Meta-Llama-3-8B", "/models/my_model"
             trust_remote_code: Whether to trust remote code when loading config.
                 Set to True for models that use custom modeling code.
 
@@ -249,7 +249,7 @@ def can_handle(cls, path: Union[str, Path], trust_remote_code: bool = False) ->
 
         Example:
             >>> # Check if a model is supported
-            >>> if AutoBridge.can_handle("meta-llama/Llama-3-8B"):
+            >>> if AutoBridge.can_handle("meta-llama/Meta-Llama-3-8B"):
             ...     print("Model is supported!")
             ... else:
             ...     print("Model requires a custom bridge implementation")
@@ -373,9 +373,9 @@ def save_hf_pretrained(self, model: list[MegatronModelT], path: str | Path, show
             saves the configuration files, while weight saving is coordinated
             across all ranks.
         """
-        if torch.distributed.is_available() and torch.distributed.is_initialized():
+        if dist.is_available() and dist.is_initialized():
             # Distributed training, only rank 0 saves artifacts
-            if torch.distributed.get_rank() == 0:
+            if dist.get_rank() == 0:
                 self.hf_pretrained.save_artifacts(path)
         else:
             # No distributed training, save artifacts
@@ -416,8 +416,8 @@ def save_hf_weights(self, model: list[MegatronModelT], path: str | Path, show_pr
             - Automatically handles model sharding for large models
             - The saved weights can be loaded with HuggingFace's from_pretrained
         """
-        if torch.distributed.is_available() and torch.distributed.is_initialized():
-            torch.distributed.barrier()
+        if dist.is_available() and dist.is_initialized():
+            dist.barrier()
         dispatch_instance = (self._causal_lm_architecture, self._get_model_instance(model))
         generator = model_bridge.stream_weights_megatron_to_hf(
             dispatch_instance, model, self.hf_pretrained, cpu=True, show_progress=show_progress
@@ -433,8 +433,8 @@ def save_hf_weights(self, model: list[MegatronModelT], path: str | Path, show_pr
         else:
             raise ValueError("The state source is not a SafeTensorsStateSource, cannot save in streaming mode.")
 
-        if torch.distributed.is_available() and torch.distributed.is_initialized():
-            torch.distributed.barrier()
+        if dist.is_available() and dist.is_initialized():
+            dist.barrier()
 
     def save_megatron_model(
         self, model: list[MegatronModule], path: str | Path, hf_tokenizer_path: Optional[str | Path] = None
@@ -462,7 +462,7 @@ def save_megatron_model(
             >>> bridge.save_megatron_model(
             ...     megatron_model,
             ...     "./megatron_checkpoint",
-            ...     hf_tokenizer_path="meta-llama/Llama-3-8B"
+            ...     hf_tokenizer_path="meta-llama/Meta-Llama-3-8B"
             ... )
 
         Note:
@@ -476,7 +476,9 @@ def save_megatron_model(
             raise ImportError("megatron.bridge.training is not available.")
         save_megatron_model(model, path, hf_tokenizer_path=hf_tokenizer_path)
 
-    def load_megatron_model(self, path: str | Path, **kwargs: Unpack[GetModelKwargs]) -> list[MegatronModelT]:
+    def load_megatron_model(
+        self, path: str | Path, *, mp_overrides: ModelParallelKwargs | None = None, **kwargs: Unpack[GetModelKwargs]
+    ) -> list[MegatronModelT]:
         """
         Load a Megatron model from a native Megatron checkpoint.
 
@@ -486,6 +488,7 @@ def load_megatron_model(self, path: str | Path, **kwargs: Unpack[GetModelKwargs]
 
         Args:
             path: Directory path where the Megatron checkpoint is stored
+            mp_overrides: Optional model-parallel overrides to apply to the loaded config.
             **kwargs: Additional arguments passed to the model provider
 
         Returns:
@@ -529,10 +532,13 @@ def get_iter_number(folder_name):
             checkpoint_path = checkpoint_path / latest_iter.name
         # else: checkpoint_path remains as the input path (no iter folders found)
 
+        skip_temp_dist_context = dist.is_available() and dist.is_initialized()
         # Load the state dict
         model = load_megatron_model(
             str(checkpoint_path),
-            use_cpu_init=True,
+            use_cpu_init=(skip_temp_dist_context and dist.get_backend() == "gloo"),
+            skip_temp_dist_context=skip_temp_dist_context,
+            mp_overrides=mp_overrides,
         )
         return model if isinstance(model, list) else [model]
 
@@ -553,7 +559,7 @@ def import_ckpt(
 
         Args:
             hf_model_id: HuggingFace model ID or path to model directory
-                Examples: "meta-llama/Llama-3-8B", "./my_model"
+                Examples: "meta-llama/Meta-Llama-3-8B", "./my_model"
             megatron_path: Directory path where the Megatron checkpoint will be saved
             **kwargs: Additional arguments passed to from_hf_pretrained
                 Common options include:
@@ -565,13 +571,13 @@ def import_ckpt(
         Example:
             >>> # Basic import
             >>> AutoBridge.import_ckpt(
-            ...     "meta-llama/Llama-3-8B",
+            ...     "meta-llama/Meta-Llama-3-8B",
             ...     "./megatron_checkpoints/llama3_8b"
             ... )
 
             >>> # Import with specific settings
             >>> AutoBridge.import_ckpt(
-            ...     "meta-llama/Llama-3-8B",
+            ...     "meta-llama/Meta-Llama-3-8B",
             ...     "./megatron_checkpoints/llama3_8b",
             ...     torch_dtype=torch.float16,
             ...     device_map="auto"
@@ -674,7 +680,7 @@ def to_megatron_provider(self, load_weights: bool = True, hf_path: str | Path |
 
         Example:
             >>> # Create provider and model with loaded weights
-            >>> bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3-8B")
+            >>> bridge = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-8B")
             >>> provider = bridge.to_megatron_provider()
             >>> model = provider.get_model()
 
diff --git a/src/megatron/bridge/models/conversion/model_bridge.py b/src/megatron/bridge/models/conversion/model_bridge.py
index 7c511c3a17..13ca8d03d3 100644
--- a/src/megatron/bridge/models/conversion/model_bridge.py
+++ b/src/megatron/bridge/models/conversion/model_bridge.py
@@ -213,7 +213,7 @@ def mapping_registry(self) -> MegatronMappingRegistry:
 
             # The bridge is typically not instantiated directly
             # Instead, use AutoBridge or AutoBridge which handle this
-            bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3-8B")
+            bridge = AutoBridge.from_hf_pretrained("meta-llama/Meta-Llama-3-8B")
             provider = bridge.to_megatron_provider()
 
     Note:
diff --git a/src/megatron/bridge/models/conversion/param_mapping.py b/src/megatron/bridge/models/conversion/param_mapping.py
index 70e33b3734..e3014dcb49 100644
--- a/src/megatron/bridge/models/conversion/param_mapping.py
+++ b/src/megatron/bridge/models/conversion/param_mapping.py
@@ -1339,6 +1339,90 @@ def resolve(self, captures: Tuple[str, ...]) -> "MegatronParamMapping":
         )
 
 
+class KVMapping(MegatronParamMapping[Dict[str, torch.Tensor]]):
+    """
+    Mapping for interleaved Key/Value projection weights.
+
+    This mapping converts between separate K and V tensors used in external
+    checkpoints and Megatron's interleaved KV format following grouped-query
+    attention semantics.
+
+    External format (HF)
+    - Separate tensors: k_proj, v_proj
+    - Shapes mirror QKV mappings but without Q
+
+    Megatron format
+    - Single interleaved tensor with order: [k1, v1, k2, v2, ...]
+      where index corresponds to query-group id
+
+    Tensor-parallel distribution is delegated to AutoMapping.
+    """
+
+    def __init__(self, megatron_param: str, k: str, v: str):
+        super().__init__(megatron_param, {"k": k, "v": v})
+        # Delegate TP sharding/broadcasting
+        self._tp_mapping = AutoMapping(megatron_param, megatron_param)
+
+    def hf_to_megatron(
+        self,
+        hf_weights: Dict[str, torch.Tensor],
+        megatron_module: nn.Module,
+    ) -> torch.Tensor:
+        """Merge K and V into interleaved format and distribute across TP."""
+        if self.tp_rank == 0:
+            config = self._get_config(megatron_module)
+
+            if hf_weights["k"].ndim == 1:
+                merged = merge_kv_biases(config, hf_weights["k"], hf_weights["v"])
+            else:
+                merged = merge_kv_weights(config, hf_weights["k"], hf_weights["v"])
+        else:
+            merged = None
+
+        return self._tp_mapping.hf_to_megatron(merged, megatron_module)
+
+    def megatron_to_hf(
+        self,
+        megatron_weights: Optional[torch.Tensor],
+        megatron_module: Optional[nn.Module],
+    ) -> Dict[str, torch.Tensor]:
+        """Gather KV shards and split into separate K and V tensors."""
+        if megatron_weights is not None:
+            megatron_weights = self.maybe_dequantize(megatron_weights)
+
+        # Ensure all PP ranks participate in config broadcast
+        if megatron_module is None:
+            config = self.broadcast_obj_from_pp_rank(None, "kv_config")
+        else:
+            config = self._get_config(megatron_module)
+            config = remove_non_pickleables(config, max_depth=2)
+            config = self.broadcast_obj_from_pp_rank(config, "kv_config")
+
+        packed_dict = self._tp_mapping.megatron_to_hf(megatron_weights, megatron_module)
+        if not packed_dict:
+            return {}
+
+        packed_kv = next(iter(packed_dict.values()))
+
+        if packed_kv.ndim == 1:
+            k, v = split_kv_biases(config, packed_kv)
+        else:
+            k, v = split_kv_weights(config, packed_kv)
+
+        return {
+            self.hf_param["k"]: k,
+            self.hf_param["v"]: v,
+        }
+
+    def resolve(self, captures: Tuple[str, ...]) -> "MegatronParamMapping":
+        resolved_megatron_param, resolved_hf_param = self._resolve_names(captures)
+        return type(self)(
+            resolved_megatron_param,
+            resolved_hf_param["k"],
+            resolved_hf_param["v"],
+        )
+
+
 class GatedMLPMapping(MegatronParamMapping[Dict[str, torch.Tensor]]):
     r"""Mapping for **gated-MLP** projection weights (SwiGLU / GeGLU).
 
@@ -1652,3 +1736,71 @@ def split_qkv_weights(
         v = v.reshape(-1, hidden_size)
 
     return q, k, v
+
+
+def merge_kv_biases(config: TransformerConfig, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """Merge separate K, V bias vectors into Megatron's interleaved KV format (1D)."""
+    num_query_groups = config.num_query_groups
+    head_size = config.kv_channels or (config.hidden_size // config.num_attention_heads)
+
+    k = k.view(num_query_groups, head_size)
+    v = v.view(num_query_groups, head_size)
+
+    pieces: List[torch.Tensor] = []
+    for i in range(num_query_groups):
+        pieces.append(k[i : i + 1, :])
+        pieces.append(v[i : i + 1, :])
+
+    kv = torch.cat(pieces, dim=0)
+    return kv.reshape(-1)
+
+
+def split_kv_biases(config: TransformerConfig, kv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Split Megatron's interleaved KV bias (1D) into separate K and V biases."""
+    num_query_groups = config.num_query_groups
+    head_size = config.kv_channels or (config.hidden_size // config.num_attention_heads)
+    kv_total_dim = 2 * num_query_groups
+
+    kv_reshaped = kv.view(kv_total_dim, head_size)
+
+    k_slice = torch.arange(0, kv_total_dim, 2)
+    v_slice = torch.arange(1, kv_total_dim, 2)
+
+    k = kv_reshaped[k_slice].reshape(-1)
+    v = kv_reshaped[v_slice].reshape(-1)
+    return k, v
+
+
+def merge_kv_weights(provider: TransformerConfig, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """Merge separate K, V weights into Megatron's interleaved KV format (2D)."""
+    num_query_groups = provider.num_query_groups
+    head_size = provider.kv_channels or (provider.hidden_size // provider.num_attention_heads)
+    hidden_size = provider.hidden_size
+
+    k_reshaped = k.view(num_query_groups, head_size, hidden_size)
+    v_reshaped = v.view(num_query_groups, head_size, hidden_size)
+
+    pieces: List[torch.Tensor] = []
+    for i in range(num_query_groups):
+        pieces.append(k_reshaped[i : i + 1])
+        pieces.append(v_reshaped[i : i + 1])
+
+    kv = torch.cat(pieces, dim=0)
+    return kv.view(-1, hidden_size)
+
+
+def split_kv_weights(provider: TransformerConfig, kv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Split Megatron's interleaved KV weights (2D) into separate K and V matrices."""
+    num_query_groups = provider.num_query_groups
+    head_size = provider.kv_channels or (provider.hidden_size // provider.num_attention_heads)
+    hidden_size = kv.shape[-1]
+    kv_total_dim = 2 * num_query_groups
+
+    kv_reshaped = kv.view(kv_total_dim, head_size, hidden_size)
+
+    k_slice = torch.arange(0, kv_total_dim, 2)
+    v_slice = torch.arange(1, kv_total_dim, 2)
+
+    k = kv_reshaped[k_slice].reshape(-1, hidden_size)
+    v = kv_reshaped[v_slice].reshape(-1, hidden_size)
+    return k, v
diff --git a/src/megatron/bridge/models/gemma/__init__.py b/src/megatron/bridge/models/gemma/__init__.py
new file mode 100644
index 0000000000..d803166ff2
--- /dev/null
+++ b/src/megatron/bridge/models/gemma/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.bridge.models.gemma.gemma2_bridge import Gemma2Bridge  # noqa: F401
+from megatron.bridge.models.gemma.gemma2_provider import (
+    Gemma2ModelProvider,
+    Gemma2ModelProvider2B,
+    Gemma2ModelProvider9B,
+    Gemma2ModelProvider27B,
+)
+from megatron.bridge.models.gemma.gemma_bridge import GemmaBridge  # noqa: F401
+from megatron.bridge.models.gemma.gemma_provider import (
+    CodeGemmaModelProvider2B,
+    CodeGemmaModelProvider7B,
+    GemmaModelProvider,
+    GemmaModelProvider2B,
+    GemmaModelProvider7B,
+)
+
+
+__all__ = [
+    "GemmaModelProvider",
+    "GemmaModelProvider2B",
+    "GemmaModelProvider7B",
+    "CodeGemmaModelProvider2B",
+    "CodeGemmaModelProvider7B",
+    "Gemma2ModelProvider",
+    "Gemma2ModelProvider2B",
+    "Gemma2ModelProvider9B",
+    "Gemma2ModelProvider27B",
+]
diff --git a/src/megatron/bridge/models/gemma/gemma2_bridge.py b/src/megatron/bridge/models/gemma/gemma2_bridge.py
new file mode 100644
index 0000000000..8d2ad02243
--- /dev/null
+++ b/src/megatron/bridge/models/gemma/gemma2_bridge.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from megatron.core.models.gpt.gpt_model import GPTModel
+from transformers import Gemma2ForCausalLM
+
+from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
+from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
+from megatron.bridge.models.conversion.param_mapping import (
+    AutoMapping,
+    GatedMLPMapping,
+    QKVMapping,
+)
+from megatron.bridge.models.gemma.gemma2_provider import Gemma2ModelProvider
+from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
+
+
+# Register custom Gemma2 modules for AutoMapping
+AutoMapping.register_module_type("TERowParallelLinearLayerNorm", "row")
+AutoMapping.register_module_type("Gemma2OutputLayer", "column")
+
+
+@MegatronModelBridge.register_bridge(source=Gemma2ForCausalLM, target=GPTModel)
+class Gemma2Bridge(MegatronModelBridge):
+    """
+    Megatron Bridge for Gemma2 Causal LM.
+    This bridge handles the conversion between HuggingFace Gemma2ForCausalLM
+    and Megatron-Core GPTModel formats, including weight mappings and
+    configuration translation. Gemma2 includes specific features like
+    attention logit softcapping, sliding window attention, and additional
+    layer normalization compared to the original Gemma model.
+    As a user you would not use this bridge directly, but through `AutoBridge`.
+    Example:
+        >>> from megatron.bridge import AutoBridge
+        >>> bridge = AutoBridge.from_hf_pretrained("google/gemma-2-2b")
+        >>> provider = bridge.to_megatron_provider()
+    """
+
+    def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> Gemma2ModelProvider:
+        """Convert HuggingFace config to Gemma2ModelProvider.
+        Args:
+            hf_pretrained: HuggingFace pretrained model wrapper
+        Returns:
+            Gemma2ModelProvider: Configured provider for Megatron model
+        """
+        hf_config = hf_pretrained.config
+
+        provider = Gemma2ModelProvider(
+            num_layers=hf_config.num_hidden_layers,
+            hidden_size=hf_config.hidden_size,
+            ffn_hidden_size=hf_config.intermediate_size,
+            num_attention_heads=hf_config.num_attention_heads,
+            init_method_std=hf_config.initializer_range,
+            layernorm_epsilon=hf_config.rms_norm_eps,
+            num_query_groups=hf_config.num_key_value_heads,
+            kv_channels=hf_config.head_dim,
+            rotary_base=hf_config.rope_theta,
+            query_pre_attn_scalar=hf_config.query_pre_attn_scalar,
+            attn_logit_softcapping=hf_config.attn_logit_softcapping,
+            final_logit_softcapping=hf_config.final_logit_softcapping,
+            window_size=(hf_config.sliding_window, 0),
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=self.make_vocab_size_divisible_by(hf_config.vocab_size),
+            vocab_size=hf_config.vocab_size,
+            share_embeddings_and_output_weights=True,
+            seq_length=hf_config.max_position_embeddings,
+            fp16=(self.dtype_from_hf(hf_config, default=torch.float32) == torch.float16),
+            bf16=(self.dtype_from_hf(hf_config, default=torch.float32) == torch.bfloat16),
+            params_dtype=self.dtype_from_hf(hf_config, default=torch.float32),
+            generation_config=hf_pretrained.generation_config,
+        )
+
+        return provider
+
+    def mapping_registry(self) -> MegatronMappingRegistry:
+        """Return MegatronMappingRegistry containing parameter mappings from HF to Megatron format.
+        Returns:
+            MegatronMappingRegistry: Registry of parameter mappings
+        """
+        # Dictionary maps HF parameter names -> Megatron parameter names
+        # Supports wildcard (*) patterns for layer-specific parameters
+        param_mappings = {
+            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
+            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "model.layers.*.pre_feedforward_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "model.layers.*.post_feedforward_layernorm.weight": "decoder.layers.*.mlp.linear_fc2.post_layernorm.weight",
+            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.self_attention.linear_proj.post_layernorm.weight",
+            "model.norm.weight": "decoder.final_layernorm.weight",
+        }
+
+        mapping_list = []
+        # Convert each dictionary entry to AutoMapping(hf_param, megatron_param)
+        for hf_param, megatron_param in param_mappings.items():
+            mapping_list.append(AutoMapping(hf_param=hf_param, megatron_param=megatron_param))
+
+        # Add special mappings that require parameter concatenation/transformation
+        mapping_list.extend(
+            [
+                # QKV: Combine separate Q, K, V matrices into single QKV matrix
+                QKVMapping(
+                    q="model.layers.*.self_attn.q_proj.weight",
+                    k="model.layers.*.self_attn.k_proj.weight",
+                    v="model.layers.*.self_attn.v_proj.weight",
+                    megatron_param="decoder.layers.*.self_attention.linear_qkv.weight",
+                ),
+                # Gated MLP: Combine gate and up projection matrices into single FC1 matrix
+                GatedMLPMapping(
+                    gate="model.layers.*.mlp.gate_proj.weight",
+                    up="model.layers.*.mlp.up_proj.weight",
+                    megatron_param="decoder.layers.*.mlp.linear_fc1.weight",
+                ),
+            ]
+        )
+
+        return MegatronMappingRegistry(*mapping_list)
diff --git a/src/megatron/bridge/models/gemma/gemma2_provider.py b/src/megatron/bridge/models/gemma/gemma2_provider.py
new file mode 100644
index 0000000000..9663b5d4c7
--- /dev/null
+++ b/src/megatron/bridge/models/gemma/gemma2_provider.py
@@ -0,0 +1,433 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.activations import fast_gelu
+from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear, TENorm, TERowParallelLinear
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax
+from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.tensor_parallel import ColumnParallelLinear
+from megatron.core.transformer import (
+    MegatronModule,
+    ModuleSpec,
+    TransformerConfig,
+    TransformerLayer,
+    TransformerLayerSubmodules,
+)
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.utils import attention_mask_func
+from megatron.core.utils import divide
+from torch import Tensor
+
+from megatron.bridge.models.gemma.modules import EmbeddingScalingMixin, extend_instance
+from megatron.bridge.models.gpt_provider import GPTModelProvider
+
+
+class Gemma2DotProductAttention(MegatronModule):
+    """
+    Region where selective activation recomputation is applied.
+    This region is memory intensive but less compute intensive which
+    makes activation checkpointing more efficient for LLMs (20B+).
+    See Reducing Activation Recomputation in Large Transformer Models:
+    https://arxiv.org/abs/2205.05198 for more details.
+    We use the following notation:
+     h: hidden size
+     n: number of attention heads
+     p: number of tensor model parallel partitions
+     b: batch size
+     s: sequence length
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: float = None,
+        **kwargs,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        assert self.config.context_parallel_size == 1, (
+            "Context parallelism is only supported by TEDotProductAttention!"
+        )
+
+        self.layer_number = max(1, layer_number)
+
+        self.window_size = None
+        if self.layer_number % 2 == 0:
+            self.window_size = config.window_size
+
+        self.attn_mask_type = attn_mask_type
+        self.attention_type = attention_type  # unused for now
+
+        projection_size = self.config.kv_channels * self.config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(projection_size, world_size)
+        self.hidden_size_per_attention_head = divide(projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
+        self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
+
+        coeff = None
+        self.norm_factor = math.sqrt(config.query_pre_attn_scalar)
+
+        if self.config.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            input_in_fp16=self.config.fp16,
+            input_in_bf16=self.config.bf16,
+            attn_mask_type=self.attn_mask_type,
+            scaled_masked_softmax_fusion=self.config.masked_softmax_fusion,
+            mask_func=attention_mask_func,
+            softmax_in_fp32=self.config.attention_softmax_in_fp32,
+            scale=coeff,
+        )
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(
+            self.config.attention_dropout if attention_dropout is None else attention_dropout
+        )
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attention_mask: Tensor,
+        attn_mask_type: AttnMaskType = None,
+        packed_seq_params: PackedSeqParams = None,
+        **kwargs,
+    ):
+        """Forward.
+        Modified from mcore.transformer.dot_product_attention to support Gemma2-specific
+        final_logit_softcapping.
+        """
+        assert packed_seq_params is None, (
+            "Packed sequence is not supported by DotProductAttention.Please use TEDotProductAttention instead."
+        )
+
+        # ===================================
+        # Raw attention scores. [b, n/p, s, s]
+        # ===================================
+
+        # expand the key and value [sk, b, ng, hn] -> [sk, b, np, hn]
+        # This is a noop for normal attention where ng == np. When using group query attention this
+        # creates a view that has the keys and values virtually repeated along their dimension to
+        # match the number of queries.
+
+        # attn_mask_type is not used.
+        if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1:
+            key = key.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
+            )
+            value = value.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
+            )
+
+        # [b, np, sq, sk]
+        output_size = (
+            query.size(1),
+            query.size(2),
+            query.size(0),
+            key.size(0),
+        )
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        # This will be a simple view when doing normal attention, but in group query attention
+        # the key and value tensors are repeated to match the queries so you can't use simple strides
+        # to extract the queries.
+        query = query.reshape(output_size[2], output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key = key.view(output_size[3], output_size[0] * output_size[1], -1)
+
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor(
+            (output_size[0] * output_size[1], output_size[2], output_size[3]),
+            query.dtype,
+            "mpu",
+        )
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query.transpose(0, 1),  # [b * np, sq, hn]
+            key.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
+        )
+        # Gemma 2 specific:
+        matmul_result = logit_softcapping(matmul_result, self.config.attn_logit_softcapping)
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # sliding window attention
+        if attention_mask is not None and self.window_size is not None:
+            attention_mask = get_swa(query.size(0), key.size(0), self.window_size)
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+
+        if not self.config.sequence_parallel:
+            with tensor_parallel.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (
+            value.size(1),
+            value.size(2),
+            query.size(0),
+            value.size(3),
+        )
+
+        # change view [sk, b * np, hn]
+        value = value.view(value.size(0), output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context = torch.bmm(attention_probs, value.transpose(0, 1))
+
+        # change view [b, np, sq, hn]
+        context = context.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context = context.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_shape = context.size()[:-2] + (self.hidden_size_per_partition,)
+        context = context.view(*new_context_shape)
+        return context
+
+
+class TERowParallelLinearLayerNorm(TERowParallelLinear):
+    """Modified From TERowParallelLinear with an additional Post-LN."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: TransformerConfig,
+        **kwargs,
+    ):
+        super().__init__(
+            input_size,
+            output_size,
+            config=config,
+            **kwargs,
+        )
+        self.post_layernorm = TENorm(config, output_size)
+
+    def forward(self, x):
+        """Forward with additional Post LN on output"""
+        output, bias = super().forward(x)
+        return self.post_layernorm(output), bias
+
+
+class Gemma2OutputLayer(ColumnParallelLinear):
+    """Extends from ColumnParallelLinear with logit soft capping."""
+
+    def forward(self, *args, **kwargs):
+        """Forward with logit soft capping."""
+        output, bias = super().forward(*args, **kwargs)
+        output = logit_softcapping(output, self.config.final_logit_softcapping)
+        return output, bias
+
+
+def logit_softcapping(logits: torch.Tensor, scale: Optional[float]) -> torch.Tensor:
+    """Prevents logits from growing excessively by scaling them to a fixed range"""
+    if not scale:
+        return logits
+
+    return scale * torch.tanh(logits / scale)
+
+
+def get_swa(seq_q: int, seq_kv: int, window_size: tuple[int, int]) -> torch.Tensor:
+    """Create the equivalent attention mask for SWA in [seq_q, seq_kv] shape"""
+    m = torch.ones(seq_q, seq_kv, dtype=torch.bool, device="cuda")
+    mu = torch.triu(m, diagonal=seq_kv - seq_q - window_size[0])
+    ml = torch.tril(mu, diagonal=seq_kv - seq_q + window_size[1])
+    ml = ~ml
+
+    return ml
+
+
+def gemma2_layer_spec(config: "GPTModelProvider") -> ModuleSpec:
+    """Gemma2-specific layer specification."""
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=Gemma2DotProductAttention,  # use unfused SDPA for attn logit softcapping
+                    linear_proj=TERowParallelLinearLayerNorm,  # post attn RMSNorm
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TELayerNormColumnParallelLinear,
+                    linear_fc2=TERowParallelLinearLayerNorm,  # post mlp RMSNorm
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+@dataclass
+class Gemma2ModelProvider(GPTModelProvider):
+    """Configuration class for Gemma2 models.
+    Extends GPTModelProvider with specific settings optimized for Gemma2 architectures.
+    Includes configurations for normalization, activation functions, and various
+    Gemma2-specific options like attention logit softcapping and sliding window attention.
+    """
+
+    # configs that are common across model sizes
+    normalization: str = "RMSNorm"
+    activation_func: Callable = fast_gelu
+    gated_linear_unit: bool = True
+    position_embedding_type: str = "rope"
+    add_bias_linear: bool = False
+    seq_length: int = 8192
+    kv_channels: int = 256
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
+    share_embeddings_and_output_weights: bool = True
+    # Note: different behavior compared to NeMo 1.0
+    # NeMo 1.0 does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script
+    # The present implementation is more in line with the official implementation
+    layernorm_zero_centered_gamma: bool = True
+    layernorm_epsilon: float = 1e-6
+    rotary_base: float = 10000
+
+    window_size: tuple[int, int] = (4096, 0)
+    vocab_size: int = 256000
+    gradient_accumulation_fusion: bool = False
+
+    transformer_layer_spec: Union[ModuleSpec, Callable[["GPTModelProvider"], ModuleSpec]] = gemma2_layer_spec
+
+    query_pre_attn_scalar: int = 224
+    attn_logit_softcapping: float = 50.0
+    final_logit_softcapping: float = 30.0
+
+    def provide(self, pre_process=None, post_process=None, vp_stage=None) -> "MCoreGPTModel":
+        """Configure and instantiate a Megatron Core Gemma2 model.
+        Extends the base configuration with Gemma2-specific embedding scaling and output layer modifications.
+        Args:
+            pre_process: Whether to include pre-processing in the model
+            post_process: Whether to include post-processing in the model
+            vp_stage: Virtual pipeline stage
+            tokenizer: Tokenizer used with the model
+        Returns:
+            MCoreGPTModel: Configured Megatron Core GPT model instance
+        """
+        model = super().provide(pre_process=pre_process, post_process=post_process, vp_stage=vp_stage)
+
+        # Apply Embedding Scaling for Gemma2: sqrt(hidden_size)
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=False, vp_stage=vp_stage):
+            extend_instance(model.embedding, EmbeddingScalingMixin)
+
+        # Prevents final logits from growing excessively by scaling them to a fixed range
+        if parallel_state.is_pipeline_last_stage(ignore_virtual=False, vp_stage=vp_stage):
+            extend_instance(model.output_layer, Gemma2OutputLayer)
+
+        return model
+
+
+@dataclass
+class Gemma2ModelProvider2B(Gemma2ModelProvider):
+    """Configuration for a 2B parameter Gemma2 model.
+    Specific configuration for the 2B Gemma2 model with 26 layers,
+    2304 hidden size, and 8 attention heads.
+    """
+
+    num_layers: int = 26
+    hidden_size: int = 2304
+    num_attention_heads: int = 8
+    num_query_groups: int = 4
+    ffn_hidden_size: int = 9216
+    query_pre_attn_scalar: int = 256
+
+
+@dataclass
+class Gemma2ModelProvider9B(Gemma2ModelProvider):
+    """Configuration for a 9B parameter Gemma2 model.
+    Specific configuration for the 9B Gemma2 model with 42 layers,
+    3584 hidden size, and 16 attention heads.
+    """
+
+    num_layers: int = 42
+    hidden_size: int = 3584
+    num_attention_heads: int = 16
+    num_query_groups: int = 8
+    ffn_hidden_size: int = 14336
+    query_pre_attn_scalar: int = 256
+
+
+@dataclass
+class Gemma2ModelProvider27B(Gemma2ModelProvider):
+    """Configuration for a 27B parameter Gemma2 model.
+    Specific configuration for the 27B Gemma2 model with 46 layers,
+    4608 hidden size, and 32 attention heads.
+    """
+
+    num_layers: int = 46
+    hidden_size: int = 4608
+    num_attention_heads: int = 32
+    num_query_groups: int = 16
+    kv_channels: int = 128
+    ffn_hidden_size: int = 36864
+    query_pre_attn_scalar: int = 144
diff --git a/src/megatron/bridge/models/gemma/gemma_bridge.py b/src/megatron/bridge/models/gemma/gemma_bridge.py
new file mode 100644
index 0000000000..2e205ec643
--- /dev/null
+++ b/src/megatron/bridge/models/gemma/gemma_bridge.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from megatron.core.models.gpt.gpt_model import GPTModel
+from transformers import GemmaForCausalLM
+
+from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
+from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
+from megatron.bridge.models.conversion.param_mapping import (
+    AutoMapping,
+    GatedMLPMapping,
+    QKVMapping,
+)
+from megatron.bridge.models.gemma.gemma_provider import GemmaModelProvider
+from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
+
+
+@MegatronModelBridge.register_bridge(source=GemmaForCausalLM, target=GPTModel)
+class GemmaBridge(MegatronModelBridge):
+    """
+    Megatron Bridge for Gemma Causal LM.
+
+    This bridge handles the conversion between HuggingFace GemmaForCausalLM
+    and Megatron-Core GPTModel formats, including weight mappings and
+    configuration translation.
+
+    As a user you would not use this bridge directly, but through `AutoBridge`.
+
+    Example:
+        >>> from megatron.bridge import AutoBridge
+        >>> bridge = AutoBridge.from_hf_pretrained("google/gemma-2b")
+        >>> provider = bridge.to_megatron_provider()
+    """
+
+    def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> GemmaModelProvider:
+        """Convert HuggingFace config to GemmaModelProvider.
+
+        Args:
+            hf_pretrained: HuggingFace pretrained model wrapper
+
+        Returns:
+            GemmaModelProvider: Configured provider for Megatron model
+        """
+        hf_config = hf_pretrained.config
+
+        provider = GemmaModelProvider(
+            num_layers=hf_config.num_hidden_layers,
+            hidden_size=hf_config.hidden_size,
+            ffn_hidden_size=hf_config.intermediate_size,
+            num_attention_heads=hf_config.num_attention_heads,
+            num_query_groups=hf_config.num_key_value_heads,
+            init_method_std=hf_config.initializer_range,
+            layernorm_epsilon=hf_config.rms_norm_eps,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=self.make_vocab_size_divisible_by(hf_config.vocab_size),
+            rotary_base=hf_config.rope_theta,
+            share_embeddings_and_output_weights=getattr(hf_config, "tie_word_embeddings", True),
+            vocab_size=hf_config.vocab_size,
+            seq_length=hf_config.max_position_embeddings,
+            fp16=(self.dtype_from_hf(hf_config, default=torch.float32) == torch.float16),
+            bf16=(self.dtype_from_hf(hf_config, default=torch.float32) == torch.bfloat16),
+            params_dtype=self.dtype_from_hf(hf_config, default=torch.float32),
+            generation_config=hf_pretrained.generation_config,
+            kv_channels=hf_config.head_dim,
+        )
+
+        return provider
+
+    def mapping_registry(self) -> MegatronMappingRegistry:
+        """Return MegatronMappingRegistry containing parameter mappings from HF to Megatron format.
+
+        Returns:
+            MegatronMappingRegistry: Registry of parameter mappings
+        """
+        # Dictionary maps HF parameter names -> Megatron parameter names
+        # Supports wildcard (*) patterns for layer-specific parameters
+        param_mappings = {
+            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
+            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "model.norm.weight": "decoder.final_layernorm.weight",
+        }
+
+        mapping_list = []
+        # Convert each dictionary entry to AutoMapping(hf_param, megatron_param)
+        for hf_param, megatron_param in param_mappings.items():
+            mapping_list.append(AutoMapping(hf_param=hf_param, megatron_param=megatron_param))
+
+        # Add special mappings that require parameter concatenation/transformation
+        mapping_list.extend(
+            [
+                # QKV: Combine separate Q, K, V matrices into single QKV matrix
+                QKVMapping(
+                    q="model.layers.*.self_attn.q_proj.weight",
+                    k="model.layers.*.self_attn.k_proj.weight",
+                    v="model.layers.*.self_attn.v_proj.weight",
+                    megatron_param="decoder.layers.*.self_attention.linear_qkv.weight",
+                ),
+                # Gated MLP: Combine gate and up projection matrices into single FC1 matrix
+                GatedMLPMapping(
+                    gate="model.layers.*.mlp.gate_proj.weight",
+                    up="model.layers.*.mlp.up_proj.weight",
+                    megatron_param="decoder.layers.*.mlp.linear_fc1.weight",
+                ),
+            ]
+        )
+
+        return MegatronMappingRegistry(*mapping_list)
diff --git a/src/megatron/bridge/models/gemma/gemma_provider.py b/src/megatron/bridge/models/gemma/gemma_provider.py
new file mode 100644
index 0000000000..2d0d9e038d
--- /dev/null
+++ b/src/megatron/bridge/models/gemma/gemma_provider.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Callable
+
+import torch
+from megatron.core import parallel_state
+from megatron.core.activations import fast_gelu
+from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+from megatron.core.transformer.enums import AttnBackend
+
+from megatron.bridge.models.gpt_provider import GPTModelProvider
+
+
+@dataclass
+class GemmaModelProvider(GPTModelProvider):
+    """Configuration class for Gemma models."""
+
+    # configs that are common across model sizes
+    normalization: str = "RMSNorm"
+    activation_func: Callable = fast_gelu
+    gated_linear_unit: bool = True
+    position_embedding_type: str = "rope"
+    add_bias_linear: bool = False
+    seq_length: int = 8192
+    kv_channels: int = 256
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
+    share_embeddings_and_output_weights: bool = True
+    # Note: different behavior compared to NeMo 1.0
+    # NeMo 1.0 does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script
+    # The present implementation is more in line with the official implementation
+    layernorm_zero_centered_gamma: bool = True
+    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
+    attention_backend: AttnBackend = AttnBackend.flash
+
+    # Gemma defaults from HuggingFace
+    layernorm_epsilon: float = 1e-06
+    vocab_size: int = 256000
+    bf16: bool = True
+    params_dtype: torch.dtype = torch.bfloat16
+    autocast_dtype: torch.dtype = torch.bfloat16
+
+    def provide(self, pre_process=None, post_process=None, vp_stage=None) -> "MCoreGPTModel":
+        """Configure and instantiate a Megatron Core Gemma model.
+
+        Extends the base configuration with Gemma-specific embedding scaling.
+
+        Args:
+            pre_process: Whether to include pre-processing in the model
+            post_process: Whether to include post-processing in the model
+            vp_stage: Virtual pipeline stage
+            tokenizer: Tokenizer used with the model
+
+        Returns:
+            MCoreGPTModel: Configured Megatron Core GPT model instance
+        """
+        model = super().provide(pre_process=pre_process, post_process=post_process, vp_stage=vp_stage)
+
+        # Apply Embedding Scaling for Gemma: sqrt(hidden_size)
+        if parallel_state.is_pipeline_first_stage(
+            ignore_virtual=False,
+            vp_stage=vp_stage,
+        ):
+            from megatron.bridge.models.gemma.modules import EmbeddingScalingMixin, extend_instance
+
+            extend_instance(model.embedding, EmbeddingScalingMixin)
+
+        return model
+
+
+@dataclass
+class GemmaModelProvider2B(GemmaModelProvider):
+    """Configuration for a 2B parameter Gemma model.
+
+    Specific configuration for the 2B Gemma model with 18 layers,
+    2048 hidden size, and 8 attention heads.
+    """
+
+    num_layers: int = 18
+    hidden_size: int = 2048
+    num_attention_heads: int = 8
+    num_query_groups: int = 1
+    ffn_hidden_size: int = 16384
+
+
+@dataclass
+class GemmaModelProvider7B(GemmaModelProvider):
+    """Configuration for a 7B parameter Gemma model.
+
+    Specific configuration for the 7B Gemma model with 28 layers,
+    3072 hidden size, and 16 attention heads.
+    """
+
+    num_layers: int = 28
+    hidden_size: int = 3072
+    num_attention_heads: int = 16
+    num_query_groups: int = 16
+    ffn_hidden_size: int = 24576
+
+
+@dataclass
+class CodeGemmaModelProvider2B(GemmaModelProvider2B):
+    """Configuration for a 2B parameter Code Gemma model.
+
+    Extends GemmaModelProvider with specific settings for code generation.
+    Thism model has an identical configuration to GemmaModelProvider2B.
+    """
+
+
+@dataclass
+class CodeGemmaModelProvider7B(GemmaModelProvider7B):
+    """Configuration for a 7B parameter Code Gemma model.
+
+    Extends GemmaModelProvider with specific settings for code generation.
+    This model has an identical configuration to GemmaModelProvider7B.
+    """
diff --git a/src/megatron/bridge/models/gemma/modules.py b/src/megatron/bridge/models/gemma/modules.py
new file mode 100644
index 0000000000..5d97a1be7a
--- /dev/null
+++ b/src/megatron/bridge/models/gemma/modules.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+
+
+class EmbeddingScalingMixin(torch.nn.Module):
+    """
+    A mixin class for scaling embeddings in Megatron GPT.
+    The scaling is applied only if the configuration (accessible via `self.config`)
+    includes `apply_embedding_scaling` set to True.
+    """
+
+    def forward(self, **kwargs):
+        """
+        Forward pass that scales the output embeddings from the `forward` method of
+        the superclass by the square root of the hidden size specified in the configuration.
+        """
+        embeddings = super().forward(**kwargs)
+        return embeddings * torch.tensor(self.config.hidden_size**0.5, dtype=embeddings.dtype)
diff --git a/src/megatron/bridge/models/hf_pretrained/__init__.py b/src/megatron/bridge/models/hf_pretrained/__init__.py
index de1604f253..9bfb9fd83f 100644
--- a/src/megatron/bridge/models/hf_pretrained/__init__.py
+++ b/src/megatron/bridge/models/hf_pretrained/__init__.py
@@ -14,6 +14,7 @@
 
 from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
 from megatron.bridge.models.hf_pretrained.vlm import PreTrainedVLM
+from megatron.bridge.models.hf_pretrained.wan import PreTrainedWAN
 
 
-__all__ = ["PreTrainedCausalLM", "PreTrainedVLM"]
+__all__ = ["PreTrainedCausalLM", "PreTrainedVLM", "PreTrainedWAN"]
diff --git a/src/megatron/bridge/models/hf_pretrained/safe_config_loader.py b/src/megatron/bridge/models/hf_pretrained/safe_config_loader.py
index 2bd9fad2ed..c5ae0a7452 100644
--- a/src/megatron/bridge/models/hf_pretrained/safe_config_loader.py
+++ b/src/megatron/bridge/models/hf_pretrained/safe_config_loader.py
@@ -61,7 +61,7 @@ def safe_load_config_with_retry(
             Useful for multi-node setups where a shared lock directory is needed.
 
     Example:
-        >>> config = safe_load_config_with_retry("meta-llama/Llama-3-8B")
+        >>> config = safe_load_config_with_retry("meta-llama/Meta-Llama-3-8B")
         >>> print(config.model_type)
 
         >>> # With custom retry settings
@@ -75,7 +75,7 @@ def safe_load_config_with_retry(
         >>> # Multi-node setup with shared lock directory
         >>> import os
         >>> os.environ["MEGATRON_CONFIG_LOCK_DIR"] = "/shared/locks"
-        >>> config = safe_load_config_with_retry("meta-llama/Llama-3-8B")
+        >>> config = safe_load_config_with_retry("meta-llama/Meta-Llama-3-8B")
     """
     last_exception = None
 
diff --git a/src/megatron/bridge/models/hf_pretrained/state.py b/src/megatron/bridge/models/hf_pretrained/state.py
index a47a22771d..b35f2c05f9 100644
--- a/src/megatron/bridge/models/hf_pretrained/state.py
+++ b/src/megatron/bridge/models/hf_pretrained/state.py
@@ -496,7 +496,8 @@ def key_to_filename_map(self) -> Dict[str, str]:
         from safetensors import safe_open
 
         key_map = {}
-        safetensor_files = file_glob(str(self.path / "*.safetensors"))
+        # DEBUGGING
+        safetensor_files = file_glob(str(self.path / "transformer" / "*.safetensors"))
         for file_path in safetensor_files:
             filename = os.path.basename(file_path)
             try:
@@ -564,7 +565,8 @@ def get_all_keys(self) -> List[str]:
             all_keys.update(key_to_filename_map.keys())
 
         if not all_keys:
-            safetensor_files = file_glob(str(self.path / "*.safetensors"))
+            # DEBUGGING
+            safetensor_files = file_glob(str(self.path / "transformer" / "*.safetensors"))
             if not safetensor_files and not key_to_filename_map:
                 raise FileNotFoundError(f"No .safetensors files or index found in {self.model_name_or_path}")
             for safetensor_file in safetensor_files:
@@ -603,7 +605,8 @@ def load_tensors(self, keys_to_load: List[str]) -> Dict[str, torch.Tensor]:
                                 remaining_keys.discard(key)
 
         if remaining_keys:
-            safetensor_files = file_glob(str(self.path / "*.safetensors"))
+            # DEBUGGING
+            safetensor_files = file_glob(str(self.path / "transformer" / "*.safetensors"))
             if not safetensor_files and not key_to_filename_map and not loaded_tensors:
                 raise FileNotFoundError(
                     f"No .safetensors files found in {self.model_name_or_path} to load keys: {remaining_keys}"
@@ -650,7 +653,8 @@ def has_glob(self, pattern: str) -> bool:
             return False
 
         # If no index map, scan the files directly.
-        safetensor_files = file_glob(str(self.path / "*.safetensors"))
+        # DEBUGGING
+        safetensor_files = file_glob(str(self.path / "transformer" / "*.safetensors"))
         if not safetensor_files:
             return False
 
diff --git a/src/megatron/bridge/models/hf_pretrained/wan.py b/src/megatron/bridge/models/hf_pretrained/wan.py
new file mode 100644
index 0000000000..d682c5cf07
--- /dev/null
+++ b/src/megatron/bridge/models/hf_pretrained/wan.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import Optional, Union
+
+from diffusers import WanTransformer3DModel, WanVACETransformer3DModel
+from transformers import AutoConfig
+
+from megatron.bridge.models.hf_pretrained.base import PreTrainedBase
+
+
+class PreTrainedWAN(PreTrainedBase):
+    """
+    Lightweight pretrained wrapper for Diffusers WAN models.
+
+    Provides access to WAN config and state through the common PreTrainedBase API
+    so bridges can consume `.config` and `.state` uniformly.
+    """
+
+    def __init__(self, model_name_or_path: Union[str, Path], **kwargs):
+        self._model_name_or_path = str(model_name_or_path)
+        super().__init__(**kwargs)
+
+    @property
+    def model_name_or_path(self) -> str:
+        return self._model_name_or_path
+
+    # Model loading is optional for conversion; implemented for completeness
+    def _load_model(self) -> WanTransformer3DModel:
+        return WanTransformer3DModel.from_pretrained(self.model_name_or_path, subfolder="transformer")
+
+    # Config is required by the WAN bridge
+    def _load_config(self) -> AutoConfig:
+        # WanTransformer3DModel returns a config-like object with required fields
+
+        print(f"Loading config from {self.model_name_or_path}")
+        
+        return WanTransformer3DModel.from_pretrained(self.model_name_or_path, subfolder="transformer").config
+    
+    
+class PreTrainedVACE(PreTrainedBase):
+    """
+    Lightweight pretrained wrapper for Diffusers WAN models.
+
+    Provides access to WAN config and state through the common PreTrainedBase API
+    so bridges can consume `.config` and `.state` uniformly.
+    """
+
+    def __init__(self, model_name_or_path: Union[str, Path], **kwargs):
+        self._model_name_or_path = str(model_name_or_path)
+        super().__init__(**kwargs)
+
+    @property
+    def model_name_or_path(self) -> str:
+        return self._model_name_or_path
+
+    # Model loading is optional for conversion; implemented for completeness
+    def _load_model(self) -> WanVACETransformer3DModel:
+        return WanVACETransformer3DModel.from_pretrained(self.model_name_or_path, subfolder="transformer")
+
+    # Config is required by the WAN bridge
+    def _load_config(self) -> AutoConfig:
+        # WanTransformer3DModel returns a config-like object with required fields
+
+        print(f"Loading config from {self.model_name_or_path}")
+        
+        return WanVACETransformer3DModel.from_pretrained(self.model_name_or_path, subfolder="transformer").config
+
+
diff --git a/src/megatron/bridge/models/llama/llama_provider.py b/src/megatron/bridge/models/llama/llama_provider.py
index 298b8756a0..1d2ab21bb3 100644
--- a/src/megatron/bridge/models/llama/llama_provider.py
+++ b/src/megatron/bridge/models/llama/llama_provider.py
@@ -180,7 +180,7 @@ class Llama3ModelProvider8B(Llama3ModelProvider):
 
     rotary_base: int = 500_000
     seq_length: int = 8192
-    num_layers: int = 32
+    num_layers: int = 2
     hidden_size: int = 4096
     ffn_hidden_size: int = 14336
     num_attention_heads: int = 32
diff --git a/src/megatron/bridge/models/model_provider.py b/src/megatron/bridge/models/model_provider.py
index d79866a5db..5454194e8b 100644
--- a/src/megatron/bridge/models/model_provider.py
+++ b/src/megatron/bridge/models/model_provider.py
@@ -209,7 +209,14 @@ def initialize_model_parallel(
             seed_kwargs: Additional arguments for `model_parallel_cuda_manual_seed`.
             **model_parallel_kwargs: Additional arguments for `parallel_state.initialize_model_parallel`.
         """
+        # Initialize torch.distributed only if not already initialized.
+        # Provide safe defaults for single-process runs where env vars like RANK/WORLD_SIZE
+        # may not be set (e.g., when not using torchrun).
         if not torch.distributed.is_initialized():
+            os.environ["RANK"] = os.environ.get("RANK", "0")
+            os.environ["WORLD_SIZE"] = os.environ.get("WORLD_SIZE", "1")
+            os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "localhost")
+            os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "12355")
             torch.cuda.set_device(get_local_rank_preinit())
             torch.distributed.init_process_group("nccl")
 
@@ -429,6 +436,24 @@ class GetModelKwargs(TypedDict, total=False):
     post_wrap_hook: Callable[[list[MegatronModule]], list[MegatronModule]] | None
 
 
+class ModelParallelKwargs(TypedDict, total=False):
+    """Model-parallel override kwargs.
+
+    Attributes map to `TransformerConfig`/provider fields that control parallelism.
+    Only provided values are applied as overrides.
+    """
+
+    tensor_model_parallel_size: int
+    pipeline_model_parallel_size: int
+    context_parallel_size: int
+    expert_model_parallel_size: int
+    expert_tensor_parallel_size: int
+    moe_extended_tp: bool
+    sequence_parallel: bool
+    virtual_pipeline_model_parallel_size: int | None
+    hierarchical_context_parallel_sizes: list[int] | None
+
+
 def get_model(
     model_provider: ModelProviderMixin,
     ddp_config: DistributedDataParallelConfig,
diff --git a/src/megatron/bridge/models/wan/flow_matching/__init__.py b/src/megatron/bridge/models/wan/flow_matching/__init__.py
new file mode 100644
index 0000000000..d9155f923f
--- /dev/null
+++ b/src/megatron/bridge/models/wan/flow_matching/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/megatron/bridge/models/wan/flow_matching/flow_inference_pipeline.py b/src/megatron/bridge/models/wan/flow_matching/flow_inference_pipeline.py
new file mode 100644
index 0000000000..385bf6d741
--- /dev/null
+++ b/src/megatron/bridge/models/wan/flow_matching/flow_inference_pipeline.py
@@ -0,0 +1,1330 @@
+import gc
+import logging
+import math
+import os
+import random
+import sys
+import types
+import re
+from contextlib import contextmanager
+from functools import partial
+
+from PIL import Image
+import torchvision.transforms.functional as TF
+import torch
+import torch.cuda.amp as amp
+import torch.distributed as dist
+from tqdm import tqdm
+
+from megatron.bridge.models.wan.wan_model import WanModel, VACEModel
+from megatron.bridge.models.wan.wan_provider import WanModelProvider, VACEModelProvider
+from megatron.bridge.models.wan.modules.t5 import T5EncoderModel
+from megatron.bridge.models.wan.modules import WanVAE
+from megatron.bridge.models.wan.inference.utils.fm_solvers import (
+    FlowDPMSolverMultistepScheduler,
+    get_sampling_sigmas,
+    retrieve_timesteps,
+)
+from megatron.bridge.models.wan.inference.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+from megatron.bridge.models.wan.utils.utils import grid_sizes_calculation, patchify
+from megatron.core import parallel_state
+from torch.nn import functional as F
+from megatron.bridge.models.wan.utils.utils import split_inputs_cp, cat_outputs_cp, thd_split_inputs_cp, thd_cat_outputs_cp
+
+import math
+from typing import Tuple, Union
+
+from ..utils.preprocessor import VaceVideoProcessor
+
+class FlowInferencePipeline:
+
+    def __init__(
+        self,
+        config,
+        checkpoint_dir,
+        checkpoint_step=None,
+        t5_checkpoint_dir=None,
+        vae_checkpoint_dir=None,
+        device_id=0,
+        rank=0,
+        t5_cpu=False,
+
+        tensor_parallel_size=1,
+        context_parallel_size=1,
+        pipeline_parallel_size=1,
+        sequence_parallel=False,
+        pipeline_dtype=torch.float32,
+    ):
+        r"""
+        Initializes the FlowInferencePipeline with the given parameters.
+
+        Args:
+            config (EasyDict):
+                Object containing model parameters initialized from config.py
+            checkpoint_dir (`str`):
+                Path to directory containing model checkpoints
+            t5_checkpoint_dir (`str`, *optional*, defaults to None):
+                Optional directory containing T5 checkpoint and tokenizer; falls back to `checkpoint_dir` if None.
+            vae_checkpoint_dir (`str`, *optional*, defaults to None):
+                Optional directory containing VAE checkpoint; falls back to `checkpoint_dir` if None.
+            device_id (`int`,  *optional*, defaults to 0):
+                Id of target GPU device
+            rank (`int`,  *optional*, defaults to 0):
+                Process rank for distributed training
+            t5_cpu (`bool`, *optional*, defaults to False):
+                Whether to place T5 model on CPU. Only works without t5_fsdp.
+        """
+        self.device = torch.device(f"cuda:{device_id}")
+        self.config = config
+        self.rank = rank
+        self.t5_cpu = t5_cpu
+        self.tensor_parallel_size = tensor_parallel_size
+        self.context_parallel_size = context_parallel_size
+        self.pipeline_parallel_size = pipeline_parallel_size
+        self.sequence_parallel = sequence_parallel
+        self.pipeline_dtype = pipeline_dtype
+        self.num_train_timesteps = config.num_train_timesteps
+        self.param_dtype = config.param_dtype
+
+        self.text_encoder = T5EncoderModel(
+            text_len=config.text_len,
+            dtype=config.t5_dtype,
+            device=torch.device('cpu'),
+            checkpoint_path=os.path.join(t5_checkpoint_dir, config.t5_checkpoint),
+            tokenizer_path=os.path.join(t5_checkpoint_dir, config.t5_tokenizer),
+            shard_fn=None)
+
+        log_checkpoint("before vae")
+        
+        self.vae_stride = config.vae_stride
+        self.patch_size = config.patch_size        
+        self.vae = WanVAE(
+            vae_pth=os.path.join(vae_checkpoint_dir, config.vae_checkpoint),
+            device=self.device)
+
+        wan_checkpoint_dir = self._select_checkpoint_dir(checkpoint_dir, checkpoint_step)
+        self.model = self.setup_model_from_checkpoint(wan_checkpoint_dir)
+        
+        # if we use context parallelism, we need to set qkv_format to "thd" for context parallelism
+        self.model.config.qkv_format = "thd" # "sbhd"
+
+        # set self.sp_size=1 for later use, just to respect the original Wan inference code
+        self.sp_size = 1
+
+        if dist.is_initialized():
+            dist.barrier()
+        self.model.to(self.device)
+        
+        log_checkpoint("after transformer")
+
+        self.sample_neg_prompt = config.sample_neg_prompt
+        
+
+    def unpatchify(self, x: torch.Tensor, grid_sizes: torch.Tensor, out_dim: int) -> list[torch.Tensor]:
+        r"""
+        Reconstruct video tensors from patch embeddings into a list of videotensors.
+
+        Args:
+            x (torch.Tensor):
+                Tensor of patchified features, with shape [seq_len, c * pF * pH * pW]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+
+        Returns:
+            list[torch.Tensor]: list of tensors, each with shape [c, F_latents, H_latents, W_latents]
+        """
+
+        c = out_dim
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[:math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum('fhwpqrc->cfphqwr', u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+
+
+    def setup_model_from_checkpoint(self, checkpoint_dir):
+        provider = WanModelProvider()
+        provider.tensor_model_parallel_size = self.tensor_parallel_size
+        provider.pipeline_model_parallel_size = self.pipeline_parallel_size
+        provider.context_parallel_size = self.context_parallel_size
+        provider.sequence_parallel = self.sequence_parallel
+        provider.pipeline_dtype = self.pipeline_dtype
+        # Once all overrides are set, finalize the model provider to ensure the post initialization logic is run
+        provider.finalize()
+        provider.initialize_model_parallel(seed=0)
+        
+        ## Read from megatron checkpoint
+        from megatron.bridge.training.model_load_save import load_megatron_model as _load_megatron_model
+        model = _load_megatron_model(
+            checkpoint_dir,
+            mp_overrides={
+                "tensor_model_parallel_size": self.tensor_parallel_size,
+                "pipeline_model_parallel_size": self.pipeline_parallel_size,
+                "context_parallel_size": self.context_parallel_size,
+                "sequence_parallel": self.sequence_parallel,
+                "pipeline_dtype": self.pipeline_dtype,
+            },
+        )
+        if isinstance(model, list):
+            model = model[0]
+        # for i in list(model.state_dict().keys()):
+        #     print(i)
+        if hasattr(model, "module"):
+            model = model.module
+        # for ly in model.decoder.layers:
+        #     print(ly.idx)
+        return model
+
+    def _select_checkpoint_dir(self, base_dir: str, checkpoint_step) -> str:
+        """
+        Resolve checkpoint directory:
+        - If checkpoint_step is provided, use base_dir/iter_{step:07d}
+        - Otherwise, pick the largest iter_######## subdirectory under base_dir
+        """
+        if checkpoint_step is not None:
+            path = os.path.join(base_dir, f"iter_{int(checkpoint_step):07d}")
+            if os.path.isdir(path):
+                logging.info(f"Using specified checkpoint: {path}")
+                return path
+            raise FileNotFoundError(f"Specified checkpoint step {checkpoint_step} not found at {path}")
+
+        if not os.path.isdir(base_dir):
+            raise FileNotFoundError(f"Checkpoint base directory does not exist: {base_dir}")
+
+        pattern = re.compile(r"^iter_(\d+)$")
+        try:
+            _, latest_path = max(
+                ((int(pattern.match(e.name).group(1)), e.path)
+                 for e in os.scandir(base_dir)
+                 if e.is_dir() and pattern.match(e.name)),
+                key=lambda x: x[0],
+            )
+        except ValueError:
+            raise FileNotFoundError(
+                f"No checkpoints found under {base_dir}. Expected subdirectories named like 'iter_0001800'.")
+
+        logging.info(f"Auto-selected latest checkpoint: {latest_path}")
+        return latest_path
+
+
+    def forward_pp_step(
+        self,
+        latent_model_input: torch.Tensor,
+        grid_sizes: list[Tuple[int, int, int]],
+        max_video_seq_len: int,
+        timestep: torch.Tensor,
+        arg_c: dict,        
+    ) -> torch.Tensor:
+        """
+        Forward pass supporting pipeline parallelism.
+        """
+
+        from megatron.core import parallel_state
+        from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage, recv_from_prev_pipeline_rank_, send_to_next_pipeline_rank
+
+        pp_world_size = parallel_state.get_pipeline_model_parallel_world_size()
+        is_pp_first = parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+        is_pp_last = parallel_state.is_pipeline_last_stage(ignore_virtual=True)
+
+        # PP=1: no pipeline parallelism
+        if pp_world_size == 1:
+            noise_pred_pp = self.model(
+                latent_model_input,
+                grid_sizes=grid_sizes,
+                t=timestep,
+                **arg_c)
+            return noise_pred_pp
+
+        # PP>1: pipeline parallelism
+        hidden_size = self.model.config.hidden_size
+        batch_size = latent_model_input.shape[1]
+        # noise prediction shape for communication between first and last pipeline stages
+        noise_pred_pp_shape = list(latent_model_input.shape)
+
+        if is_pp_first:
+            # First stage: compute multimodal + first PP slice, send activations, then receive sampled token
+            hidden_states = self.model(
+                latent_model_input,
+                grid_sizes=grid_sizes,
+                t=timestep,
+                **arg_c)
+            send_to_next_pipeline_rank(hidden_states)
+
+            noise_pred_pp = broadcast_from_last_pipeline_stage(noise_pred_pp_shape, dtype=torch.float32)
+            return noise_pred_pp
+
+        if is_pp_last:
+            # Last stage: recv activations, run final slice + output, sample, broadcast
+            recv_buffer = torch.empty(
+                (max_video_seq_len, batch_size, hidden_size),
+                dtype=next(self.model.parameters()).dtype,
+                device=latent_model_input[0].device,
+            )
+            recv_from_prev_pipeline_rank_(recv_buffer)
+            recv_buffer = recv_buffer.to(torch.bfloat16) # ????
+            self.model.set_input_tensor(recv_buffer)
+            noise_pred_pp = self.model(
+                latent_model_input,
+                grid_sizes=grid_sizes,
+                t=timestep,
+                **arg_c)
+
+            noise_pred_pp = broadcast_from_last_pipeline_stage(noise_pred_pp_shape, dtype=noise_pred_pp.dtype, tensor=noise_pred_pp.contiguous())
+            return noise_pred_pp
+
+        # Intermediate stages: recv -> run local slice -> send -> receive broadcast token
+        recv_buffer = torch.empty(
+            (max_video_seq_len, batch_size, hidden_size),
+            dtype=next(self.model.parameters()).dtype,
+            device=latent_model_input[0].device,
+        )
+        recv_from_prev_pipeline_rank_(recv_buffer)
+        recv_buffer = recv_buffer.to(torch.bfloat16) # ????
+        self.model.set_input_tensor(recv_buffer)
+        hidden_states = self.model(
+            latent_model_input,
+            grid_sizes=grid_sizes,
+            t=timestep,
+            **arg_c)
+        send_to_next_pipeline_rank(hidden_states)
+
+        noise_pred_pp = broadcast_from_last_pipeline_stage(noise_pred_pp_shape, dtype=torch.float32)
+        return noise_pred_pp
+
+
+    def generate(self,
+                 prompts,
+                 sizes,
+                 frame_nums,
+                 shift=5.0,
+                 sample_solver='unipc',
+                 sampling_steps=50,
+                 guide_scale=5.0,
+                 n_prompt="",
+                 seed=-1,
+                 offload_model=True):
+        r"""
+        Generates video frames from text prompt using diffusion process.
+
+        Args:
+            prompts (`list[str]`):
+                Text prompt for content generation
+            sizes (list[tuple[int, int]]):
+                Controls video resolution, (width,height).
+            frame_nums (`list[int]`):
+                How many frames to sample from a video. The number should be 4n+1
+            shift (`float`, *optional*, defaults to 5.0):
+                Noise schedule shift parameter. Affects temporal dynamics
+            sample_solver (`str`, *optional*, defaults to 'unipc'):
+                Solver used to sample the video.
+            sampling_steps (`int`, *optional*, defaults to 40):
+                Number of diffusion sampling steps. Higher values improve quality but slow generation
+            guide_scale (`float`, *optional*, defaults 5.0):
+                Classifier-free guidance scale. Controls prompt adherence vs. creativity
+            n_prompt (`str`, *optional*, defaults to ""):
+                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
+            seed (`int`, *optional*, defaults to -1):
+                Random seed for noise generation. If -1, use random seed.
+            offload_model (`bool`, *optional*, defaults to True):
+                If True, offloads models to CPU during generation to save VRAM
+
+        Returns:
+            torch.Tensor:
+                Generated video frames tensor. Dimensions: (C, N H, W) where:
+                - C: Color channels (3 for RGB)
+                - N: Number of frames (81)
+                - H: Frame height (from size)
+                - W: Frame width from size)
+        """
+    
+        # preprocess
+        target_shapes = []
+        for size, frame_num in zip(sizes, frame_nums):
+            target_shapes.append((self.vae.model.z_dim, (frame_num - 1) // self.vae_stride[0] + 1,
+                                size[1] // self.vae_stride[1],
+                                size[0] // self.vae_stride[2]))
+
+        max_video_seq_len = 0
+        seq_lens = []
+        for target_shape in target_shapes:
+            seq_len = math.ceil((target_shape[2] * target_shape[3]) /
+                                (self.patch_size[1] * self.patch_size[2]) *
+                                target_shape[1] / self.sp_size) * self.sp_size
+            seq_lens.append(seq_len)
+        max_video_seq_len = max(seq_lens)
+
+        if n_prompt == "":
+            n_prompt = self.sample_neg_prompt
+        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
+        seed_g = torch.Generator(device=self.device)
+        seed_g.manual_seed(seed)
+
+
+        ## process context
+        context_max_len = 512
+        context_lens = []
+        contexts = []
+        contexts_null = []
+        for prompt in prompts:
+            if not self.t5_cpu:
+                self.text_encoder.model.to(self.device)
+                context = self.text_encoder([prompt], self.device)[0]
+                context_null = self.text_encoder([n_prompt], self.device)[0]
+                if offload_model:
+                    self.text_encoder.model.cpu()
+            else:
+                context = self.text_encoder([prompt], torch.device('cpu'))[0].to(self.device)
+                context_null = self.text_encoder([n_prompt], torch.device('cpu'))[0].to(self.device)
+            context_lens.append(context_max_len) # all samples have the same context_max_len
+            contexts.append(context)
+            contexts_null.append(context_null)
+        # pad to context_max_len tokens, and stack to a tensor of shape [s, b, hidden]
+        contexts = [F.pad(context, (0, 0, 0, context_max_len - context.shape[0])) for context in contexts]
+        contexts_null = [F.pad(context_null, (0, 0, 0, context_max_len - context_null.shape[0])) for context_null in contexts_null]
+        contexts = torch.stack(contexts, dim=1)
+        contexts_null = torch.stack(contexts_null, dim=1)
+
+
+        ## setup noise
+        noises = []
+        for target_shape in target_shapes:
+            noises.append(
+                torch.randn(
+                    target_shape[0],
+                    target_shape[1],
+                    target_shape[2],
+                    target_shape[3],
+                    dtype=torch.float32,
+                    device=self.device,
+                    generator=seed_g)
+            )
+
+
+        # calculate grid_sizes
+        grid_sizes = [grid_sizes_calculation(
+            input_shape =u.shape[1:], 
+            patch_size=self.model.patch_size,
+            ) for u in noises]
+        grid_sizes = torch.tensor(grid_sizes, dtype=torch.long)
+
+
+        @contextmanager
+        def noop_no_sync():
+            yield
+
+        no_sync = getattr(self.model, 'no_sync', noop_no_sync)
+
+        # evaluation mode
+        with amp.autocast(dtype=self.param_dtype), torch.no_grad(), no_sync():
+
+            if sample_solver == 'unipc':
+                # Create a prototype scheduler to compute shared timesteps
+                sample_scheduler = FlowUniPCMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sample_scheduler.set_timesteps(
+                    sampling_steps, device=self.device, shift=shift)
+                timesteps = sample_scheduler.timesteps
+
+                # Instantiate per-sample schedulers so each sample maintains its own state
+                batch_size_for_schedulers = len(noises)
+                schedulers = []
+                for _ in range(batch_size_for_schedulers):
+                    s = FlowUniPCMultistepScheduler(
+                        num_train_timesteps=self.num_train_timesteps,
+                        shift=1,
+                        use_dynamic_shifting=False)
+                    s.set_timesteps(sampling_steps, device=self.device, shift=shift)
+                    schedulers.append(s)
+            elif sample_solver == 'dpm++':
+                sample_scheduler = FlowDPMSolverMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
+                timesteps, _ = retrieve_timesteps(
+                    sample_scheduler,
+                    device=self.device,
+                    sigmas=sampling_sigmas)
+            else:
+                raise NotImplementedError("Unsupported solver.")
+
+            # sample videos
+            latents = noises
+
+            from megatron.core.packed_seq_params import PackedSeqParams
+            cu_q = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)])
+            cu_q = cu_q.to(torch.int32).to(self.device)
+            cu_kv_self = cu_q
+            cu_kv_cross = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(context_lens), dim=0)])
+            cu_kv_cross = cu_kv_cross.to(torch.int32).to(self.device)
+            packed_seq_params = {
+                "self_attention": PackedSeqParams(
+                    cu_seqlens_q=cu_q,
+                    cu_seqlens_kv=cu_kv_self,
+                    qkv_format=self.model.config.qkv_format,
+                ),
+                "cross_attention": PackedSeqParams(
+                    cu_seqlens_q=cu_q,
+                    cu_seqlens_kv=cu_kv_cross,
+                    qkv_format=self.model.config.qkv_format,
+                ),
+            }
+
+            
+            # context parallel
+            if parallel_state.get_context_parallel_world_size() > 1:
+                contexts = thd_split_inputs_cp(contexts, packed_seq_params['cross_attention'].cu_seqlens_kv, parallel_state.get_context_parallel_group())
+                contexts_null = thd_split_inputs_cp(contexts_null, packed_seq_params['cross_attention'].cu_seqlens_kv, parallel_state.get_context_parallel_group())
+            
+
+            arg_c = {'context': contexts, 'max_seq_len': max_video_seq_len, 'packed_seq_params': packed_seq_params}
+            arg_null = {'context': contexts_null, 'max_seq_len': max_video_seq_len, 'packed_seq_params': packed_seq_params}
+
+            for _, t in enumerate(tqdm(timesteps)):
+
+                batch_size = len(latents)
+
+                # patchify latents
+                unpatchified_latents = latents
+                latents = patchify(latents, self.patch_size)
+                # pad to have same length
+                for i in range(batch_size):
+                    latents[i] = F.pad(latents[i], (0, 0, 0, max_video_seq_len - latents[i].shape[0]))
+                latents = torch.stack(latents, dim=1)
+
+
+                # context parallel
+                if parallel_state.get_context_parallel_world_size() > 1:
+                    latents = thd_split_inputs_cp(latents, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+
+
+                latent_model_input = latents
+                timestep = [t] * batch_size
+                timestep = torch.stack(timestep)
+
+                self.model.to(self.device)
+                noise_pred_cond = self.forward_pp_step(
+                    latent_model_input, grid_sizes=grid_sizes, max_video_seq_len=max_video_seq_len, timestep=timestep, arg_c=arg_c)
+
+                noise_pred_uncond = self.forward_pp_step(
+                    latent_model_input, grid_sizes=grid_sizes, max_video_seq_len=max_video_seq_len, timestep=timestep, arg_c=arg_null)
+
+
+                # context parallel
+                if parallel_state.get_context_parallel_world_size() > 1:
+                    noise_pred_cond = thd_cat_outputs_cp(noise_pred_cond, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+                    noise_pred_uncond = thd_cat_outputs_cp(noise_pred_uncond, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+
+
+                # run unpatchify
+                unpatchified_noise_pred_cond = noise_pred_cond
+                unpatchified_noise_pred_cond = unpatchified_noise_pred_cond.transpose(0, 1) # bring sbhd -> bshd
+                # when unpatchifying, the code will truncate the padded videos into the original video shape, based on the grid_sizes.
+                unpatchified_noise_pred_cond = self.unpatchify(unpatchified_noise_pred_cond, grid_sizes, self.vae.model.z_dim)
+                unpatchified_noise_pred_uncond = noise_pred_uncond
+                unpatchified_noise_pred_uncond = unpatchified_noise_pred_uncond.transpose(0, 1) # bring sbhd -> bshd
+                # when unpatchifying, the code will truncate the padded videos into the original video shape, based on the grid_sizes.
+                unpatchified_noise_pred_uncond = self.unpatchify(unpatchified_noise_pred_uncond, grid_sizes, self.vae.model.z_dim)
+
+                noise_preds = []
+                for i in range(batch_size):
+                    noise_pred = unpatchified_noise_pred_uncond[i] + guide_scale * (
+                        unpatchified_noise_pred_cond[i] - unpatchified_noise_pred_uncond[i])
+                    noise_preds.append(noise_pred)
+
+                # step and update latents
+                latents = []
+                for i in range(batch_size):
+
+                    if sample_solver == 'unipc':
+                        temp_x0 = schedulers[i].step(
+                            noise_preds[i].unsqueeze(0),
+                            t,
+                            unpatchified_latents[i].unsqueeze(0),
+                            return_dict=False,
+                            generator=seed_g)[0]
+                    else:
+                        temp_x0 = sample_scheduler.step(
+                            noise_preds[i].unsqueeze(0),
+                            t,
+                            unpatchified_latents[i].unsqueeze(0),
+                            return_dict=False,
+                            generator=seed_g)[0]
+                    latents.append(temp_x0.squeeze(0))
+
+            x0 = latents
+            if offload_model:
+                self.model.cpu()
+                torch.cuda.empty_cache()
+            if self.rank == 0:
+                videos = self.vae.decode(x0)
+            else:
+                videos = None
+
+        del noises, latents
+        if sample_solver == 'unipc':
+            del schedulers
+        else:
+            del sample_scheduler
+        if offload_model:
+            gc.collect()
+            torch.cuda.synchronize()
+        if dist.is_initialized():
+            dist.barrier()
+
+        return videos if self.rank == 0 else None
+
+
+def log_checkpoint(tag):
+    torch.cuda.synchronize()
+    alloc = torch.cuda.memory_allocated() / 1024**3
+    reserved = torch.cuda.memory_reserved() / 1024**3
+    print(f"[{tag}] alloc={alloc:.2f} GB reserved={reserved:.2f} GB")
+
+
+class VACEFlowInferencePipeline:
+
+    def __init__(
+        self,
+        config,
+        checkpoint_dir,
+        checkpoint_step=None,
+        t5_checkpoint_dir=None,
+        vae_checkpoint_dir=None,
+        device_id=0,
+        rank=0,
+        t5_cpu=False,
+
+        tensor_parallel_size=1,
+        context_parallel_size=1,
+        pipeline_parallel_size=1,
+        sequence_parallel=False,
+        pipeline_dtype=torch.float32,
+    ):
+        r"""
+        Initializes the FlowInferencePipeline with the given parameters.
+
+        Args:
+            config (EasyDict):
+                Object containing model parameters initialized from config.py
+            checkpoint_dir (`str`):
+                Path to directory containing model checkpoints
+            t5_checkpoint_dir (`str`, *optional*, defaults to None):
+                Optional directory containing T5 checkpoint and tokenizer; falls back to `checkpoint_dir` if None.
+            vae_checkpoint_dir (`str`, *optional*, defaults to None):
+                Optional directory containing VAE checkpoint; falls back to `checkpoint_dir` if None.
+            device_id (`int`,  *optional*, defaults to 0):
+                Id of target GPU device
+            rank (`int`,  *optional*, defaults to 0):
+                Process rank for distributed training
+            t5_cpu (`bool`, *optional*, defaults to False):
+                Whether to place T5 model on CPU. Only works without t5_fsdp.
+        """
+        self.device = torch.device(f"cuda:{device_id}")
+        self.config = config
+        self.rank = rank
+        self.t5_cpu = t5_cpu
+        self.tensor_parallel_size = tensor_parallel_size
+        self.context_parallel_size = context_parallel_size
+        self.pipeline_parallel_size = pipeline_parallel_size
+        self.sequence_parallel = sequence_parallel
+        self.pipeline_dtype = pipeline_dtype
+        self.num_train_timesteps = config.num_train_timesteps
+        self.param_dtype = config.param_dtype
+
+        self.text_encoder = T5EncoderModel(
+            text_len=config.text_len,
+            dtype=config.t5_dtype,
+            device=torch.device('cpu'),
+            checkpoint_path=os.path.join(t5_checkpoint_dir, config.t5_checkpoint),
+            tokenizer_path=os.path.join(t5_checkpoint_dir, config.t5_tokenizer),
+            shard_fn=None)
+        
+        log_checkpoint("before vae")
+        
+        self.vae_stride = config.vae_stride
+        self.patch_size = config.patch_size        
+        self.vae = WanVAE(
+            vae_pth=os.path.join(vae_checkpoint_dir, config.vae_checkpoint),
+            device=self.device)
+
+        wan_checkpoint_dir = self._select_checkpoint_dir(checkpoint_dir, checkpoint_step)
+        self.model = self.setup_model_from_checkpoint(wan_checkpoint_dir)
+        
+        # if we use context parallelism, we need to set qkv_format to "thd" for context parallelism
+        self.model.config.qkv_format = "thd" # "sbhd"
+
+        # set self.sp_size=1 for later use, just to respect the original Wan inference code
+        self.sp_size = 1
+
+        if dist.is_initialized():
+            dist.barrier()
+        self.model.to(self.device)
+        
+        log_checkpoint("after transformer")
+        
+        self.sample_neg_prompt = config.sample_neg_prompt
+        
+        self.vid_proc = VaceVideoProcessor(downsample=tuple([x * y for x, y in zip(self.vae_stride, self.patch_size)]),
+                                            min_area=832 *480,
+                                            max_area=832 *480,
+                                            min_fps=self.config.sample_fps,
+                                            max_fps=self.config.sample_fps,
+                                            zero_start=True,
+                                            seq_len=32760,
+                                            keep_last=True)
+        
+
+    def unpatchify(self, x: torch.Tensor, grid_sizes: torch.Tensor, out_dim: int) -> list[torch.Tensor]:
+        r"""
+        Reconstruct video tensors from patch embeddings into a list of videotensors.
+
+        Args:
+            x (torch.Tensor):
+                Tensor of patchified features, with shape [seq_len, c * pF * pH * pW]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+
+        Returns:
+            list[torch.Tensor]: list of tensors, each with shape [c, F_latents, H_latents, W_latents]
+        """
+
+        c = out_dim
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[:math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum('fhwpqrc->cfphqwr', u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+
+
+    def setup_model_from_checkpoint(self, checkpoint_dir):
+        provider = VACEModelProvider()
+        provider.tensor_model_parallel_size = self.tensor_parallel_size
+        provider.pipeline_model_parallel_size = self.pipeline_parallel_size
+        provider.context_parallel_size = self.context_parallel_size
+        provider.sequence_parallel = self.sequence_parallel
+        provider.pipeline_dtype = self.pipeline_dtype
+        # Once all overrides are set, finalize the model provider to ensure the post initialization logic is run
+        provider.finalize()
+        provider.initialize_model_parallel(seed=0)
+        
+        ## Read from megatron checkpoint
+        from megatron.bridge.training.model_load_save import load_megatron_model as _load_megatron_model
+        model = _load_megatron_model(
+            checkpoint_dir,
+            mp_overrides={
+                "tensor_model_parallel_size": self.tensor_parallel_size,
+                "pipeline_model_parallel_size": self.pipeline_parallel_size,
+                "context_parallel_size": self.context_parallel_size,
+                "sequence_parallel": self.sequence_parallel,
+                "pipeline_dtype": self.pipeline_dtype,
+            },
+        )
+        if isinstance(model, list):
+            model = model[0]
+        if hasattr(model, "module"):
+            model = model.module
+        return model
+
+    def _select_checkpoint_dir(self, base_dir: str, checkpoint_step) -> str:
+        """
+        Resolve checkpoint directory:
+        - If checkpoint_step is provided, use base_dir/iter_{step:07d}
+        - Otherwise, pick the largest iter_######## subdirectory under base_dir
+        """
+        if checkpoint_step is not None:
+            path = os.path.join(base_dir, f"iter_{int(checkpoint_step):07d}")
+            if os.path.isdir(path):
+                logging.info(f"Using specified checkpoint: {path}")
+                return path
+            raise FileNotFoundError(f"Specified checkpoint step {checkpoint_step} not found at {path}")
+
+        if not os.path.isdir(base_dir):
+            raise FileNotFoundError(f"Checkpoint base directory does not exist: {base_dir}")
+
+        pattern = re.compile(r"^iter_(\d+)$")
+        try:
+            _, latest_path = max(
+                ((int(pattern.match(e.name).group(1)), e.path)
+                 for e in os.scandir(base_dir)
+                 if e.is_dir() and pattern.match(e.name)),
+                key=lambda x: x[0],
+            )
+        except ValueError:
+            raise FileNotFoundError(
+                f"No checkpoints found under {base_dir}. Expected subdirectories named like 'iter_0001800'.")
+
+        logging.info(f"Auto-selected latest checkpoint: {latest_path}")
+        return latest_path
+
+
+    def vace_encode_frames(self, frames, ref_images, masks=None):
+        vae = self.vae
+        if ref_images is None:
+            ref_images = [None] * len(frames)
+        else:
+            assert len(frames) == len(ref_images)
+
+        if masks is None:
+            latents = vae.encode(frames)
+        else:
+            masks = [torch.where(m > 0.5, 1.0, 0.0) for m in masks]
+            inactive = [i * (1 - m) + 0 * m for i, m in zip(frames, masks)]
+            reactive = [i * m + 0 * (1 - m) for i, m in zip(frames, masks)]
+            inactive = vae.encode(inactive)
+            reactive = vae.encode(reactive)
+            latents = [torch.cat((u, c), dim=0) for u, c in zip(inactive, reactive)]
+
+        cat_latents = []
+        for latent, refs in zip(latents, ref_images):
+            if refs is not None:
+                if masks is None:
+                    ref_latent = vae.encode(refs)
+                else:
+                    ref_latent = vae.encode(refs)
+                    ref_latent = [torch.cat((u, torch.zeros_like(u)), dim=0) for u in ref_latent]
+                assert all([x.shape[1] == 1 for x in ref_latent])
+                latent = torch.cat([*ref_latent, latent], dim=1)
+            cat_latents.append(latent)
+        return cat_latents
+
+
+    def vace_encode_masks(self, masks, ref_images=None):
+        vae_stride = self.vae_stride
+        if ref_images is None:
+            ref_images = [None] * len(masks)
+        else:
+            assert len(masks) == len(ref_images)
+
+        result_masks = []
+        for mask, refs in zip(masks, ref_images):
+            c, depth, height, width = mask.shape
+            new_depth = int((depth + 3) // vae_stride[0])
+            height = 2 * (int(height) // (vae_stride[1] * 2))
+            width = 2 * (int(width) // (vae_stride[2] * 2))
+
+            # reshape
+            mask = mask[0, :, :, :]
+            mask = mask.view(
+                depth, height, vae_stride[1], width, vae_stride[1]
+            )  # depth, height, 8, width, 8
+            mask = mask.permute(2, 4, 0, 1, 3)  # 8, 8, depth, height, width
+            mask = mask.reshape(
+                vae_stride[1] * vae_stride[2], depth, height, width
+            )  # 8*8, depth, height, width
+
+            # interpolation
+            mask = F.interpolate(mask.unsqueeze(0), size=(new_depth, height, width), mode='nearest-exact').squeeze(0)
+
+            if refs is not None:
+                length = len(refs)
+                mask_pad = torch.zeros_like(mask[:, :length, :, :])
+                mask = torch.cat((mask_pad, mask), dim=1)
+            result_masks.append(mask)
+        return result_masks
+
+
+    def vace_latent(self, z, m):
+        return [torch.cat([zz, mm], dim=0) for zz, mm in zip(z, m)]
+
+
+    def prepare_source(self, src_video, src_mask, src_ref_images, num_frames, image_size, device):
+        area = image_size[0] * image_size[1]
+        self.vid_proc.set_area(area)
+        if area == 1280*720:
+            self.vid_proc.set_seq_len(75600)
+        elif area == 832*480:
+            self.vid_proc.set_seq_len(32760)
+        else:
+            raise NotImplementedError(f'image_size {image_size} is not supported')
+
+        image_size = (image_size[1], image_size[0])
+        image_sizes = []
+        for i, (sub_src_video, sub_src_mask) in enumerate(zip(src_video, src_mask)):
+            if sub_src_mask is not None and sub_src_video is not None:
+                src_video[i], src_mask[i], _, _, _ = self.vid_proc.load_video_pair(sub_src_video, sub_src_mask)
+                src_video[i] = src_video[i].to(device)
+                src_mask[i] = src_mask[i].to(device)
+                src_mask[i] = torch.clamp((src_mask[i][:1, :, :, :] + 1) / 2, min=0, max=1)
+                image_sizes.append(src_video[i].shape[2:])
+            elif sub_src_video is None:
+                src_video[i] = torch.zeros((3, num_frames, image_size[0], image_size[1]), device=device)
+                src_mask[i] = torch.ones_like(src_video[i], device=device)
+                image_sizes.append(image_size)
+            else:
+                src_video[i], _, _, _ = self.vid_proc.load_video(sub_src_video)
+                src_video[i] = src_video[i].to(device)
+                src_mask[i] = torch.ones_like(src_video[i], device=device)
+                image_sizes.append(src_video[i].shape[2:])
+
+        for i, ref_images in enumerate(src_ref_images):
+            if ref_images is not None:
+                image_size = image_sizes[i]
+                for j, ref_img in enumerate(ref_images):
+                    if ref_img is not None:
+                        ref_img = Image.open(ref_img).convert("RGB")
+                        ref_img = TF.to_tensor(ref_img).sub_(0.5).div_(0.5).unsqueeze(1)
+                        if ref_img.shape[-2:] != image_size:
+                            canvas_height, canvas_width = image_size
+                            ref_height, ref_width = ref_img.shape[-2:]
+                            white_canvas = torch.ones((3, 1, canvas_height, canvas_width), device=device) # [-1, 1]
+                            scale = min(canvas_height / ref_height, canvas_width / ref_width)
+                            new_height = int(ref_height * scale)
+                            new_width = int(ref_width * scale)
+                            resized_image = F.interpolate(ref_img.squeeze(1).unsqueeze(0), size=(new_height, new_width), mode='bilinear', align_corners=False).squeeze(0).unsqueeze(1)
+                            top = (canvas_height - new_height) // 2
+                            left = (canvas_width - new_width) // 2
+                            white_canvas[:, :, top:top + new_height, left:left + new_width] = resized_image
+                            ref_img = white_canvas
+                        src_ref_images[i][j] = ref_img.to(device)
+        return src_video, src_mask, src_ref_images
+    
+    
+    def decode_latent(self, latent, ref_images=None):
+        vae = self.vae
+        if ref_images is None:
+            ref_images = [None] * len(latent)
+        else:
+            assert len(latent) == len(ref_images)
+
+        trimed_latent = []
+        for lat, refs in zip(latent, ref_images):
+            if refs is not None:
+                lat = lat[:, len(refs):, :, :]
+            trimed_latent.append(lat)
+
+        return vae.decode(trimed_latent)
+    
+    
+    def forward_pp_step(
+        self,
+        latent_model_input: torch.Tensor,
+        grid_sizes: list[Tuple[int, int, int]],
+        max_video_seq_len: int,
+        timestep: torch.Tensor,
+        vace_context: torch.Tensor,
+        arg_c: dict,        
+    ) -> torch.Tensor:
+        """
+        Forward pass supporting pipeline parallelism.
+        """
+
+        from megatron.core import parallel_state
+        from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage, recv_from_prev_pipeline_rank_, send_to_next_pipeline_rank
+
+        pp_world_size = parallel_state.get_pipeline_model_parallel_world_size()
+        is_pp_first = parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+        is_pp_last = parallel_state.is_pipeline_last_stage(ignore_virtual=True)
+
+        # PP=1: no pipeline parallelism
+        if pp_world_size == 1:
+            noise_pred_pp = self.model(
+                latent_model_input,
+                grid_sizes=grid_sizes,
+                t=timestep,
+                vace_context=vace_context,
+                **arg_c)
+            return noise_pred_pp
+
+        # # PP>1: pipeline parallelism
+        # hidden_size = self.model.config.hidden_size
+        # batch_size = latent_model_input.shape[1]
+        # # noise prediction shape for communication between first and last pipeline stages
+        # noise_pred_pp_shape = list(latent_model_input.shape)
+
+        # if is_pp_first:
+        #     # First stage: compute multimodal + first PP slice, send activations, then receive sampled token
+        #     hidden_states = self.model(
+        #         latent_model_input,
+        #         grid_sizes=grid_sizes,
+        #         t=timestep,
+        #         **arg_c)
+        #     send_to_next_pipeline_rank(hidden_states)
+
+        #     noise_pred_pp = broadcast_from_last_pipeline_stage(noise_pred_pp_shape, dtype=torch.float32)
+        #     return noise_pred_pp
+
+        # if is_pp_last:
+        #     # Last stage: recv activations, run final slice + output, sample, broadcast
+        #     recv_buffer = torch.empty(
+        #         (max_video_seq_len, batch_size, hidden_size),
+        #         dtype=next(self.model.parameters()).dtype,
+        #         device=latent_model_input[0].device,
+        #     )
+        #     recv_from_prev_pipeline_rank_(recv_buffer)
+        #     recv_buffer = recv_buffer.to(torch.bfloat16) # ????
+        #     self.model.set_input_tensor(recv_buffer)
+        #     noise_pred_pp = self.model(
+        #         latent_model_input,
+        #         grid_sizes=grid_sizes,
+        #         t=timestep,
+        #         **arg_c)
+
+        #     noise_pred_pp = broadcast_from_last_pipeline_stage(noise_pred_pp_shape, dtype=noise_pred_pp.dtype, tensor=noise_pred_pp.contiguous())
+        #     return noise_pred_pp
+
+        # # Intermediate stages: recv -> run local slice -> send -> receive broadcast token
+        # recv_buffer = torch.empty(
+        #     (max_video_seq_len, batch_size, hidden_size),
+        #     dtype=next(self.model.parameters()).dtype,
+        #     device=latent_model_input[0].device,
+        # )
+        # recv_from_prev_pipeline_rank_(recv_buffer)
+        # recv_buffer = recv_buffer.to(torch.bfloat16) # ????
+        # self.model.set_input_tensor(recv_buffer)
+        # hidden_states = self.model(
+        #     latent_model_input,
+        #     grid_sizes=grid_sizes,
+        #     t=timestep,
+        #     **arg_c)
+        # send_to_next_pipeline_rank(hidden_states)
+
+        # noise_pred_pp = broadcast_from_last_pipeline_stage(noise_pred_pp_shape, dtype=torch.float32)
+        # return noise_pred_pp
+
+
+    def generate(self,
+                 prompts,
+                 input_frames,
+                 input_masks,
+                 input_ref_images,
+                 sizes,
+                 frame_nums,
+                 shift=5.0,
+                 sample_solver='unipc',
+                 sampling_steps=50,
+                 guide_scale=5.0,
+                 n_prompt="",
+                 seed=-1,
+                 offload_model=True):
+        r"""
+        Generates video frames from text prompt using diffusion process.
+
+        Args:
+            prompts (`list[str]`):
+                Text prompt for content generation
+            Input_frames (`list[Tensor]`):
+                Input frames for content generation
+            Input_masks (`list[Tensor]`):
+                Input masks for content generation
+            Input_ref_images (`list[Tensor]`):
+                Input reference images for content generation
+            sizes (list[tuple[int, int]]):
+                Controls video resolution, (width,height).
+            frame_nums (`list[int]`):
+                How many frames to sample from a video. The number should be 4n+1
+            shift (`float`, *optional*, defaults to 5.0):
+                Noise schedule shift parameter. Affects temporal dynamics
+            sample_solver (`str`, *optional*, defaults to 'unipc'):
+                Solver used to sample the video.
+            sampling_steps (`int`, *optional*, defaults to 40):
+                Number of diffusion sampling steps. Higher values improve quality but slow generation
+            guide_scale (`float`, *optional*, defaults 5.0):
+                Classifier-free guidance scale. Controls prompt adherence vs. creativity
+            n_prompt (`str`, *optional*, defaults to ""):
+                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
+            seed (`int`, *optional*, defaults to -1):
+                Random seed for noise generation. If -1, use random seed.
+            offload_model (`bool`, *optional*, defaults to True):
+                If True, offloads models to CPU during generation to save VRAM
+
+        Returns:
+            torch.Tensor:
+                Generated video frames tensor. Dimensions: (C, N, H, W) where:
+                - C: Color channels (3 for RGB)
+                - N: Number of frames (81)
+                - H: Frame height (from size)
+                - W: Frame width from size)
+        """
+        
+        
+        # process source video, mask, reference image
+        vace_context0 = self.vace_encode_frames(input_frames, input_ref_images, masks=input_masks)
+        mask0 = self.vace_encode_masks(input_masks, input_ref_images)
+        vace_context = self.vace_latent(vace_context0, mask0)
+    
+        # # for huggingface inference, latent shape: B, C_latent, N/4, H/8, W/8
+        # vace_context_hf = torch.stack(vace_context)
+        
+        max_video_seq_len = 0
+        seq_lens = []
+        target_shapes = []
+        for item in vace_context0:
+            target_shape = list(item.shape) 
+            target_shape[0] = int(target_shape[0] / 2)
+            seq_len = math.ceil((target_shape[2] * target_shape[3]) /
+                                (self.patch_size[1] * self.patch_size[2]) *
+                                target_shape[1] / self.sp_size) * self.sp_size
+            seq_lens.append(seq_len)
+            target_shapes.append(target_shape)
+        max_video_seq_len = max(seq_lens)
+        
+        vace_context = patchify(vace_context, self.patch_size)
+        # pad to have same length
+        for i in range(len(vace_context)):
+            vace_context[i] = F.pad(vace_context[i], (0, 0, 0, max_video_seq_len - vace_context[i].shape[0]))
+        vace_context = torch.stack(vace_context, dim=1)
+        
+        s, b, h = vace_context.shape
+        vace_context = vace_context.transpose(0, 1).reshape(s*b, 1, h)
+
+        if n_prompt == "":
+            n_prompt = self.sample_neg_prompt
+        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
+        seed_g = torch.Generator(device=self.device)
+        seed_g.manual_seed(seed)
+
+
+        ## process context
+        context_max_len = 512
+        context_lens = []
+        contexts = []
+        contexts_null = []
+        for prompt in prompts:
+            if not self.t5_cpu:
+                self.text_encoder.model.to(self.device)
+                context = self.text_encoder([prompt], self.device)[0]
+                context_null = self.text_encoder([n_prompt], self.device)[0]
+                if offload_model:
+                    self.text_encoder.model.cpu()
+            else:
+                context = self.text_encoder([prompt], torch.device('cpu'))[0].to(self.device)
+                context_null = self.text_encoder([n_prompt], torch.device('cpu'))[0].to(self.device)
+            context_lens.append(context_max_len) # all samples have the same context_max_len
+            contexts.append(context)
+            contexts_null.append(context_null)
+        # pad to context_max_len tokens, and stack to a tensor of shape [s, b, hidden]
+        contexts = [F.pad(context, (0, 0, 0, context_max_len - context.shape[0])) for context in contexts]
+        contexts_null = [F.pad(context_null, (0, 0, 0, context_max_len - context_null.shape[0])) for context_null in contexts_null]
+        contexts = torch.stack(contexts, dim=1)
+        contexts_null = torch.stack(contexts_null, dim=1)
+        
+        s, b, h = contexts.shape
+        contexts = contexts.transpose(0, 1).reshape(s*b, 1, h)
+        contexts_null = contexts_null.transpose(0, 1).reshape(s*b, 1, h)
+        
+        ## setup noise
+        noises = []
+        for target_shape in target_shapes:
+            noises.append(
+                torch.randn(
+                    target_shape[0],
+                    target_shape[1],
+                    target_shape[2],
+                    target_shape[3],
+                    dtype=torch.float32,
+                    device=self.device,
+                    generator=seed_g)
+            )
+        # noises = noises[:1] * len(noises)
+
+        # calculate grid_sizes
+        grid_sizes = [grid_sizes_calculation(
+            input_shape =u.shape[1:], 
+            patch_size=self.model.patch_size,
+            ) for u in noises]
+        grid_sizes = torch.tensor(grid_sizes, dtype=torch.long)
+
+
+        @contextmanager
+        def noop_no_sync():
+            yield
+
+        no_sync = getattr(self.model, 'no_sync', noop_no_sync)
+
+        # evaluation mode
+        with amp.autocast(dtype=self.param_dtype), torch.no_grad(), no_sync():
+
+            if sample_solver == 'unipc':
+                # Create a prototype scheduler to compute shared timesteps
+                sample_scheduler = FlowUniPCMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sample_scheduler.set_timesteps(
+                    sampling_steps, device=self.device, shift=shift)
+                timesteps = sample_scheduler.timesteps
+
+                # Instantiate per-sample schedulers so each sample maintains its own state
+                batch_size_for_schedulers = len(noises)
+                schedulers = []
+                for _ in range(batch_size_for_schedulers):
+                    s = FlowUniPCMultistepScheduler(
+                        num_train_timesteps=self.num_train_timesteps,
+                        shift=1,
+                        use_dynamic_shifting=False)
+                    s.set_timesteps(sampling_steps, device=self.device, shift=shift)
+                    schedulers.append(s)
+            elif sample_solver == 'dpm++':
+                sample_scheduler = FlowDPMSolverMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
+                timesteps, _ = retrieve_timesteps(
+                    sample_scheduler,
+                    device=self.device,
+                    sigmas=sampling_sigmas)
+            else:
+                raise NotImplementedError("Unsupported solver.")
+
+            # sample videos
+            latents = noises
+
+            from megatron.core.packed_seq_params import PackedSeqParams
+            cu_q = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)])
+            cu_q = cu_q.to(torch.int32).to(self.device)
+            cu_kv_self = cu_q
+            cu_kv_cross = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(context_lens), dim=0)])
+            cu_kv_cross = cu_kv_cross.to(torch.int32).to(self.device)
+            packed_seq_params = {
+                "self_attention": PackedSeqParams(
+                    cu_seqlens_q=cu_q,
+                    cu_seqlens_kv=cu_kv_self,
+                    qkv_format=self.model.config.qkv_format,
+                ),
+                "cross_attention": PackedSeqParams(
+                    cu_seqlens_q=cu_q,
+                    cu_seqlens_kv=cu_kv_cross,
+                    qkv_format=self.model.config.qkv_format,
+                ),
+            }
+
+
+            # context parallel
+            if parallel_state.get_context_parallel_world_size() > 1:
+                vace_context = thd_split_inputs_cp(vace_context, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+                contexts = thd_split_inputs_cp(contexts, packed_seq_params['cross_attention'].cu_seqlens_kv, parallel_state.get_context_parallel_group())
+                contexts_null = thd_split_inputs_cp(contexts_null, packed_seq_params['cross_attention'].cu_seqlens_kv, parallel_state.get_context_parallel_group())
+
+
+            arg_c = {'context': contexts, 'max_seq_len': max_video_seq_len, 'packed_seq_params': packed_seq_params}
+            arg_null = {'context': contexts_null, 'max_seq_len': max_video_seq_len, 'packed_seq_params': packed_seq_params}
+
+            
+            from megatron.bridge.models.hf_pretrained.wan import PreTrainedVACE
+            hf = PreTrainedVACE("Wan-AI/Wan2.1-VACE-1.3B-Diffusers")._load_model().to(self.device)
+            
+            
+            for _, t in enumerate(tqdm(timesteps)):
+
+                batch_size = len(latents)
+
+                # patchify latents
+                unpatchified_latents = latents
+                latents = patchify(latents, self.patch_size)
+                # pad to have same length
+                for i in range(batch_size):
+                    latents[i] = F.pad(latents[i], (0, 0, 0, max_video_seq_len - latents[i].shape[0]))
+                latents = torch.stack(latents, dim=1)
+
+                s, b, h = latents.shape
+                latents = latents.transpose(0, 1).reshape(s*b, 1, h)
+
+                # context parallel
+                if parallel_state.get_context_parallel_world_size() > 1:
+                    latents = thd_split_inputs_cp(latents, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+
+
+                latent_model_input = latents
+                timestep = [t] * 1
+                timestep = torch.stack(timestep)
+
+                self.model.to(self.device)
+                noise_pred_cond = self.forward_pp_step(
+                    latent_model_input, grid_sizes=grid_sizes, max_video_seq_len=max_video_seq_len, timestep=timestep, vace_context=vace_context, arg_c=arg_c)
+
+                noise_pred_uncond = self.forward_pp_step(
+                    latent_model_input, grid_sizes=grid_sizes, max_video_seq_len=max_video_seq_len, timestep=timestep, vace_context=vace_context, arg_c=arg_null)
+
+
+                # context parallel
+                if parallel_state.get_context_parallel_world_size() > 1:
+                    noise_pred_cond = thd_cat_outputs_cp(noise_pred_cond, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+                    noise_pred_uncond = thd_cat_outputs_cp(noise_pred_uncond, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+
+                noise_pred_cond = noise_pred_cond.reshape(b, s, h).transpose(0, 1)
+                noise_pred_uncond = noise_pred_uncond.reshape(b, s, h).transpose(0, 1)
+
+                # run unpatchify
+                unpatchified_noise_pred_cond = noise_pred_cond
+                unpatchified_noise_pred_cond = unpatchified_noise_pred_cond.transpose(0, 1) # bring sbhd -> bshd
+                # when unpatchifying, the code will truncate the padded videos into the original video shape, based on the grid_sizes.
+                unpatchified_noise_pred_cond = self.unpatchify(unpatchified_noise_pred_cond, grid_sizes, self.vae.model.z_dim)
+                unpatchified_noise_pred_uncond = noise_pred_uncond
+                unpatchified_noise_pred_uncond = unpatchified_noise_pred_uncond.transpose(0, 1) # bring sbhd -> bshd
+                # when unpatchifying, the code will truncate the padded videos into the original video shape, based on the grid_sizes.
+                unpatchified_noise_pred_uncond = self.unpatchify(unpatchified_noise_pred_uncond, grid_sizes, self.vae.model.z_dim)
+
+                
+                # # for huggingface inference
+                # unpatchified_latents = torch.stack(latents)
+                # timestep = [t] * batch_size
+                # timestep = torch.stack(timestep)
+                # unpatchified_noise_pred_cond=hf(hidden_states=unpatchified_latents,
+                #                                 timestep=timestep,
+                #                                 encoder_hidden_states=contexts.transpose(0,1),
+                #                                 control_hidden_states=vace_context_hf,
+                #                                 return_dict=False)[0]
+                # unpatchified_noise_pred_uncond=hf(hidden_states=unpatchified_latents,
+                #                                 timestep=timestep,
+                #                                 encoder_hidden_states=contexts_null.transpose(0,1),
+                #                                 control_hidden_states=vace_context_hf,
+                #                                 return_dict=False)[0]
+                
+                
+                noise_preds = []
+                for i in range(batch_size):
+                    noise_pred = unpatchified_noise_pred_uncond[i] + guide_scale * (
+                        unpatchified_noise_pred_cond[i] - unpatchified_noise_pred_uncond[i])
+                    noise_preds.append(noise_pred)
+
+                # step and update latents
+                latents = []
+                for i in range(batch_size):
+
+                    if sample_solver == 'unipc':
+                        temp_x0 = schedulers[i].step(
+                            noise_preds[i].unsqueeze(0),
+                            t,
+                            unpatchified_latents[i].unsqueeze(0),
+                            return_dict=False,
+                            generator=seed_g)[0]
+                    else:
+                        temp_x0 = sample_scheduler.step(
+                            noise_preds[i].unsqueeze(0),
+                            t,
+                            unpatchified_latents[i].unsqueeze(0),
+                            return_dict=False,
+                            generator=seed_g)[0]
+                    latents.append(temp_x0.squeeze(0))
+
+            x0 = latents
+            if offload_model:
+                self.model.cpu()
+                torch.cuda.empty_cache()
+            if self.rank == 0:
+                videos = self.decode_latent(x0, input_ref_images)
+            else:
+                videos = None
+
+        del noises, latents
+        if sample_solver == 'unipc':
+            del schedulers
+        else:
+            del sample_scheduler
+        if offload_model:
+            gc.collect()
+            torch.cuda.synchronize()
+        if dist.is_initialized():
+            dist.barrier()
+
+        return videos if self.rank == 0 else None
diff --git a/src/megatron/bridge/models/wan/flow_matching/flow_pipeline.py b/src/megatron/bridge/models/wan/flow_matching/flow_pipeline.py
new file mode 100644
index 0000000000..f14db07728
--- /dev/null
+++ b/src/megatron/bridge/models/wan/flow_matching/flow_pipeline.py
@@ -0,0 +1,425 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Dict, Optional, Tuple, List
+import logging
+
+import numpy as np
+import torch
+from megatron.core import parallel_state
+from torch import Tensor
+from diffusers import WanPipeline
+from megatron.bridge.models.wan.flow_matching.time_shift_utils import compute_density_for_timestep_sampling
+from megatron.bridge.models.wan.utils.utils import patchify, thd_split_inputs_cp
+
+logger = logging.getLogger(__name__)
+
+class FlowPipeline:
+
+    def __init__(
+        self,
+        model_id="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+        seed=1234,
+    ):
+        """
+        Initializes the FlowPipeline with the given parameters.
+        """
+        self.pipe = WanPipeline.from_pretrained(model_id, vae=None, torch_dtype=torch.float32, text_encoder=None)
+
+
+    def training_step(
+        self, 
+        model, 
+        data_batch: dict[str, torch.Tensor],
+        # Flow matching parameters
+        use_sigma_noise: bool = True,
+        timestep_sampling: str = "uniform",
+        logit_mean: float = 0.0,
+        logit_std: float = 1.0,
+        flow_shift: float = 3.0,
+        mix_uniform_ratio: float = 0.1,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        """
+        Performs a single training step using flow matching algorithm.
+
+        This method is responsible for executing one iteration of the model's training. It involves:
+        1. Generate noise and add it to the input data.
+        2. Pass the noisy data through the network to generate predictions.
+        3. Compute the loss based on the difference between the predictions and target.
+        """
+
+        video_latents = data_batch['video_latents']
+        max_video_seq_len = data_batch['max_video_seq_len']
+        context_embeddings = data_batch['context_embeddings']
+        loss_mask = data_batch['loss_mask']
+        grid_sizes = data_batch['grid_sizes']
+        packed_seq_params = data_batch['packed_seq_params']
+        video_metadata = data_batch['video_metadata']
+
+        self.model = model
+
+        batch_size = video_latents.shape[1]
+        device = video_latents.device
+
+        # # # DEBUGGING precision
+        # # import torch.cuda.amp as amp
+        # # with amp.autocast(dtype=torch.bfloat16):
+        # #     # Pass through model
+        # #     ...
+
+        # ========================================================================
+        # Flow Matching Timestep Sampling
+        # ========================================================================
+        
+        num_train_timesteps = self.pipe.scheduler.config.num_train_timesteps
+        
+        if use_sigma_noise:
+            use_uniform = torch.rand(1).item() < mix_uniform_ratio
+            
+            if use_uniform or timestep_sampling == "uniform":
+                # Pure uniform: u ~ U(0, 1)
+                u = torch.rand(size=(batch_size,), device=device)
+                sampling_method = "uniform"
+            else:
+                # Density-based sampling
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=timestep_sampling,
+                    batch_size=batch_size,
+                    logit_mean=logit_mean,
+                    logit_std=logit_std,
+                ).to(device)
+                sampling_method = timestep_sampling
+            
+            # Apply flow shift: σ = shift/(shift + (1/u - 1))
+            u_clamped = torch.clamp(u, min=1e-5)  # Avoid division by zero
+            sigma = flow_shift / (flow_shift + (1.0 / u_clamped - 1.0))
+            sigma = torch.clamp(sigma, 0.0, 1.0)
+            
+        else:
+            # Simple uniform without shift
+            u = torch.rand(size=(batch_size,), device=device)
+            sigma = u
+            sampling_method = "uniform_no_shift"
+
+        # ========================================================================
+        # Manual Flow Matching Noise Addition
+        # ========================================================================
+        
+        # Generate noise
+        noise = torch.randn_like(torch.ones([1, 16, grid_sizes[0][0], grid_sizes[0][1]*2, grid_sizes[0][2]*2], device=video_latents.device), dtype=torch.float32)
+        noise = patchify(noise, (1, 2, 2))[0].unsqueeze(1)
+        # DEBUGGING
+        # because video_latents might be padded, we need to make sure noise also be padded to have the same shape
+        seq_noise = noise.shape[0]
+        seq_video = video_latents.shape[0]
+        if seq_noise < seq_video:
+            pad_len = seq_video - seq_noise
+            pad = torch.zeros((pad_len, noise.shape[1], noise.shape[2]), device=noise.device, dtype=noise.dtype)
+            noise = torch.cat([noise, pad], dim=0)
+
+        # CRITICAL: Manual flow matching (NOT scheduler.add_noise!)
+        # x_t = (1 - σ) * x_0 + σ * ε
+        sigma_reshaped = sigma.view(1, batch_size, 1)
+        noisy_latents = (
+            (1.0 - sigma_reshaped) * video_latents.float() 
+            + sigma_reshaped * noise
+        )
+        
+        # Timesteps for model [0, 1000]
+        timesteps = sigma * num_train_timesteps
+
+        # ========================================================================
+        # Cast model inputs to bf16
+        # ========================================================================
+
+        video_latents = video_latents.to(torch.bfloat16)
+        noisy_latents = noisy_latents.to(torch.bfloat16)
+        context_embeddings = context_embeddings.to(torch.bfloat16)
+        timesteps = timesteps.to(torch.bfloat16)
+
+        # ========================================================================
+        # Split accross context parallelism
+        # ========================================================================
+
+        if parallel_state.get_context_parallel_world_size() > 1:
+            video_latents = thd_split_inputs_cp(video_latents, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+            noisy_latents = thd_split_inputs_cp(noisy_latents, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+            noise = thd_split_inputs_cp(noise, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+            context_embeddings = thd_split_inputs_cp(context_embeddings, packed_seq_params['cross_attention'].cu_seqlens_kv, parallel_state.get_context_parallel_group())
+            split_loss_mask = thd_split_inputs_cp(loss_mask, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+        else:
+            video_latents = video_latents
+            noisy_latents = noisy_latents
+            noise = noise
+            context_embeddings = context_embeddings
+            split_loss_mask = loss_mask
+
+
+        # ========================================================================
+        # Forward Pass
+        # ========================================================================
+        
+        if parallel_state.is_pipeline_last_stage():
+        
+            model_pred = self.model(
+                x = noisy_latents,
+                grid_sizes = grid_sizes,
+                t = timesteps,
+                context = context_embeddings,
+                max_seq_len = max_video_seq_len,
+                packed_seq_params=packed_seq_params,
+            )
+
+            # ========================================================================
+            # Target: Flow Matching Velocity
+            # ========================================================================
+            
+            # Flow matching target: v = ε - x_0
+            target = noise - video_latents.float()
+            
+            # ========================================================================
+            # Loss with Flow Weighting
+            # ========================================================================
+            
+            loss = torch.nn.functional.mse_loss(
+                model_pred.float(),
+                target.float(),
+                reduction="none"
+            )
+
+            # Flow weight: w = 1 + shift * σ
+            loss_weight = 1.0 + flow_shift * sigma # shape [batch_size]
+            loss_weight = loss_weight.view(1, batch_size, 1).to(device) # shape [1, batch_size, 1]
+            unweighted_loss = loss
+            weighted_loss = (loss * loss_weight) # shape [seq_length / cp_size, batch_size, -1]
+
+            # Safety check
+            mean_weighted_loss = weighted_loss.mean()
+            if torch.isnan(mean_weighted_loss) or mean_weighted_loss > 100:
+                print(f"[ERROR] Loss explosion! Loss={mean_weighted_loss.item():.3f}")
+                print(f"[DEBUG] Stopping training - check hyperparameters")
+                raise ValueError(f"Loss exploded: {mean_weighted_loss.item()}")
+
+            return model_pred, weighted_loss, split_loss_mask
+
+        else:
+            hidden_states = self.model(
+                x = noisy_latents,
+                grid_sizes = grid_sizes,
+                t = timesteps,
+                context = context_embeddings,
+                max_seq_len = max_video_seq_len,
+                packed_seq_params=packed_seq_params,
+            )
+
+            return hidden_states
+
+
+class VACEFlowPipeline(FlowPipeline):
+    """
+    Flow pipeline for VACE (Video Editing) models.
+    
+    Extends FlowPipeline to handle the additional vace_context input required by VACEModel.
+    """
+
+    def training_step(
+        self, 
+        model, 
+        data_batch: dict[str, torch.Tensor],
+        # Flow matching parameters
+        use_sigma_noise: bool = True,
+        timestep_sampling: str = "uniform",
+        logit_mean: float = 0.0,
+        logit_std: float = 1.0,
+        flow_shift: float = 3.0,
+        mix_uniform_ratio: float = 0.1,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        """
+        Performs a single training step using flow matching algorithm for VACE models.
+
+        This method extends the base FlowPipeline training_step to include vace_context.
+        """
+
+        video_latents = data_batch['video_latents']
+        max_video_seq_len = data_batch['max_video_seq_len']
+        context_embeddings = data_batch['context_embeddings']
+        loss_mask = data_batch['loss_mask']
+        grid_sizes = data_batch['grid_sizes']
+        packed_seq_params = data_batch['packed_seq_params']
+        video_metadata = data_batch['video_metadata']
+        
+        # VACE-specific: extract vace_context from data_batch
+        # If not provided, initialize a zero tensor with same shape as video_latents
+        vace_context = data_batch.get('vace_context', None)
+        if vace_context is None:
+            raise NotImplementedError("vace_context is required for VACEFlowPipeline but not found in data_batch.")
+            # logger.warning("vace_context not found in data_batch; initializing zeros with shape of video_latents")
+            # vace_context = torch.zeros_like(video_latents)
+
+        self.model = model
+
+        batch_size = video_latents.shape[1]
+        device = video_latents.device
+
+        # ========================================================================
+        # Flow Matching Timestep Sampling
+        # ========================================================================
+        
+        num_train_timesteps = self.pipe.scheduler.config.num_train_timesteps
+        
+        if use_sigma_noise:
+            use_uniform = torch.rand(1).item() < mix_uniform_ratio
+            
+            if use_uniform or timestep_sampling == "uniform":
+                # Pure uniform: u ~ U(0, 1)
+                u = torch.rand(size=(batch_size,), device=device)
+                sampling_method = "uniform"
+            else:
+                # Density-based sampling
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=timestep_sampling,
+                    batch_size=batch_size,
+                    logit_mean=logit_mean,
+                    logit_std=logit_std,
+                ).to(device)
+                sampling_method = timestep_sampling
+            
+            # Apply flow shift: σ = shift/(shift + (1/u - 1))
+            u_clamped = torch.clamp(u, min=1e-5)  # Avoid division by zero
+            sigma = flow_shift / (flow_shift + (1.0 / u_clamped - 1.0))
+            sigma = torch.clamp(sigma, 0.0, 1.0)
+            
+        else:
+            # Simple uniform without shift
+            u = torch.rand(size=(batch_size,), device=device)
+            sigma = u
+            sampling_method = "uniform_no_shift"
+
+        # ========================================================================
+        # Manual Flow Matching Noise Addition
+        # ========================================================================
+        
+        # Generate noise
+        noise = torch.randn_like(torch.ones([1, 16, grid_sizes[0][0], grid_sizes[0][1]*2, grid_sizes[0][2]*2], device=video_latents.device), dtype=torch.float32)
+        noise = patchify(noise, (1, 2, 2))[0].unsqueeze(1)
+        # DEBUGGING
+        # because video_latents might be padded, we need to make sure noise also be padded to have the same shape
+        seq_noise = noise.shape[0]
+        seq_video = video_latents.shape[0]
+        if seq_noise < seq_video:
+            pad_len = seq_video - seq_noise
+            pad = torch.zeros((pad_len, noise.shape[1], noise.shape[2]), device=noise.device, dtype=noise.dtype)
+            noise = torch.cat([noise, pad], dim=0)
+
+        # CRITICAL: Manual flow matching (NOT scheduler.add_noise!)
+        # x_t = (1 - σ) * x_0 + σ * ε
+        sigma_reshaped = sigma.view(1, batch_size, 1)
+        noisy_latents = (
+            (1.0 - sigma_reshaped) * video_latents.float() 
+            + sigma_reshaped * noise
+        )
+        
+        # Timesteps for model [0, 1000]
+        timesteps = sigma * num_train_timesteps
+
+        # ========================================================================
+        # Cast model inputs to bf16
+        # ========================================================================
+
+        video_latents = video_latents.to(torch.bfloat16)
+        noisy_latents = noisy_latents.to(torch.bfloat16)
+        context_embeddings = context_embeddings.to(torch.bfloat16)
+        vace_context = vace_context.to(torch.bfloat16)
+        timesteps = timesteps.to(torch.bfloat16)
+
+        # ========================================================================
+        # Split accross context parallelism
+        # ========================================================================
+
+        if parallel_state.get_context_parallel_world_size() > 1:
+            video_latents = thd_split_inputs_cp(video_latents, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+            noisy_latents = thd_split_inputs_cp(noisy_latents, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+            noise = thd_split_inputs_cp(noise, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+            context_embeddings = thd_split_inputs_cp(context_embeddings, packed_seq_params['cross_attention'].cu_seqlens_kv, parallel_state.get_context_parallel_group())
+            vace_context = thd_split_inputs_cp(vace_context, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+            split_loss_mask = thd_split_inputs_cp(loss_mask, packed_seq_params['self_attention'].cu_seqlens_q, parallel_state.get_context_parallel_group())
+        else:
+            video_latents = video_latents
+            noisy_latents = noisy_latents
+            noise = noise
+            context_embeddings = context_embeddings
+            vace_context = vace_context
+            split_loss_mask = loss_mask
+
+
+        # ========================================================================
+        # Forward Pass (VACE-specific: includes vace_context)
+        # ========================================================================
+        
+        if parallel_state.is_pipeline_last_stage():
+        
+            model_pred = self.model(
+                x = noisy_latents,
+                grid_sizes = grid_sizes,
+                t = timesteps,
+                context = context_embeddings,
+                vace_context = vace_context,  # VACE-specific argument
+                max_seq_len = max_video_seq_len,
+                packed_seq_params=packed_seq_params,
+            )
+
+            # ========================================================================
+            # Target: Flow Matching Velocity
+            # ========================================================================
+            
+            # Flow matching target: v = ε - x_0
+            target = noise - video_latents.float()
+            
+            # ========================================================================
+            # Loss with Flow Weighting
+            # ========================================================================
+            
+            loss = torch.nn.functional.mse_loss(
+                model_pred.float(),
+                target.float(),
+                reduction="none"
+            )
+
+            # Flow weight: w = 1 + shift * σ
+            loss_weight = 1.0 + flow_shift * sigma # shape [batch_size]
+            loss_weight = loss_weight.view(1, batch_size, 1).to(device) # shape [1, batch_size, 1]
+            unweighted_loss = loss
+            weighted_loss = (loss * loss_weight) # shape [seq_length / cp_size, batch_size, -1]
+
+            # Safety check
+            mean_weighted_loss = weighted_loss.mean()
+            if torch.isnan(mean_weighted_loss) or mean_weighted_loss > 100:
+                print(f"[ERROR] Loss explosion! Loss={mean_weighted_loss.item():.3f}")
+                print(f"[DEBUG] Stopping training - check hyperparameters")
+                raise ValueError(f"Loss exploded: {mean_weighted_loss.item()}")
+
+            return model_pred, weighted_loss, split_loss_mask
+
+        else:
+            hidden_states = self.model(
+                x = noisy_latents,
+                grid_sizes = grid_sizes,
+                t = timesteps,
+                context = context_embeddings,
+                vace_context = vace_context,  # VACE-specific argument
+                max_seq_len = max_video_seq_len,
+                packed_seq_params=packed_seq_params,
+            )
+
+            return hidden_states
\ No newline at end of file
diff --git a/src/megatron/bridge/models/wan/flow_matching/time_shift_utils.py b/src/megatron/bridge/models/wan/flow_matching/time_shift_utils.py
new file mode 100644
index 0000000000..56faee4b20
--- /dev/null
+++ b/src/megatron/bridge/models/wan/flow_matching/time_shift_utils.py
@@ -0,0 +1,108 @@
+# time_shift_utils.py - Timestep sampling and sigma computation utilities
+
+import math
+import numpy as np
+import torch
+
+
+def time_shift(
+    t: torch.Tensor,
+    image_seq_len: int,
+    shift_type: str = "constant",
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    constant: float = 3.0,
+):
+    """
+    Convert timesteps to sigmas with sequence-length-aware shifting.
+    
+    Args:
+        t: timesteps in range [0, 1]
+        image_seq_len: number of tokens (frames * height * width / patch_size^2)
+        shift_type: "linear", "sqrt", or "constant"
+        base_shift: base shift for linear mode
+        max_shift: max shift for linear mode
+        constant: shift value for constant mode (default 3.0 matches Pika)
+    
+    Returns:
+        sigma values for noise scheduling
+    """
+    if shift_type == "linear":
+        # Linear interpolation based on sequence length
+        mu = base_shift + (max_shift - base_shift) * (image_seq_len / 4096)
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1))
+    
+    elif shift_type == "sqrt":
+        # Square root scaling (Flux-style)
+        # Assuming 128x128 latent space (1024x1024 image) gives mu=3
+        mu = np.maximum(1.0, np.sqrt(image_seq_len / (128.0 * 128.0)) * 3.0)
+        return mu / (mu + (1 / t - 1))
+    
+    elif shift_type == "constant":
+        # Constant shift (Pika default)
+        return constant / (constant + (1 / t - 1))
+    
+    else:
+        # No shift, return original t
+        return t
+
+
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str,
+    batch_size: int,
+    logit_mean: float = 0.0,
+    logit_std: float = 1.0,
+    mode_scale: float = 1.29,
+):
+    """
+    Sample timesteps from different distributions for better training coverage.
+    
+    Args:
+        weighting_scheme: "uniform", "logit_normal", or "mode"
+        batch_size: number of samples to generate
+        logit_mean: mean for logit-normal distribution
+        logit_std: std for logit-normal distribution
+        mode_scale: scale for mode-based sampling
+    
+    Returns:
+        Tensor of shape (batch_size,) with values in [0, 1]
+    """
+    if weighting_scheme == "logit_normal":
+        # SD3-style logit-normal sampling
+        u = torch.normal(
+            mean=logit_mean, 
+            std=logit_std, 
+            size=(batch_size,), 
+            device="cpu"
+        )
+        u = torch.nn.functional.sigmoid(u)
+    
+    elif weighting_scheme == "mode":
+        # Mode-based sampling (concentrates around certain timesteps)
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    
+    else:
+        # Uniform sampling (default)
+        u = torch.rand(size=(batch_size,), device="cpu")
+    
+    return u
+
+
+def get_flow_match_loss_weight(sigma: torch.Tensor, shift: float = 3.0):
+    """
+    Compute loss weights for flow matching based on sigma values.
+    
+    Higher sigma (more noise) typically gets higher weight.
+    
+    Args:
+        sigma: sigma values in range [0, 1]
+        shift: weight scaling factor
+    
+    Returns:
+        Loss weights with same shape as sigma
+    """
+    # Flow matching weight: weight = 1 + shift * sigma
+    # This gives more weight to noisier timesteps
+    weight = 1.0 + shift * sigma
+    return weight
\ No newline at end of file
diff --git a/src/megatron/bridge/models/wan/inference/configs/__init__.py b/src/megatron/bridge/models/wan/inference/configs/__init__.py
new file mode 100644
index 0000000000..a28c03c5fd
--- /dev/null
+++ b/src/megatron/bridge/models/wan/inference/configs/__init__.py
@@ -0,0 +1,52 @@
+import copy
+import os
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+from .wan_i2v_14B import i2v_14B
+from .wan_t2v_1_3B import t2v_1_3B
+from .wan_t2v_14B import t2v_14B
+
+# the config of t2i_14B is the same as t2v_14B
+t2i_14B = copy.deepcopy(t2v_14B)
+t2i_14B.__name__ = 'Config: Wan T2I 14B'
+
+# the config of flf2v_14B is the same as i2v_14B
+flf2v_14B = copy.deepcopy(i2v_14B)
+flf2v_14B.__name__ = 'Config: Wan FLF2V 14B'
+flf2v_14B.sample_neg_prompt = "镜头切换，" + flf2v_14B.sample_neg_prompt
+
+WAN_CONFIGS = {
+    't2v-14B': t2v_14B,
+    't2v-1.3B': t2v_1_3B,
+    'i2v-14B': i2v_14B,
+    't2i-14B': t2i_14B,
+    'flf2v-14B': flf2v_14B,
+    'vace-1.3B': t2v_1_3B,
+    'vace-14B': t2v_14B,
+}
+
+SIZE_CONFIGS = {
+    '720*1280': (720, 1280),
+    '1280*720': (1280, 720),
+    '480*832': (480, 832),
+    '832*480': (832, 480),
+    '1024*1024': (1024, 1024),
+}
+
+MAX_AREA_CONFIGS = {
+    '720*1280': 720 * 1280,
+    '1280*720': 1280 * 720,
+    '480*832': 480 * 832,
+    '832*480': 832 * 480,
+}
+
+SUPPORTED_SIZES = {
+    't2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    't2v-1.3B': ('480*832', '832*480'),
+    'i2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    'flf2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    't2i-14B': tuple(SIZE_CONFIGS.keys()),
+    'vace-1.3B': ('480*832', '832*480'),
+    'vace-14B': ('720*1280', '1280*720', '480*832', '832*480')
+}
diff --git a/src/megatron/bridge/models/wan/inference/configs/shared_config.py b/src/megatron/bridge/models/wan/inference/configs/shared_config.py
new file mode 100644
index 0000000000..37d3ae0c43
--- /dev/null
+++ b/src/megatron/bridge/models/wan/inference/configs/shared_config.py
@@ -0,0 +1,18 @@
+import torch
+from easydict import EasyDict
+
+#------------------------ Wan shared config ------------------------#
+wan_shared_cfg = EasyDict()
+
+# t5
+wan_shared_cfg.t5_model = 'umt5_xxl'
+wan_shared_cfg.t5_dtype = torch.bfloat16
+wan_shared_cfg.text_len = 512
+
+# transformer
+wan_shared_cfg.param_dtype = torch.bfloat16
+
+# inference
+wan_shared_cfg.num_train_timesteps = 1000
+wan_shared_cfg.sample_fps = 16
+wan_shared_cfg.sample_neg_prompt = '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'
diff --git a/src/megatron/bridge/models/wan/inference/configs/wan_i2v_14B.py b/src/megatron/bridge/models/wan/inference/configs/wan_i2v_14B.py
new file mode 100644
index 0000000000..764d2ed8c3
--- /dev/null
+++ b/src/megatron/bridge/models/wan/inference/configs/wan_i2v_14B.py
@@ -0,0 +1,35 @@
+import torch
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan I2V 14B ------------------------#
+
+i2v_14B = EasyDict(__name__='Config: Wan I2V 14B')
+i2v_14B.update(wan_shared_cfg)
+i2v_14B.sample_neg_prompt = "镜头晃动，" + i2v_14B.sample_neg_prompt
+
+i2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+i2v_14B.t5_tokenizer = 'google/umt5-xxl'
+
+# clip
+i2v_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
+i2v_14B.clip_dtype = torch.float16
+i2v_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
+i2v_14B.clip_tokenizer = 'xlm-roberta-large'
+
+# vae
+i2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+i2v_14B.vae_stride = (4, 8, 8)
+
+# transformer
+i2v_14B.patch_size = (1, 2, 2)
+i2v_14B.dim = 5120
+i2v_14B.ffn_dim = 13824
+i2v_14B.freq_dim = 256
+i2v_14B.num_heads = 40
+i2v_14B.num_layers = 40
+i2v_14B.window_size = (-1, -1)
+i2v_14B.qk_norm = True
+i2v_14B.cross_attn_norm = True
+i2v_14B.eps = 1e-6
diff --git a/src/megatron/bridge/models/wan/inference/configs/wan_t2v_14B.py b/src/megatron/bridge/models/wan/inference/configs/wan_t2v_14B.py
new file mode 100644
index 0000000000..c793f7f6c3
--- /dev/null
+++ b/src/megatron/bridge/models/wan/inference/configs/wan_t2v_14B.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan T2V 14B ------------------------#
+
+t2v_14B = EasyDict(__name__='Config: Wan T2V 14B')
+t2v_14B.update(wan_shared_cfg)
+
+# t5
+t2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_14B.t5_tokenizer = 'google/umt5-xxl'
+
+# vae
+t2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_14B.vae_stride = (4, 8, 8)
+
+# transformer
+t2v_14B.patch_size = (1, 2, 2)
+t2v_14B.dim = 5120
+t2v_14B.ffn_dim = 13824
+t2v_14B.freq_dim = 256
+t2v_14B.num_heads = 40
+t2v_14B.num_layers = 40
+t2v_14B.window_size = (-1, -1)
+t2v_14B.qk_norm = True
+t2v_14B.cross_attn_norm = True
+t2v_14B.eps = 1e-6
diff --git a/src/megatron/bridge/models/wan/inference/configs/wan_t2v_1_3B.py b/src/megatron/bridge/models/wan/inference/configs/wan_t2v_1_3B.py
new file mode 100644
index 0000000000..c8458ce804
--- /dev/null
+++ b/src/megatron/bridge/models/wan/inference/configs/wan_t2v_1_3B.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan T2V 1.3B ------------------------#
+
+t2v_1_3B = EasyDict(__name__='Config: Wan T2V 1.3B')
+t2v_1_3B.update(wan_shared_cfg)
+
+# t5
+t2v_1_3B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_1_3B.t5_tokenizer = 'google/umt5-xxl'
+
+# vae
+t2v_1_3B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_1_3B.vae_stride = (4, 8, 8)
+
+# transformer
+t2v_1_3B.patch_size = (1, 2, 2)
+t2v_1_3B.dim = 1536
+t2v_1_3B.ffn_dim = 8960
+t2v_1_3B.freq_dim = 256
+t2v_1_3B.num_heads = 12
+t2v_1_3B.num_layers = 30
+t2v_1_3B.window_size = (-1, -1)
+t2v_1_3B.qk_norm = True
+t2v_1_3B.cross_attn_norm = True
+t2v_1_3B.eps = 1e-6
diff --git a/src/megatron/bridge/models/wan/inference/utils/fm_solvers.py b/src/megatron/bridge/models/wan/inference/utils/fm_solvers.py
new file mode 100644
index 0000000000..a38b755c40
--- /dev/null
+++ b/src/megatron/bridge/models/wan/inference/utils/fm_solvers.py
@@ -0,0 +1,858 @@
+# Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+# Convert dpm solver for flow matching
+
+import inspect
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import (
+    KarrasDiffusionSchedulers,
+    SchedulerMixin,
+    SchedulerOutput,
+)
+from diffusers.utils import deprecate, is_scipy_available
+from diffusers.utils.torch_utils import randn_tensor
+
+if is_scipy_available():
+    pass
+
+
+def get_sampling_sigmas(sampling_steps, shift):
+    sigma = np.linspace(1, 0, sampling_steps + 1)[:sampling_steps]
+    sigma = (shift * sigma / (1 + (shift - 1) * sigma))
+
+    return sigma
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps=None,
+    device=None,
+    timesteps=None,
+    sigmas=None,
+    **kwargs,
+):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FlowDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `FlowDPMSolverMultistepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model. This determines the resolution of the diffusion process.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1`, `2`, or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling. This affects the number of model outputs stored
+            and used in multistep updates.
+        prediction_type (`str`, defaults to "flow_prediction"):
+            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
+            the flow of the diffusion process.
+        shift (`float`, *optional*, defaults to 1.0):
+            A factor used to adjust the sigmas in the noise schedule. It modifies the step sizes during the sampling
+            process.
+        use_dynamic_shifting (`bool`, defaults to `False`):
+            Whether to apply dynamic shifting to the timesteps based on image resolution. If `True`, the shifting is
+            applied on the fly.
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This method adjusts the predicted sample to prevent
+            saturation and improve photorealism.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
+            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
+            paper, and the `dpmsolver++` type implements the algorithms in the
+            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        final_sigmas_type (`str`, *optional*, defaults to "zero"):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        solver_order: int = 2,
+        prediction_type: str = "flow_prediction",
+        shift: Optional[float] = 1.0,
+        use_dynamic_shifting=False,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        invert_sigmas: bool = False,
+    ):
+        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
+            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0",
+                      deprecation_message)
+
+        # settings for DPM-Solver
+        if algorithm_type not in [
+                "dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"
+        ]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(
+                    f"{algorithm_type} is not implemented for {self.__class__}")
+
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(
+                    f"{solver_type} is not implemented for {self.__class__}")
+
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"
+                                 ] and final_sigmas_type == "zero":
+            raise ValueError(
+                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead."
+            )
+
+        # setable values
+        self.num_inference_steps = None
+        alphas = np.linspace(1, 1 / num_train_timesteps,
+                             num_train_timesteps)[::-1].copy()
+        sigmas = 1.0 - alphas
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
+
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 +
+                                       (shift - 1) * sigmas)  # pyright: ignore
+
+        self.sigmas = sigmas
+        self.timesteps = sigmas * num_train_timesteps
+
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+
+        # self.sigmas = self.sigmas.to(
+        #     "cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: Union[int, None] = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[Union[float, None]] = None,
+        shift: Optional[Union[float, None]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                Total number of the spacing of the time steps.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(
+                " you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`"
+            )
+
+        if sigmas is None:
+            sigmas = np.linspace(self.sigma_max, self.sigma_min,
+                                 num_inference_steps +
+                                 1).copy()[:-1]  # pyright: ignore
+
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)  # pyright: ignore
+        else:
+            if shift is None:
+                shift = self.config.shift
+            sigmas = shift * sigmas / (1 +
+                                       (shift - 1) * sigmas)  # pyright: ignore
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) /
+                          self.alphas_cumprod[0])**0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        timesteps = sigmas * self.config.num_train_timesteps
+        sigmas = np.concatenate([sigmas, [sigma_last]
+                                ]).astype(np.float32)  # pyright: ignore
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(
+            device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        self._step_index = None
+        self._begin_index = None
+        # self.sigmas = self.sigmas.to(
+        #     "cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float(
+            )  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(
+            abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(
+            1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(
+            sample, -s, s
+        ) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        return 1 - sigma, sigma
+
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1)**sigma)
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.convert_model_output
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+        <Tip>
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+        </Tip>
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(
+                    "missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction`, or `flow_prediction` for the FlowDPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                epsilon = sample - (1 - sigma_t) * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the FlowDPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = model_output + x0_pred
+
+            return epsilon
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.dpm_solver_first_order_update
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop(
+            "prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(
+                    " missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[
+            self.step_index]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t /
+                   sigma_s) * sample - (alpha_t *
+                                        (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t /
+                   alpha_s) * sample - (sigma_t *
+                                        (torch.exp(h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = ((sigma_t / sigma_s * torch.exp(-h)) * sample +
+                   (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output +
+                   sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise)
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            x_t = ((alpha_t / alpha_s) * sample - 2.0 *
+                   (sigma_t * (torch.exp(h) - 1.0)) * model_output +
+                   sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise)
+        return x_t  # pyright: ignore
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_second_order_update
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the second-order multistep DPMSolver.
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop(
+            "timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop(
+            "prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(
+                    " missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],  # pyright: ignore
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],  # pyright: ignore
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = ((sigma_t / sigma_s0) * sample -
+                       (alpha_t * (torch.exp(-h) - 1.0)) * D0 - 0.5 *
+                       (alpha_t * (torch.exp(-h) - 1.0)) * D1)
+            elif self.config.solver_type == "heun":
+                x_t = ((sigma_t / sigma_s0) * sample -
+                       (alpha_t * (torch.exp(-h) - 1.0)) * D0 +
+                       (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1)
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = ((alpha_t / alpha_s0) * sample -
+                       (sigma_t * (torch.exp(h) - 1.0)) * D0 - 0.5 *
+                       (sigma_t * (torch.exp(h) - 1.0)) * D1)
+            elif self.config.solver_type == "heun":
+                x_t = ((alpha_t / alpha_s0) * sample -
+                       (sigma_t * (torch.exp(h) - 1.0)) * D0 -
+                       (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1)
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = ((sigma_t / sigma_s0 * torch.exp(-h)) * sample +
+                       (alpha_t * (1 - torch.exp(-2.0 * h))) * D0 + 0.5 *
+                       (alpha_t * (1 - torch.exp(-2.0 * h))) * D1 +
+                       sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise)
+            elif self.config.solver_type == "heun":
+                x_t = ((sigma_t / sigma_s0 * torch.exp(-h)) * sample +
+                       (alpha_t * (1 - torch.exp(-2.0 * h))) * D0 +
+                       (alpha_t * ((1.0 - torch.exp(-2.0 * h)) /
+                                   (-2.0 * h) + 1.0)) * D1 +
+                       sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise)
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = ((alpha_t / alpha_s0) * sample - 2.0 *
+                       (sigma_t * (torch.exp(h) - 1.0)) * D0 -
+                       (sigma_t * (torch.exp(h) - 1.0)) * D1 +
+                       sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise)
+            elif self.config.solver_type == "heun":
+                x_t = ((alpha_t / alpha_s0) * sample - 2.0 *
+                       (sigma_t * (torch.exp(h) - 1.0)) * D0 - 2.0 *
+                       (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 +
+                       sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise)
+        return x_t  # pyright: ignore
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_third_order_update
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the third-order multistep DPMSolver.
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop(
+            "timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop(
+            "prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(
+                    " missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],  # pyright: ignore
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],  # pyright: ignore
+            self.sigmas[self.step_index - 2],  # pyright: ignore
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[
+            -2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = ((sigma_t / sigma_s0) * sample -
+                   (alpha_t * (torch.exp(-h) - 1.0)) * D0 +
+                   (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1 -
+                   (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2)
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = ((alpha_t / alpha_s0) * sample - (sigma_t *
+                                                    (torch.exp(h) - 1.0)) * D0 -
+                   (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 -
+                   (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2)
+        return x_t  # pyright: ignore
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    # Modified from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.step
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`LEdits++`].
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Improve numerical stability for small number of steps
+        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
+            self.config.euler_at_final or
+            (self.config.lower_order_final and len(self.timesteps) < 15) or
+            self.config.final_sigmas_type == "zero")
+        lower_order_second = ((self.step_index == len(self.timesteps) - 2) and
+                              self.config.lower_order_final and
+                              len(self.timesteps) < 15)
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"
+                                         ] and variance_noise is None:
+            noise = randn_tensor(
+                model_output.shape,
+                generator=generator,
+                device=model_output.device,
+                dtype=torch.float32)
+        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = variance_noise.to(
+                device=model_output.device,
+                dtype=torch.float32)  # pyright: ignore
+        else:
+            noise = None
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(
+                model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(
+                self.model_outputs, sample=sample, noise=noise)
+        else:
+            prev_sample = self.multistep_dpm_solver_third_order_update(
+                self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # Cast sample back to expected dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1  # pyright: ignore
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.Tensor, *args,
+                          **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(
+            device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(
+                timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(
+                original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(
+                original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [
+                self.index_for_timestep(t, schedule_timesteps)
+                for t in timesteps
+            ]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/src/megatron/bridge/models/wan/inference/utils/fm_solvers_unipc.py b/src/megatron/bridge/models/wan/inference/utils/fm_solvers_unipc.py
new file mode 100644
index 0000000000..8d96058394
--- /dev/null
+++ b/src/megatron/bridge/models/wan/inference/utils/fm_solvers_unipc.py
@@ -0,0 +1,801 @@
+# Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
+# Convert unipc for flow matching
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import (
+    KarrasDiffusionSchedulers,
+    SchedulerMixin,
+    SchedulerOutput,
+)
+from diffusers.utils import deprecate, is_scipy_available
+
+if is_scipy_available():
+    import scipy.stats
+
+
+class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        solver_order (`int`, default `2`):
+            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
+            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
+            unconditional sampling.
+        prediction_type (`str`, defaults to "flow_prediction"):
+            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
+            the flow of the diffusion process.
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
+        predict_x0 (`bool`, defaults to `True`):
+            Whether to use the updating algorithm on the predicted x0.
+        solver_type (`str`, default `bh2`):
+            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
+            otherwise.
+        lower_order_final (`bool`, default `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        disable_corrector (`list`, default `[]`):
+            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
+            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
+            usually disabled during the first few steps.
+        solver_p (`SchedulerMixin`, default `None`):
+            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int = 1000,
+            solver_order: int = 2,
+            prediction_type: str = "flow_prediction",
+            shift: Optional[float] = 1.0,
+            use_dynamic_shifting=False,
+            thresholding: bool = False,
+            dynamic_thresholding_ratio: float = 0.995,
+            sample_max_value: float = 1.0,
+            predict_x0: bool = True,
+            solver_type: str = "bh2",
+            lower_order_final: bool = True,
+            disable_corrector: List[int] = [],
+            solver_p: SchedulerMixin = None,
+            timestep_spacing: str = "linspace",
+            steps_offset: int = 0,
+            final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+    ):
+
+        if solver_type not in ["bh1", "bh2"]:
+            if solver_type in ["midpoint", "heun", "logrho"]:
+                self.register_to_config(solver_type="bh2")
+            else:
+                raise NotImplementedError(
+                    f"{solver_type} is not implemented for {self.__class__}")
+
+        self.predict_x0 = predict_x0
+        # setable values
+        self.num_inference_steps = None
+        alphas = np.linspace(1, 1 / num_train_timesteps,
+                             num_train_timesteps)[::-1].copy()
+        sigmas = 1.0 - alphas
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
+
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 +
+                                       (shift - 1) * sigmas)  # pyright: ignore
+
+        self.sigmas = sigmas
+        self.timesteps = sigmas * num_train_timesteps
+
+        self.model_outputs = [None] * solver_order
+        self.timestep_list = [None] * solver_order
+        self.lower_order_nums = 0
+        self.disable_corrector = disable_corrector
+        self.solver_p = solver_p
+        self.last_sample = None
+        self._step_index = None
+        self._begin_index = None
+
+        self.sigmas = self.sigmas.to(
+            "cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: Union[int, None] = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[Union[float, None]] = None,
+        shift: Optional[Union[float, None]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                Total number of the spacing of the time steps.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(
+                " you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`"
+            )
+
+        if sigmas is None:
+            sigmas = np.linspace(self.sigma_max, self.sigma_min,
+                                 num_inference_steps +
+                                 1).copy()[:-1]  # pyright: ignore
+
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)  # pyright: ignore
+        else:
+            if shift is None:
+                shift = self.config.shift
+            sigmas = shift * sigmas / (1 +
+                                       (shift - 1) * sigmas)  # pyright: ignore
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) /
+                          self.alphas_cumprod[0])**0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        timesteps = sigmas * self.config.num_train_timesteps
+        sigmas = np.concatenate([sigmas, [sigma_last]
+                                ]).astype(np.float32)  # pyright: ignore
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(
+            device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+        self.last_sample = None
+        if self.solver_p:
+            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to(
+            "cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float(
+            )  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(
+            abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(
+            1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(
+            sample, -s, s
+        ) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        return 1 - sigma, sigma
+
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1)**sigma)
+
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Convert the model output to the corresponding type the UniPC algorithm needs.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(
+                    "missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+
+        if self.predict_x0:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+        else:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                epsilon = sample - (1 - sigma_t) * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = model_output + x0_pred
+
+            return epsilon
+
+    def multistep_uni_p_bh_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        order: int = None,  # pyright: ignore
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop(
+            "prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(
+                    " missing `sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 2:
+                order = args[2]
+            else:
+                raise ValueError(
+                    " missing `order` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+
+        s0 = self.timestep_list[-1]
+        m0 = model_output_list[-1]
+        x = sample
+
+        if self.solver_p:
+            x_t = self.solver_p.step(model_output, s0, x).prev_sample
+            return x_t
+
+        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[
+            self.step_index]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        h = lambda_t - lambda_s0
+        device = sample.device
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - i  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # pyright: ignore
+
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+
+        R = []
+        b = []
+
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+
+        factorial_i = 1
+
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)  # (B, K)
+            # for order 2, we use a simplified version
+            if order == 2:
+                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1],
+                                            b[:-1]).to(device).to(x.dtype)
+        else:
+            D1s = None
+
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p,
+                                        D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - alpha_t * B_h * pred_res
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p,
+                                        D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - sigma_t * B_h * pred_res
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def multistep_uni_c_bh_update(
+        self,
+        this_model_output: torch.Tensor,
+        *args,
+        last_sample: torch.Tensor = None,
+        this_sample: torch.Tensor = None,
+        order: int = None,  # pyright: ignore
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniC (B(h) version).
+
+        Args:
+            this_model_output (`torch.Tensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.Tensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.Tensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
+
+        Returns:
+            `torch.Tensor`:
+                The corrected sample tensor at the current timestep.
+        """
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop(
+            "this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(
+                    " missing`last_sample` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 2:
+                this_sample = args[2]
+            else:
+                raise ValueError(
+                    " missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(
+                    " missing`order` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        model_output_list = self.model_outputs
+
+        m0 = model_output_list[-1]
+        x = last_sample
+        x_t = this_sample
+        model_t = this_model_output
+
+        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[
+            self.step_index - 1]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        h = lambda_t - lambda_s0
+        device = this_sample.device
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - (i + 1)  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # pyright: ignore
+
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+
+        R = []
+        b = []
+
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+
+        factorial_i = 1
+
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)
+        else:
+            D1s = None
+
+        # for order 1, we use a simplified version
+        if order == 1:
+            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
+        else:
+            rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
+
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(self,
+             model_output: torch.Tensor,
+             timestep: Union[int, torch.Tensor],
+             sample: torch.Tensor,
+             return_dict: bool = True,
+             generator=None) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep UniPC.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        use_corrector = (
+            self.step_index > 0 and
+            self.step_index - 1 not in self.disable_corrector and
+            self.last_sample is not None  # pyright: ignore
+        )
+
+        model_output_convert = self.convert_model_output(
+            model_output, sample=sample)
+        if use_corrector:
+            sample = self.multistep_uni_c_bh_update(
+                this_model_output=model_output_convert,
+                last_sample=self.last_sample,
+                this_sample=sample,
+                order=self.this_order,
+            )
+
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep  # pyright: ignore
+
+        if self.config.lower_order_final:
+            this_order = min(self.config.solver_order,
+                             len(self.timesteps) -
+                             self.step_index)  # pyright: ignore
+        else:
+            this_order = self.config.solver_order
+
+        self.this_order = min(this_order,
+                              self.lower_order_nums + 1)  # warmup for multistep
+        assert self.this_order > 0
+
+        self.last_sample = sample
+        prev_sample = self.multistep_uni_p_bh_update(
+            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
+            sample=sample,
+            order=self.this_order,
+        )
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1  # pyright: ignore
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.Tensor, *args,
+                          **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(
+            device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(
+                timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(
+                original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(
+                original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [
+                self.index_for_timestep(t, schedule_timesteps)
+                for t in timesteps
+            ]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/src/megatron/bridge/models/wan/inference/utils/utils.py b/src/megatron/bridge/models/wan/inference/utils/utils.py
new file mode 100644
index 0000000000..a57f9bb993
--- /dev/null
+++ b/src/megatron/bridge/models/wan/inference/utils/utils.py
@@ -0,0 +1,117 @@
+import argparse
+import binascii
+import os
+import os.path as osp
+
+import imageio
+import torch
+import torchvision
+
+__all__ = ['cache_video', 'cache_image', 'str2bool']
+
+
+def rand_name(length=8, suffix=''):
+    name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
+    if suffix:
+        if not suffix.startswith('.'):
+            suffix = '.' + suffix
+        name += suffix
+    return name
+
+
+def cache_video(tensor,
+                save_file=None,
+                fps=30,
+                suffix='.mp4',
+                nrow=8,
+                normalize=True,
+                value_range=(-1, 1),
+                retry=5):
+    # cache file
+    cache_file = osp.join('/tmp', rand_name(
+        suffix=suffix)) if save_file is None else save_file
+
+    # save to cache
+    error = None
+    for _ in range(retry):
+        try:
+            # preprocess
+            tensor = tensor.clamp(min(value_range), max(value_range))
+            tensor = torch.stack([
+                torchvision.utils.make_grid(
+                    u, nrow=nrow, normalize=normalize, value_range=value_range)
+                for u in tensor.unbind(2)
+            ],
+                                 dim=1).permute(1, 2, 3, 0)
+            tensor = (tensor * 255).type(torch.uint8).cpu()
+
+            # write video
+            writer = imageio.get_writer(
+                cache_file, fps=fps, codec='libx264', quality=8)
+            for frame in tensor.numpy():
+                writer.append_data(frame)
+            writer.close()
+            return cache_file
+        except Exception as e:
+            error = e
+            continue
+    else:
+        print(f'cache_video failed, error: {error}', flush=True)
+        return None
+
+
+def cache_image(tensor,
+                save_file,
+                nrow=8,
+                normalize=True,
+                value_range=(-1, 1),
+                retry=5):
+    # cache file
+    suffix = osp.splitext(save_file)[1]
+    if suffix.lower() not in [
+            '.jpg', '.jpeg', '.png', '.tiff', '.gif', '.webp'
+    ]:
+        suffix = '.png'
+
+    # save to cache
+    error = None
+    for _ in range(retry):
+        try:
+            tensor = tensor.clamp(min(value_range), max(value_range))
+            torchvision.utils.save_image(
+                tensor,
+                save_file,
+                nrow=nrow,
+                normalize=normalize,
+                value_range=value_range)
+            return save_file
+        except Exception as e:
+            error = e
+            continue
+
+
+def str2bool(v):
+    """
+    Convert a string to a boolean.
+
+    Supported true values: 'yes', 'true', 't', 'y', '1'
+    Supported false values: 'no', 'false', 'f', 'n', '0'
+
+    Args:
+        v (str): String to convert.
+
+    Returns:
+        bool: Converted boolean value.
+
+    Raises:
+        argparse.ArgumentTypeError: If the value cannot be converted to boolean.
+    """
+    if isinstance(v, bool):
+        return v
+    v_lower = v.lower()
+    if v_lower in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v_lower in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected (True/False)')
diff --git a/src/megatron/bridge/models/wan/modules/__init__.py b/src/megatron/bridge/models/wan/modules/__init__.py
new file mode 100644
index 0000000000..435f1eef0d
--- /dev/null
+++ b/src/megatron/bridge/models/wan/modules/__init__.py
@@ -0,0 +1,13 @@
+from .t5 import T5Decoder, T5Encoder, T5EncoderModel, T5Model
+from .tokenizers import HuggingfaceTokenizer
+from .vae import WanVAE
+
+
+__all__ = [
+    'WanVAE',
+    'T5Model',
+    'T5Encoder',
+    'T5Decoder',
+    'T5EncoderModel',
+    'HuggingfaceTokenizer',
+]
diff --git a/src/megatron/bridge/models/wan/modules/t5.py b/src/megatron/bridge/models/wan/modules/t5.py
new file mode 100644
index 0000000000..fecd989e07
--- /dev/null
+++ b/src/megatron/bridge/models/wan/modules/t5.py
@@ -0,0 +1,512 @@
+# Modified from transformers.models.t5.modeling_t5
+import logging
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .tokenizers import HuggingfaceTokenizer
+
+__all__ = [
+    'T5Model',
+    'T5Encoder',
+    'T5Decoder',
+    'T5EncoderModel',
+]
+
+
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+
+
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5Model):
+        nn.init.normal_(m.token_embedding.weight, std=1.0)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(
+            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)
+
+
+class GELU(nn.Module):
+
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(
+            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+
+
+class T5LayerNorm(nn.Module):
+
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) +
+                            self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+
+
+class T5Attention(nn.Module):
+
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1,
+                             -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum('binc,bjnc->bnij', q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+
+
+class T5FeedForward(nn.Module):
+
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class T5SelfAttention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True)
+
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(
+            x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+
+
+class T5CrossAttention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5CrossAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.self_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.cross_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm3 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=False)
+
+    def forward(self,
+                x,
+                mask=None,
+                encoder_states=None,
+                encoder_mask=None,
+                pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(
+            x.size(1), x.size(1))
+        x = fp16_clamp(x + self.self_attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.cross_attn(
+            self.norm2(x), context=encoder_states, mask=encoder_mask))
+        x = fp16_clamp(x + self.ffn(self.norm3(x)))
+        return x
+
+
+class T5RelativeEmbedding(nn.Module):
+
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - \
+            torch.arange(lq, device=device).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(
+            0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = max_exact + (torch.log(rel_pos.float() / max_exact) /
+                                     math.log(self.max_dist / max_exact) *
+                                     (num_buckets - max_exact)).long()
+        rel_pos_large = torch.min(
+            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+
+
+class T5Encoder(nn.Module):
+
+    def __init__(self,
+                 vocab,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_layers,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5Encoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
+            else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
+                            shared_pos, dropout) for _ in range(num_layers)
+        ])
+        self.norm = T5LayerNorm(dim)
+
+        # initialize weights
+        self.apply(init_weights)
+
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1),
+                               x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+
+
+class T5Decoder(nn.Module):
+
+    def __init__(self,
+                 vocab,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_layers,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5Decoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
+            else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=False) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            T5CrossAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
+                             shared_pos, dropout) for _ in range(num_layers)
+        ])
+        self.norm = T5LayerNorm(dim)
+
+        # initialize weights
+        self.apply(init_weights)
+
+    def forward(self, ids, mask=None, encoder_states=None, encoder_mask=None):
+        b, s = ids.size()
+
+        # causal mask
+        if mask is None:
+            mask = torch.tril(torch.ones(1, s, s).to(ids.device))
+        elif mask.ndim == 2:
+            mask = torch.tril(mask.unsqueeze(1).expand(-1, s, -1))
+
+        # layers
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1),
+                               x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, encoder_states, encoder_mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+
+
+class T5Model(nn.Module):
+
+    def __init__(self,
+                 vocab_size,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 encoder_layers,
+                 decoder_layers,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5Model, self).__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.num_buckets = num_buckets
+
+        # layers
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.encoder = T5Encoder(self.token_embedding, dim, dim_attn, dim_ffn,
+                                 num_heads, encoder_layers, num_buckets,
+                                 shared_pos, dropout)
+        self.decoder = T5Decoder(self.token_embedding, dim, dim_attn, dim_ffn,
+                                 num_heads, decoder_layers, num_buckets,
+                                 shared_pos, dropout)
+        self.head = nn.Linear(dim, vocab_size, bias=False)
+
+        # initialize weights
+        self.apply(init_weights)
+
+    def forward(self, encoder_ids, encoder_mask, decoder_ids, decoder_mask):
+        x = self.encoder(encoder_ids, encoder_mask)
+        x = self.decoder(decoder_ids, decoder_mask, x, encoder_mask)
+        x = self.head(x)
+        return x
+
+
+def _t5(name,
+        encoder_only=False,
+        decoder_only=False,
+        return_tokenizer=False,
+        tokenizer_kwargs={},
+        dtype=torch.float32,
+        device='cpu',
+        **kwargs):
+    # sanity check
+    assert not (encoder_only and decoder_only)
+
+    # params
+    if encoder_only:
+        model_cls = T5Encoder
+        kwargs['vocab'] = kwargs.pop('vocab_size')
+        kwargs['num_layers'] = kwargs.pop('encoder_layers')
+        _ = kwargs.pop('decoder_layers')
+    elif decoder_only:
+        model_cls = T5Decoder
+        kwargs['vocab'] = kwargs.pop('vocab_size')
+        kwargs['num_layers'] = kwargs.pop('decoder_layers')
+        _ = kwargs.pop('encoder_layers')
+    else:
+        model_cls = T5Model
+
+    # init model
+    with torch.device(device):
+        model = model_cls(**kwargs)
+
+    # set device
+    model = model.to(dtype=dtype, device=device)
+
+    # init tokenizer
+    if return_tokenizer:
+        from .tokenizers import HuggingfaceTokenizer
+        tokenizer = HuggingfaceTokenizer(f'google/{name}', **tokenizer_kwargs)
+        return model, tokenizer
+    else:
+        return model
+
+
+def umt5_xxl(**kwargs):
+    cfg = dict(
+        vocab_size=256384,
+        dim=4096,
+        dim_attn=4096,
+        dim_ffn=10240,
+        num_heads=64,
+        encoder_layers=24,
+        decoder_layers=24,
+        num_buckets=32,
+        shared_pos=False,
+        dropout=0.1)
+    cfg.update(**kwargs)
+    return _t5('umt5-xxl', **cfg)
+
+
+class T5EncoderModel:
+
+    def __init__(
+        self,
+        text_len,
+        dtype=torch.bfloat16,
+        device=torch.cuda.current_device(),
+        checkpoint_path=None,
+        tokenizer_path=None,
+        shard_fn=None,
+    ):
+        self.text_len = text_len
+        self.dtype = dtype
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self.tokenizer_path = tokenizer_path
+
+        # init model
+        model = umt5_xxl(
+            encoder_only=True,
+            return_tokenizer=False,
+            dtype=dtype,
+            device=device).eval().requires_grad_(False)
+        logging.info(f'loading {checkpoint_path}')
+        model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))
+        self.model = model
+        if shard_fn is not None:
+            self.model = shard_fn(self.model, sync_module_states=False)
+        else:
+            self.model.to(self.device)
+        # init tokenizer
+        self.tokenizer = HuggingfaceTokenizer(
+            name=tokenizer_path, seq_len=text_len, clean='whitespace')
+
+    def __call__(self, texts, device):
+        ids, mask = self.tokenizer(
+            texts, return_mask=True, add_special_tokens=True)
+        ids = ids.to(device)
+        mask = mask.to(device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        context = self.model(ids, mask)
+        return [u[:v] for u, v in zip(context, seq_lens)]
diff --git a/src/megatron/bridge/models/wan/modules/tokenizers.py b/src/megatron/bridge/models/wan/modules/tokenizers.py
new file mode 100644
index 0000000000..a69972adf2
--- /dev/null
+++ b/src/megatron/bridge/models/wan/modules/tokenizers.py
@@ -0,0 +1,81 @@
+import html
+import string
+
+import ftfy
+import regex as re
+from transformers import AutoTokenizer
+
+__all__ = ['HuggingfaceTokenizer']
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace('_', ' ')
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans('', '', string.punctuation))
+            for part in text.split(keep_punctuation_exact_string))
+    else:
+        text = text.translate(str.maketrans('', '', string.punctuation))
+    text = text.lower()
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+
+
+class HuggingfaceTokenizer:
+
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, 'whitespace', 'lower', 'canonicalize')
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop('return_mask', False)
+
+        # arguments
+        _kwargs = {'return_tensors': 'pt'}
+        if self.seq_len is not None:
+            _kwargs.update({
+                'padding': 'max_length',
+                'truncation': True,
+                'max_length': self.seq_len
+            })
+        _kwargs.update(**kwargs)
+
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        return text
diff --git a/src/megatron/bridge/models/wan/modules/vae.py b/src/megatron/bridge/models/wan/modules/vae.py
new file mode 100644
index 0000000000..d4f1ef1d0e
--- /dev/null
+++ b/src/megatron/bridge/models/wan/modules/vae.py
@@ -0,0 +1,662 @@
+import logging
+
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+__all__ = [
+    'WanVAE',
+]
+
+CACHE_T = 2
+
+
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (self.padding[2], self.padding[2], self.padding[1],
+                         self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+
+        return super().forward(x)
+
+
+class RMS_norm(nn.Module):
+
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
+
+    def forward(self, x):
+        return F.normalize(
+            x, dim=(1 if self.channel_first else
+                    -1)) * self.scale * self.gamma + self.bias
+
+
+class Upsample(nn.Upsample):
+
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+
+
+class Resample(nn.Module):
+
+    def __init__(self, dim, mode):
+        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
+                        'downsample3d')
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+
+        # layers
+        if mode == 'upsample2d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+        elif mode == 'upsample3d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+            self.time_conv = CausalConv3d(
+                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+
+        elif mode == 'downsample2d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == 'downsample3d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == 'upsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = 'Rep'
+                    feat_idx[0] += 1
+                else:
+
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] != 'Rep':
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat([
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device), cache_x
+                        ],
+                                            dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] == 'Rep':
+                        cache_x = torch.cat([
+                            torch.zeros_like(cache_x).to(cache_x.device),
+                            cache_x
+                        ],
+                                            dim=2)
+                    if feat_cache[idx] == 'Rep':
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
+                                    3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.resample(x)
+        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
+
+        if self.mode == 'downsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
+                    #     # cache last frame of last two chunk
+                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  #* 0.5
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
+        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False), nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
+            if in_dim != out_dim else nn.Identity()
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3,
+                                         -1).permute(0, 1, 3,
+                                                     2).contiguous().chunk(
+                                                         3, dim=-1)
+
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+
+        # output
+        x = self.proj(x)
+        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
+        return x + identity
+
+
+class Encoder3d(nn.Module):
+
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = 'downsample3d' if temperal_downsample[
+                    i] else 'downsample2d'
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout))
+
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False), nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1))
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+
+
+class Decoder3d(nn.Module):
+
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_upsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2**(len(dim_mult) - 2)
+
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout))
+
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False), nn.SiLU(),
+            CausalConv3d(out_dim, 3, 3, padding=1))
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+
+
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+
+
+class WanVAE_(nn.Module):
+
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+
+        # modules
+        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_downsample, dropout)
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_upsample, dropout)
+
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+
+    def encode(self, x, scale):
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        ## 对encode输入的x，按时间拆分为1、4、4、4....
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx)
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        return mu
+
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx)
+            else:
+                out_ = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)
+        self.clear_cache()
+        return out
+
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        #cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+
+
+def _video_vae(pretrained_path=None, z_dim=None, device='cpu', **kwargs):
+    """
+    Autoencoder3d adapted from Stable Diffusion 1.x, 2.x and XL.
+    """
+    # params
+    cfg = dict(
+        dim=96,
+        z_dim=z_dim,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0)
+    cfg.update(**kwargs)
+
+    # init model
+    with torch.device('meta'):
+        model = WanVAE_(**cfg)
+
+    # load checkpoint
+    logging.info(f'loading {pretrained_path}')
+    model.load_state_dict(
+        torch.load(pretrained_path, map_location=device), assign=True)
+
+    return model
+
+
+class WanVAE:
+
+    def __init__(self,
+                 z_dim=16,
+                 vae_pth='cache/vae_step_411000.pth',
+                 dtype=torch.float,
+                 device="cuda"):
+        self.dtype = dtype
+        self.device = device
+
+        mean = [
+            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
+        ]
+        std = [
+            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
+        ]
+        self.mean = torch.tensor(mean, dtype=dtype, device=device)
+        self.std = torch.tensor(std, dtype=dtype, device=device)
+        self.scale = [self.mean, 1.0 / self.std]
+
+        # init model
+        self.model = _video_vae(
+            pretrained_path=vae_pth,
+            z_dim=z_dim,
+        ).eval().requires_grad_(False).to(device)
+
+    def encode(self, videos):
+        """
+        videos: A list of videos each with shape [C, T, H, W].
+        """
+        with amp.autocast(dtype=self.dtype):
+            return [
+                self.model.encode(u.unsqueeze(0), self.scale).float().squeeze(0)
+                for u in videos
+            ]
+
+    def decode(self, zs):
+        with amp.autocast(dtype=self.dtype):
+            return [
+                self.model.decode(u.unsqueeze(0),
+                                  self.scale).float().clamp_(-1, 1).squeeze(0)
+                for u in zs
+            ]
diff --git a/src/megatron/bridge/models/wan/rope_utils.py b/src/megatron/bridge/models/wan/rope_utils.py
new file mode 100644
index 0000000000..1f79d8bc7c
--- /dev/null
+++ b/src/megatron/bridge/models/wan/rope_utils.py
@@ -0,0 +1,65 @@
+import torch
+from torch.cuda import amp
+from megatron.bridge.models.wan.utils.utils import split_inputs_cp
+from megatron.core import parallel_state
+
+class Wan3DRopeEmbeddings(torch.nn.Module):
+    """
+    Wan 3D RoPE embeddings implementation.
+    Implements Wan's 3D RoPE embeddings for Mcore Attention based on Wan's implementation at https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py.
+    """
+
+    def __init__(self, dim_head, max_position_len):
+        super().__init__()
+        self.freqs = torch.cat([
+            self.rope_params(max_position_len, dim_head - 4 * (dim_head // 6)),
+            self.rope_params(max_position_len, 2 * (dim_head // 6)),
+            self.rope_params(max_position_len, 2 * (dim_head // 6))
+        ], dim=1)
+
+    def rope_params(self, max_position_len, dim_head, theta=10000):
+        assert dim_head % 2 == 0
+        freqs = torch.outer(
+            torch.arange(max_position_len),
+            1.0 / torch.pow(theta,
+                            torch.arange(0, dim_head, 2).div(dim_head)))
+        return freqs
+
+    def forward(self, n_head, dim_head, max_seq_len, grid_sizes, device):
+        self.freqs = self.freqs.to(device) # ??? do we need to put this here, or the when we move WanModel to device, it also move freqs to device?
+
+        n, c = n_head, dim_head // 2
+
+        # split freqs
+        freqs = self.freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+
+        freqs_real = []
+        for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+            seq_len = f * h * w
+            freqs_real_i = torch.cat([
+                freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+            ], dim=-1).reshape(seq_len, 1, 1, -1)  # <-- add 1,1 for batch/head broadcasting
+
+            # Double dimension from c -> 2c with rotating angles as (x0, x0, x1, x1, ...), for interleaving RoPE
+            freqs_real_i = freqs_real_i.unsqueeze(-1).expand(-1, -1, -1, -1, 2).reshape(seq_len, 1, 1, dim_head)
+
+            # Pad freqs_real_i to (max_seq_len, 1, 1, dim_head) with 0s
+            if freqs_real_i.shape[0] < max_seq_len:
+                pad_shape = (max_seq_len - freqs_real_i.shape[0], 1, 1, dim_head)
+                freqs_real_i = torch.cat(
+                    [freqs_real_i, torch.zeros(pad_shape, dtype=freqs_real_i.dtype, device=freqs_real_i.device)]
+                )
+            freqs_real.append(freqs_real_i)
+
+        # Each freqs_real[i] is (max_seq_len, 1, 1, dim_head)
+        # We concatenate them along dim=1 to get (max_seq_len, batch_size, 1, dim_head)
+        freqs_real = torch.cat(freqs_real, dim=1)
+
+        # Note:
+        # when running context_parallel, which must use "thd" for qkv_format,
+        # we don't need to scatter the freqs to the context parallel region,
+        # because mcore rope_utils will automatically retrieve the correct freqs for each context parallel region
+
+        return freqs_real
\ No newline at end of file
diff --git a/src/megatron/bridge/models/wan/utils/preprocessor.py b/src/megatron/bridge/models/wan/utils/preprocessor.py
new file mode 100644
index 0000000000..fc5ea6a740
--- /dev/null
+++ b/src/megatron/bridge/models/wan/utils/preprocessor.py
@@ -0,0 +1,271 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+
+
+class VaceImageProcessor(object):
+    def __init__(self, downsample=None, seq_len=None):
+        self.downsample = downsample
+        self.seq_len = seq_len
+
+    def _pillow_convert(self, image, cvt_type='RGB'):
+        if image.mode != cvt_type:
+            if image.mode == 'P':
+                image = image.convert(f'{cvt_type}A')
+            if image.mode == f'{cvt_type}A':
+                bg = Image.new(cvt_type,
+                               size=(image.width, image.height),
+                               color=(255, 255, 255))
+                bg.paste(image, (0, 0), mask=image)
+                image = bg
+            else:
+                image = image.convert(cvt_type)
+        return image
+
+    def _load_image(self, img_path):
+        if img_path is None or img_path == '':
+            return None
+        img = Image.open(img_path)
+        img = self._pillow_convert(img)
+        return img
+
+    def _resize_crop(self, img, oh, ow, normalize=True):
+        """
+        Resize, center crop, convert to tensor, and normalize.
+        """
+        # resize and crop
+        iw, ih = img.size
+        if iw != ow or ih != oh:
+            # resize
+            scale = max(ow / iw, oh / ih)
+            img = img.resize(
+                (round(scale * iw), round(scale * ih)),
+                resample=Image.Resampling.LANCZOS
+            )
+            assert img.width >= ow and img.height >= oh
+
+            # center crop
+            x1 = (img.width - ow) // 2
+            y1 = (img.height - oh) // 2
+            img = img.crop((x1, y1, x1 + ow, y1 + oh))
+
+        # normalize
+        if normalize:
+            img = TF.to_tensor(img).sub_(0.5).div_(0.5).unsqueeze(1)
+        return img
+    
+    def _image_preprocess(self, img, oh, ow, normalize=True, **kwargs):
+        return self._resize_crop(img, oh, ow, normalize)
+
+    def load_image(self, data_key, **kwargs):
+        return self.load_image_batch(data_key, **kwargs)
+
+    def load_image_pair(self, data_key, data_key2, **kwargs):
+        return self.load_image_batch(data_key, data_key2, **kwargs)
+
+    def load_image_batch(self, *data_key_batch, normalize=True, seq_len=None, **kwargs):
+        seq_len = self.seq_len if seq_len is None else seq_len
+        imgs = []
+        for data_key in data_key_batch:
+            img = self._load_image(data_key)
+            imgs.append(img)
+        w, h = imgs[0].size
+        dh, dw = self.downsample[1:]
+
+        # compute output size
+        scale = min(1., np.sqrt(seq_len / ((h / dh) * (w / dw))))
+        oh = int(h * scale) // dh * dh
+        ow = int(w * scale) // dw * dw
+        assert (oh // dh) * (ow // dw) <= seq_len
+        imgs = [self._image_preprocess(img, oh, ow, normalize) for img in imgs]
+        return *imgs, (oh, ow)
+
+
+class VaceVideoProcessor(object):
+    def __init__(self, downsample, min_area, max_area, min_fps, max_fps, zero_start, seq_len, keep_last, **kwargs):
+        self.downsample = downsample
+        self.min_area = min_area
+        self.max_area = max_area
+        self.min_fps = min_fps
+        self.max_fps = max_fps
+        self.zero_start = zero_start
+        self.keep_last = keep_last
+        self.seq_len = seq_len
+        assert seq_len >= min_area / (self.downsample[1] * self.downsample[2])
+
+    def set_area(self, area):
+        self.min_area = area
+        self.max_area = area
+
+    def set_seq_len(self, seq_len):
+        self.seq_len = seq_len
+
+    @staticmethod
+    def resize_crop(video: torch.Tensor, oh: int, ow: int):
+        """
+        Resize, center crop and normalize for decord loaded video (torch.Tensor type)
+
+        Parameters:
+          video - video to process (torch.Tensor): Tensor from `reader.get_batch(frame_ids)`, in shape of (T, H, W, C)
+          oh - target height (int)
+          ow - target width (int)
+
+        Returns:
+            The processed video (torch.Tensor): Normalized tensor range [-1, 1], in shape of (C, T, H, W)
+
+        Raises:
+        """
+        # permute ([t, h, w, c] -> [t, c, h, w])
+        video = video.permute(0, 3, 1, 2)
+
+        # resize and crop
+        ih, iw = video.shape[2:]
+        if ih != oh or iw != ow:
+            # resize
+            scale = max(ow / iw, oh / ih)
+            video = F.interpolate(
+                video,
+                size=(round(scale * ih), round(scale * iw)),
+                mode='bicubic',
+                antialias=True
+            )
+            assert video.size(3) >= ow and video.size(2) >= oh
+
+            # center crop
+            x1 = (video.size(3) - ow) // 2
+            y1 = (video.size(2) - oh) // 2
+            video = video[:, :, y1:y1 + oh, x1:x1 + ow]
+
+        # permute ([t, c, h, w] -> [c, t, h, w]) and normalize
+        video = video.transpose(0, 1).float().div_(127.5).sub_(1.)
+        return video
+
+    def _video_preprocess(self, video, oh, ow):
+        return self.resize_crop(video, oh, ow)
+
+    def _get_frameid_bbox_default(self, fps, frame_timestamps, h, w, crop_box, rng):
+        target_fps = min(fps, self.max_fps)
+        duration = frame_timestamps[-1].mean()
+        x1, x2, y1, y2 = [0, w, 0, h] if crop_box is None else crop_box
+        h, w = y2 - y1, x2 - x1
+        ratio = h / w
+        df, dh, dw = self.downsample
+
+        area_z = min(self.seq_len, self.max_area / (dh * dw), (h // dh) * (w // dw))
+        of = min(
+            (int(duration * target_fps) - 1) // df + 1,
+            int(self.seq_len / area_z)
+        )
+
+        # deduce target shape of the [latent video]
+        target_area_z = min(area_z, int(self.seq_len / of))
+        oh = round(np.sqrt(target_area_z * ratio))
+        ow = int(target_area_z / oh)
+        of = (of - 1) * df + 1
+        oh *= dh
+        ow *= dw
+
+        # sample frame ids
+        target_duration = of / target_fps
+        begin = 0. if self.zero_start else rng.uniform(0, duration - target_duration)
+        timestamps = np.linspace(begin, begin + target_duration, of)
+        frame_ids = np.argmax(np.logical_and(
+            timestamps[:, None] >= frame_timestamps[None, :, 0],
+            timestamps[:, None] < frame_timestamps[None, :, 1]
+        ), axis=1).tolist()
+        return frame_ids, (x1, x2, y1, y2), (oh, ow), target_fps
+
+    def _get_frameid_bbox_adjust_last(self, fps, frame_timestamps, h, w, crop_box, rng):
+        duration = frame_timestamps[-1].mean()
+        x1, x2, y1, y2 = [0, w, 0, h] if crop_box is None else crop_box
+        h, w = y2 - y1, x2 - x1
+        ratio = h / w
+        df, dh, dw = self.downsample
+
+        area_z = min(self.seq_len, self.max_area / (dh * dw), (h // dh) * (w // dw))
+        of = min(
+            (len(frame_timestamps) - 1) // df + 1,
+            int(self.seq_len / area_z)
+        )
+
+        # deduce target shape of the [latent video]
+        target_area_z = min(area_z, int(self.seq_len / of))
+        oh = round(np.sqrt(target_area_z * ratio))
+        ow = int(target_area_z / oh)
+        of = (of - 1) * df + 1
+        oh *= dh
+        ow *= dw
+
+        # sample frame ids
+        target_duration = duration
+        target_fps = of / target_duration
+        timestamps = np.linspace(0., target_duration, of)
+        frame_ids = np.argmax(np.logical_and(
+            timestamps[:, None] >= frame_timestamps[None, :, 0],
+            timestamps[:, None] <= frame_timestamps[None, :, 1]
+        ), axis=1).tolist()
+        # print(oh, ow, of, target_duration, target_fps, len(frame_timestamps), len(frame_ids))
+        return frame_ids, (x1, x2, y1, y2), (oh, ow), target_fps
+
+
+    def _get_frameid_bbox(self, fps, frame_timestamps, h, w, crop_box, rng):
+        if self.keep_last:
+            return self._get_frameid_bbox_adjust_last(fps, frame_timestamps, h, w, crop_box, rng)
+        else:
+            return self._get_frameid_bbox_default(fps, frame_timestamps, h, w, crop_box, rng)
+
+    def load_video(self, data_key, crop_box=None, seed=2024, **kwargs):
+        return self.load_video_batch(data_key, crop_box=crop_box, seed=seed, **kwargs)
+
+    def load_video_pair(self, data_key, data_key2, crop_box=None, seed=2024, **kwargs):
+        return self.load_video_batch(data_key, data_key2, crop_box=crop_box, seed=seed, **kwargs)
+
+    def load_video_batch(self, *data_key_batch, crop_box=None, seed=2024, **kwargs):
+        rng = np.random.default_rng(seed + hash(data_key_batch[0]) % 10000)
+        # read video
+        import decord
+        decord.bridge.set_bridge('torch')
+        readers = []
+        for data_k in data_key_batch:
+            reader = decord.VideoReader(data_k)
+            readers.append(reader)
+
+        fps = readers[0].get_avg_fps()
+        length = min([len(r) for r in readers])
+        frame_timestamps = [readers[0].get_frame_timestamp(i) for i in range(length)]
+        frame_timestamps = np.array(frame_timestamps, dtype=np.float32)
+        h, w = readers[0].next().shape[:2]
+        frame_ids, (x1, x2, y1, y2), (oh, ow), fps = self._get_frameid_bbox(fps, frame_timestamps, h, w, crop_box, rng)
+
+        # preprocess video
+        videos = [reader.get_batch(frame_ids)[:, y1:y2, x1:x2, :] for reader in readers]
+        videos = [self._video_preprocess(video, oh, ow) for video in videos]
+        return *videos, frame_ids, (oh, ow), fps
+        # return videos if len(videos) > 1 else videos[0]
+
+
+def prepare_source(src_video, src_mask, src_ref_images, num_frames, image_size, device):
+    for i, (sub_src_video, sub_src_mask) in enumerate(zip(src_video, src_mask)):
+        if sub_src_video is None and sub_src_mask is None:
+            src_video[i] = torch.zeros((3, num_frames, image_size[0], image_size[1]), device=device)
+            src_mask[i] = torch.ones((1, num_frames, image_size[0], image_size[1]), device=device)
+    for i, ref_images in enumerate(src_ref_images):
+        if ref_images is not None:
+            for j, ref_img in enumerate(ref_images):
+                if ref_img is not None and ref_img.shape[-2:] != image_size:
+                    canvas_height, canvas_width = image_size
+                    ref_height, ref_width = ref_img.shape[-2:]
+                    white_canvas = torch.ones((3, 1, canvas_height, canvas_width), device=device) # [-1, 1]
+                    scale = min(canvas_height / ref_height, canvas_width / ref_width)
+                    new_height = int(ref_height * scale)
+                    new_width = int(ref_width * scale)
+                    resized_image = F.interpolate(ref_img.squeeze(1).unsqueeze(0), size=(new_height, new_width), mode='bilinear', align_corners=False).squeeze(0).unsqueeze(1)
+                    top = (canvas_height - new_height) // 2
+                    left = (canvas_width - new_width) // 2
+                    white_canvas[:, :, top:top + new_height, left:left + new_width] = resized_image
+                    src_ref_images[i][j] = white_canvas
+    return src_video, src_mask, src_ref_images
\ No newline at end of file
diff --git a/src/megatron/bridge/models/wan/utils/utils.py b/src/megatron/bridge/models/wan/utils/utils.py
new file mode 100644
index 0000000000..0f93526632
--- /dev/null
+++ b/src/megatron/bridge/models/wan/utils/utils.py
@@ -0,0 +1,221 @@
+import torch
+from typing import Tuple
+from torch.distributed import all_gather
+import megatron.core.parallel_state as parallel_state
+import math
+import torch.distributed as dist
+import transformer_engine_torch as tex
+
+def grid_sizes_calculation(
+    input_shape: Tuple[int, int, int],  # (F_latents, H_latents, W_latents)
+    patch_size: Tuple[int, int, int],  # (pF, pH, pW)
+) -> Tuple[int, int, int]:
+    """
+    Compute the (f,h,w) output spatial/temporal dimensions of a Conv3d patch embedder.
+    """
+
+    F_latents, H_latents, W_latents = input_shape
+    pF, pH, pW = patch_size
+    F_patches = F_latents // pF
+    H_patches = H_latents // pH
+    W_patches = W_latents // pW
+
+    return [F_patches, H_patches, W_patches]
+
+
+def patchify(x, patch_size):
+    """
+    Convert a list of reconstructed video tensor into patch embeddings.
+    This method is the inverse of `unpatchify`.
+
+    Args:
+        x (list[torch.Tensor]): list of tensors, each with shape [c, F_patches * pF, H_patches * pH, W_patches * pW]
+        patch_size (tuple): (pF, pH, pW)
+
+    Returns:
+        torch.Tensor: shape [ (F_patches * H_patches * W_patches), (c * pF * pH * pW)],
+    """
+    out = []
+    for u in x:
+        c, F_pF, H_pH, W_pW = u.shape
+        pF, pH, pW = patch_size
+        assert F_pF % pF == 0 and H_pH % pH == 0 and W_pW % pW == 0, \
+            "Spatial dimensions must be divisible by patch size."
+
+        F_patches, H_patches, W_patches = F_pF // pF, H_pH // pH, W_pW // pW
+
+        # split spatial dims into (grid, patch) and reorder to match original patch layout:
+        # start: (c, F_patches * pF, H_patches * pH, W_patches * pW)
+        # reshape -> (c, F_patches, pF, H_patches, pH, W_patches, pW)
+        # permute -> (F_patches, H_patches, W_patches, pF, pH, pW, c)
+        t = u.reshape(c, F_patches, pF, H_patches, pH, W_patches, pW)
+        t = t.permute(1, 3, 5, 2, 4, 6, 0)
+
+        num_patches = F_patches * H_patches * W_patches
+        out.append(t.reshape(num_patches, c * (pF * pH * pW)))
+    return out
+
+
+def unpatchify(x: list[torch.Tensor], grid_sizes: list[Tuple[int, int, int]], out_dim: int, patch_size: Tuple[int, int, int]) -> list[torch.Tensor]:
+    """
+    Reconstruct video tensors from patch embeddings into a list of videotensors.
+
+    Args:
+        x (list[torch.Tensor]):
+            list of tensors, each with shape [seq_len, c * pF * pH * pW]
+        grid_sizes (list[Tuple[int, int, int]]):
+            list of tensors, each with original spatial-temporal grid dimensions before patching,
+                (3 dimensions correspond to F_patches, H_patches, W_patches)
+
+    Returns:
+        list[torch.Tensor]: list of tensors, each with shape [c, F_latents, H_latents, W_latents]
+    """
+
+    c = out_dim
+    out = []
+    for u, v in zip(x, grid_sizes):
+        u = u[:math.prod(v)].view(*v, *patch_size, c)
+        u = torch.einsum('fhwpqrc->cfphqwr', u)
+        u = u.reshape(c, *[i * j for i, j in zip(v, patch_size)])
+        out.append(u)
+    return out
+
+
+def split_inputs_cp(x: torch.Tensor, seq_dim: int = 0) -> torch.Tensor:
+    """
+    Split input tensor along the sequence dimension for context parallelism.
+
+    Args:
+        x: Input tensor to be split. (e.g. shape [seq_len, batch_size, ...])
+        seq_dim: The dimension along which to split the input (sequence dimension).
+
+    Returns:
+        A slice of the input tensor corresponding to the current rank. (e.g. shape [seq_len/cp_size, batch_size, ...])
+    """
+
+    cp_size = parallel_state.get_context_parallel_world_size()
+    if cp_size > 1:
+        cp_rank = parallel_state.get_context_parallel_rank()
+        assert x.shape[seq_dim] % cp_size == 0, f"{x.shape[seq_dim]} cannot divide cp_size {cp_size}"
+        x = x.view(*x.shape[:seq_dim], cp_size, x.shape[seq_dim] // cp_size, *x.shape[(seq_dim + 1) :])
+        seq_idx = torch.tensor([cp_rank], device=x.device)
+        x = x.index_select(seq_dim, seq_idx)
+        # Note that the new sequence length is the original sequence length / cp_size
+        x = x.view(*x.shape[:seq_dim], -1, *x.shape[(seq_dim + 2) :])
+    return x
+
+
+def cat_outputs_cp(x: torch.Tensor, seq_dim: int) -> torch.Tensor:
+    """
+    Concatenate tensors from multiple processes along a specified dimension.
+
+    Args:
+        x: Input tensor to be concatenated. (e.g. shape [seq_len/cp_size, batch_size, ...])
+        seq_dim: The dimension along which to concatenate the input tensors.
+
+    Returns:
+        A tensor with the concatenated tensors. (e.g. shape [seq_len, batch_size, ...])
+    """
+
+    cp_group = parallel_state.get_context_parallel_group()
+    cp_size = parallel_state.get_context_parallel_world_size()
+    if cp_size > 1:
+        gathered_tensors = [torch.zeros_like(x) for _ in range(cp_size)]
+        # Attempt to gather tensors from all ranks
+        # PyTorch’s all_gather orders outputs by rank within the group, which matches how chunks were selected by cp_rank
+        all_gather(gathered_tensors, x, group=cp_group)
+        gathered_tensors = torch.cat(gathered_tensors, dim=seq_dim)
+        return gathered_tensors
+    else:
+        return x
+
+
+def thd_split_inputs_cp(x: torch.Tensor,
+                           cu_seqlens_q_padded: torch.Tensor,
+                           cp_group: dist.ProcessGroup) -> torch.Tensor:
+    """
+    Split a THD-packed tensor across CP ranks for inputs shaped [S, B, ...].
+
+    Args:
+        x: [S, B, ...] tensor (sequence first).
+        cu_seqlens_q_padded: 1D int32 THD cu_seqlens (padded) used for packing.
+        cp_group: context-parallel process group.
+
+    Returns:
+        x_local: [S_local, B, ...] shard for this CP rank.
+    """
+    # Move to [B, S, ...] to use THD partitioning along S
+    x_bs = x.transpose(0, 1).contiguous()  # [B, S, ...]
+
+    total_S = x_bs.size(1)
+    cp_size = dist.get_world_size(cp_group)
+    cp_rank = dist.get_rank(cp_group)
+
+    # Compute this rank's THD partition indices (same API as during gather)
+    idx = tex.thd_get_partitioned_indices(
+        cu_seqlens_q_padded,  # int32 offsets
+        total_S,
+        cp_size,
+        cp_rank,
+    ).to(device=x_bs.device, dtype=torch.long)  # [S_local]
+
+    # Take the shard along sequence dim
+    x_local_bs = x_bs.index_select(dim=1, index=idx).contiguous()  # [B, S_local, ...]
+
+    # Return to [S, B, ...]
+    x_local = x_local_bs.transpose(0, 1).contiguous()  # [S_local, B, ...]
+    return x_local
+
+
+def thd_cat_outputs_cp(x_local: torch.Tensor,
+                         cu_seqlens_q_padded: torch.Tensor,
+                         cp_group: dist.ProcessGroup) -> torch.Tensor:
+    """
+    Reverse of thd_split_inputs_cp: gather THD-partitioned local shards back to global.
+
+    Args:
+        x_local: [S_local, B, ...] tensor (this rank's shard, sequence first).
+        cu_seqlens_q_padded: 1D int32 THD cu_seqlens (padded) used for packing.
+        cp_group: context-parallel process group.
+
+    Returns:
+        x_global: [S, B, ...] tensor reassembled across CP ranks.
+    """
+    # Work in [B, S_local, ...] for easy indexing along S
+    x_local_bs = x_local.transpose(0, 1).contiguous()  # [B, S_local, ...]
+
+    cp_size = dist.get_world_size(cp_group)
+    cp_rank = dist.get_rank(cp_group)
+
+    # Discover total S from cu_seqlens (last value)
+    # (Matches 'total_S' used during split.)
+    total_S = int(cu_seqlens_q_padded[-1].item())
+
+    # All-gather local shards across CP group
+    gather_list = [torch.empty_like(x_local_bs) for _ in range(cp_size)]
+    dist.all_gather(gather_list, x_local_bs, group=cp_group)  # each is [B, S_r, ...]
+
+    # Compute per-rank indices once (same device/dtype as input)
+    # NOTE: tex.thd_get_partitioned_indices returns indices along S for that rank.
+    idx_list = []
+    for r in range(cp_size):
+        idx_r = tex.thd_get_partitioned_indices(
+            cu_seqlens_q_padded,  # int32 offsets
+            total_S,
+            cp_size,
+            r,
+        ).to(device=x_local_bs.device, dtype=torch.long)  # [S_r]
+        idx_list.append(idx_r)
+
+    # Allocate output [B, S, ...] and place each rank's slice back
+    out_shape = list(x_local_bs.shape)
+    out_shape[1] = total_S  # replace S_local with S
+    x_global_bs = x_local_bs.new_zeros(out_shape)  # [B, S, ...]
+
+    # index_copy_ along S dimension
+    for shard, idx in zip(gather_list, idx_list):
+        x_global_bs.index_copy_(dim=1, index=idx, source=shard)
+
+    # Return to [S, B, ...]
+    x_global = x_global_bs.transpose(0, 1).contiguous()  # [S, B, ...]
+    return x_global
\ No newline at end of file
diff --git a/src/megatron/bridge/models/wan/wan_bridge.py b/src/megatron/bridge/models/wan/wan_bridge.py
new file mode 100644
index 0000000000..ebcbf8e1c4
--- /dev/null
+++ b/src/megatron/bridge/models/wan/wan_bridge.py
@@ -0,0 +1,411 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+
+import torch
+from megatron.bridge.models.wan.wan_model import WanModel, VACEModel
+from diffusers import WanTransformer3DModel, WanVACETransformer3DModel
+
+from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
+from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
+from megatron.bridge.models.conversion.param_mapping import (
+    AutoMapping,
+    GatedMLPMapping,
+    QKVMapping,
+    KVMapping,
+    ReplicatedMapping,
+)
+from megatron.bridge.models.hf_pretrained.wan import PreTrainedWAN, PreTrainedVACE
+from megatron.bridge.models.wan.wan_provider import WanModelProvider, VACEModelProvider
+from megatron.core.transformer.utils import openai_gelu
+from megatron.bridge.models.conversion.utils import get_module_and_param_from_name
+
+
+@MegatronModelBridge.register_bridge(source=WanTransformer3DModel, target=WanModel)
+class WanBridge(MegatronModelBridge):
+    """
+    Megatron Bridge for WAN model.
+
+    As a user you would not use this bridge directly, but through `AutoBridge`.
+
+    Example:
+        >>> from megatron.bridge import AutoBridge
+        >>> bridge = AutoBridge.from_hf_pretrained("WAN-3D-1.3B-v1")
+        >>> provider = bridge.to_megatron_provider()
+    """
+
+    def provider_bridge(self, hf_pretrained: PreTrainedWAN) -> WanModelProvider:
+        hf_config = hf_pretrained.config
+
+        cls = WanModelProvider
+
+        provider = cls(
+            num_layers=hf_config.num_layers,
+            hidden_size=hf_config.num_attention_heads * hf_config.attention_head_dim,
+            kv_channels=hf_config.attention_head_dim,
+            num_query_groups=hf_config.num_attention_heads,
+            crossattn_emb_size=hf_config.num_attention_heads * hf_config.attention_head_dim,
+            ffn_hidden_size=hf_config.ffn_dim,
+            num_attention_heads=hf_config.num_attention_heads,
+            activation_func=openai_gelu,
+            in_channels=hf_config.in_channels,
+            out_channels=hf_config.out_channels,
+            text_dim=hf_config.text_dim,
+            patch_spatial=hf_config.patch_size[1],
+            patch_temporal=hf_config.patch_size[0],
+            layernorm_epsilon=hf_config.eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            use_cpu_initialization=True,
+            freq_dim=hf_config.freq_dim,
+            bf16=False,
+            params_dtype=torch.float32,
+        )
+
+        return provider
+
+
+    def mapping_registry(self) -> MegatronMappingRegistry:
+        """Return MegatronMappingRegistry containing parameter mappings from HF to Megatron format.
+
+        Returns:
+            MegatronMappingRegistry: Registry of parameter mappings
+        """
+        # Dictionary maps HF parameter names -> Megatron parameter names
+        # Supports wildcard (*) patterns for layer-specific parameters
+        param_mappings = {
+            "scale_shift_table": "head.modulation",
+            "patch_embedding.weight": "patch_embedding.weight",
+            "patch_embedding.bias": "patch_embedding.bias",
+            "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
+            "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
+            "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
+            "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
+            "condition_embedder.time_proj.weight": "time_projection.1.weight",
+            "condition_embedder.time_proj.bias": "time_projection.1.bias",
+            "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
+            "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
+            "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
+            "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
+            "blocks.*.scale_shift_table": "decoder.layers.*.adaLN.modulation",
+            "blocks.*.attn1.to_out.0.weight": "decoder.layers.*.full_self_attention.linear_proj.weight",
+            "blocks.*.attn1.to_out.0.bias": "decoder.layers.*.full_self_attention.linear_proj.bias",
+            "blocks.*.attn1.norm_q.weight": "decoder.layers.*.full_self_attention.q_layernorm.weight",
+            "blocks.*.attn1.norm_k.weight": "decoder.layers.*.full_self_attention.k_layernorm.weight",
+            "blocks.*.attn2.to_q.weight": "decoder.layers.*.cross_attention.linear_q.weight",
+            "blocks.*.attn2.to_q.bias": "decoder.layers.*.cross_attention.linear_q.bias",
+            "blocks.*.attn2.to_out.0.weight": "decoder.layers.*.cross_attention.linear_proj.weight",
+            "blocks.*.attn2.to_out.0.bias": "decoder.layers.*.cross_attention.linear_proj.bias",
+            "blocks.*.attn2.norm_q.weight": "decoder.layers.*.cross_attention.q_layernorm.weight",
+            "blocks.*.attn2.norm_k.weight": "decoder.layers.*.cross_attention.k_layernorm.weight",
+            "blocks.*.norm2.weight": "decoder.layers.*.norm3.weight",
+            "blocks.*.norm2.bias": "decoder.layers.*.norm3.bias",
+            "blocks.*.ffn.net.0.proj.weight": "decoder.layers.*.mlp.linear_fc1.weight",
+            "blocks.*.ffn.net.0.proj.bias": "decoder.layers.*.mlp.linear_fc1.bias",
+            "blocks.*.ffn.net.2.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "blocks.*.ffn.net.2.bias": "decoder.layers.*.mlp.linear_fc2.bias",
+            "proj_out.weight": "head.head.weight",
+            "proj_out.bias": "head.head.bias",
+        }
+
+
+        # Custom WAN mapping to safely handle replicated params whose owning module
+        # does not expose a top-level `.weight` (e.g., Head.modulation)
+        class _ReplicatedByParamNameMapping(ReplicatedMapping):
+            def hf_to_megatron(self, hf_weights, megatron_module):
+                normalized_param = self._normalize_expert_param_name(self.megatron_param)
+                _, target_param = get_module_and_param_from_name(megatron_module, normalized_param)
+
+                target_device = target_param.device
+                target_dtype = target_param.dtype
+
+                hf_weights = hf_weights.to(device=target_device, dtype=target_dtype)
+                if self.tp_size == 1:
+                    return hf_weights
+
+                if target_device.type == "cuda" and torch.cuda.is_available():
+                    if target_device.index != torch.cuda.current_device():
+                        hf_weights = hf_weights.to(torch.cuda.current_device())
+
+                if self.tp_rank > 0:
+                    hf_weights = torch.empty_like(hf_weights)
+
+                return self.broadcast_tensor_to_tp_ranks(hf_weights, src_rank=0)
+
+
+        mapping_list = []
+        # Convert each dictionary entry to AutoMapping(hf_param, megatron_param)
+        for hf_param, megatron_param in param_mappings.items():
+            if hf_param in {"scale_shift_table", "blocks.*.scale_shift_table", "proj_out.weight", "proj_out.bias"}:
+                # Use WAN-specific replicated mapping that resolves the exact param
+                mapping_list.append(_ReplicatedByParamNameMapping(hf_param=hf_param, megatron_param=megatron_param))
+            else:
+                mapping_list.append(AutoMapping(hf_param=hf_param, megatron_param=megatron_param))
+
+        # Adding custom module types for AutoMapping
+        AutoMapping.register_module_type("Linear", "replicated")
+        AutoMapping.register_module_type("Conv3d", "replicated")
+        AutoMapping.register_module_type("WanAdaLN", "replicated")
+        AutoMapping.register_module_type("Head", "replicated")
+
+        # Add special mappings that require parameter concatenation/transformation
+        mapping_list.extend(
+            [
+                # QKV: Combine separate Q, K, V matrices into single QKV matrix
+                QKVMapping(
+                    q="blocks.*.attn1.to_q.weight",
+                    k="blocks.*.attn1.to_k.weight",
+                    v="blocks.*.attn1.to_v.weight",
+                    megatron_param="decoder.layers.*.full_self_attention.linear_qkv.weight",
+                ),
+                # QKV bias: Combine separate Q, K, V bias into single QKV bias
+                QKVMapping(
+                    q="blocks.*.attn1.to_q.bias",
+                    k="blocks.*.attn1.to_k.bias",
+                    v="blocks.*.attn1.to_v.bias",
+                    megatron_param="decoder.layers.*.full_self_attention.linear_qkv.bias",
+                ),
+                # K, V: Combine separate K, V matrices into single KV matrix
+                KVMapping(
+                    k="blocks.*.attn2.to_k.weight",
+                    v="blocks.*.attn2.to_v.weight",
+                    megatron_param="decoder.layers.*.cross_attention.linear_kv.weight",
+                ),
+                # K, V bias: Combine separate K, V bias into single KV bias
+                KVMapping(
+                    k="blocks.*.attn2.to_k.bias",
+                    v="blocks.*.attn2.to_v.bias",
+                    megatron_param="decoder.layers.*.cross_attention.linear_kv.bias",
+                ),
+            ]
+        )
+
+        return MegatronMappingRegistry(*mapping_list)
+    
+    
+@MegatronModelBridge.register_bridge(source=WanVACETransformer3DModel, target=VACEModel)
+class VACEBridge(MegatronModelBridge):
+    """
+    Megatron Bridge for VACE model.
+
+    As a user you would not use this bridge directly, but through `AutoBridge`.
+
+    Example:
+        >>> from megatron.bridge import AutoBridge
+        >>> bridge = AutoBridge.from_hf_pretrained("WAN-3D-1.3B-v1")
+        >>> provider = bridge.to_megatron_provider()
+    """
+
+    def provider_bridge(self, hf_pretrained: PreTrainedVACE) -> VACEModelProvider:
+        hf_config = hf_pretrained.config
+
+        cls = VACEModelProvider
+
+        provider = cls(
+            num_layers=hf_config.num_layers,
+            hidden_size=hf_config.num_attention_heads * hf_config.attention_head_dim,
+            kv_channels=hf_config.attention_head_dim,
+            num_query_groups=hf_config.num_attention_heads,
+            crossattn_emb_size=hf_config.num_attention_heads * hf_config.attention_head_dim,
+            ffn_hidden_size=hf_config.ffn_dim,
+            num_attention_heads=hf_config.num_attention_heads,
+            activation_func=openai_gelu,
+            in_channels=hf_config.in_channels,
+            out_channels=hf_config.out_channels,
+            text_dim=hf_config.text_dim,
+            patch_spatial=hf_config.patch_size[1],
+            patch_temporal=hf_config.patch_size[0],
+            layernorm_epsilon=hf_config.eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            use_cpu_initialization=True,
+            freq_dim=hf_config.freq_dim,
+            bf16=False,
+            params_dtype=torch.float32,
+            vace_in_channels=hf_config.vace_in_channels,
+            vace_layers=hf_config.vace_layers,
+            base_num_layers=hf_config.num_layers,
+        )
+
+        return provider
+
+
+    def mapping_registry(self) -> MegatronMappingRegistry:
+        """Return MegatronMappingRegistry containing parameter mappings from HF to Megatron format.
+
+        Returns:
+            MegatronMappingRegistry: Registry of parameter mappings
+        """
+        # Dictionary maps HF parameter names -> Megatron parameter names
+        # Supports wildcard (*) patterns for layer-specific parameters
+        param_mappings = {
+            "scale_shift_table": "head.modulation",
+            "patch_embedding.weight": "patch_embedding.weight",
+            "patch_embedding.bias": "patch_embedding.bias",
+            "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
+            "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
+            "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
+            "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
+            "condition_embedder.time_proj.weight": "time_projection.1.weight",
+            "condition_embedder.time_proj.bias": "time_projection.1.bias",
+            "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
+            "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
+            "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
+            "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
+            "blocks.*.scale_shift_table": "decoder.layers.*.adaLN.modulation",
+            "blocks.*.attn1.to_out.0.weight": "decoder.layers.*.full_self_attention.linear_proj.weight",
+            "blocks.*.attn1.to_out.0.bias": "decoder.layers.*.full_self_attention.linear_proj.bias",
+            "blocks.*.attn1.norm_q.weight": "decoder.layers.*.full_self_attention.q_layernorm.weight",
+            "blocks.*.attn1.norm_k.weight": "decoder.layers.*.full_self_attention.k_layernorm.weight",
+            "blocks.*.attn2.to_q.weight": "decoder.layers.*.cross_attention.linear_q.weight",
+            "blocks.*.attn2.to_q.bias": "decoder.layers.*.cross_attention.linear_q.bias",
+            "blocks.*.attn2.to_out.0.weight": "decoder.layers.*.cross_attention.linear_proj.weight",
+            "blocks.*.attn2.to_out.0.bias": "decoder.layers.*.cross_attention.linear_proj.bias",
+            "blocks.*.attn2.norm_q.weight": "decoder.layers.*.cross_attention.q_layernorm.weight",
+            "blocks.*.attn2.norm_k.weight": "decoder.layers.*.cross_attention.k_layernorm.weight",
+            "blocks.*.norm2.weight": "decoder.layers.*.norm3.weight",
+            "blocks.*.norm2.bias": "decoder.layers.*.norm3.bias",
+            "blocks.*.ffn.net.0.proj.weight": "decoder.layers.*.mlp.linear_fc1.weight",
+            "blocks.*.ffn.net.0.proj.bias": "decoder.layers.*.mlp.linear_fc1.bias",
+            "blocks.*.ffn.net.2.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "blocks.*.ffn.net.2.bias": "decoder.layers.*.mlp.linear_fc2.bias",
+            "proj_out.weight": "head.head.weight",
+            "proj_out.bias": "head.head.bias",
+            
+            "vace_patch_embedding.weight": "vace_patch_embedding.weight",
+            "vace_patch_embedding.bias": "vace_patch_embedding.bias",
+            "vace_blocks.0.proj_in.weight": "vace_init_proj.weight",
+            "vace_blocks.0.proj_in.bias": "vace_init_proj.bias",
+            "vace_blocks.*.scale_shift_table": "vace_decoder.layers.*.adaLN.modulation",
+            "vace_blocks.*.attn1.to_out.0.weight": "vace_decoder.layers.*.full_self_attention.linear_proj.weight",
+            "vace_blocks.*.attn1.to_out.0.bias": "vace_decoder.layers.*.full_self_attention.linear_proj.bias",
+            "vace_blocks.*.attn1.norm_q.weight": "vace_decoder.layers.*.full_self_attention.q_layernorm.weight",
+            "vace_blocks.*.attn1.norm_k.weight": "vace_decoder.layers.*.full_self_attention.k_layernorm.weight",
+            "vace_blocks.*.attn2.to_q.weight": "vace_decoder.layers.*.cross_attention.linear_q.weight",
+            "vace_blocks.*.attn2.to_q.bias": "vace_decoder.layers.*.cross_attention.linear_q.bias",
+            "vace_blocks.*.attn2.to_out.0.weight": "vace_decoder.layers.*.cross_attention.linear_proj.weight",
+            "vace_blocks.*.attn2.to_out.0.bias": "vace_decoder.layers.*.cross_attention.linear_proj.bias",
+            "vace_blocks.*.attn2.norm_q.weight": "vace_decoder.layers.*.cross_attention.q_layernorm.weight",
+            "vace_blocks.*.attn2.norm_k.weight": "vace_decoder.layers.*.cross_attention.k_layernorm.weight",
+            "vace_blocks.*.norm2.weight": "vace_decoder.layers.*.norm3.weight",
+            "vace_blocks.*.norm2.bias": "vace_decoder.layers.*.norm3.bias",
+            "vace_blocks.*.ffn.net.0.proj.weight": "vace_decoder.layers.*.mlp.linear_fc1.weight",
+            "vace_blocks.*.ffn.net.0.proj.bias": "vace_decoder.layers.*.mlp.linear_fc1.bias",
+            "vace_blocks.*.ffn.net.2.weight": "vace_decoder.layers.*.mlp.linear_fc2.weight",
+            "vace_blocks.*.ffn.net.2.bias": "vace_decoder.layers.*.mlp.linear_fc2.bias",
+            "vace_blocks.*.proj_out.weight":"vace_decoder.layers.*.context_proj.weight",
+            "vace_blocks.*.proj_out.bias":"vace_decoder.layers.*.context_proj.bias",
+        }
+
+
+        # Custom WAN mapping to safely handle replicated params whose owning module
+        # does not expose a top-level `.weight` (e.g., Head.modulation)
+        class _ReplicatedByParamNameMapping(ReplicatedMapping):
+            def hf_to_megatron(self, hf_weights, megatron_module):
+                normalized_param = self._normalize_expert_param_name(self.megatron_param)
+                _, target_param = get_module_and_param_from_name(megatron_module, normalized_param)
+
+                target_device = target_param.device
+                target_dtype = target_param.dtype
+
+                hf_weights = hf_weights.to(device=target_device, dtype=target_dtype)
+                if self.tp_size == 1:
+                    return hf_weights
+
+                if target_device.type == "cuda" and torch.cuda.is_available():
+                    if target_device.index != torch.cuda.current_device():
+                        hf_weights = hf_weights.to(torch.cuda.current_device())
+
+                if self.tp_rank > 0:
+                    hf_weights = torch.empty_like(hf_weights)
+
+                return self.broadcast_tensor_to_tp_ranks(hf_weights, src_rank=0)
+
+
+        mapping_list = []
+        # Convert each dictionary entry to AutoMapping(hf_param, megatron_param)
+        for hf_param, megatron_param in param_mappings.items():
+            if hf_param in {"scale_shift_table", "blocks.*.scale_shift_table", "vace_blocks.*.scale_shift_table", "proj_out.weight", "proj_out.bias"}:
+                # Use WAN-specific replicated mapping that resolves the exact param
+                mapping_list.append(_ReplicatedByParamNameMapping(hf_param=hf_param, megatron_param=megatron_param))
+            else:
+                mapping_list.append(AutoMapping(hf_param=hf_param, megatron_param=megatron_param))
+
+        # Adding custom module types for AutoMapping
+        AutoMapping.register_module_type("Linear", "replicated")
+        AutoMapping.register_module_type("Conv3d", "replicated")
+        AutoMapping.register_module_type("WanAdaLN", "replicated")
+        AutoMapping.register_module_type("Head", "replicated")
+
+        # Add special mappings that require parameter concatenation/transformation
+        mapping_list.extend(
+            [
+                # QKV: Combine separate Q, K, V matrices into single QKV matrix
+                QKVMapping(
+                    q="blocks.*.attn1.to_q.weight",
+                    k="blocks.*.attn1.to_k.weight",
+                    v="blocks.*.attn1.to_v.weight",
+                    megatron_param="decoder.layers.*.full_self_attention.linear_qkv.weight",
+                ),
+                # QKV bias: Combine separate Q, K, V bias into single QKV bias
+                QKVMapping(
+                    q="blocks.*.attn1.to_q.bias",
+                    k="blocks.*.attn1.to_k.bias",
+                    v="blocks.*.attn1.to_v.bias",
+                    megatron_param="decoder.layers.*.full_self_attention.linear_qkv.bias",
+                ),
+                # K, V: Combine separate K, V matrices into single KV matrix
+                KVMapping(
+                    k="blocks.*.attn2.to_k.weight",
+                    v="blocks.*.attn2.to_v.weight",
+                    megatron_param="decoder.layers.*.cross_attention.linear_kv.weight",
+                ),
+                # K, V bias: Combine separate K, V bias into single KV bias
+                KVMapping(
+                    k="blocks.*.attn2.to_k.bias",
+                    v="blocks.*.attn2.to_v.bias",
+                    megatron_param="decoder.layers.*.cross_attention.linear_kv.bias",
+                ),
+                
+                # QKV: Combine separate Q, K, V matrices into single QKV matrix
+                QKVMapping(
+                    q="vace_blocks.*.attn1.to_q.weight",
+                    k="vace_blocks.*.attn1.to_k.weight",
+                    v="vace_blocks.*.attn1.to_v.weight",
+                    megatron_param="vace_decoder.layers.*.full_self_attention.linear_qkv.weight",
+                ),
+                # QKV bias: Combine separate Q, K, V bias into single QKV bias
+                QKVMapping(
+                    q="vace_blocks.*.attn1.to_q.bias",
+                    k="vace_blocks.*.attn1.to_k.bias",
+                    v="vace_blocks.*.attn1.to_v.bias",
+                    megatron_param="vace_decoder.layers.*.full_self_attention.linear_qkv.bias",
+                ),
+                # K, V: Combine separate K, V matrices into single KV matrix
+                KVMapping(
+                    k="vace_blocks.*.attn2.to_k.weight",
+                    v="vace_blocks.*.attn2.to_v.weight",
+                    megatron_param="vace_decoder.layers.*.cross_attention.linear_kv.weight",
+                ),
+                # K, V bias: Combine separate K, V bias into single KV bias
+                KVMapping(
+                    k="vace_blocks.*.attn2.to_k.bias",
+                    v="vace_blocks.*.attn2.to_v.bias",
+                    megatron_param="vace_decoder.layers.*.cross_attention.linear_kv.bias",
+                ),
+            ]
+        )
+
+        return MegatronMappingRegistry(*mapping_list)
\ No newline at end of file
diff --git a/src/megatron/bridge/models/wan/wan_layer_spec.py b/src/megatron/bridge/models/wan/wan_layer_spec.py
new file mode 100644
index 0000000000..8a839bcd89
--- /dev/null
+++ b/src/megatron/bridge/models/wan/wan_layer_spec.py
@@ -0,0 +1,862 @@
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+import copy
+from dataclasses import dataclass
+from typing import Union, Optional
+
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.transformer.attention import (
+    CrossAttention,
+    CrossAttentionSubmodules,
+    SelfAttention,
+    SelfAttentionSubmodules,
+)
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TEDotProductAttention,
+    TERowParallelLinear,
+    TELinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.utils import make_viewless_tensor
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.extensions.transformer_engine import TENorm
+
+try:
+    import transformer_engine  # pylint: disable=unused-import
+
+    HAVE_TE = True
+    from megatron.core.extensions.transformer_engine import SplitAlongDim
+    
+except ImportError:
+    HAVE_TE = False
+    SplitAlongDim = None
+
+
+class WanLayerNorm(nn.LayerNorm):
+
+    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
+        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
+
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return super().forward(x).type_as(x)
+
+
+@dataclass
+class WanSelfAttentionSubmodules:
+    """
+    Configuration class for specifying the submodules of a self-attention.
+    """
+
+    linear_qkv: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    layernorm_across_head: bool = False
+    q_layernorm: Union[ModuleSpec, type] = None
+    k_layernorm: Union[ModuleSpec, type] = None
+
+
+@dataclass
+class WanCrossAttentionSubmodules:
+    """
+    Configuration class for specifying the submodules of a cross-attention.
+    """
+    linear_q: Union[ModuleSpec, type] = None
+    linear_kv: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    layernorm_across_head: bool = False
+    q_layernorm: Union[ModuleSpec, type] = None
+    k_layernorm: Union[ModuleSpec, type] = None
+
+
+class WanSelfAttention(SelfAttention):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: WanSelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        cp_comm_type: str = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+    ):
+        super().__init__(
+            config,
+            submodules,
+            layer_number,
+            attn_mask_type,
+            cp_comm_type,
+            pg_collection,
+        )
+
+        self.layernorm_across_head = submodules.layernorm_across_head
+
+        # override q_layernorm
+        if submodules.q_layernorm is not None:
+            if self.layernorm_across_head:
+                q_layernorm_size = self.query_projection_size
+            else:
+                q_layernorm_size = self.hidden_size_per_attention_head
+            import transformer_engine as te
+            norm_config = copy.deepcopy(self.config)
+            norm_config.normalization = "RMSNorm"
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                eps=norm_config.layernorm_epsilon,
+                hidden_size=q_layernorm_size,
+                config=norm_config,
+            )
+        else:
+            self.q_layernorm = None
+
+        # override k_layernorm
+        if submodules.k_layernorm is not None:
+            if self.layernorm_across_head:
+                k_layernorm_size = self.kv_projection_size
+            else:
+                k_layernorm_size = self.hidden_size_per_attention_head
+            import transformer_engine as te
+            norm_config = copy.deepcopy(self.config)
+            norm_config.normalization = "RMSNorm"
+            self.k_layernorm = build_module(
+                submodules.k_layernorm,
+                eps=norm_config.layernorm_epsilon,
+                hidden_size=k_layernorm_size,
+                config=norm_config,
+            )
+        else:
+            self.k_layernorm = None
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        mixed_qkv, _ = self.linear_qkv(hidden_states)
+
+        # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+        new_tensor_shape = mixed_qkv.size()[:-1] + (
+            self.num_query_groups_per_partition,
+            (
+                (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                * self.hidden_size_per_attention_head
+            ),
+        )
+        mixed_qkv = mixed_qkv.view(*new_tensor_shape)
+
+        split_arg_list = [
+            (
+                self.num_attention_heads_per_partition
+                // self.num_query_groups_per_partition
+                * self.hidden_size_per_attention_head
+            ),
+            self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head,
+        ]
+
+        if SplitAlongDim is not None:
+
+            # [sq, b, ng, (np/ng + 2) * hn]
+            # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list)
+        else:
+
+            # [sq, b, ng, (np/ng + 2) * hn]
+            # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3)
+
+        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+        query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
+
+        # gather query and key heads across TP ranks if self.layernorm_across_head is True
+        if self.layernorm_across_head and parallel_state.get_tensor_model_parallel_world_size() > 1:
+            query = query.transpose(-2, -1)
+            key = key.transpose(-2, -1)
+            query = tensor_parallel.gather_from_tensor_model_parallel_region(query)
+            key = tensor_parallel.gather_from_tensor_model_parallel_region(key)
+            query = query.transpose(-2, -1)
+            key = key.transpose(-2, -1)
+
+        if self.q_layernorm is not None:
+            if self.layernorm_across_head:                
+                q_flat = query.reshape(query.size(0), query.size(1), -1).contiguous()  # [sq, b, np*hn]
+                q_flat = self.q_layernorm(q_flat)
+                query = q_flat.view(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)  # [sq, b, np, hn]
+            else:
+                query = self.q_layernorm(query.contiguous())
+
+        if self.k_layernorm is not None:
+            if self.layernorm_across_head:
+                k_flat = key.reshape(key.size(0), key.size(1), -1).contiguous()
+                k_flat = self.k_layernorm(k_flat)
+                key = k_flat.view(key.size(0), key.size(1), -1, self.hidden_size_per_attention_head)
+            else:
+                key = self.k_layernorm(key.contiguous())
+
+        # scatter query and key heads across TP ranks if self.layernorm_across_head is True
+        if self.layernorm_across_head and parallel_state.get_tensor_model_parallel_world_size() > 1:
+            query = query.transpose(-2, -1)
+            key = key.transpose(-2, -1)
+            query = tensor_parallel.scatter_to_tensor_model_parallel_region(query)
+            key = tensor_parallel.scatter_to_tensor_model_parallel_region(key)
+            query = query.transpose(-2, -1)
+            key = key.transpose(-2, -1)
+            query = query.contiguous() # important becuase TE attention expects contiguous tensors
+            key = key.contiguous() # important becuase TE attention expects contiguous tensors
+
+        if self.config.test_mode:
+            self.run_realtime_tests()
+
+        return query, key, value
+
+
+class WanCrossAttention(CrossAttention):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: WanCrossAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        cp_comm_type: str = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+    ):
+        super().__init__(
+            config,
+            submodules,
+            layer_number,
+            attn_mask_type,
+            cp_comm_type,
+            pg_collection,
+        )
+
+        self.layernorm_across_head = submodules.layernorm_across_head
+
+        # override q_layernorm
+        if submodules.q_layernorm is not None:
+            if self.layernorm_across_head:
+                q_layernorm_size = self.query_projection_size
+            else:
+                q_layernorm_size = self.hidden_size_per_attention_head
+            import transformer_engine as te
+            norm_config = copy.deepcopy(self.config)
+            norm_config.normalization = "RMSNorm"
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                eps=norm_config.layernorm_epsilon,
+                hidden_size=q_layernorm_size,
+                config=norm_config,
+            )
+        else:
+            self.q_layernorm = None
+
+        # override k_layernorm
+        if submodules.k_layernorm is not None:
+            if self.layernorm_across_head:
+                k_layernorm_size = self.kv_projection_size
+            else:
+                k_layernorm_size = self.hidden_size_per_attention_head
+            import transformer_engine as te
+            norm_config = copy.deepcopy(self.config)
+            norm_config.normalization = "RMSNorm"
+            self.k_layernorm = build_module(
+                submodules.k_layernorm,
+                eps=norm_config.layernorm_epsilon,
+                hidden_size=k_layernorm_size,
+                config=norm_config,
+            )
+        else:
+            self.k_layernorm = None
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states):
+        """
+        Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
+        from `key_value_states`.
+        """
+        # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
+        mixed_kv, _ = self.linear_kv(key_value_states)
+
+        # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+        new_tensor_shape = mixed_kv.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            2 * self.hidden_size_per_attention_head,
+        )
+        mixed_kv = mixed_kv.view(*new_tensor_shape)
+
+        # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+        (key, value) = tensor_parallel.split_tensor_along_last_dim(mixed_kv, 2)
+
+        # Attention head [sq, b, h] --> [sq, b, hp]
+        query, _ = self.linear_q(hidden_states)
+
+        # [sq, b, hp] --> [sq, b, np, hn]
+        new_tensor_shape = query.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+        query = query.view(*new_tensor_shape)
+
+        # gather query and key heads across TP ranks if self.layernorm_across_head is True
+        if self.layernorm_across_head and parallel_state.get_tensor_model_parallel_world_size() > 1:
+            query = query.transpose(-2, -1)
+            key = key.transpose(-2, -1)
+            query = tensor_parallel.gather_from_tensor_model_parallel_region(query)
+            key = tensor_parallel.gather_from_tensor_model_parallel_region(key)
+            query = query.transpose(-2, -1)
+            key = key.transpose(-2, -1)
+
+        if self.q_layernorm is not None:
+            if self.layernorm_across_head:
+                q_flat = query.reshape(query.size(0), query.size(1), -1).contiguous()  # [sq, b, np*hn]
+                q_flat = self.q_layernorm(q_flat)
+                query = q_flat.view(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)  # [sq, b, np, hn]
+            else:
+                query = self.q_layernorm(query.contiguous())
+
+        if self.k_layernorm is not None:
+            if self.layernorm_across_head:
+                k_flat = key.reshape(key.size(0), key.size(1), -1).contiguous()
+                k_flat = self.k_layernorm(k_flat)
+                key = k_flat.view(key.size(0), key.size(1), -1, self.hidden_size_per_attention_head)
+            else:
+                key = self.k_layernorm(key.contiguous())
+
+        # scatter query and key heads across TP ranks if self.layernorm_across_head is True
+        if self.layernorm_across_head and parallel_state.get_tensor_model_parallel_world_size() > 1:
+            query = query.transpose(-2, -1)
+            key = key.transpose(-2, -1)
+            query = tensor_parallel.scatter_to_tensor_model_parallel_region(query)
+            key = tensor_parallel.scatter_to_tensor_model_parallel_region(key)
+            query = query.transpose(-2, -1)
+            key = key.transpose(-2, -1)
+            query = query.contiguous() # important becuase TE attention expects contiguous tensors
+            key = key.contiguous() # important becuase TE attention expects contiguous tensors
+
+        return query, key, value
+        
+
+@dataclass
+class WanWithAdaLNSubmodules(TransformerLayerSubmodules):
+    temporal_self_attention: Union[ModuleSpec, type] = IdentityOp
+    full_self_attention: Union[ModuleSpec, type] = IdentityOp
+    norm1: Union[ModuleSpec, type] = None
+    norm3: Union[ModuleSpec, type] = None
+    norm2: Union[ModuleSpec, type] = None
+    context_proj: Union[ModuleSpec, type] = IdentityOp
+    
+    
+# @dataclass
+# class VACEContextLayerSubmodules(WanWithAdaLNSubmodules):
+    
+
+
+class WanAdaLN(MegatronModule):
+    """
+    Adaptive Layer Normalization Module for DiT.
+    """
+
+    def __init__(
+        self, config: TransformerConfig
+    ):
+        super().__init__(config)
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, config.hidden_size) / config.hidden_size**0.5)
+
+        setattr(self.modulation, "sequence_parallel", config.sequence_parallel)
+
+    def forward(self, timestep_emb):
+        e = (self.modulation + timestep_emb).chunk(6, dim=1)
+        return e
+
+    # @jit_fuser
+    def modulate(self, x, shift, scale):
+        return x * (1 + scale) + shift
+
+    # @jit_fuser
+    def scale_add(self, residual, x, gate):
+        return residual + gate * x
+
+
+class WanLayerWithAdaLN(TransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    DiT with Adapative Layer Normalization.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        vp_stage: Optional[int] = None,
+    ):
+        super().__init__(
+            config=config, submodules=submodules, layer_number=layer_number, hidden_dropout=hidden_dropout, pg_collection=pg_collection, vp_stage=vp_stage
+        )
+
+        # # TODO: Override Cross Attention to disable TP Comm overlap as well. ???
+        # # Not disabling will attempt re-use of buffer size same as Q and lead to incorrect tensor shapes.
+        # cp_override_config = copy.deepcopy(config)
+        # cp_override_config.tp_comm_overlap = False
+        # self.cross_attention = build_module(
+        #     submodules.cross_attention,
+        #     config=cp_override_config,
+        #     layer_number=layer_number,
+        # )
+
+        self.full_self_attention = build_module(
+            submodules.full_self_attention,
+            config=self.config,
+            layer_number=layer_number,
+            cp_comm_type=config.cp_comm_type,
+            pg_collection=pg_collection,
+        )
+
+        self.adaLN = WanAdaLN(config=self.config)
+        self.norm1 = build_module(
+            submodules.norm1,
+            dim=config.hidden_size,
+            eps=config.layernorm_epsilon,
+            elementwise_affine=False
+        )
+        self.norm3 = build_module(
+            submodules.norm3,
+            dim=config.hidden_size,
+            eps=config.layernorm_epsilon,
+            elementwise_affine=True,
+        )
+        self.norm2 = build_module(
+            submodules.norm2,
+            dim=config.hidden_size,
+            eps=config.layernorm_epsilon,
+            elementwise_affine=False,
+        )
+
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        rotary_pos_cos=None,
+        rotary_pos_sin=None,
+        attention_bias=None,
+        inference_params=None,
+        packed_seq_params=None,
+        sequence_len_offset=None,
+        inference_context=None,
+    ):
+        
+        # log_checkpoint("before layer")
+        
+        # the timestep embedding is stored in attention_mask argument
+        timestep_emb = attention_mask
+        rope_emb = rotary_pos_emb
+
+        shift_full, scale_full, gate_full, shift_mlp, scale_mlp, gate_mlp = self.adaLN(timestep_emb)
+        # transpose to bring it to [1, b, ...] format
+        shift_full = shift_full.transpose(0, 1)
+        scale_full = scale_full.transpose(0, 1)
+        gate_full = gate_full.transpose(0, 1)
+        shift_mlp = shift_mlp.transpose(0, 1)
+        scale_mlp = scale_mlp.transpose(0, 1)
+        gate_mlp = gate_mlp.transpose(0, 1)
+
+        # ******************************************** full self attention *******************************************
+
+        # adaLN with scale + shift + gate
+        pre_full_attn_layernorm_output_ada = self.adaLN.modulate(
+            self.norm1(hidden_states),
+            shift=shift_full,
+            scale=scale_full,
+        )
+
+        attention_output, bias = self.full_self_attention(
+            pre_full_attn_layernorm_output_ada,
+            attention_mask=None,
+            rotary_pos_emb=rope_emb,
+            rotary_pos_cos=rotary_pos_cos,
+            rotary_pos_sin=rotary_pos_sin,
+            packed_seq_params=packed_seq_params['self_attention'],
+        )
+        if bias is not None:
+            attention_output = attention_output + bias
+
+        hidden_states = self.adaLN.scale_add(residual=hidden_states, x=attention_output, gate=gate_full)
+
+        # ******************************************** cross attention ******************************************************
+
+        attention_output, bias = self.cross_attention(
+            self.norm3(hidden_states),
+            attention_mask=context_mask,
+            key_value_states=context,
+            packed_seq_params=packed_seq_params['cross_attention'],
+        )
+        if bias is not None:
+            attention_output = attention_output + bias
+
+        hidden_states = hidden_states + attention_output
+
+        # ******************************************** mlp ******************************************************
+
+        pre_mlp_layernorm_output_ada = self.adaLN.modulate(
+            self.norm2(hidden_states),
+            shift=shift_mlp,
+            scale=scale_mlp,
+        )
+
+        mlp_output, bias = self.mlp(pre_mlp_layernorm_output_ada)
+        if bias is not None:
+           mlp_output = mlp_output + bias
+
+        hidden_states = self.adaLN.scale_add(residual=hidden_states, x=mlp_output, gate=gate_mlp)
+
+        # TODO: Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor. ???
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+        # output = hidden_states
+        
+        # log_checkpoint("after layer")
+        
+        return output, context
+
+def log_checkpoint(tag):
+    torch.cuda.synchronize()
+    alloc = torch.cuda.memory_allocated() / 1024**3
+    reserved = torch.cuda.memory_reserved() / 1024**3
+    print(f"[{tag}] alloc={alloc:.2f} GB reserved={reserved:.2f} GB")
+
+class VACEBaseLayer(WanLayerWithAdaLN):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    DiT with Adapative Layer Normalization.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        vp_stage: Optional[int] = None,
+    ):
+        super().__init__(
+            config=config, submodules=submodules, layer_number=layer_number, hidden_dropout=hidden_dropout, pg_collection=pg_collection, vp_stage=vp_stage
+        )
+
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        context_signal=None,
+        rotary_pos_emb=None,
+        rotary_pos_cos=None,
+        rotary_pos_sin=None,
+        attention_bias=None,
+        inference_params=None,
+        packed_seq_params=None,
+        sequence_len_offset=None,
+        inference_context=None,
+    ):
+        
+        # log_checkpoint("before base")
+        
+        hidden_states, context = super().forward(
+            hidden_states, 
+            attention_mask=attention_mask,
+            context=context,
+            context_mask=None,
+            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_cos=rotary_pos_cos,
+            rotary_pos_sin=rotary_pos_sin,
+            attention_bias=attention_bias,
+            inference_params=inference_params,
+            packed_seq_params=packed_seq_params,
+            sequence_len_offset=sequence_len_offset,
+            inference_context=inference_context,
+        )
+        # consider how to pass block id and context_scale
+        # the context_tokens from context branch is stored in context_signal argument
+        if self.idx is not None:
+            hidden_states = hidden_states + context_signal[self.idx] * self.context_scale
+            # hidden_states = hidden_states + context_signal[self.idx] * 2.0
+            # hidden_states = hidden_states + torch.rand_like(context_signal[self.idx]) * 0.05
+            
+        # log_checkpoint(f"after base {self.idx}")
+        
+        return hidden_states, context
+   
+    
+class VACEContextLayer(WanLayerWithAdaLN):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+
+    DiT with Adapative Layer Normalization.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        vp_stage: Optional[int] = None,
+    ):
+        super().__init__(
+            config=config, submodules=submodules, layer_number=layer_number, hidden_dropout=hidden_dropout, pg_collection=pg_collection, vp_stage=vp_stage
+        )
+
+        # self.context_proj = build_module(
+        #     submodules.context_proj,
+        #     self.config.hidden_size,
+        #     self.config.hidden_size,
+        #     config=self.config,
+        #     init_method=self.config.output_layer_init_method,
+        #     bias=self.config.add_bias_linear,
+        #     input_is_parallel=False,
+        #     skip_bias_add=True,
+        #     is_expert=False,
+        #     tp_comm_buffer_name='proj',
+        #     tp_group=self.pg_collection.tp,
+        # )
+        self.context_proj = build_module(
+            submodules.context_proj,
+            self.config.hidden_size,
+            self.config.hidden_size,
+            parallel_mode="duplicated",
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            is_expert=False,
+            symmetric_ar_type=self.config.symmetric_ar_type,
+            tp_comm_buffer_name='proj',
+            tp_group=None,
+        )
+
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        context_signal=None,
+        rotary_pos_emb=None,
+        rotary_pos_cos=None,
+        rotary_pos_sin=None,
+        attention_bias=None,
+        inference_params=None,
+        packed_seq_params=None,
+        sequence_len_offset=None,
+        inference_context=None,
+    ):  
+        
+        # log_checkpoint("before context")
+
+        hidden_states, context = super().forward(
+            hidden_states, 
+            attention_mask=attention_mask,
+            context=context,
+            context_mask=None,
+            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_cos=rotary_pos_cos,
+            rotary_pos_sin=rotary_pos_sin,
+            attention_bias=attention_bias,
+            inference_params=inference_params,
+            packed_seq_params=packed_seq_params,
+            sequence_len_offset=sequence_len_offset,
+            inference_context=inference_context,
+        )
+        context_signal[self.idx] = self.context_proj(hidden_states)[0]
+        
+        # log_checkpoint("after context")
+        
+        return hidden_states, context_signal
+
+
+import transformer_engine as te
+def get_wan_block_with_transformer_engine_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.padding}
+    return ModuleSpec(
+        module=WanLayerWithAdaLN,
+        submodules=WanWithAdaLNSubmodules(
+            norm1=WanLayerNorm,
+            norm3=WanLayerNorm,
+            norm2=WanLayerNorm,
+            full_self_attention=ModuleSpec(
+                module=WanSelfAttention,
+                params=params,
+                submodules=WanSelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    layernorm_across_head=True,     
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,         
+                ),
+            ),
+            cross_attention=ModuleSpec(
+                module=WanCrossAttention,
+                params=params,
+                submodules=WanCrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    layernorm_across_head=True,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    # by default, activation_func is openai_gelu, which is equivalent to nn.GELU(approximate='tanh')
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+    
+    
+def get_vace_base_block_with_transformer_engine_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.padding}
+    return ModuleSpec(
+        module=VACEBaseLayer,
+        submodules=WanWithAdaLNSubmodules(
+            norm1=WanLayerNorm,
+            norm3=WanLayerNorm,
+            norm2=WanLayerNorm,
+            full_self_attention=ModuleSpec(
+                module=WanSelfAttention,
+                params=params,
+                submodules=WanSelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    layernorm_across_head=True,     
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,         
+                ),
+            ),
+            cross_attention=ModuleSpec(
+                module=WanCrossAttention,
+                params=params,
+                submodules=WanCrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    layernorm_across_head=True,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    # by default, activation_func is openai_gelu, which is equivalent to nn.GELU(approximate='tanh')
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+        ),
+    )
+    
+    
+def get_vace_context_block_with_transformer_engine_spec() -> ModuleSpec:
+    params = {"attn_mask_type": AttnMaskType.padding}
+    return ModuleSpec(
+        module=VACEContextLayer,
+        submodules=WanWithAdaLNSubmodules(
+            norm1=WanLayerNorm,
+            norm3=WanLayerNorm,
+            norm2=WanLayerNorm,
+            full_self_attention=ModuleSpec(
+                module=WanSelfAttention,
+                params=params,
+                submodules=WanSelfAttentionSubmodules(
+                    linear_qkv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    layernorm_across_head=True,     
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,         
+                ),
+            ),
+            cross_attention=ModuleSpec(
+                module=WanCrossAttention,
+                params=params,
+                submodules=WanCrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    layernorm_across_head=True,
+                    q_layernorm=TENorm,
+                    k_layernorm=TENorm,
+                ),
+            ),
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TEColumnParallelLinear,
+                    # by default, activation_func is openai_gelu, which is equivalent to nn.GELU(approximate='tanh')
+                    linear_fc2=TERowParallelLinear,
+                ),
+            ),
+            context_proj=TELinear
+        ),
+    )
+
diff --git a/src/megatron/bridge/models/wan/wan_model.py b/src/megatron/bridge/models/wan/wan_model.py
new file mode 100644
index 0000000000..9917246cce
--- /dev/null
+++ b/src/megatron/bridge/models/wan/wan_model.py
@@ -0,0 +1,1413 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+from typing import Dict, Literal, Optional, Tuple, List, Union
+import copy
+
+import math
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.models.common.vision_module.vision_module import VisionModule
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.transformer_block import TransformerBlock, TransformerBlockSubmodules
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_sharded_tensor_for_checkpoint
+from megatron.bridge.models.wan.wan_layer_spec import (
+    get_wan_block_with_transformer_engine_spec as WanLayerWithAdaLNspec,
+    get_vace_base_block_with_transformer_engine_spec as VACEBaseLayerspec,
+    get_vace_context_block_with_transformer_engine_spec as VACEContextLayerspec,
+)
+from megatron.bridge.models.wan.wan_layer_spec import WanLayerNorm
+from torch import Tensor
+from .rope_utils import Wan3DRopeEmbeddings
+
+from contextlib import nullcontext
+from megatron.core.fp4_utils import get_fp4_context
+from megatron.core.fp8_utils import get_fp8_context
+from megatron.core.enums import Fp8Recipe
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+from megatron.core.inference.contexts import BaseInferenceContext
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_layer import get_transformer_layer_offset
+from megatron.core.utils import (
+    WrappedTensor,
+    deprecate_inference_params,
+    get_pg_rank,
+    make_viewless_tensor,
+)
+
+try:
+    import transformer_engine.pytorch as te  # pylint: disable=unused-import
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex  # pylint: disable=unused-import
+
+    HAVE_APEX = True
+except ImportError:
+    HAVE_APEX = False
+
+get_cpu_offload_context = None
+te_checkpoint = None
+
+if HAVE_TE:
+    from megatron.core.extensions.transformer_engine import (
+        TENorm,
+        get_cpu_offload_context,
+        te_checkpoint,
+    )
+
+    LayerNormImpl = TENorm
+
+elif HAVE_APEX:
+    LayerNormImpl = FusedLayerNorm
+
+else:
+    from megatron.core.transformer.torch_norm import WrappedTorchNorm
+
+    LayerNormImpl = WrappedTorchNorm
+
+class BaseTransformerBlock(TransformerBlock):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        spec: Union[TransformerBlockSubmodules, ModuleSpec],
+        post_layer_norm: bool = True,
+        pre_process: bool = True,
+        post_process: bool = True,
+        pg_collection: ProcessGroupCollection = None,
+        vp_stage: Optional[int] = None,
+    ):
+        # Pass block id and context_scale
+        self.vace_layers = [i for i in range(0, config.num_layers, 2)] if config.vace_layers is None else config.vace_layers
+        print(self.vace_layers)
+        assert 0 in self.vace_layers
+        self.vace_layers_mapping = {i: n for n, i in enumerate(self.vace_layers)}
+        
+        super().__init__(
+            config=config, 
+            spec=spec,
+            post_layer_norm=post_layer_norm,
+            pre_process=pre_process,
+            post_process=post_process,
+            pg_collection=pg_collection,
+            vp_stage=vp_stage,
+        )
+    
+    def _build_layers(self):
+        # Transformer layers.
+        # @jcasper can we improve how we deal with layer_number?
+        # currently it's only used in CoreAttention?
+        # if self.apply_query_key_layer_scaling:
+        #     coeff = self.layer_number
+        #     self.norm_factor *= coeff
+        def build_layer(layer_spec, layer_number):
+            global_layer_number = layer_number + get_transformer_layer_offset(
+                self.config, self.vp_stage, get_pg_rank(self.pg_collection.pp)
+            )  # 1-based index
+            if self.config.heterogeneous_block_specs:
+                layer_config = self.config.get_config_for_layer(global_layer_number)
+            else:
+                layer_config = self.config
+
+            # Get appropriate quantization context (FP8 and FP4 are mutually exclusive)
+            if layer_config.fp8:
+                quantization_context = get_fp8_context(
+                    layer_config, global_layer_number - 1, is_init=True
+                )
+            elif layer_config.fp4:
+                quantization_context = get_fp4_context(
+                    layer_config, global_layer_number - 1, is_init=True
+                )
+            else:
+                quantization_context = nullcontext()
+
+            with quantization_context:
+                module = build_module(
+                    layer_spec,
+                    config=layer_config,
+                    layer_number=layer_number,
+                    pg_collection=self.pg_collection,
+                    vp_stage=self.vp_stage,
+                )
+                idx = global_layer_number - 1
+                if idx in self.vace_layers:
+                    module.idx = self.vace_layers_mapping[idx]
+                    module.context_scale = self.config.context_scale
+                else:
+                    module.idx = None
+            return module
+
+        # offset is implicit in TransformerLayer
+        self.layers = torch.nn.ModuleList(
+            [
+                build_layer(layer_spec, i + 1)
+                for i, layer_spec in enumerate(self.submodules.layer_specs)
+            ]
+        )
+
+        # @TODO: add back account_for_embedding_in_pipeline_split (see issue #293)
+        # In pipeline parallelism, we want to add this LN only to the last stage of the pipeline
+        # self.post_process and self.post_layer_norm guide this behavior
+        if self.submodules.layer_norm and self.post_process and self.post_layer_norm:
+            self.final_layernorm = build_module(
+                self.submodules.layer_norm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.final_layernorm = None  # Either this or nn.Identity
+            
+    def _checkpointed_forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        context: Tensor,
+        context_mask: Tensor,
+        context_signal: Tensor,
+        rotary_pos_emb: Tensor,
+        attention_bias: Tensor,
+        packed_seq_params: PackedSeqParams,
+        use_inner_quantization_context: bool,
+    ):
+        """Forward method with activation checkpointing."""
+
+        def custom(start: int, end: int):
+            def custom_forward(
+                hidden_states, attention_mask, context, context_mask, context_signal, rotary_pos_emb
+            ):
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+
+                    # Get appropriate inner quantization context
+                    if use_inner_quantization_context:
+                        if self.config.fp8:
+                            inner_quantization_context = get_fp8_context(
+                                self.config, layer.layer_number - 1
+                            )
+                        # TODO: check if fp4 is supported in this case
+                        elif self.config.fp4:
+                            inner_quantization_context = get_fp4_context(
+                                self.config, layer.layer_number - 1
+                            )
+                        else:
+                            inner_quantization_context = nullcontext()
+                    else:
+                        inner_quantization_context = nullcontext()
+
+                    with inner_quantization_context:
+                        hidden_states, context = layer(
+                            hidden_states=hidden_states,
+                            attention_mask=attention_mask,
+                            context=context,
+                            context_mask=context_mask,
+                            context_signal=context_signal,
+                            rotary_pos_emb=rotary_pos_emb,
+                            attention_bias=attention_bias,
+                            inference_context=None,
+                            packed_seq_params=packed_seq_params,
+                        )
+                return hidden_states, context
+
+            return custom_forward
+
+        def checkpoint_handler(forward_func):
+            """Determines whether to use the `te_checkpoint` or `tensor_parallel.checkpoint`"""
+            # TODO: check if fp4 is supported in this case
+            if self.config.fp8 or self.config.fp4:
+                return te_checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    tensor_parallel.random.get_cuda_rng_tracker,
+                    self.pg_collection.tp,
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    context_signal,
+                    rotary_pos_emb,
+                )
+            else:
+                return tensor_parallel.checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    context_signal,
+                    rotary_pos_emb,
+                )
+
+        if self.config.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            layer_idx = 0
+            while layer_idx < self.num_layers_per_pipeline_rank:
+                hidden_states, context = checkpoint_handler(
+                    custom(layer_idx, layer_idx + self.config.recompute_num_layers)
+                )
+
+                layer_idx += self.config.recompute_num_layers
+
+        elif self.config.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            recompute_skip_num_layers = 0
+            for layer_idx in range(self.num_layers_per_pipeline_rank):
+                # Skip recomputation when input grad computation is not needed.
+                # Need to have at least one input tensor with gradient computation
+                # for re-enterant autograd engine.
+                # TODO: check if fp4 is supported in this case
+                if (self.config.fp8 or self.config.fp4) and not hidden_states.requires_grad:
+                    recompute_skip_num_layers += 1
+                if (
+                    layer_idx >= recompute_skip_num_layers
+                    and layer_idx < self.config.recompute_num_layers + recompute_skip_num_layers
+                ):
+                    hidden_states, context = checkpoint_handler(custom(layer_idx, layer_idx + 1))
+                else:
+                    hidden_states, context = custom(layer_idx, layer_idx + 1)(
+                        hidden_states, attention_mask, context, context_mask, context_signal, rotary_pos_emb
+                    )
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states
+    
+    def forward(
+        self,
+        hidden_states: Union[Tensor, WrappedTensor],
+        attention_mask: Optional[Tensor],
+        context: Optional[Tensor] = None,
+        context_mask: Optional[Tensor] = None,
+        context_signal: Optional[Tensor] = None,
+        rotary_pos_emb: Optional[Tensor] = None,
+        rotary_pos_cos: Optional[Tensor] = None,
+        rotary_pos_sin: Optional[Tensor] = None,
+        rotary_pos_cos_sin: Optional[Tensor] = None,
+        attention_bias: Optional[Tensor] = None,
+        inference_context: Optional[BaseInferenceContext] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        sequence_len_offset: Optional[Tensor] = None,
+        *,
+        inference_params: Optional[BaseInferenceContext] = None,
+        dynamic_inference_decode_only: Optional[bool] = None,
+    ):
+        """
+        Perform the forward pass through the transformer block.
+
+        This method handles the core computation of the transformer, including
+        self-attention, optional cross-attention, and feed-forward operations.
+
+        Args:
+            hidden_states (Union[Tensor, WrappedTensor]): Input tensor of shape [s, b, h]
+                where s is the sequence length, b is the batch size, and h is the hidden size.
+                Can be passed as a WrappedTensor during inference to avoid an obsolete
+                reference in the calling function.
+            attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
+                self-attention.
+            context (Tensor, optional): Context tensor for cross-attention.
+            context_mask (Tensor, optional): Mask for cross-attention context
+            context_signal (Tensor, optional): Signal from context tokens
+            rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
+            rotary_pos_cos (Optional[Tensor]): Rotary embedding cosine.
+            rotary_pos_sin (Optional[Tensor]): Rotary embedding sine.
+            rotary_pos_cos_sin (Optional[Tensor]): Combined rotary embedding cosine and sine.
+            Currently used exclusively for inference with dynamic batching and flashinfer RoPE.
+            attention_bias (Tensor): Bias tensor for Q * K.T of shape in shape broadcastable
+                to [b, num_head, sq, skv], e.g. [1, 1, sq, skv].
+                Used as an alternative to apply attention mask for TE cuDNN attention.
+            inference_context (BaseInferenceContext, optional): Parameters for inference-time
+                optimizations.
+            packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence
+                processing.
+            dynamic_inference_decode_only: Optional[bool]: If true, indicates that the current
+                inference context is for decode-only. This args is only used to uniquely
+                identify decode and non-decode cuda graph runners in the cuda graph manager.
+
+        Returns:
+            Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape
+            [s, b, h], and optionally the updated context tensor if cross-attention is used.
+        """
+
+        inference_context = deprecate_inference_params(inference_context, inference_params)
+        # Remove 'dynamic_inference_decode_only' from kwargs if present
+        # this is only used to uniquely identify decode and non-decode cuda graph
+        # runners in the cuda graph manager
+
+        # Delete the obsolete reference to the initial input tensor if necessary
+        if isinstance(hidden_states, WrappedTensor):
+            hidden_states = hidden_states.unwrap()
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
+
+        if self.config.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        # If fp8_recipe is delayed, wrap the entire pass with get_fp8_context(),
+        # otherwise do nothing extra at the outer level
+        # if we are using other fp8 recipes, then the context manager enter&exit are free
+        # we can wrap fp8_context within the for loop over layers, so that we can fine-grained
+        # control which layer will be fp8 or bf16
+        # For FP4: NVFP4BlockScaling doesn't have delayed scaling, always uses inner context
+        if self.config.fp8:
+            use_outer_quantization_context = self.config.fp8_recipe == Fp8Recipe.delayed
+            use_inner_quantization_context = self.config.fp8_recipe != Fp8Recipe.delayed
+            outer_quantization_context = (
+                get_fp8_context(self.config) if use_outer_quantization_context else nullcontext()
+            )
+        elif self.config.fp4:
+            use_outer_quantization_context = False
+            use_inner_quantization_context = True
+            outer_quantization_context = nullcontext()
+        else:
+            # No quantization
+            use_outer_quantization_context = False
+            use_inner_quantization_context = False
+            outer_quantization_context = nullcontext()
+
+        with rng_context, outer_quantization_context:
+            # Forward pass.
+            if self.config.recompute_granularity == 'full' and self.training:
+                hidden_states = self._checkpointed_forward(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    context=context,
+                    context_mask=context_mask,
+                    context_signal=context_signal,
+                    rotary_pos_emb=rotary_pos_emb,
+                    attention_bias=attention_bias,
+                    packed_seq_params=packed_seq_params,
+                    use_inner_quantization_context=use_inner_quantization_context,
+                )
+            else:
+                for l_no, layer in enumerate(self.layers):
+                    # Get appropriate inner quantization context
+                    if use_inner_quantization_context:
+                        if self.config.fp8:
+                            inner_quantization_context = get_fp8_context(
+                                self.config, layer.layer_number - 1
+                            )
+                        elif self.config.fp4:
+                            inner_quantization_context = get_fp4_context(
+                                self.config, layer.layer_number - 1
+                            )
+                        else:
+                            inner_quantization_context = nullcontext()
+                    else:
+                        inner_quantization_context = nullcontext()
+
+                    with self.offload_context, inner_quantization_context:
+                        hidden_states, context = layer(
+                            hidden_states=hidden_states,
+                            attention_mask=attention_mask,
+                            context=context,
+                            context_mask=context_mask,
+                            context_signal=context_signal,
+                            rotary_pos_emb=rotary_pos_emb,
+                            rotary_pos_cos=rotary_pos_cos,
+                            rotary_pos_sin=rotary_pos_sin,
+                            # rotary_pos_cos_sin=rotary_pos_cos_sin,
+                            attention_bias=attention_bias,
+                            inference_context=inference_context,
+                            packed_seq_params=packed_seq_params,
+                            sequence_len_offset=sequence_len_offset,
+                        )
+
+                    if (
+                        torch.is_grad_enabled()
+                        and self.config.cpu_offloading
+                        and self.group_prefetch_offload_commit_async is not None
+                    ):
+                        hidden_states = self.group_prefetch_offload_commit_async(hidden_states)
+
+        # Final layer norm.
+        if self.final_layernorm is not None:
+            hidden_states = self.final_layernorm(hidden_states)
+            # TENorm produces a "viewed" tensor. This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            hidden_states = make_viewless_tensor(
+                inp=hidden_states, requires_grad=True, keep_graph=True
+            )
+
+        # If this TransformerBlock is empty, input and output hidden states will be the same node
+        # on the computational graph and will lead to unexpected errors in pipeline schedules.
+        if not self.pre_process and len(self.layers) == 0 and not self.final_layernorm:
+            hidden_states = hidden_states.clone()
+
+        return hidden_states
+            
+class ContextTransformerBlock(TransformerBlock):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        spec: Union[TransformerBlockSubmodules, ModuleSpec],
+        post_layer_norm: bool = True,
+        pre_process: bool = True,
+        post_process: bool = True,
+        pg_collection: ProcessGroupCollection = None,
+        vp_stage: Optional[int] = None,
+    ):
+        # Pass block id and context_scale
+        self.vace_id = [i for i in range(0, config.num_layers)] if config.vace_layers is None else [i for i in range(0, len(config.vace_layers))]
+        print(self.vace_id)
+        assert 0 in self.vace_id
+        
+        super().__init__(
+            config=config, 
+            spec=spec,
+            post_layer_norm=post_layer_norm,
+            pre_process=pre_process,
+            post_process=post_process,
+            pg_collection=pg_collection,
+            vp_stage=vp_stage,
+        )
+    
+    def _build_layers(self):
+        # Transformer layers.
+        # @jcasper can we improve how we deal with layer_number?
+        # currently it's only used in CoreAttention?
+        # if self.apply_query_key_layer_scaling:
+        #     coeff = self.layer_number
+        #     self.norm_factor *= coeff
+        def build_layer(layer_spec, layer_number):
+            global_layer_number = layer_number + get_transformer_layer_offset(
+                self.config, self.vp_stage, get_pg_rank(self.pg_collection.pp)
+            )  # 1-based index
+            if self.config.heterogeneous_block_specs:
+                layer_config = self.config.get_config_for_layer(global_layer_number)
+            else:
+                layer_config = self.config
+
+            # Get appropriate quantization context (FP8 and FP4 are mutually exclusive)
+            if layer_config.fp8:
+                quantization_context = get_fp8_context(
+                    layer_config, global_layer_number - 1, is_init=True
+                )
+            elif layer_config.fp4:
+                quantization_context = get_fp4_context(
+                    layer_config, global_layer_number - 1, is_init=True
+                )
+            else:
+                quantization_context = nullcontext()
+
+            with quantization_context:
+                module = build_module(
+                    layer_spec,
+                    config=layer_config,
+                    layer_number=layer_number,
+                    pg_collection=self.pg_collection,
+                    vp_stage=self.vp_stage,
+                )
+                idx = global_layer_number - 1
+                if idx in self.vace_id:
+                    module.idx = idx
+                else:
+                    module.idx = None
+            return module
+
+        # offset is implicit in TransformerLayer
+        self.layers = torch.nn.ModuleList(
+            [
+                build_layer(layer_spec, i + 1)
+                for i, layer_spec in enumerate(self.submodules.layer_specs)
+            ]
+        )
+
+        # @TODO: add back account_for_embedding_in_pipeline_split (see issue #293)
+        # In pipeline parallelism, we want to add this LN only to the last stage of the pipeline
+        # self.post_process and self.post_layer_norm guide this behavior
+        if self.submodules.layer_norm and self.post_process and self.post_layer_norm:
+            self.final_layernorm = build_module(
+                self.submodules.layer_norm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.final_layernorm = None  # Either this or nn.Identity
+            
+    def _checkpointed_forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        context: Tensor,
+        context_mask: Tensor,
+        context_signal: Tensor,
+        rotary_pos_emb: Tensor,
+        attention_bias: Tensor,
+        packed_seq_params: PackedSeqParams,
+        use_inner_quantization_context: bool,
+    ):
+        """Forward method with activation checkpointing."""
+
+        def custom(start: int, end: int):
+            def custom_forward(
+                hidden_states, attention_mask, context, context_mask, context_signal, rotary_pos_emb
+            ):
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+
+                    # Get appropriate inner quantization context
+                    if use_inner_quantization_context:
+                        if self.config.fp8:
+                            inner_quantization_context = get_fp8_context(
+                                self.config, layer.layer_number - 1
+                            )
+                        # TODO: check if fp4 is supported in this case
+                        elif self.config.fp4:
+                            inner_quantization_context = get_fp4_context(
+                                self.config, layer.layer_number - 1
+                            )
+                        else:
+                            inner_quantization_context = nullcontext()
+                    else:
+                        inner_quantization_context = nullcontext()
+
+                    with inner_quantization_context:
+                        hidden_states, context_signal = layer(
+                            hidden_states=hidden_states,
+                            attention_mask=attention_mask,
+                            context=context,
+                            context_mask=context_mask,
+                            context_signal=context_signal,
+                            rotary_pos_emb=rotary_pos_emb,
+                            attention_bias=attention_bias,
+                            inference_context=None,
+                            packed_seq_params=packed_seq_params,
+                        )
+                return hidden_states, context_signal
+
+            return custom_forward
+
+        def checkpoint_handler(forward_func):
+            """Determines whether to use the `te_checkpoint` or `tensor_parallel.checkpoint`"""
+            # TODO: check if fp4 is supported in this case
+            if self.config.fp8 or self.config.fp4:
+                return te_checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    tensor_parallel.random.get_cuda_rng_tracker,
+                    self.pg_collection.tp,
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    context_signal,
+                    rotary_pos_emb,
+                )
+            else:
+                return tensor_parallel.checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    context_signal,
+                    rotary_pos_emb,
+                )
+
+        if self.config.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            layer_idx = 0
+            while layer_idx < self.num_layers_per_pipeline_rank:
+                hidden_states, context_signal = checkpoint_handler(
+                    custom(layer_idx, layer_idx + self.config.recompute_num_layers)
+                )
+
+                layer_idx += self.config.recompute_num_layers
+
+        elif self.config.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            recompute_skip_num_layers = 0
+            for layer_idx in range(self.num_layers_per_pipeline_rank):
+                # Skip recomputation when input grad computation is not needed.
+                # Need to have at least one input tensor with gradient computation
+                # for re-enterant autograd engine.
+                # TODO: check if fp4 is supported in this case
+                if (self.config.fp8 or self.config.fp4) and not hidden_states.requires_grad:
+                    recompute_skip_num_layers += 1
+                if (
+                    layer_idx >= recompute_skip_num_layers
+                    and layer_idx < self.config.recompute_num_layers + recompute_skip_num_layers
+                ):
+                    hidden_states, context_signal = checkpoint_handler(custom(layer_idx, layer_idx + 1))
+                else:
+                    hidden_states, context_signal = custom(layer_idx, layer_idx + 1)(
+                        hidden_states, attention_mask, context, context_mask, context_signal, rotary_pos_emb
+                    )
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states, context_signal
+    
+    def forward(
+        self,
+        hidden_states: Union[Tensor, WrappedTensor],
+        attention_mask: Optional[Tensor],
+        context: Optional[Tensor] = None,
+        context_mask: Optional[Tensor] = None,
+        context_signal: Optional[Tensor] = None,
+        rotary_pos_emb: Optional[Tensor] = None,
+        rotary_pos_cos: Optional[Tensor] = None,
+        rotary_pos_sin: Optional[Tensor] = None,
+        rotary_pos_cos_sin: Optional[Tensor] = None,
+        attention_bias: Optional[Tensor] = None,
+        inference_context: Optional[BaseInferenceContext] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        sequence_len_offset: Optional[Tensor] = None,
+        *,
+        inference_params: Optional[BaseInferenceContext] = None,
+        dynamic_inference_decode_only: Optional[bool] = None,
+    ):
+        """
+        Perform the forward pass through the transformer block.
+
+        This method handles the core computation of the transformer, including
+        self-attention, optional cross-attention, and feed-forward operations.
+
+        Args:
+            hidden_states (Union[Tensor, WrappedTensor]): Input tensor of shape [s, b, h]
+                where s is the sequence length, b is the batch size, and h is the hidden size.
+                Can be passed as a WrappedTensor during inference to avoid an obsolete
+                reference in the calling function.
+            attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
+                self-attention.
+            context (Tensor, optional): Context tensor for cross-attention.
+            context_mask (Tensor, optional): Mask for cross-attention context
+            context_signal (Tensor, optional): Signal from context tokens
+            rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
+            rotary_pos_cos (Optional[Tensor]): Rotary embedding cosine.
+            rotary_pos_sin (Optional[Tensor]): Rotary embedding sine.
+            rotary_pos_cos_sin (Optional[Tensor]): Combined rotary embedding cosine and sine.
+            Currently used exclusively for inference with dynamic batching and flashinfer RoPE.
+            attention_bias (Tensor): Bias tensor for Q * K.T of shape in shape broadcastable
+                to [b, num_head, sq, skv], e.g. [1, 1, sq, skv].
+                Used as an alternative to apply attention mask for TE cuDNN attention.
+            inference_context (BaseInferenceContext, optional): Parameters for inference-time
+                optimizations.
+            packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence
+                processing.
+            dynamic_inference_decode_only: Optional[bool]: If true, indicates that the current
+                inference context is for decode-only. This args is only used to uniquely
+                identify decode and non-decode cuda graph runners in the cuda graph manager.
+
+        Returns:
+            Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape
+            [s, b, h], and optionally the updated context tensor if cross-attention is used.
+        """
+
+        inference_context = deprecate_inference_params(inference_context, inference_params)
+        # Remove 'dynamic_inference_decode_only' from kwargs if present
+        # this is only used to uniquely identify decode and non-decode cuda graph
+        # runners in the cuda graph manager
+
+        # Delete the obsolete reference to the initial input tensor if necessary
+        if isinstance(hidden_states, WrappedTensor):
+            hidden_states = hidden_states.unwrap()
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
+
+        if self.config.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        # If fp8_recipe is delayed, wrap the entire pass with get_fp8_context(),
+        # otherwise do nothing extra at the outer level
+        # if we are using other fp8 recipes, then the context manager enter&exit are free
+        # we can wrap fp8_context within the for loop over layers, so that we can fine-grained
+        # control which layer will be fp8 or bf16
+        # For FP4: NVFP4BlockScaling doesn't have delayed scaling, always uses inner context
+        if self.config.fp8:
+            use_outer_quantization_context = self.config.fp8_recipe == Fp8Recipe.delayed
+            use_inner_quantization_context = self.config.fp8_recipe != Fp8Recipe.delayed
+            outer_quantization_context = (
+                get_fp8_context(self.config) if use_outer_quantization_context else nullcontext()
+            )
+        elif self.config.fp4:
+            use_outer_quantization_context = False
+            use_inner_quantization_context = True
+            outer_quantization_context = nullcontext()
+        else:
+            # No quantization
+            use_outer_quantization_context = False
+            use_inner_quantization_context = False
+            outer_quantization_context = nullcontext()
+
+        with rng_context, outer_quantization_context:
+            # Forward pass.
+            if self.config.recompute_granularity == 'full' and self.training:
+                hidden_states, context_signal = self._checkpointed_forward(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    context=context,
+                    context_mask=context_mask,
+                    context_signal=context_signal,
+                    rotary_pos_emb=rotary_pos_emb,
+                    attention_bias=attention_bias,
+                    packed_seq_params=packed_seq_params,
+                    use_inner_quantization_context=use_inner_quantization_context,
+                )
+            else:
+                for l_no, layer in enumerate(self.layers):
+                    # Get appropriate inner quantization context
+                    if use_inner_quantization_context:
+                        if self.config.fp8:
+                            inner_quantization_context = get_fp8_context(
+                                self.config, layer.layer_number - 1
+                            )
+                        elif self.config.fp4:
+                            inner_quantization_context = get_fp4_context(
+                                self.config, layer.layer_number - 1
+                            )
+                        else:
+                            inner_quantization_context = nullcontext()
+                    else:
+                        inner_quantization_context = nullcontext()
+
+                    with self.offload_context, inner_quantization_context:
+                        hidden_states, context_signal = layer(
+                            hidden_states=hidden_states,
+                            attention_mask=attention_mask,
+                            context=context,
+                            context_mask=context_mask,
+                            context_signal=context_signal,
+                            rotary_pos_emb=rotary_pos_emb,
+                            rotary_pos_cos=rotary_pos_cos,
+                            rotary_pos_sin=rotary_pos_sin,
+                            # rotary_pos_cos_sin=rotary_pos_cos_sin,
+                            attention_bias=attention_bias,
+                            inference_context=inference_context,
+                            packed_seq_params=packed_seq_params,
+                            sequence_len_offset=sequence_len_offset,
+                        )
+
+                    if (
+                        torch.is_grad_enabled()
+                        and self.config.cpu_offloading
+                        and self.group_prefetch_offload_commit_async is not None
+                    ):
+                        hidden_states = self.group_prefetch_offload_commit_async(hidden_states)
+
+        # Final layer norm.
+        if self.final_layernorm is not None:
+            hidden_states = self.final_layernorm(hidden_states)
+            # TENorm produces a "viewed" tensor. This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            hidden_states = make_viewless_tensor(
+                inp=hidden_states, requires_grad=True, keep_graph=True
+            )
+
+        # If this TransformerBlock is empty, input and output hidden states will be the same node
+        # on the computational graph and will lead to unexpected errors in pipeline schedules.
+        if not self.pre_process and len(self.layers) == 0 and not self.final_layernorm:
+            hidden_states = hidden_states.clone()
+
+        return hidden_states, context_signal
+
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position
+
+    # calculation
+    sinusoid = torch.outer(
+        position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+
+class Head(nn.Module):
+
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+
+    def forward(self, x, e):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            e(Tensor): Shape [B, C]
+        """
+        e = (self.modulation + e.unsqueeze(1)).chunk(2, dim=1)
+        x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
+        return x
+
+
+class WanModel(VisionModule):
+    """
+    WanModel is a VisionModule that implements a Wan model.
+    Attributes:
+        config (TransformerConfig): Configuration for the transformer.
+        pre_process (bool): Whether to apply pre-processing steps.
+        post_process (bool): Whether to apply post-processing steps.
+        fp16_lm_cross_entropy (bool): Whether to use fp16 for cross-entropy loss.
+        parallel_output (bool): Whether to use parallel output.
+        transformer_decoder_layer_spec (WanLayerWithAdaLNspec): Specification for the transformer decoder layer.
+        model_type (ModelType): Type of the model.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        transformer_decoder_layer_spec=WanLayerWithAdaLNspec,
+        **kwargs,
+    ):
+        super(WanModel, self).__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        self.transformer_decoder_layer_spec = transformer_decoder_layer_spec()
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+
+        self.num_heads = self.config.num_attention_heads
+        self.freq_dim = self.config.freq_dim
+        self.in_channels = self.config.in_channels
+        self.out_channels = self.config.out_channels
+        self.patch_spatial = self.config.patch_spatial
+        self.patch_temporal = self.config.patch_temporal
+        self.patch_size = (self.patch_temporal, self.patch_spatial, self.patch_spatial)
+
+        # these attributes are unused for images/videos, we just set because bridge training requires for LLMs
+        self.share_embeddings_and_output_weights = False
+
+        ######################################
+        ########## Wan architecture ##########
+
+        # embeddings
+        if self.pre_process:
+            self.patch_embedding = nn.Conv3d(
+                self.in_channels, self.config.hidden_size, kernel_size=self.patch_size, stride=self.patch_size)
+
+        self.text_embedding = nn.Sequential(
+            nn.Linear(self.config.text_dim, self.config.hidden_size), nn.GELU(approximate='tanh'),
+            nn.Linear(self.config.hidden_size, self.config.hidden_size))
+
+        self.time_embedding = nn.Sequential(
+            nn.Linear(self.freq_dim, self.config.hidden_size), nn.SiLU(), nn.Linear(self.config.hidden_size, self.config.hidden_size))
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(self.config.hidden_size, self.config.hidden_size * 6))
+
+        self.rope_embeddings = Wan3DRopeEmbeddings(dim_head = self.config.hidden_size // self.num_heads, max_position_len = 1024)
+
+        # decoder blocks
+        self.decoder = TransformerBlock(
+            config=self.config,
+            spec=self.transformer_decoder_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            post_layer_norm=False,
+        )
+
+        # output head
+        if self.post_process:
+            self.head = Head(self.config.hidden_size, self.out_channels, self.patch_size, eps = 1e-6)
+
+
+    def forward(
+        self,
+        x: Tensor,
+        grid_sizes: list[Tuple[int, int, int]],
+        t: Tensor,
+        context: Tensor,
+        max_seq_len: int,
+        packed_seq_params: PackedSeqParams = None,
+        **kwargs,
+    ) -> Tensor:
+        """Forward pass.
+
+        Args:
+            x List[Tensor]: list of vae encoded data (s, b, c * pF * pH * pW)
+            grid_sizes List[Tuple[int, int, int]]: list of grid sizes (f, h, w)
+            t Tensor: timesteps
+            context List[Tensor]: list of context (text_len, hidden_size)
+            max_seq_len int: maximum sequence length
+            packed_seq_params PackedSeqParams: packed sequence parameters
+
+        Returns:
+            Tensor: output tensor (still patchified) of shape [seq_len, batch_size, hidden_size]
+        """
+        #################################
+        ########## Wan forward ##########
+
+        # ============= embedders =============
+
+        # run input embedding
+        if self.pre_process:
+            # x.shape [s, b, c * pF * pH * pW]
+            seq_len, batch_size, _ = x.shape
+            c = self.in_channels
+            pF, pH, pW = self.patch_size
+            x = x.reshape(seq_len * batch_size, pF, pH, pW, c) # output: x.shape [s * b, pF, pH, pW, c]
+            x = x.permute(0, 4, 1, 2, 3) # output: x.shape [s * b, c, pF, pH, pW]
+            x = self.patch_embedding(x) # output: x.shape [s * b, hidden_size, 1, 1, 1]
+            x = x.flatten(1) # output: x.shape [s * b, hidden_size]
+            x = x.reshape(seq_len, batch_size, -1) # output: x.shape [s, b, hidden_size]
+
+            # split sequence for sequence_parallel
+            # TODO: for PP, do we move scatter_to_sequence_parallel_region here or after "x = self.decoder.input_tensor" ???
+            if self.config.sequence_parallel:
+                x = tensor_parallel.scatter_to_sequence_parallel_region(x) # output: x.shape [s * b // tp_size, hidden_size]
+
+        else:
+            # intermediate stage of pipeline
+            x = self.decoder.input_tensor
+
+        # time embeddings
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t).to(x.dtype)
+        )
+        e0 = self.time_projection(e).unflatten(1, (6, self.config.hidden_size))
+
+        # context embeddings
+        context = self.text_embedding(context) # shape [text_len, b, hidden_size]
+
+
+        # ============= decoder =============
+        # calculate rotary pos emb
+        n_head, dim_head = self.num_heads, self.config.hidden_size // self.num_heads
+        rotary_pos_emb = self.rope_embeddings(n_head, dim_head, max_seq_len, grid_sizes, t.device) # output: rotary_pos_emb.shape [s, b, 1, dim_head]
+
+        # run decoder
+        x = self.decoder(
+            hidden_states=x,
+            attention_mask=e0,
+            context=context,
+            context_mask=None,
+            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_cos=None,
+            rotary_pos_sin=None,
+            packed_seq_params=packed_seq_params,
+        )
+
+        # return if not post_process
+        if not self.post_process:
+            return x
+
+        # head
+        x = x.transpose(0, 1) # head expects shape [b, s, hidden_size]
+        x = self.head(x, e) # output: x.shape [b, s, c * pF * pH * pW]
+        x = x.transpose(0, 1) # reshape back to shape [s, b, c * pF * pH * pW]
+
+        # gather outputs for sequence_parallel
+        # Note: in GPT models, because the vocab projection matrix is ColumnParallelLinear, the sequence is 
+        #   automatically gathered in ColumnParallelLinear forward pass.
+        #   However, in Wan models, we need to gather the outputs manually.
+        if self.config.sequence_parallel:
+            x = tensor_parallel.gather_from_sequence_parallel_region(x)
+
+        return x # output: x.shape [s, b, c * pF * pH * pW]
+
+
+    def set_input_tensor(self, input_tensor: Tensor) -> None:
+        """Sets input tensor to the model.
+
+        See megatron.model.transformer.set_input_tensor()
+
+        Args:
+            input_tensor (Tensor): Sets the input tensor for the model.
+        """
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, "input_tensor should only be length 1 for gpt/bert"
+        self.decoder.set_input_tensor(input_tensor[0])
+
+
+    def sharded_state_dict(
+        self, prefix: str = "", sharded_offsets: tuple = (), metadata: Optional[Dict] = None
+    ) -> ShardedStateDict:
+        """Sharded state dict implementation for GPTModel backward-compatibility (removing extra state).
+
+        Args:
+            prefix (str): Module name prefix.
+            sharded_offsets (tuple): PP related offsets, expected to be empty at this module level.
+            metadata (Optional[Dict]): metadata controlling sharded state dict creation.
+
+        Returns:
+            ShardedStateDict: sharded state dict for the GPTModel
+        """
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+
+        # DEBUGGING
+        # for module in ["t_embedder"]:
+        #     for param_name, param in getattr(self, module).named_parameters():
+        #         weight_key = f"{prefix}{module}.{param_name}"
+        #         self._set_embedder_weights_replica_id(param, sharded_state_dict, weight_key)
+        # DEBUGGING
+        # Ensure replica ids for non-transformer embedder weights include pipeline dimension
+        for module in ["text_embedding", "time_embedding", "time_projection"]:
+            if hasattr(self, module):
+                for param_name, param in getattr(self, module).named_parameters():
+                    weight_key = f"{prefix}{module}.{param_name}"
+                    if weight_key in sharded_state_dict:
+                        self._set_embedder_weights_replica_id(param, sharded_state_dict, weight_key)
+
+        return sharded_state_dict
+
+
+    def _set_embedder_weights_replica_id(
+        self, tensor: Tensor, sharded_state_dict: ShardedStateDict, embedder_weight_key: str
+    ) -> None:
+        """set replica ids of the weights in t_embedder for sharded state dict.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): state dict with the weight to tie
+            weight_key (str): key of the weight in the state dict.
+                This entry will be replaced with a tied version
+
+        Returns: None, acts in-place
+        """
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        vpp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+        vpp_rank = vpp_rank if vpp_rank else 0
+        vpp_world = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+        vpp_world = vpp_world if vpp_world else 1
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        del sharded_state_dict[embedder_weight_key]
+        replica_id = (
+            tp_rank,
+            (vpp_rank + pp_rank * vpp_world),
+            parallel_state.get_data_parallel_rank(with_context_parallel=True),
+        )
+
+        sharded_state_dict[embedder_weight_key] = make_sharded_tensor_for_checkpoint(
+            tensor=tensor,
+            key=embedder_weight_key,
+            replica_id=replica_id,
+            allow_shape_mismatch=False,
+        )
+
+
+class VACEModel(WanModel):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        transformer_decoder_layer_spec=VACEBaseLayerspec,
+        vace_transformer_decoder_layer_spec=VACEContextLayerspec,
+        **kwargs,
+    ):
+        super().__init__(
+            config,
+            pre_process,
+            post_process,
+            fp16_lm_cross_entropy,
+            parallel_output,
+            transformer_decoder_layer_spec,
+            **kwargs
+        )
+        
+        self.vace_in_channels = self.config.vace_in_channels
+        self.vace_transformer_decoder_layer_spec = vace_transformer_decoder_layer_spec()
+        
+        if self.pre_process:
+            self.vace_patch_embedding = nn.Conv3d(
+                self.vace_in_channels, self.config.hidden_size, kernel_size=self.patch_size, stride=self.patch_size)
+            
+        self.decoder = BaseTransformerBlock(
+            config=self.config,
+            spec=self.transformer_decoder_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            post_layer_norm=False,
+        )
+        # print(self.decoder)
+        self.vace_config = copy.deepcopy(self.config)
+        self.vace_config.num_layers = len(self.decoder.vace_layers)
+        self.vace_decoder = ContextTransformerBlock(
+            config=self.vace_config,
+            spec=self.vace_transformer_decoder_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            post_layer_norm=False,
+        )
+        # print(self.vace_decoder.state_dict().keys())
+        
+        self.vace_init_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)
+        
+        # Freeze base WAN parameters if specified
+        if getattr(self.config, 'freeze_base_model', False):
+            self.freeze_base_parameters()
+    
+    def freeze_base_parameters(self):
+        """
+        Freeze all base WAN model parameters, only allow VACE-specific parameters to be trained.
+        
+        Frozen parameters (from base WAN model):
+        - patch_embedding
+        - text_embedding
+        - time_embedding
+        - time_projection
+        - rope_embeddings
+        - decoder.layers (base transformer layers, not VACE layers)
+        - head
+        
+        Trainable parameters (VACE-specific):
+        - vace_patch_embedding
+        - vace_decoder (separate transformer for VACE context)
+        - vace_init_proj
+        - decoder.vace_layers (VACE context attention layers within decoder)
+        """
+        # Freeze base model embeddings
+        for param in self.patch_embedding.parameters():
+            param.requires_grad = False
+        for param in self.text_embedding.parameters():
+            param.requires_grad = False
+        for param in self.time_embedding.parameters():
+            param.requires_grad = False
+        for param in self.time_projection.parameters():
+            param.requires_grad = False
+        for param in self.rope_embeddings.parameters():
+            param.requires_grad = False
+        
+        # Freeze output head
+        for param in self.head.parameters():
+            param.requires_grad = False
+        
+        # Freeze base decoder layers (but not vace_layers)
+        if hasattr(self.decoder, 'layers'):
+            for layer in self.decoder.layers:
+                for param in layer.parameters():
+                    param.requires_grad = False
+        
+        print("[VACEModel] Frozen base WAN model parameters. Only VACE-specific parameters will be trained:")
+        print(f"  - vace_patch_embedding")
+        print(f"  - vace_decoder ({self.vace_config.num_layers} layers)")
+        print(f"  - vace_init_proj")
+        if hasattr(self.decoder, 'vace_layers'):
+            print(f"  - decoder.vace_layers ({len(self.decoder.vace_layers)} VACE context layers)")
+    
+    def forward(
+        self,
+        x: Tensor,
+        grid_sizes: list[Tuple[int, int, int]],
+        t: Tensor,
+        context: Tensor,
+        vace_context: Tensor,
+        max_seq_len: int,
+        packed_seq_params: PackedSeqParams = None,
+        **kwargs,
+    ) -> Tensor:
+        """Forward pass.
+
+        Args:
+            x List[Tensor]: list of vae encoded data (s, b, c * pF * pH * pW)
+            grid_sizes List[Tuple[int, int, int]]: list of grid sizes (f, h, w)
+            t Tensor: timesteps
+            context List[Tensor]: list of context (text_len, hidden_size)
+            max_seq_len int: maximum sequence length
+            packed_seq_params PackedSeqParams: packed sequence parameters
+
+        Returns:
+            Tensor: output tensor (still patchified) of shape [seq_len, batch_size, hidden_size]
+        """
+        #################################
+        ########## Wan forward ##########
+
+        # ============= embedders =============
+
+        # run input embedding
+        if self.pre_process:
+            # x.shape [s, b, c * pF * pH * pW]
+            seq_len, batch_size, _ = x.shape
+            c = self.in_channels
+            pF, pH, pW = self.patch_size
+            x = x.reshape(seq_len * batch_size, pF, pH, pW, c) # output: x.shape [s * b, pF, pH, pW, c]
+            x = x.permute(0, 4, 1, 2, 3) # output: x.shape [s * b, c, pF, pH, pW]
+            x = self.patch_embedding(x) # output: x.shape [s * b, hidden_size, 1, 1, 1]
+            x = x.flatten(1) # output: x.shape [s * b, hidden_size]
+            x = x.reshape(seq_len, batch_size, -1) # output: x.shape [s, b, hidden_size]
+            
+            # vace_context.shape [s, b, c * pF * pH * pW]
+            vace_seq_len, _, vace_flat_dim = vace_context.shape
+            # Calculate actual channels from the tensor shape
+            vace_c = vace_flat_dim // (pF * pH * pW)
+            # pF, pH, pW = self.patch_size
+            vace_context = vace_context.reshape(vace_seq_len * batch_size, pF, pH, pW, vace_c) # output: vace_context.shape [s * b, pF, pH, pW, c]
+            vace_context = vace_context.permute(0, 4, 1, 2, 3) # output: vace_context.shape [s * b, c, pF, pH, pW]
+            # Use patch_embedding if vace_context has same channels as main input (self-editing mode)
+            # Otherwise use vace_patch_embedding for different channel counts
+            if vace_c == self.in_channels:
+                vace_context = self.patch_embedding(vace_context) # output: vace_context.shape [s * b, hidden_size, 1, 1, 1]
+            else:
+                vace_context = self.vace_patch_embedding(vace_context) # output: vace_context.shape [s * b, hidden_size, 1, 1, 1]
+            vace_context = vace_context.flatten(1) # output: vace_context.shape [s * b, hidden_size]
+            vace_context = vace_context.reshape(vace_seq_len, batch_size, -1) # output: vace_context.shape [s, b, hidden_size]
+            vace_context = self.vace_init_proj(vace_context) + x
+            # vace_context = vace_context.unsqueeze(0)
+            vace_context = torch.stack([vace_context] * (self.vace_config.num_layers))
+            
+            # split sequence for sequence_parallel
+            # TODO: for PP, do we move scatter_to_sequence_parallel_region here or after "x = self.decoder.input_tensor" ???
+            if self.config.sequence_parallel:
+                x = tensor_parallel.scatter_to_sequence_parallel_region(x) # output: x.shape [s * b // tp_size, hidden_size]
+                vace_context = tensor_parallel.scatter_to_sequence_parallel_region(vace_context) # output: vace_context.shape [s * b // tp_size, hidden_size]
+
+        else:
+            # intermediate stage of pipeline
+            x = self.decoder.input_tensor
+            vace_context = self.vace_decoder.input_tensor
+            
+        # run context token embedding
+        
+        # time embeddings
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t).to(x.dtype)
+        )
+        e0 = self.time_projection(e).unflatten(1, (6, self.config.hidden_size))
+
+        # context embeddings
+        context = self.text_embedding(context) # shape [text_len, b, hidden_size]
+
+
+        # ============= decoder =============
+        # calculate rotary pos emb
+        n_head, dim_head = self.num_heads, self.config.hidden_size // self.num_heads
+        rotary_pos_emb = self.rope_embeddings(n_head, dim_head, max_seq_len, grid_sizes, t.device) # output: rotary_pos_emb.shape [s, b, 1, dim_head]
+
+        s, b, sq, h = rotary_pos_emb.shape
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).reshape(s*b, 1, sq, h)
+        
+        # run vace decoder
+        vace_context = self.vace_decoder(
+            hidden_states=vace_context[0],
+            attention_mask=e0,
+            context=context,
+            context_mask=None,
+            context_signal=vace_context,
+            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_cos=None,
+            rotary_pos_sin=None,
+            packed_seq_params=packed_seq_params,
+        )[1]
+        
+        # run decoder
+        x = self.decoder(
+            hidden_states=x,
+            attention_mask=e0,
+            context=context,
+            context_mask=None,
+            context_signal=vace_context,
+            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_cos=None,
+            rotary_pos_sin=None,
+            packed_seq_params=packed_seq_params,
+        )
+
+        # return if not post_process
+        if not self.post_process:
+            return x
+
+        # head
+        x = x.transpose(0, 1) # head expects shape [b, s, hidden_size]
+        x = self.head(x, e) # output: x.shape [b, s, c * pF * pH * pW]
+        x = x.transpose(0, 1) # reshape back to shape [s, b, c * pF * pH * pW]
+
+        # gather outputs for sequence_parallel
+        # Note: in GPT models, because the vocab projection matrix is ColumnParallelLinear, the sequence is 
+        #   automatically gathered in ColumnParallelLinear forward pass.
+        #   However, in Wan models, we need to gather the outputs manually.
+        if self.config.sequence_parallel:
+            x = tensor_parallel.gather_from_sequence_parallel_region(x)
+
+        return x # output: x.shape [s, b, c * pF * pH * pW]
\ No newline at end of file
diff --git a/src/megatron/bridge/models/wan/wan_provider.py b/src/megatron/bridge/models/wan/wan_provider.py
new file mode 100644
index 0000000000..6522a6c0c3
--- /dev/null
+++ b/src/megatron/bridge/models/wan/wan_provider.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from dataclasses import dataclass
+
+import torch
+from megatron.core import parallel_state
+from megatron.bridge.models.transformer_config import TransformerConfig
+
+from megatron.bridge.models.model_provider import ModelProviderMixin
+from megatron.core.models.common.vision_module.vision_module import VisionModule
+from megatron.bridge.models.wan.wan_model import WanModel, VACEModel
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class WanModelProvider(TransformerConfig, ModelProviderMixin[VisionModule]):
+    crossattn_emb_size: int = 1536
+    add_bias_linear: bool = True
+    gated_linear_unit: bool = False
+
+    num_layers: int = 30
+    hidden_size: int = 1536
+    ffn_hidden_size: int = 8960
+    num_attention_heads: int = 12
+    layernorm_epsilon: float = 1e-6
+    normalization: str = "RMSNorm"
+    layernorm_zero_centered_gamma: bool = False
+    add_qkv_bias: bool = True
+    rotary_interleaved: bool = True
+    hidden_dropout: float = 0
+    attention_dropout: float = 0
+    fp16_lm_cross_entropy: bool = False
+    parallel_output: bool = True
+    bf16: bool = False
+    params_dtype: torch.dtype = torch.float32
+    # qkv_format: str = "sbhd" # "thd". NOTE: if we use context parallelism, we need to use "thd"
+    qkv_format: str = "thd"
+    # these attributes are unused for images/videos, we just set because bridge training requires for LLMs
+    seq_length: int = 1024
+    share_embeddings_and_output_weights: bool = False
+    vocab_size: int = 25256 * 8
+    make_vocab_size_divisible_by: int = 128
+
+    # images/videos attributes
+    in_channels: int = 16
+    out_channels: int = 16
+    patch_spatial: int = 2
+    patch_temporal: int = 1
+    freq_dim: int = 256
+    text_len: int = 512
+    text_dim: int = 4096
+
+    def provide(self, pre_process=None, post_process=None, vp_stage=None) -> WanModel:
+        vp_size = self.virtual_pipeline_model_parallel_size
+        if vp_size:
+            p_size = self.pipeline_model_parallel_size
+            assert (self.num_layers // p_size) % vp_size == 0, (
+                "Make sure the number of model chunks is the same across all pipeline stages."
+            )
+
+        model = WanModel
+
+        return model(
+            self,
+            pre_process=parallel_state.is_pipeline_first_stage(),
+            post_process=parallel_state.is_pipeline_last_stage(),
+            fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
+            parallel_output=self.parallel_output,
+        )
+        
+        
+@dataclass
+class VACEModelProvider(WanModelProvider):
+    vace_layers: list = None
+    # vace_layers: list = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28]
+    vace_in_channels: int = 96
+    base_num_layers: int = 30
+    context_scale: float = 1.0
+    freeze_base_model: bool = False
+
+    def provide(self, pre_process=None, post_process=None, vp_stage=None) -> VACEModel:
+        vp_size = self.virtual_pipeline_model_parallel_size
+        if vp_size:
+            p_size = self.pipeline_model_parallel_size
+            assert (self.num_layers // p_size) % vp_size == 0, (
+                "Make sure the number of model chunks is the same across all pipeline stages."
+            )
+
+        model = VACEModel
+
+        return model(
+            self,
+            pre_process=parallel_state.is_pipeline_first_stage(),
+            post_process=parallel_state.is_pipeline_last_stage(),
+            fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
+            parallel_output=self.parallel_output,
+        )
\ No newline at end of file
diff --git a/src/megatron/bridge/models/wan/wan_step.py b/src/megatron/bridge/models/wan/wan_step.py
new file mode 100644
index 0000000000..456437e4ae
--- /dev/null
+++ b/src/megatron/bridge/models/wan/wan_step.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from functools import partial
+from typing import Iterable
+
+import torch
+from megatron.core import parallel_state
+from megatron.core.models.common.vision_module.vision_module import VisionModule
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.utils import get_model_config
+from megatron.bridge.models.wan.flow_matching.flow_pipeline import FlowPipeline, VACEFlowPipeline
+from megatron.bridge.training.losses import masked_next_token_loss
+from megatron.bridge.training.state import GlobalState
+
+logger = logging.getLogger(__name__)
+
+def wan_data_step(qkv_format, dataloader_iter):
+    batch = next(iter(dataloader_iter.iterable))
+
+    batch = {k: v.to(device="cuda", non_blocking=True) if torch.is_tensor(v) else v for k, v in batch.items()}
+
+    # Construct packed sequence parameters
+    if ("seq_len_q" in batch) and ("seq_len_kv" in batch):
+        cu_seqlens = batch["seq_len_q"].cumsum(dim=0).to(torch.int32)
+        zero = torch.zeros(1, dtype=torch.int32, device="cuda")
+        cu_seqlens = torch.cat((zero, cu_seqlens))
+
+        cu_seqlens_kv = batch["seq_len_kv"].cumsum(dim=0).to(torch.int32)
+        cu_seqlens_kv = torch.cat((zero, cu_seqlens_kv))
+
+        batch["packed_seq_params"] = {
+            "self_attention": PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens,
+                qkv_format=qkv_format,
+            ),
+            "cross_attention": PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens_kv,
+                qkv_format=qkv_format,
+            ),
+        }
+
+    return batch
+
+
+class WanForwardStep:
+    def __init__(self):
+        self.diffusion_pipeline = FlowPipeline()
+
+
+    def __call__(
+        self, state: GlobalState, data_iterator: Iterable, model: VisionModule
+    ) -> tuple[torch.Tensor, partial]:
+        """
+        Forward training step.
+        """
+        timers = state.timers
+        straggler_timer = state.straggler_timer
+
+        config = get_model_config(model)
+ 
+        timers("batch-generator", log_level=2).start()
+
+        qkv_format = getattr(config, "qkv_format", "sbhd")
+        with straggler_timer(bdata=True):
+            batch = wan_data_step(
+                qkv_format, data_iterator
+            )
+        timers("batch-generator").stop()
+        
+        check_for_nan_in_loss = state.cfg.rerun_state_machine.check_for_nan_in_loss
+        check_for_spiky_loss = state.cfg.rerun_state_machine.check_for_spiky_loss
+
+        # run diffusion training step
+        with straggler_timer:
+            if parallel_state.is_pipeline_last_stage():
+                output_batch, loss, split_loss_mask = self.diffusion_pipeline.training_step(model, batch)
+                output_tensor = torch.mean(loss, dim=-1)
+                batch["loss_mask"] = split_loss_mask
+            else:
+                output_tensor = self.diffusion_pipeline.training_step(model, batch)
+
+        # DEBUGGING
+        # TODO: do we need to gather output with sequence or context parallelism here
+        #       especially when we have pipeline parallelism
+
+        loss = output_tensor
+        if "loss_mask" not in batch or batch["loss_mask"] is None:
+            loss_mask = torch.ones_like(loss)
+        loss_mask = batch["loss_mask"]
+        
+        loss_function = self._create_loss_function(loss_mask, check_for_nan_in_loss, check_for_spiky_loss)
+
+        return output_tensor, loss_function
+
+
+    def _create_loss_function(self, loss_mask: torch.Tensor, check_for_nan_in_loss: bool, check_for_spiky_loss: bool) -> partial:
+        """Create a partial loss function with the specified configuration.
+
+        Args:
+            loss_mask: Used to mask out some portions of the loss
+            check_for_nan_in_loss: Whether to check for NaN values in the loss
+            check_for_spiky_loss: Whether to check for spiky loss values
+
+        Returns:
+            A partial function that can be called with output_tensor to compute the loss
+        """
+        return partial(
+            masked_next_token_loss,
+            loss_mask,
+            check_for_nan_in_loss=check_for_nan_in_loss,
+            check_for_spiky_loss=check_for_spiky_loss,
+        )
+
+
+class VACEForwardStep:
+    """
+    Forward step for VACE (Video Editing) models.
+    
+    Uses VACEFlowPipeline which handles the additional vace_context input
+    required by VACEModel.
+    """
+    
+    def __init__(self):
+        self.diffusion_pipeline = VACEFlowPipeline()
+
+
+    def __call__(
+        self, state: GlobalState, data_iterator: Iterable, model: VisionModule
+    ) -> tuple[torch.Tensor, partial]:
+        """
+        Forward training step for VACE models.
+        """
+        timers = state.timers
+        straggler_timer = state.straggler_timer
+
+        config = get_model_config(model)
+ 
+        timers("batch-generator", log_level=2).start()
+
+        qkv_format = getattr(config, "qkv_format", "sbhd")
+        with straggler_timer(bdata=True):
+            batch = wan_data_step(
+                qkv_format, data_iterator
+            )
+        timers("batch-generator").stop()
+        
+        check_for_nan_in_loss = state.cfg.rerun_state_machine.check_for_nan_in_loss
+        check_for_spiky_loss = state.cfg.rerun_state_machine.check_for_spiky_loss
+
+        # run diffusion training step with VACE pipeline
+        with straggler_timer:
+            if parallel_state.is_pipeline_last_stage():
+                output_batch, loss, split_loss_mask = self.diffusion_pipeline.training_step(model, batch)
+                output_tensor = torch.mean(loss, dim=-1)
+                batch["loss_mask"] = split_loss_mask
+            else:
+                output_tensor = self.diffusion_pipeline.training_step(model, batch)
+
+        loss = output_tensor
+        if "loss_mask" not in batch or batch["loss_mask"] is None:
+            loss_mask = torch.ones_like(loss)
+        loss_mask = batch["loss_mask"]
+        
+        loss_function = self._create_loss_function(loss_mask, check_for_nan_in_loss, check_for_spiky_loss)
+
+        return output_tensor, loss_function
+
+
+    def _create_loss_function(self, loss_mask: torch.Tensor, check_for_nan_in_loss: bool, check_for_spiky_loss: bool) -> partial:
+        """Create a partial loss function with the specified configuration.
+
+        Args:
+            loss_mask: Used to mask out some portions of the loss
+            check_for_nan_in_loss: Whether to check for NaN values in the loss
+            check_for_spiky_loss: Whether to check for spiky loss values
+
+        Returns:
+            A partial function that can be called with output_tensor to compute the loss
+        """
+        return partial(
+            masked_next_token_loss,
+            loss_mask,
+            check_for_nan_in_loss=check_for_nan_in_loss,
+            check_for_spiky_loss=check_for_spiky_loss,
+        )
diff --git a/src/megatron/bridge/recipes/llama/llama32_1b.py b/src/megatron/bridge/recipes/DiTModel/dit.py
similarity index 77%
rename from src/megatron/bridge/recipes/llama/llama32_1b.py
rename to src/megatron/bridge/recipes/DiTModel/dit.py
index 92c1baf5ed..30f79c90c0 100644
--- a/src/megatron/bridge/recipes/llama/llama32_1b.py
+++ b/src/megatron/bridge/recipes/DiTModel/dit.py
@@ -15,36 +15,39 @@
 import os
 from typing import List, Optional, Union
 
+from megatron.bridge.data.Dit.data.diffusion_energon_datamodule import DiffusionDataModule, DiffusionDataModuleConfig
+from megatron.bridge.data.Dit.data.diffusion_taskencoder import BasicDiffusionTaskEncoder
+from megatron.bridge.models.DiTModel.dit_provider import DiTModelProvider
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 
-from megatron.bridge.models.llama import Llama32ModelProvider1B
+from megatron.bridge.models.gpt_provider import GPTProvider175B
 from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
 from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
+from megatron.bridge.training.comm_overlap import CommOverlapConfig, userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048
 from megatron.bridge.training.config import (
     CheckpointConfig,
     ConfigContainer,
-    DistributedDataParallelConfig,
     GPTDatasetConfig,
     LoggerConfig,
     RNGConfig,
-    TokenizerConfig,
+    TokenizerConfig, 
     TrainingConfig,
 )
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
+from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, get_mixed_precision_config
 
 
 def model_config(
     tensor_parallelism: int = 1,
     pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 1,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
-) -> Llama32ModelProvider1B:
+) -> DiTModelProvider:
     """
-    Configure the Llama3.2 1B model.
+    Configure the DiT-S model.
 
     Args:
         tensor_parallelism (int): Degree of tensor model parallelism.
@@ -55,15 +58,16 @@ def model_config(
         sequence_parallelism (bool): Whether to use sequence parallelism.
 
     Returns:
-        Llama32ModelProvider1B: Configuration for the Llama3.2 1B model.
+        DiTModelProvider: Configuration for the DiT-S model.
     """
-    return Llama32ModelProvider1B(
+    return DiTModelProvider(
         tensor_model_parallel_size=tensor_parallelism,
         pipeline_model_parallel_size=pipeline_parallelism,
         pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        virtual_pipeline_model_parallel_size=None,
         context_parallel_size=context_parallelism,
         sequence_parallel=sequence_parallelism,
+        seq_length=2048
     )
 
 
@@ -81,26 +85,25 @@ def pretrain_config(
     # Model configuration
     tensor_parallelism: int = 1,
     pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 1,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
     use_megatron_fsdp: bool = False,
     # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    seq_length: int = 8192,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
+    train_iters: int = 10000,
+    global_batch_size: int = 4,
+    micro_batch_size: int = 2,
+    lr: float = 0.9e-4,
     lr_warmup_iters: int = 2000,
-    lr_decay_iters: Optional[int] = None,
     # Precision recipe
     precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
     comm_overlap_config: Optional[CommOverlapConfig] = None,
 ) -> ConfigContainer:
     """
-    Create a pre-training configuration for Llama3.2 1B model.
+    Create a pre-training configuration for GPT3 175B model.
+
+    The default configuration is expected to run on 64 nodes with 8 GPUs each.
 
     Args:
         dir (Optional[str]): Base directory for saving logs and checkpoints.
@@ -121,28 +124,21 @@ def pretrain_config(
         train_iters (int): Total number of training iterations.
         global_batch_size (int): Global batch size for training.
         micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for the model.
+        seq_length (int): Sequence length for training data.
         lr (float): Learning rate.
         min_lr (float): Minimum learning rate for cosine decay.
         lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
         precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
         comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model.
 
     Returns:
         ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is hardcoded to 8192 for Llama3.2 1B pretraining.
     """
     base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
     run_output_dir = os.path.join(base_output_dir, name)
     checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
     tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
 
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
 
     model_cfg = model_config(
         tensor_parallelism=tensor_parallelism,
@@ -155,10 +151,16 @@ def pretrain_config(
 
     opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
         lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
+        lr_decay_iters=train_iters,
         max_lr=lr,
-        min_lr=min_lr,
     )
+    opt_config.use_precision_aware_optimizer = False
+
+    if isinstance(precision_config, str):
+        precision_config = get_mixed_precision_config(precision_config)
+
+    precision_config.grad_reduce_in_fp32 = False
+
 
     # Config Container
     cfg = ConfigContainer(
@@ -178,36 +180,30 @@ def pretrain_config(
         ddp=DistributedDataParallelConfig(
             check_for_nan_in_grad=True,
             grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
+            overlap_grad_reduce=False,
+            overlap_param_gather=False,
             average_in_collective=True,
             use_distributed_optimizer=True,
             use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
         ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            num_workers=8,
-            skip_getting_attention_mask_from_dataset=True,
-        ),
+        dataset= DiffusionDataModuleConfig(
+            path="/opt/VFM/butterfly_webdataset",
+            seq_length=2048,
+            task_encoder_seq_length=2048,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            num_workers=10)
+        ,
         logger=LoggerConfig(
             log_interval=10,
             tensorboard_dir=tensorboard_dir,
+            log_timers_to_tensorboard=True,
         ),
         tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
         checkpoint=CheckpointConfig(
             save_interval=2000,
             save=checkpoint_dir,
+            load=checkpoint_dir,
             ckpt_format="torch_dist",
             fully_parallel_save=True,
         ),
@@ -216,9 +212,4 @@ def pretrain_config(
         mixed_precision=precision_config,
     )
 
-    if cfg.comm_overlap is None:
-        cfg.comm_overlap = CommOverlapConfig(
-            tp_comm_overlap=False,
-        )
-
     return cfg
diff --git a/src/megatron/bridge/recipes/llama/__init__.py b/src/megatron/bridge/recipes/llama/__init__.py
index e69de29bb2..e609301037 100644
--- a/src/megatron/bridge/recipes/llama/__init__.py
+++ b/src/megatron/bridge/recipes/llama/__init__.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Llama2 models
+from .llama2 import (
+    llama2_7b_pretrain_config,
+)
+
+# Llama3 models
+from .llama3 import (
+    llama3_8b_16k_pretrain_config,
+    llama3_8b_64k_pretrain_config,
+    llama3_8b_128k_pretrain_config,
+    llama3_8b_pretrain_config,
+    llama3_70b_16k_pretrain_config,
+    llama3_70b_64k_pretrain_config,
+    llama3_70b_pretrain_config,
+    # Llama3.1 models
+    llama31_8b_pretrain_config,
+    llama31_70b_pretrain_config,
+    llama31_405b_pretrain_config,
+    # Llama3.2 models
+    llama32_1b_pretrain_config,
+    llama32_3b_pretrain_config,
+)
+
+
+__all__ = [
+    # Llama2 models
+    "llama2_7b_pretrain_config",
+    # Llama3 models
+    "llama3_8b_pretrain_config",
+    "llama3_8b_16k_pretrain_config",
+    "llama3_8b_64k_pretrain_config",
+    "llama3_8b_128k_pretrain_config",
+    "llama3_70b_pretrain_config",
+    "llama3_70b_16k_pretrain_config",
+    "llama3_70b_64k_pretrain_config",
+    # Llama3.1 models
+    "llama31_8b_pretrain_config",
+    "llama31_70b_pretrain_config",
+    "llama31_405b_pretrain_config",
+    # Llama3.2 models
+    "llama32_1b_pretrain_config",
+    "llama32_3b_pretrain_config",
+]
diff --git a/src/megatron/bridge/recipes/llama/llama2_7b.py b/src/megatron/bridge/recipes/llama/llama2.py
similarity index 66%
rename from src/megatron/bridge/recipes/llama/llama2_7b.py
rename to src/megatron/bridge/recipes/llama/llama2.py
index 4cf4e71518..0ef8a590e8 100644
--- a/src/megatron/bridge/recipes/llama/llama2_7b.py
+++ b/src/megatron/bridge/recipes/llama/llama2.py
@@ -16,8 +16,9 @@
 from typing import List, Optional, Union
 
 import torch
+from typing_extensions import TypedDict, Unpack
 
-from megatron.bridge.models.llama import Llama2ModelProvider7B
+from megatron.bridge import AutoBridge
 from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
 from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
@@ -35,39 +36,69 @@
 from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
 
 
-def model_config(
-    tensor_parallelism: int = 2,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Llama2ModelProvider7B:
-    """
-    Configure the Llama2 7B model.
+class Llama2CommonKwargs(TypedDict, total=False):
+    """Typed options accepted by Llama2 recipe helper functions."""
 
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
+    # Core identifiers
+    hf_path: str
+    dir: Optional[str]
+    name: str
+    # Dataset configuration
+    data_paths: Optional[List[str]]
+    data_args_path: Optional[str]
+    train_data_path: Optional[List[str]]
+    valid_data_path: Optional[List[str]]
+    test_data_path: Optional[List[str]]
+    per_split_data_args_path: Optional[str]
+    mock: bool
+    # Model configuration
+    tensor_parallelism: int
+    pipeline_parallelism: int
+    pipeline_parallelism_dtype: Optional[torch.dtype]
+    virtual_pipeline_parallelism: Optional[int]
+    context_parallelism: int
+    sequence_parallelism: bool
+    use_megatron_fsdp: bool
+    # Training hyperparameters
+    train_iters: int
+    global_batch_size: int
+    micro_batch_size: int
+    seq_length: int
+    lr: float
+    min_lr: float
+    lr_warmup_iters: int
+    lr_decay_iters: Optional[int]
+    eval_interval: int
+    save_interval: int
+    use_null_tokenizer: bool
+    # Precision / overlap configs
+    precision_config: Optional[Union[MixedPrecisionConfig, str]]
+    comm_overlap_config: Optional[CommOverlapConfig]
 
-    Returns:
-        Llama2ModelProvider7B: Configuration for the Llama2 7B model.
+
+def llama2_7b_pretrain_config(**user_kwargs: Unpack[Llama2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama-2 7B.
+
+    See `_llama2_common` for the full list of parameters.
     """
-    return Llama2ModelProvider7B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
+    recommended_kwargs: Llama2CommonKwargs = {
+        "hf_path": "meta-llama/Llama-2-7b-hf",
+        "tensor_parallelism": 2,
+        "pipeline_parallelism": 1,
+        "train_iters": 1_168_251,
+        "global_batch_size": 512,
+        "micro_batch_size": 1,
+        "lr_warmup_iters": 2000,
+        "eval_interval": 2000,
+        "save_interval": 2000,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Llama2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama2_common(**combined_kwargs)
 
 
-def pretrain_config(
+def _llama2_common(
+    hf_path: str,
     dir: Optional[str] = None,
     name: str = "default",
     # Dataset configuration
@@ -95,14 +126,18 @@ def pretrain_config(
     min_lr: float = 3e-5,
     lr_warmup_iters: int = 2000,
     lr_decay_iters: Optional[int] = None,
+    eval_interval: int = 2000,
+    save_interval: int = 2000,
+    use_null_tokenizer: bool = True,
     # Precision recipe
     precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
     comm_overlap_config: Optional[CommOverlapConfig] = None,
 ) -> ConfigContainer:
     """
-    Create a pre-training configuration for Llama2 7B model.
+    Create a pre-training configuration for Llama2 models using a given HuggingFace path.
 
     Args:
+        hf_path (str): HuggingFace model path (e.g., "meta-llama/Llama-2-7b-hf").
         dir (Optional[str]): Base directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
         data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
@@ -118,6 +153,7 @@ def pretrain_config(
         virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
         context_parallelism (int): Degree of context parallelism to be passed to model_config.
         sequence_parallelism (bool): Whether to use sequence parallelism.
+        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
         train_iters (int): Total number of training iterations.
         global_batch_size (int): Global batch size for training.
         micro_batch_size (int): Micro batch size for training.
@@ -125,7 +161,9 @@ def pretrain_config(
         lr (float): Learning rate.
         min_lr (float): Minimum learning rate for cosine decay.
         lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
+        lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR.
+        eval_interval (int): Evaluation interval.
+        save_interval (int): Save interval.
         precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
         comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model.
 
@@ -141,14 +179,15 @@ def pretrain_config(
         data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
     )
 
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
+    bridge = AutoBridge.from_hf_pretrained(hf_path)
+    model_cfg = bridge.to_megatron_provider(load_weights=False)
+    model_cfg.tensor_model_parallel_size = tensor_parallelism
+    model_cfg.pipeline_model_parallel_size = pipeline_parallelism
+    model_cfg.pipeline_dtype = pipeline_parallelism_dtype
+    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_parallelism
+    model_cfg.context_parallel_size = context_parallelism
+    model_cfg.sequence_parallel = sequence_parallelism
+    model_cfg.seq_length = seq_length
 
     opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
         lr_warmup_iters=lr_warmup_iters,
@@ -166,7 +205,7 @@ def pretrain_config(
         model=model_cfg,
         train=TrainingConfig(
             train_iters=train_iters,
-            eval_interval=2000,
+            eval_interval=eval_interval,
             eval_iters=32,
             global_batch_size=global_batch_size,
             micro_batch_size=micro_batch_size,
@@ -205,10 +244,15 @@ def pretrain_config(
             log_interval=10,
             tensorboard_dir=tensorboard_dir,
         ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
+        tokenizer=TokenizerConfig(
+            tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer",
+            tokenizer_model=hf_path if not use_null_tokenizer else None,
+            vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None,
+        ),
         checkpoint=CheckpointConfig(
-            save_interval=2000,
+            save_interval=save_interval,
             save=checkpoint_dir,
+            load=checkpoint_dir,
             ckpt_format="torch_dist",
             fully_parallel_save=True,
         ),
diff --git a/src/megatron/bridge/recipes/llama/llama3.py b/src/megatron/bridge/recipes/llama/llama3.py
new file mode 100644
index 0000000000..f627108f65
--- /dev/null
+++ b/src/megatron/bridge/recipes/llama/llama3.py
@@ -0,0 +1,503 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Optional, Union
+
+import torch
+from typing_extensions import TypedDict, Unpack
+
+from megatron.bridge import AutoBridge
+from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
+from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
+from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
+from megatron.bridge.training.comm_overlap import (
+    CommOverlapConfig,
+    userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
+    userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
+)
+from megatron.bridge.training.config import (
+    CheckpointConfig,
+    ConfigContainer,
+    DistributedDataParallelConfig,
+    GPTDatasetConfig,
+    LoggerConfig,
+    RNGConfig,
+    TokenizerConfig,
+    TrainingConfig,
+)
+from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed
+
+
+class Llama3CommonKwargs(TypedDict, total=False):
+    """Typed options accepted by Llama3 family recipe helpers."""
+
+    # Core identifiers
+    hf_path: str
+    dir: Optional[str]
+    name: str
+    # Dataset configuration
+    data_paths: Optional[List[str]]
+    data_args_path: Optional[str]
+    train_data_path: Optional[List[str]]
+    valid_data_path: Optional[List[str]]
+    test_data_path: Optional[List[str]]
+    per_split_data_args_path: Optional[str]
+    mock: bool
+    # Model configuration
+    tensor_parallelism: int
+    pipeline_parallelism: int
+    pipeline_parallelism_dtype: Optional[torch.dtype]
+    virtual_pipeline_parallelism: Optional[int]
+    context_parallelism: int
+    sequence_parallelism: bool
+    use_megatron_fsdp: bool
+    account_for_embedding_in_pipeline_split: bool
+    account_for_loss_in_pipeline_split: bool
+    # Training hyperparameters
+    train_iters: int
+    global_batch_size: int
+    micro_batch_size: int
+    seq_length: int
+    lr: float
+    min_lr: float
+    lr_warmup_iters: int
+    lr_decay_iters: Optional[int]
+    eval_interval: int
+    save_interval: int
+    use_null_tokenizer: bool
+    # Precision / overlap configs
+    precision_config: Optional[Union[MixedPrecisionConfig, str]]
+    comm_overlap_config: Optional[CommOverlapConfig]
+
+
+# Sequence length constants
+SEQUENCE_LENGTH_16K: int = 16384
+SEQUENCE_LENGTH_64K: int = 65536
+SEQUENCE_LENGTH_128K: int = 131072
+
+
+# Llama3.2 models
+def llama32_1b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3.2 1B.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Llama-3.2-1B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+        "context_parallelism": 1,
+        "sequence_parallelism": False,
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+def llama32_3b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3.2 3B.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Llama-3.2-3B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+        "context_parallelism": 1,
+        "sequence_parallelism": False,
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+# Llama3 8B models
+def llama3_8b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3 8B.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3-8B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+        "context_parallelism": 2,
+        "sequence_parallelism": False,
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+def llama3_8b_16k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3 8B 16K.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3-8B",
+        "tensor_parallelism": 4,
+        "pipeline_parallelism": 2,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "context_parallelism": 2,
+        "sequence_parallelism": True,
+        "seq_length": SEQUENCE_LENGTH_16K,
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+def llama3_8b_64k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3 8B 64K.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3-8B",
+        "tensor_parallelism": 4,
+        "pipeline_parallelism": 2,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "context_parallelism": 4,
+        "sequence_parallelism": True,
+        "seq_length": SEQUENCE_LENGTH_64K,
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+def llama3_8b_128k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3 8B 128K.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3-8B",
+        "tensor_parallelism": 4,
+        "pipeline_parallelism": 2,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "context_parallelism": 8,
+        "sequence_parallelism": True,
+        "seq_length": SEQUENCE_LENGTH_128K,
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+# Llama3 70B models
+def llama3_70b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3 70B.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3-70B",
+        "tensor_parallelism": 4,
+        "pipeline_parallelism": 4,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "virtual_pipeline_parallelism": 5,
+        "context_parallelism": 2,
+        "sequence_parallelism": True,
+        "comm_overlap_config": CommOverlapConfig(
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
+        ),
+        "precision_config": bf16_mixed(),
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+def llama3_70b_16k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3 70B 16K.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3-70B",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 2,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "virtual_pipeline_parallelism": None,
+        "context_parallelism": 2,
+        "sequence_parallelism": True,
+        "seq_length": SEQUENCE_LENGTH_16K,
+        "comm_overlap_config": CommOverlapConfig(
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
+        ),
+        "precision_config": bf16_mixed(),
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+def llama3_70b_64k_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3 70B 64K.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3-70B",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 4,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "virtual_pipeline_parallelism": None,
+        "context_parallelism": 8,
+        "sequence_parallelism": True,
+        "seq_length": SEQUENCE_LENGTH_64K,
+        "comm_overlap_config": CommOverlapConfig(
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
+        ),
+        "precision_config": bf16_mixed(),
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+# Llama3.1 models
+def llama31_8b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3.1 8B.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3.1-8B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+        "context_parallelism": 2,
+        "sequence_parallelism": False,
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+def llama31_70b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3.1 70B.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3.1-70B",
+        "tensor_parallelism": 4,
+        "pipeline_parallelism": 4,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "virtual_pipeline_parallelism": 5,
+        "context_parallelism": 2,
+        "sequence_parallelism": True,
+        "comm_overlap_config": CommOverlapConfig(
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
+        ),
+        "precision_config": bf16_mixed(),
+        "seq_length": SEQUENCE_LENGTH_128K,
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+def llama31_405b_pretrain_config(**user_kwargs: Unpack[Llama3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Llama 3.1 405B.
+
+    See `_llama3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Llama3CommonKwargs = {
+        "hf_path": "meta-llama/Meta-Llama-3.1-405B",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 8,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "virtual_pipeline_parallelism": 2,
+        "context_parallelism": 4,
+        "sequence_parallelism": True,
+        "account_for_embedding_in_pipeline_split": True,
+        "account_for_loss_in_pipeline_split": True,
+        "comm_overlap_config": CommOverlapConfig(
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
+        ),
+        "precision_config": bf16_mixed(),
+        "micro_batch_size": 1,
+        "seq_length": SEQUENCE_LENGTH_128K,
+    }
+    combined_kwargs: Llama3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _llama3_common(**combined_kwargs)
+
+
+def _llama3_common(
+    hf_path: str,
+    dir: Optional[str] = None,
+    name: str = "default",
+    # Dataset configuration
+    data_paths: Optional[List[str]] = None,
+    data_args_path: Optional[str] = None,
+    train_data_path: Optional[List[str]] = None,
+    valid_data_path: Optional[List[str]] = None,
+    test_data_path: Optional[List[str]] = None,
+    per_split_data_args_path: Optional[str] = None,
+    mock: bool = False,
+    # Model configuration
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    use_megatron_fsdp: bool = False,
+    account_for_embedding_in_pipeline_split: bool = False,
+    account_for_loss_in_pipeline_split: bool = False,
+    # Training hyperparameters
+    train_iters: int = 1168251,
+    global_batch_size: int = 512,
+    micro_batch_size: int = 1,
+    seq_length: int = 8192,
+    lr: float = 3e-4,
+    min_lr: float = 3e-5,
+    lr_warmup_iters: int = 2000,
+    lr_decay_iters: Optional[int] = None,
+    eval_interval: int = 2000,
+    save_interval: int = 500,
+    use_null_tokenizer: bool = True,
+    # Precision recipe
+    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
+    comm_overlap_config: Optional[CommOverlapConfig] = None,
+) -> ConfigContainer:
+    """
+    Create a pre-training configuration for Llama3 family models using a given HuggingFace path.
+
+    Args:
+        hf_path (str): HuggingFace model path (e.g., "meta-llama/Meta-Llama-3-8B").
+        dir (Optional[str]): Base directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
+        data_args_path (Optional[str]): Path to file containing data arguments.
+        train_data_path (Optional[List[str]]): List of training data paths.
+        valid_data_path (Optional[List[str]]): List of validation data paths.
+        test_data_path (Optional[List[str]]): List of test data paths.
+        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
+        mock (bool): Whether to use mock data. If True, ignores data_paths.
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
+        account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split.
+        account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split.
+        train_iters (int): Total number of training iterations.
+        global_batch_size (int): Global batch size for training.
+        micro_batch_size (int): Micro batch size for training.
+        seq_length (int): Sequence length for training data.
+        lr (float): Learning rate.
+        min_lr (float): Minimum learning rate for cosine decay.
+        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
+        lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR.
+        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
+        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration.
+
+    Returns:
+        ConfigContainer: Configuration for pre-training.
+    """
+    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
+    run_output_dir = os.path.join(base_output_dir, name)
+    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
+    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
+
+    blend, blend_per_split, split = get_blend_fields_from_data_paths(
+        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
+    )
+
+    bridge = AutoBridge.from_hf_pretrained(hf_path)
+    model_cfg = bridge.to_megatron_provider(load_weights=False)
+    model_cfg.tensor_model_parallel_size = tensor_parallelism
+    model_cfg.pipeline_model_parallel_size = pipeline_parallelism
+    model_cfg.pipeline_dtype = pipeline_parallelism_dtype
+    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_parallelism
+    model_cfg.context_parallel_size = context_parallelism
+    model_cfg.sequence_parallel = sequence_parallelism
+    model_cfg.seq_length = seq_length
+
+    # Large model specific pipeline split configurations
+    if account_for_embedding_in_pipeline_split:
+        model_cfg.account_for_embedding_in_pipeline_split = True
+    if account_for_loss_in_pipeline_split:
+        model_cfg.account_for_loss_in_pipeline_split = True
+
+    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=lr_warmup_iters,
+        lr_decay_iters=lr_decay_iters,
+        max_lr=lr,
+        min_lr=min_lr,
+    )
+
+    # Config Container
+    cfg = ConfigContainer(
+        model=model_cfg,
+        train=TrainingConfig(
+            train_iters=train_iters,
+            eval_interval=eval_interval,
+            eval_iters=32,
+            global_batch_size=global_batch_size,
+            micro_batch_size=micro_batch_size,
+            manual_gc=True,
+            manual_gc_interval=100,
+            manual_gc_eval=100,
+        ),
+        optimizer=opt_config,
+        scheduler=scheduler,
+        ddp=DistributedDataParallelConfig(
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+            use_distributed_optimizer=True,
+            use_megatron_fsdp=use_megatron_fsdp,
+        ),
+        dataset=GPTDatasetConfig(
+            random_seed=1234,
+            reset_attention_mask=False,
+            reset_position_ids=False,
+            eod_mask_loss=False,
+            sequence_length=seq_length,
+            num_dataset_builder_threads=1,
+            blend=blend,
+            blend_per_split=blend_per_split,
+            split=split,
+            # Dataloader config parameters
+            data_sharding=True,
+            dataloader_type="single",
+            skip_getting_attention_mask_from_dataset=True,
+        ),
+        logger=LoggerConfig(
+            log_interval=10,
+            tensorboard_dir=tensorboard_dir,
+            log_timers_to_tensorboard=True,
+        ),
+        tokenizer=TokenizerConfig(
+            tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer",
+            tokenizer_model=hf_path if not use_null_tokenizer else None,
+            vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None,
+        ),
+        checkpoint=CheckpointConfig(
+            save_interval=save_interval,
+            save=checkpoint_dir,
+            load=checkpoint_dir,
+            ckpt_format="torch_dist",
+            fully_parallel_save=True,
+        ),
+        rng=RNGConfig(seed=1234),
+        comm_overlap=comm_overlap_config,
+        mixed_precision=precision_config,
+    )
+
+    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama31_405b.py b/src/megatron/bridge/recipes/llama/llama31_405b.py
deleted file mode 100644
index 9f3b7d1baf..0000000000
--- a/src/megatron/bridge/recipes/llama/llama31_405b.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama31ModelProvider405B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.training.comm_overlap import (
-    CommOverlapConfig,
-    userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
-)
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed
-
-
-def model_config(
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 8,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = 2,
-    context_parallelism: int = 4,
-    sequence_parallelism: bool = True,
-    account_for_embedding_in_pipeline_split: bool = True,
-    account_for_loss_in_pipeline_split: bool = True,
-) -> Llama31ModelProvider405B:
-    """
-    Configure the Llama3.1 405B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split.
-        account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split.
-
-    Returns:
-        Llama31ModelProvider405B: Configuration for the Llama3.1 405B model.
-    """
-    return Llama31ModelProvider405B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-        account_for_embedding_in_pipeline_split=account_for_embedding_in_pipeline_split,
-        account_for_loss_in_pipeline_split=account_for_loss_in_pipeline_split,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 8,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = 2,
-    context_parallelism: int = 4,
-    sequence_parallelism: bool = True,
-    use_megatron_fsdp: bool = False,
-    account_for_embedding_in_pipeline_split: bool = True,
-    account_for_loss_in_pipeline_split: bool = True,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = None,
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-    vocab_size: int = 128256,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3.1 405B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split.
-        account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is hardcoded to 8192 for Llama3.1 405B pretraining.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-        account_for_embedding_in_pipeline_split=account_for_embedding_in_pipeline_split,
-        account_for_loss_in_pipeline_split=account_for_loss_in_pipeline_split,
-    )
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    if precision_config is None:
-        precision_config = bf16_mixed()
-    if isinstance(precision_config, MixedPrecisionConfig):
-        precision_config.grad_reduce_in_fp32 = False
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=2000,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=8192,  # Hardcoded to 8192 for Llama3.1 405B pretraining
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            num_workers=8,
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=vocab_size),
-        checkpoint=CheckpointConfig(
-            save_interval=2000,
-            save=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    if cfg.comm_overlap is None:
-        cfg.comm_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=50,
-            # 'overlap_param_gather_with_optimizer_step' is set automatically. Added here for user's knowledge
-            overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing
-        )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama31_70b.py b/src/megatron/bridge/recipes/llama/llama31_70b.py
deleted file mode 100644
index 51583f0959..0000000000
--- a/src/megatron/bridge/recipes/llama/llama31_70b.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama31ModelProvider70B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import (
-    CommOverlapConfig,
-    userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
-)
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = 5,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = True,
-) -> Llama31ModelProvider70B:
-    """
-    Configure the Llama3.1 70B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Llama31ModelProvider70B: Configuration for the Llama3.1 70B model.
-    """
-    return Llama31ModelProvider70B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = 5,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = True,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3.1 70B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is hardcoded to 8192 for Llama3.1 70B pretraining.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=2000,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=8192,  # Hardcoded to 8192 for Llama3.1 70B pretraining
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            num_workers=8,
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=2000,
-            save=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    if cfg.comm_overlap is None:
-        cfg.comm_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=50,
-            overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing
-            align_param_gather=True,
-        )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama31_8b.py b/src/megatron/bridge/recipes/llama/llama31_8b.py
deleted file mode 100644
index 38ab362eaa..0000000000
--- a/src/megatron/bridge/recipes/llama/llama31_8b.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama31ModelProvider8B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import (
-    CommOverlapConfig,
-    userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
-)
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = False,
-) -> Llama31ModelProvider8B:
-    """
-    Configure the Llama3.1 8B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Llama31ModelProvider8B: Configuration for the Llama3.1 8B model.
-    """
-    return Llama31ModelProvider8B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    seq_length: int = 8192,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3.1 8B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        seq_length (int): Sequence length for training.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=2000,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            num_workers=8,
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=2000,
-            save=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    # TODO(ananthsub): Temporarily disabled as the extra allocations causes an OOM on a single node
-    # if cfg.comm_overlap is None:
-    #     cfg.comm_overlap = get_comm_overlap_config()
-
-    return cfg
-
-
-def get_comm_overlap_config() -> CommOverlapConfig:
-    """Communication overlap configuration for the Llama3.1 8B model."""
-    return CommOverlapConfig(
-        tp_comm_overlap=True,
-        tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
-        defer_embedding_wgrad_compute=True,
-        wgrad_deferral_limit=50,
-        overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing
-        align_param_gather=True,
-    )
diff --git a/src/megatron/bridge/recipes/llama/llama3_70b.py b/src/megatron/bridge/recipes/llama/llama3_70b.py
deleted file mode 100644
index 4fe6ec748d..0000000000
--- a/src/megatron/bridge/recipes/llama/llama3_70b.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider70B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.training.comm_overlap import CommOverlapConfig, userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = 5,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = True,
-) -> Llama3ModelProvider70B:
-    """
-    Configure the Llama3 70B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Llama3ModelProvider70B: Configuration for the Llama3 70B model.
-    """
-    return Llama3ModelProvider70B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = 5,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = True,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    seq_length: int = 8192,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = None,
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-    vocab_size: int = 128256,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3 70B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int) Number of warmup iterations for the learning rate.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    if precision_config is None:
-        precision_config = bf16_mixed()
-    if isinstance(precision_config, MixedPrecisionConfig):
-        precision_config.grad_reduce_in_fp32 = False
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=2000,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            num_workers=8,
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=vocab_size),
-        checkpoint=CheckpointConfig(
-            save_interval=2000,
-            save=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    if cfg.comm_overlap is None:
-        cfg.comm_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=22,
-            # 'overlap_param_gather_with_optimizer_step' is set automatically. Added here for user's knowledge
-            overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing.
-        )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama3_70b_16k.py b/src/megatron/bridge/recipes/llama/llama3_70b_16k.py
deleted file mode 100644
index 48dd116299..0000000000
--- a/src/megatron/bridge/recipes/llama/llama3_70b_16k.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider70B
-from megatron.bridge.recipes.llama import llama3_70b
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-# 16k sequence length constant
-SEQUENCE_LENGTH_16K = 16384
-
-
-def model_config(
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = True,
-) -> Llama3ModelProvider70B:
-    """
-    Configure the Llama3 70B model for 16k sequence length training.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism. Default optimized for 70B with 16k sequences.
-        pipeline_parallelism (int): Degree of pipeline model parallelism. Default optimized for 70B with 16k sequences.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Default optimized for 70B with 16k sequences.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. Default optimized for 70B with 16k sequences.
-        context_parallelism (int): Degree of context parallelism. Default optimized for 70B with 16k sequences.
-        sequence_parallelism (bool): Whether to use sequence parallelism. Default optimized for 70B with 16k sequences.
-
-    Returns:
-        Llama3ModelProvider70B: Configuration for the Llama3 70B model optimized for 16k sequences.
-    """
-    # Get base model config and override specific parameters for 16k sequences
-    model_cfg = llama3_70b.model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    # Override sequence length to 16k to match dataset config
-    model_cfg.seq_length = SEQUENCE_LENGTH_16K
-
-    return model_cfg
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration - defaults optimized for 70B with 16k sequences
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = True,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3 70B model with 16k sequence length.
-
-    This function inherits from llama3_70b.pretrain_config() and overrides specific parameters
-    optimized for 16k sequence length training.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism. Default optimized for 70B with 16k sequences.
-        pipeline_parallelism (int): Degree of pipeline model parallelism. Default optimized for 70B with 16k sequences.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Default optimized for 70B with 16k sequences.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. Default optimized for 70B with 16k sequences.
-        context_parallelism (int): Degree of context parallelism. Default optimized for 70B with 16k sequences.
-        sequence_parallelism (bool): Whether to use sequence parallelism. Default optimized for 70B with 16k sequences.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int) Number of warmup iterations for the learning rate.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is set to SEQUENCE_LENGTH_16K (16384) for extended sequence training.
-        Default parallelism settings are optimized for 70B model with 16k sequences efficiently.
-    """
-    # Get base configuration from llama3_70b with 16k sequence length
-    config = llama3_70b.pretrain_config(
-        dir=dir,
-        name=name,
-        data_paths=data_paths,
-        data_args_path=data_args_path,
-        train_data_path=train_data_path,
-        valid_data_path=valid_data_path,
-        test_data_path=test_data_path,
-        per_split_data_args_path=per_split_data_args_path,
-        mock=mock,
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-        train_iters=train_iters,
-        global_batch_size=global_batch_size,
-        micro_batch_size=micro_batch_size,
-        seq_length=SEQUENCE_LENGTH_16K,  # Override to 16k sequence length
-        lr=lr,
-        min_lr=min_lr,
-        lr_warmup_iters=lr_warmup_iters,
-        precision_config=precision_config,
-        vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
-    )
-
-    # Override the model configuration to use 16k sequence length
-    config.model = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    return config
diff --git a/src/megatron/bridge/recipes/llama/llama3_70b_64k.py b/src/megatron/bridge/recipes/llama/llama3_70b_64k.py
deleted file mode 100644
index 96cc3555fc..0000000000
--- a/src/megatron/bridge/recipes/llama/llama3_70b_64k.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider70B
-from megatron.bridge.recipes.llama import llama3_70b
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-# 64k sequence length constant
-SEQUENCE_LENGTH_64K = 65536
-
-
-def model_config(
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 8,
-    sequence_parallelism: bool = True,
-) -> Llama3ModelProvider70B:
-    """
-    Configure the Llama3 70B model for 64k sequence length training.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism. Default optimized for 70B with 64k sequences.
-        pipeline_parallelism (int): Degree of pipeline model parallelism. Default optimized for 70B with 64k sequences.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Default optimized for 70B with 64k sequences.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. Default optimized for 70B with 64k sequences.
-        context_parallelism (int): Degree of context parallelism. Default optimized for 70B with 64k sequences.
-        sequence_parallelism (bool): Whether to use sequence parallelism. Default optimized for 70B with 64k sequences.
-
-    Returns:
-        Llama3ModelProvider70B: Configuration for the Llama3 70B model optimized for 64k sequences.
-    """
-    # Get base model config and override specific parameters for 64k sequences
-    model_cfg = llama3_70b.model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    # Override sequence length to 64k to match dataset config
-    model_cfg.seq_length = SEQUENCE_LENGTH_64K
-
-    return model_cfg
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration - defaults optimized for 70B with 64k sequences
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 8,
-    sequence_parallelism: bool = True,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3 70B model with 64k sequence length.
-
-    This function inherits from llama3_70b.pretrain_config() and overrides specific parameters
-    optimized for 64k sequence length training.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism. Default optimized for 70B with 64k sequences.
-        pipeline_parallelism (int): Degree of pipeline model parallelism. Default optimized for 70B with 64k sequences.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Default optimized for 70B with 64k sequences.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. Default optimized for 70B with 64k sequences.
-        context_parallelism (int): Degree of context parallelism. Default optimized for 70B with 64k sequences.
-        sequence_parallelism (bool): Whether to use sequence parallelism. Default optimized for 70B with 64k sequences.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int) Number of warmup iterations for the learning rate.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision recipe for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is set to SEQUENCE_LENGTH_64K (65536) for extended sequence training.
-        Default parallelism settings are optimized for 70B model with 64k sequences efficiently.
-    """
-    # Get base configuration from llama3_70b with 64k sequence length
-    cfg = llama3_70b.pretrain_config(
-        dir=dir,
-        name=name,
-        data_paths=data_paths,
-        data_args_path=data_args_path,
-        train_data_path=train_data_path,
-        valid_data_path=valid_data_path,
-        test_data_path=test_data_path,
-        per_split_data_args_path=per_split_data_args_path,
-        mock=mock,
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-        train_iters=train_iters,
-        global_batch_size=global_batch_size,
-        micro_batch_size=micro_batch_size,
-        seq_length=SEQUENCE_LENGTH_64K,  # Override to 64k sequence length
-        lr=lr,
-        min_lr=min_lr,
-        lr_warmup_iters=lr_warmup_iters,
-        precision_config=precision_config,
-        vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
-    )
-
-    # Override the model configuration to use 64k sequence length
-    cfg.model = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama3_8b.py b/src/megatron/bridge/recipes/llama/llama3_8b.py
deleted file mode 100644
index 01f3eb706e..0000000000
--- a/src/megatron/bridge/recipes/llama/llama3_8b.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider8B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed
-
-
-def model_config(
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = False,
-) -> Llama3ModelProvider8B:
-    """
-    Configure the Llama3 8B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Llama3ModelProvider8B: Configuration for the Llama3 8B model.
-    """
-    return Llama3ModelProvider8B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    seq_length: int = 8192,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = None,
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-    vocab_size: int = 128256,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3 8B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        adam_beta1=0.9,
-        adam_beta2=0.95,
-        adam_eps=1e-5,
-        weight_decay=0.1,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    if precision_config is None:
-        precision_config = bf16_mixed()
-    if isinstance(precision_config, MixedPrecisionConfig):
-        precision_config.grad_reduce_in_fp32 = False
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=2000,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            num_workers=8,
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=vocab_size),
-        checkpoint=CheckpointConfig(
-            save_interval=2000,
-            save=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    if cfg.comm_overlap is None:
-        cfg.comm_overlap = CommOverlapConfig(
-            tp_comm_overlap=False,
-        )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama3_8b_128k.py b/src/megatron/bridge/recipes/llama/llama3_8b_128k.py
deleted file mode 100644
index 406dce89be..0000000000
--- a/src/megatron/bridge/recipes/llama/llama3_8b_128k.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider8B
-from megatron.bridge.recipes.llama import llama3_8b
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-SEQUENCE_LENGTH_128K: int = 131072
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 8,
-    sequence_parallelism: bool = True,
-) -> Llama3ModelProvider8B:
-    """
-    Configure the Llama3 8B model for 128k sequence length training.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism. Default optimized for 128k sequences.
-        pipeline_parallelism (int): Degree of pipeline model parallelism. Default optimized for 128k sequences.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Default optimized for 128k sequences.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism. Default optimized for 128k sequences.
-        sequence_parallelism (bool): Whether to use sequence parallelism. Default optimized for 128k sequences.
-
-    Returns:
-        Llama3ModelProvider8B: Configuration for the Llama3 8B model optimized for 128k sequences.
-    """
-    # Get base model config and override sequence length to 128k
-    model_cfg = llama3_8b.model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    model_cfg.seq_length = SEQUENCE_LENGTH_128K
-
-    return model_cfg
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration - defaults optimized for 128k sequences
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 8,
-    sequence_parallelism: bool = True,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3 8B model with 128k sequence length.
-
-    This function inherits from llama3_8b.pretrain_config() and overrides specific parameters
-    optimized for 128k sequence length training.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism. Default optimized for 128k sequences.
-        pipeline_parallelism (int): Degree of pipeline model parallelism. Default optimized for 128k sequences.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Default optimized for 128k sequences.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism. Default optimized for 128k sequences.
-        sequence_parallelism (bool): Whether to use sequence parallelism. Default optimized for 128k sequences.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int) Number of warmup iterations for the learning rate.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision recipe for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is set to SEQUENCE_LENGTH_128K (131072) for long sequence training.
-        Default parallelism settings are optimized for handling 128k sequences efficiently.
-    """
-    # Get base configuration from llama3_8b with 128k sequence length
-    config = llama3_8b.pretrain_config(
-        dir=dir,
-        name=name,
-        data_paths=data_paths,
-        data_args_path=data_args_path,
-        train_data_path=train_data_path,
-        valid_data_path=valid_data_path,
-        test_data_path=test_data_path,
-        per_split_data_args_path=per_split_data_args_path,
-        mock=mock,
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-        train_iters=train_iters,
-        global_batch_size=global_batch_size,
-        micro_batch_size=micro_batch_size,
-        seq_length=SEQUENCE_LENGTH_128K,  # Override to 128k sequence length
-        lr=lr,
-        min_lr=min_lr,
-        lr_warmup_iters=lr_warmup_iters,
-        precision_config=precision_config,
-    )
-
-    # Override the model configuration to use 128k sequence length
-    config.model = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    return config
diff --git a/src/megatron/bridge/recipes/llama/llama3_8b_16k.py b/src/megatron/bridge/recipes/llama/llama3_8b_16k.py
deleted file mode 100644
index a78217d032..0000000000
--- a/src/megatron/bridge/recipes/llama/llama3_8b_16k.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider8B
-from megatron.bridge.recipes.llama import llama3_8b
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-SEQ_LENGTH: int = 16384
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = True,
-) -> Llama3ModelProvider8B:
-    """
-    Configure the Llama3 8B model with 16k sequence length optimizations.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Llama3ModelProvider8B: Configuration for the Llama3 8B model with 16k optimizations.
-    """
-    cfg = Llama3ModelProvider8B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-    cfg.seq_length = SEQ_LENGTH
-    return cfg
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    sequence_parallelism: bool = True,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3 8B model with 16k sequence length.
-
-    This function extends the base llama3_8b configuration with optimizations for 16k sequences.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int) Number of warmup iterations for the learning rate.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training with 16k sequence length.
-    """
-    # Start with base llama3_8b configuration
-    cfg = llama3_8b.pretrain_config(
-        dir=dir,
-        name=name,
-        data_paths=data_paths,
-        data_args_path=data_args_path,
-        train_data_path=train_data_path,
-        valid_data_path=valid_data_path,
-        test_data_path=test_data_path,
-        per_split_data_args_path=per_split_data_args_path,
-        mock=mock,
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-        train_iters=train_iters,
-        global_batch_size=global_batch_size,
-        micro_batch_size=micro_batch_size,
-        seq_length=SEQ_LENGTH,
-        lr=lr,
-        min_lr=min_lr,
-        lr_warmup_iters=lr_warmup_iters,
-        precision_config=precision_config,
-        vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
-    )
-
-    # Override model configuration with 16k-optimized defaults
-    cfg.model = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    # Ensure dataset sequence length is set to 16k
-    cfg.dataset.sequence_length = SEQ_LENGTH
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama3_8b_64k.py b/src/megatron/bridge/recipes/llama/llama3_8b_64k.py
deleted file mode 100644
index f47f9ef644..0000000000
--- a/src/megatron/bridge/recipes/llama/llama3_8b_64k.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider8B
-from megatron.bridge.recipes.llama import llama3_8b
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-SEQUENCE_LENGTH_64K: int = 65536
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 4,
-    sequence_parallelism: bool = True,
-) -> Llama3ModelProvider8B:
-    """
-    Configure the Llama3 8B model for 64k sequence length training.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism. Default optimized for 64k sequences.
-        pipeline_parallelism (int): Degree of pipeline model parallelism. Default optimized for 64k sequences.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Default optimized for 64k sequences.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism. Default optimized for 64k sequences.
-        sequence_parallelism (bool): Whether to use sequence parallelism. Default optimized for 64k sequences.
-
-    Returns:
-        Llama3ModelProvider8B: Configuration for the Llama3 8B model optimized for 64k sequences.
-    """
-    # Get base model config and override sequence length to 64k
-    model_cfg = llama3_8b.model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    model_cfg.seq_length = SEQUENCE_LENGTH_64K
-
-    return model_cfg
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration - defaults optimized for 64k sequences
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 4,
-    sequence_parallelism: bool = True,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama3 8B model with 64k sequence length.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism. Default optimized for 64k sequences.
-        pipeline_parallelism (int): Degree of pipeline model parallelism. Default optimized for 64k sequences.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Default optimized for 64k sequences.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism. Default optimized for 64k sequences.
-        sequence_parallelism (bool): Whether to use sequence parallelism. Default optimized for 64k sequences.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int) Number of warmup iterations for the learning rate.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision recipe for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is hardcoded to 65536 (64k) for long sequence training.
-        Default parallelism settings are optimized for handling 64k sequences efficiently.
-    """
-    # Get base configuration from llama3_8b with 64k sequence length
-    cfg = llama3_8b.pretrain_config(
-        dir=dir,
-        name=name,
-        data_paths=data_paths,
-        data_args_path=data_args_path,
-        train_data_path=train_data_path,
-        valid_data_path=valid_data_path,
-        test_data_path=test_data_path,
-        per_split_data_args_path=per_split_data_args_path,
-        mock=mock,
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-        train_iters=train_iters,
-        global_batch_size=global_batch_size,
-        micro_batch_size=micro_batch_size,
-        seq_length=SEQUENCE_LENGTH_64K,
-        lr=lr,
-        min_lr=min_lr,
-        lr_warmup_iters=lr_warmup_iters,
-        precision_config=precision_config,
-    )
-
-    # Override the model configuration to use 64k sequence length
-    cfg.model = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama4_e128.py b/src/megatron/bridge/recipes/llama/llama4_e128.py
deleted file mode 100644
index 6d5ade36ec..0000000000
--- a/src/megatron/bridge/recipes/llama/llama4_e128.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama4Experts128ModelProvider
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = True,
-    expert_tensor_parallelism: int = 4,
-    expert_model_parallelism: int = 128,
-) -> Llama4Experts128ModelProvider:
-    """
-    Configure the Llama4 128-Experts (Maverick) model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        expert_tensor_parallelism (int): Degree of expert tensor parallelism.
-        expert_model_parallelism (int): Degree of expert model parallelism.
-
-    Returns:
-        Llama4Experts128ModelProvider: Configuration for the Llama4 128-Experts (Maverick) model.
-    """
-    return Llama4Experts128ModelProvider(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-        expert_tensor_parallel_size=expert_tensor_parallelism,
-        expert_model_parallel_size=expert_model_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = True,
-    expert_tensor_parallelism: int = 4,
-    expert_model_parallelism: int = 128,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama4 128-Experts (Maverick) model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        expert_tensor_parallelism (int): Degree of expert tensor parallelism.
-        expert_model_parallelism (int): Degree of expert model parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is set to 8192 for Llama4 128-Experts pretraining.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-        expert_tensor_parallelism=expert_tensor_parallelism,
-        expert_model_parallelism=expert_model_parallelism,
-    )
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=2000,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=8192,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            num_workers=8,
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=2000,
-            save=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama4_e16.py b/src/megatron/bridge/recipes/llama/llama4_e16.py
deleted file mode 100644
index ff8ec34e9e..0000000000
--- a/src/megatron/bridge/recipes/llama/llama4_e16.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.llama import Llama4Experts16ModelProvider
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = True,
-    expert_tensor_parallelism: int = 4,
-    expert_model_parallelism: int = 16,
-) -> Llama4Experts16ModelProvider:
-    """
-    Configure the Llama4 16-Experts (Scout) model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        expert_tensor_parallelism (int): Degree of expert tensor parallelism.
-        expert_model_parallelism (int): Degree of expert model parallelism.
-
-    Returns:
-        Llama4Experts16ModelProvider: Configuration for the Llama4 16-Experts (Scout) model.
-    """
-    return Llama4Experts16ModelProvider(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-        expert_tensor_parallel_size=expert_tensor_parallelism,
-        expert_model_parallel_size=expert_model_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = True,
-    expert_tensor_parallelism: int = 4,
-    expert_model_parallelism: int = 16,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
-    micro_batch_size: int = 1,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 2000,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Llama4 16-Experts (Scout) model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        expert_tensor_parallelism (int): Degree of expert tensor parallelism.
-        expert_model_parallelism (int): Degree of expert model parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is set to 8192 for Llama4 16-Experts pretraining.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-        expert_tensor_parallelism=expert_tensor_parallelism,
-        expert_model_parallelism=expert_model_parallelism,
-    )
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=2000,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=8192,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            num_workers=8,
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=2000,
-            save=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/__init__.py b/src/megatron/bridge/recipes/qwen/__init__.py
index 341a77c5bc..86f6e3313c 100644
--- a/src/megatron/bridge/recipes/qwen/__init__.py
+++ b/src/megatron/bridge/recipes/qwen/__init__.py
@@ -11,3 +11,60 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+# Qwen2 models
+from .qwen2 import (
+    qwen2_1p5b_pretrain_config,
+    qwen2_7b_pretrain_config,
+    qwen2_72b_pretrain_config,
+    qwen2_500m_pretrain_config,
+    # Qwen2.5 models
+    qwen25_1p5b_pretrain_config,
+    qwen25_7b_pretrain_config,
+    qwen25_14b_pretrain_config,
+    qwen25_32b_pretrain_config,
+    qwen25_72b_pretrain_config,
+    qwen25_500m_pretrain_config,
+)
+
+# Qwen3 models
+from .qwen3 import (
+    qwen3_1p7b_pretrain_config,
+    qwen3_4b_pretrain_config,
+    qwen3_8b_pretrain_config,
+    qwen3_14b_pretrain_config,
+    qwen3_32b_pretrain_config,
+    qwen3_600m_pretrain_config,
+)
+
+# Qwen3 MoE models
+from .qwen3_moe import (
+    qwen3_30b_a3b_pretrain_config,
+    qwen3_235b_a22b_pretrain_config,
+)
+
+
+__all__ = [
+    # Qwen2 models
+    "qwen2_500m_pretrain_config",
+    "qwen2_1p5b_pretrain_config",
+    "qwen2_7b_pretrain_config",
+    "qwen2_72b_pretrain_config",
+    # Qwen2.5 models
+    "qwen25_500m_pretrain_config",
+    "qwen25_1p5b_pretrain_config",
+    "qwen25_7b_pretrain_config",
+    "qwen25_14b_pretrain_config",
+    "qwen25_32b_pretrain_config",
+    "qwen25_72b_pretrain_config",
+    # Qwen3 models
+    "qwen3_600m_pretrain_config",
+    "qwen3_1p7b_pretrain_config",
+    "qwen3_4b_pretrain_config",
+    "qwen3_8b_pretrain_config",
+    "qwen3_14b_pretrain_config",
+    "qwen3_32b_pretrain_config",
+    # Qwen3 MoE models
+    "qwen3_30b_a3b_pretrain_config",
+    "qwen3_235b_a22b_pretrain_config",
+]
diff --git a/src/megatron/bridge/recipes/qwen/qwen2.py b/src/megatron/bridge/recipes/qwen/qwen2.py
new file mode 100644
index 0000000000..dcbe076ec1
--- /dev/null
+++ b/src/megatron/bridge/recipes/qwen/qwen2.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Optional, Union
+
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from typing_extensions import TypedDict, Unpack
+
+from megatron.bridge import AutoBridge
+from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
+from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
+from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
+from megatron.bridge.training.comm_overlap import CommOverlapConfig
+from megatron.bridge.training.config import (
+    CheckpointConfig,
+    ConfigContainer,
+    GPTDatasetConfig,
+    LoggerConfig,
+    RNGConfig,
+    TokenizerConfig,
+    TrainingConfig,
+)
+from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
+
+
+class Qwen2CommonKwargs(TypedDict, total=False):
+    """Typed options accepted by Qwen2/2.5 recipe helper functions."""
+
+    # Core identifiers
+    hf_path: str
+    dir: Optional[str]
+    name: str
+    # Dataset configuration
+    data_paths: Optional[List[str]]
+    data_args_path: Optional[str]
+    train_data_path: Optional[List[str]]
+    valid_data_path: Optional[List[str]]
+    test_data_path: Optional[List[str]]
+    per_split_data_args_path: Optional[str]
+    mock: bool
+    # Model configuration
+    tensor_parallelism: int
+    pipeline_parallelism: int
+    pipeline_parallelism_dtype: Optional[torch.dtype]
+    virtual_pipeline_parallelism: Optional[int]
+    context_parallelism: int
+    sequence_parallelism: bool
+    use_megatron_fsdp: bool
+    check_for_nan_in_grad: bool
+    # Training hyperparameters
+    train_iters: int
+    global_batch_size: int
+    micro_batch_size: int
+    seq_length: int
+    lr: float
+    min_lr: float
+    lr_warmup_iters: int
+    lr_decay_iters: Optional[int]
+    eval_interval: int
+    save_interval: int
+    use_null_tokenizer: bool
+    # Precision / overlap configs
+    precision_config: Optional[Union[MixedPrecisionConfig, str]]
+    comm_overlap_config: Optional[CommOverlapConfig]
+
+
+def qwen2_500m_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2 0.5B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2-0.5B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def qwen2_1p5b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2 1.5B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2-1.5B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def qwen2_7b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2 7B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2-7B",
+        "tensor_parallelism": 2,
+        "pipeline_parallelism": 1,
+        "use_megatron_fsdp": False,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def qwen2_72b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2 72B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2-72B",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 4,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "use_megatron_fsdp": False,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def qwen25_500m_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2.5 0.5B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-0.5B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+        "check_for_nan_in_grad": True,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def qwen25_1p5b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2.5 1.5B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-1.5B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+        "check_for_nan_in_grad": True,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def qwen25_7b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2.5 7B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-7B",
+        "tensor_parallelism": 2,
+        "pipeline_parallelism": 1,
+        "check_for_nan_in_grad": True,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def qwen25_14b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2.5 14B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-14B",
+        "tensor_parallelism": 4,
+        "pipeline_parallelism": 1,
+        "check_for_nan_in_grad": True,
+        "use_megatron_fsdp": False,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def qwen25_32b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2.5 32B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-32B",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 2,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "check_for_nan_in_grad": True,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def qwen25_72b_pretrain_config(**user_kwargs: Unpack[Qwen2CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen2.5 72B.
+
+    See `_qwen2_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen2CommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-72B",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 4,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "check_for_nan_in_grad": True,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen2CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen2_common(**combined_kwargs)
+
+
+def _qwen2_common(
+    hf_path: str,
+    dir: Optional[str] = None,
+    name: str = "default",
+    # Dataset configuration
+    data_paths: Optional[List[str]] = None,
+    data_args_path: Optional[str] = None,
+    train_data_path: Optional[List[str]] = None,
+    valid_data_path: Optional[List[str]] = None,
+    test_data_path: Optional[List[str]] = None,
+    per_split_data_args_path: Optional[str] = None,
+    mock: bool = False,
+    # Model configuration
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    use_megatron_fsdp: bool = False,
+    check_for_nan_in_grad: bool = False,
+    # Training hyperparameters
+    train_iters: int = 300000,
+    global_batch_size: int = 32,
+    micro_batch_size: int = 2,
+    seq_length: int = 4096,
+    lr: float = 3e-4,
+    min_lr: float = 3e-5,
+    lr_warmup_iters: int = 500,
+    lr_decay_iters: Optional[int] = None,
+    eval_interval: int = 500,
+    save_interval: int = 500,
+    use_null_tokenizer: bool = True,
+    # Precision recipe
+    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
+    comm_overlap_config: Optional[CommOverlapConfig] = None,
+) -> ConfigContainer:
+    """
+    Create a pre-training configuration for Qwen2/Qwen2.5 models using a given HuggingFace path.
+
+    Args:
+        hf_path (str): HuggingFace model path (e.g., "Qwen/Qwen2-1.5B", "Qwen/Qwen2.5-7B").
+        dir (Optional[str]): Base directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
+        data_args_path (Optional[str]): Path to file containing data arguments.
+        train_data_path (Optional[List[str]]): List of training data paths.
+        valid_data_path (Optional[List[str]]): List of validation data paths.
+        test_data_path (Optional[List[str]]): List of test data paths.
+        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
+        mock (bool): Whether to use mock data. If True, ignores data_paths.
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism to be passed to model_config.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
+        check_for_nan_in_grad (bool): Whether to check for NaN in gradients.
+        train_iters (int): Total number of training iterations.
+        global_batch_size (int): Global batch size for training.
+        micro_batch_size (int): Micro batch size for training.
+        seq_length (int): Sequence length for training data.
+        lr (float): Learning rate.
+        min_lr (float): Minimum learning rate for cosine decay.
+        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
+        lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR.
+        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
+        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration.
+
+    Returns:
+        ConfigContainer: Configuration for pre-training.
+    """
+    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
+    run_output_dir = os.path.join(base_output_dir, name)
+    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
+    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
+
+    blend, blend_per_split, split = get_blend_fields_from_data_paths(
+        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
+    )
+
+    bridge = AutoBridge.from_hf_pretrained(hf_path)
+    model_cfg = bridge.to_megatron_provider(load_weights=False)
+    model_cfg.tensor_model_parallel_size = tensor_parallelism
+    model_cfg.pipeline_model_parallel_size = pipeline_parallelism
+    model_cfg.pipeline_dtype = pipeline_parallelism_dtype
+    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_parallelism
+    model_cfg.context_parallel_size = context_parallelism
+    model_cfg.sequence_parallel = sequence_parallelism
+    model_cfg.seq_length = seq_length
+
+    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=lr_warmup_iters,
+        lr_decay_iters=lr_decay_iters,
+        max_lr=lr,
+        min_lr=min_lr,
+    )
+
+    # Config Container
+    cfg = ConfigContainer(
+        model=model_cfg,
+        train=TrainingConfig(
+            train_iters=train_iters,
+            eval_interval=eval_interval,
+            eval_iters=32,
+            global_batch_size=global_batch_size,
+            micro_batch_size=micro_batch_size,
+            manual_gc=True,
+            manual_gc_interval=100,
+            manual_gc_eval=100,
+        ),
+        optimizer=opt_config,
+        scheduler=scheduler,
+        ddp=DistributedDataParallelConfig(
+            check_for_nan_in_grad=check_for_nan_in_grad,
+            use_distributed_optimizer=True,
+            use_megatron_fsdp=use_megatron_fsdp,
+        ),
+        dataset=GPTDatasetConfig(
+            random_seed=1234,
+            reset_attention_mask=False,
+            reset_position_ids=False,
+            eod_mask_loss=False,
+            sequence_length=seq_length,
+            num_dataset_builder_threads=1,
+            blend=blend,
+            blend_per_split=blend_per_split,
+            split=split,
+            # Dataloader config parameters
+            data_sharding=True,
+            dataloader_type="single",
+            skip_getting_attention_mask_from_dataset=True,
+        ),
+        logger=LoggerConfig(
+            log_interval=10,
+            tensorboard_dir=tensorboard_dir,
+            log_timers_to_tensorboard=True,
+        ),
+        tokenizer=TokenizerConfig(
+            tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer",
+            tokenizer_model=hf_path if not use_null_tokenizer else None,
+            vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None,
+        ),
+        checkpoint=CheckpointConfig(
+            save_interval=save_interval,
+            save=checkpoint_dir,
+            load=checkpoint_dir,
+            ckpt_format="torch_dist",
+            fully_parallel_save=True,
+        ),
+        rng=RNGConfig(seed=1234),
+        comm_overlap=comm_overlap_config,
+        mixed_precision=precision_config,
+    )
+
+    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen25_14b.py b/src/megatron/bridge/recipes/qwen/qwen25_14b.py
deleted file mode 100644
index 189aba0e88..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen25_14b.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider14B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen25ModelProvider14B:
-    """
-    Configure the Qwen2.5 14B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen25ModelProvider14B: Configuration for the Qwen2.5 14B model.
-    """
-    return Qwen25ModelProvider14B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2.5 14B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen25_1p5b.py b/src/megatron/bridge/recipes/qwen/qwen25_1p5b.py
deleted file mode 100644
index 0b6f499104..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen25_1p5b.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider1P5B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen25ModelProvider1P5B:
-    """
-    Configure the Qwen2.5 1.5B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen25ModelProvider1P5B: Configuration for the Qwen2.5 1.5B model.
-    """
-    return Qwen25ModelProvider1P5B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2.5 1.5B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen25_32b.py b/src/megatron/bridge/recipes/qwen/qwen25_32b.py
deleted file mode 100644
index 30deb5a79a..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen25_32b.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider32B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen25ModelProvider32B:
-    """
-    Configure the Qwen2.5 32B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen25ModelProvider32B: Configuration for the Qwen2.5 32B model.
-    """
-    return Qwen25ModelProvider32B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2.5 32B model. The default configuration is for 2 nodes with 8 GPUs per node.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen25_500m.py b/src/megatron/bridge/recipes/qwen/qwen25_500m.py
deleted file mode 100644
index 0a923b1f87..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen25_500m.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider500M
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen25ModelProvider500M:
-    """
-    Configure the Qwen2.5 500M model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen25ModelProvider500M: Configuration for the Qwen2.5 500M model.
-    """
-    return Qwen25ModelProvider500M(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2.5 500M model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen25_72b.py b/src/megatron/bridge/recipes/qwen/qwen25_72b.py
deleted file mode 100644
index 077d9e2547..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen25_72b.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider72B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen25ModelProvider72B:
-    """
-    Configure the Qwen2.5 72B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen25ModelProvider72B: Configuration for the Qwen2.5 72B model.
-    """
-    return Qwen25ModelProvider72B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2.5 72B model. The default configuration is for 4 nodes with 8 GPUs per node.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(check_for_nan_in_grad=True, use_distributed_optimizer=True),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen25_7b.py b/src/megatron/bridge/recipes/qwen/qwen25_7b.py
deleted file mode 100644
index fbab8a0148..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen25_7b.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider7B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 2,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen25ModelProvider7B:
-    """
-    Configure the Qwen2.5 7B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen25ModelProvider7B: Configuration for the Qwen2.5 7B model.
-    """
-    return Qwen25ModelProvider7B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 2,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2.5 7B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(check_for_nan_in_grad=True, use_distributed_optimizer=True),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen2_1p5b.py b/src/megatron/bridge/recipes/qwen/qwen2_1p5b.py
deleted file mode 100644
index a74416e8cf..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen2_1p5b.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen2ModelProvider1P5B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen2ModelProvider1P5B:
-    """
-    Configure the Qwen2 1.5B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen2ModelProvider1P5B: Configuration for the Qwen2 1.5B model.
-    """
-    return Qwen2ModelProvider1P5B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2 1.5B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(use_distributed_optimizer=True),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen2_500m.py b/src/megatron/bridge/recipes/qwen/qwen2_500m.py
deleted file mode 100644
index 7ee1953144..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen2_500m.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen2ModelProvider500M
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen2ModelProvider500M:
-    """
-    Configure the Qwen2 500M model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen2ModelProvider500M: Configuration for the Qwen2 500M model.
-    """
-    return Qwen2ModelProvider500M(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2 500M model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(use_distributed_optimizer=True),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen2_72b.py b/src/megatron/bridge/recipes/qwen/qwen2_72b.py
deleted file mode 100644
index 5ecb666364..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen2_72b.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen2ModelProvider72B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen2ModelProvider72B:
-    """
-    Configure the Qwen2 72B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen2ModelProvider72B: Configuration for the Qwen2 72B model.
-    """
-    return Qwen2ModelProvider72B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 4,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2 72B model. The default configuration is for 4 nodes with 8 GPUs per node.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen2_7b.py b/src/megatron/bridge/recipes/qwen/qwen2_7b.py
deleted file mode 100644
index a7f01b906f..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen2_7b.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen2ModelProvider7B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 2,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen2ModelProvider7B:
-    """
-    Configure the Qwen2 7B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen2ModelProvider7B: Configuration for the Qwen2 7B model.
-    """
-    return Qwen2ModelProvider7B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 2,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen2 7B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen3.py b/src/megatron/bridge/recipes/qwen/qwen3.py
new file mode 100644
index 0000000000..c83da4c816
--- /dev/null
+++ b/src/megatron/bridge/recipes/qwen/qwen3.py
@@ -0,0 +1,340 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Optional, Union
+
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from typing_extensions import TypedDict, Unpack
+
+from megatron.bridge import AutoBridge
+from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
+from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
+from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
+from megatron.bridge.training.comm_overlap import CommOverlapConfig
+from megatron.bridge.training.config import (
+    CheckpointConfig,
+    ConfigContainer,
+    GPTDatasetConfig,
+    LoggerConfig,
+    RNGConfig,
+    TokenizerConfig,
+    TrainingConfig,
+)
+from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
+
+
+class Qwen3CommonKwargs(TypedDict, total=False):
+    """Typed options accepted by Qwen3 recipe helper functions."""
+
+    # Core identifiers
+    hf_path: str
+    dir: Optional[str]
+    name: str
+    # Dataset configuration
+    data_paths: Optional[List[str]]
+    data_args_path: Optional[str]
+    train_data_path: Optional[List[str]]
+    valid_data_path: Optional[List[str]]
+    test_data_path: Optional[List[str]]
+    per_split_data_args_path: Optional[str]
+    mock: bool
+    # Model configuration
+    tensor_parallelism: int
+    pipeline_parallelism: int
+    pipeline_parallelism_dtype: Optional[torch.dtype]
+    virtual_pipeline_parallelism: Optional[int]
+    context_parallelism: int
+    sequence_parallelism: bool
+    use_megatron_fsdp: bool
+    use_null_tokenizer: bool
+    enable_recompute: bool
+    # Training hyperparameters
+    train_iters: int
+    global_batch_size: int
+    micro_batch_size: int
+    seq_length: int
+    lr: float
+    min_lr: float
+    lr_warmup_iters: int
+    lr_decay_iters: Optional[int]
+    eval_interval: int
+    save_interval: int
+    # Precision / overlap configs
+    precision_config: Optional[Union[MixedPrecisionConfig, str]]
+    comm_overlap_config: Optional[CommOverlapConfig]
+
+
+def qwen3_600m_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen3 0.6B.
+
+    See `_qwen3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen3CommonKwargs = {
+        "hf_path": "Qwen/Qwen3-0.6B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen3_common(**combined_kwargs)
+
+
+def qwen3_1p7b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen3 1.7B.
+
+    See `_qwen3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen3CommonKwargs = {
+        "hf_path": "Qwen/Qwen3-1.7B",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen3_common(**combined_kwargs)
+
+
+def qwen3_4b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen3 4B.
+
+    See `_qwen3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen3CommonKwargs = {
+        "hf_path": "Qwen/Qwen3-4B",
+        "tensor_parallelism": 2,
+        "pipeline_parallelism": 1,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen3_common(**combined_kwargs)
+
+
+def qwen3_8b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen3 8B.
+
+    See `_qwen3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen3CommonKwargs = {
+        "hf_path": "Qwen/Qwen3-8B",
+        "tensor_parallelism": 4,
+        "pipeline_parallelism": 1,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen3_common(**combined_kwargs)
+
+
+def qwen3_14b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen3 14B.
+
+    See `_qwen3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen3CommonKwargs = {
+        "hf_path": "Qwen/Qwen3-14B",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 1,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen3_common(**combined_kwargs)
+
+
+def qwen3_32b_pretrain_config(**user_kwargs: Unpack[Qwen3CommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen3 32B.
+
+    See `_qwen3_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen3CommonKwargs = {
+        "hf_path": "Qwen/Qwen3-32B",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 2,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "enable_recompute": True,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen3CommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen3_common(**combined_kwargs)
+
+
+def _qwen3_common(
+    hf_path: str,
+    dir: Optional[str] = None,
+    name: str = "default",
+    # Dataset configuration
+    data_paths: Optional[List[str]] = None,
+    data_args_path: Optional[str] = None,
+    train_data_path: Optional[List[str]] = None,
+    valid_data_path: Optional[List[str]] = None,
+    test_data_path: Optional[List[str]] = None,
+    per_split_data_args_path: Optional[str] = None,
+    mock: bool = False,
+    # Model configuration
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    use_megatron_fsdp: bool = False,
+    use_null_tokenizer: bool = False,
+    enable_recompute: bool = False,
+    # Training hyperparameters
+    train_iters: int = 300000,
+    global_batch_size: int = 32,
+    micro_batch_size: int = 2,
+    seq_length: int = 4096,
+    lr: float = 3e-4,
+    min_lr: float = 3e-5,
+    lr_warmup_iters: int = 500,
+    lr_decay_iters: Optional[int] = None,
+    eval_interval: int = 500,
+    save_interval: int = 500,
+    # Precision recipe
+    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
+    comm_overlap_config: Optional[CommOverlapConfig] = None,
+) -> ConfigContainer:
+    """
+    Create a pre-training configuration for Qwen3 models using a given HuggingFace path.
+
+    Args:
+        hf_path (str): HuggingFace model path (e.g., "Qwen/Qwen3-1.7B").
+        dir (Optional[str]): Base directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
+        data_args_path (Optional[str]): Path to file containing data arguments.
+        train_data_path (Optional[List[str]]): List of training data paths.
+        valid_data_path (Optional[List[str]]): List of validation data paths.
+        test_data_path (Optional[List[str]]): List of test data paths.
+        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
+        mock (bool): Whether to use mock data. If True, ignores data_paths.
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism to be passed to model_config.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
+        use_null_tokenizer (bool): Whether to use NullTokenizer instead of HuggingFaceTokenizer.
+        enable_recompute (bool): Whether to enable recompute for memory optimization.
+        train_iters (int): Total number of training iterations.
+        global_batch_size (int): Global batch size for training.
+        micro_batch_size (int): Micro batch size for training.
+        seq_length (int): Sequence length for training data.
+        lr (float): Learning rate.
+        min_lr (float): Minimum learning rate for cosine decay.
+        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
+        lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR.
+        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
+        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration.
+
+    Returns:
+        ConfigContainer: Configuration for pre-training.
+    """
+    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
+    run_output_dir = os.path.join(base_output_dir, name)
+    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
+    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
+
+    blend, blend_per_split, split = get_blend_fields_from_data_paths(
+        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
+    )
+
+    bridge = AutoBridge.from_hf_pretrained(hf_path)
+    model_cfg = bridge.to_megatron_provider(load_weights=False)
+    model_cfg.tensor_model_parallel_size = tensor_parallelism
+    model_cfg.pipeline_model_parallel_size = pipeline_parallelism
+    model_cfg.pipeline_dtype = pipeline_parallelism_dtype
+    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_parallelism
+    model_cfg.context_parallel_size = context_parallelism
+    model_cfg.sequence_parallel = sequence_parallelism
+    model_cfg.seq_length = seq_length
+
+    # Add recompute settings for memory optimization (used by larger models like 32B)
+    if enable_recompute:
+        model_cfg.recompute_granularity = "full"
+        model_cfg.recompute_method = "uniform"
+        model_cfg.recompute_num_layers = 1
+
+    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=lr_warmup_iters,
+        lr_decay_iters=lr_decay_iters,
+        max_lr=lr,
+        min_lr=min_lr,
+    )
+
+    # Config Container
+    cfg_container = ConfigContainer(
+        model=model_cfg,
+        train=TrainingConfig(
+            train_iters=train_iters,
+            eval_interval=eval_interval,
+            eval_iters=32,
+            global_batch_size=global_batch_size,
+            micro_batch_size=micro_batch_size,
+            manual_gc=True,
+            manual_gc_interval=100,
+            manual_gc_eval=100,
+        ),
+        optimizer=opt_cfg,
+        scheduler=scheduler_cfg,
+        ddp=DistributedDataParallelConfig(
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,  # Not supported for custom FSDP for now, need to be set to False if using FSDP
+            data_parallel_sharding_strategy="optim_grads_params",  # For custom FSDP only
+            use_distributed_optimizer=True,
+            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
+        ),
+        dataset=GPTDatasetConfig(
+            random_seed=1234,
+            reset_attention_mask=False,
+            reset_position_ids=False,
+            eod_mask_loss=False,
+            sequence_length=seq_length,
+            num_dataset_builder_threads=1,
+            blend=blend,
+            blend_per_split=blend_per_split,
+            split=split,
+            # Dataloader config parameters
+            data_sharding=True,
+            dataloader_type="single",
+            skip_getting_attention_mask_from_dataset=True,
+        ),
+        logger=LoggerConfig(
+            log_interval=10,
+            tensorboard_dir=tensorboard_dir,
+            log_timers_to_tensorboard=True,
+        ),
+        tokenizer=TokenizerConfig(
+            tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer",
+            tokenizer_model=hf_path if not use_null_tokenizer else None,
+            vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None,
+        ),
+        checkpoint=CheckpointConfig(
+            save_interval=save_interval,
+            save=checkpoint_dir,
+            load=checkpoint_dir,
+            ckpt_format="torch_dist",
+            fully_parallel_save=True,
+        ),
+        rng=RNGConfig(seed=1234),
+        comm_overlap=comm_overlap_config,
+        mixed_precision=precision_config,
+    )
+
+    return cfg_container
diff --git a/src/megatron/bridge/recipes/qwen/qwen3_14b.py b/src/megatron/bridge/recipes/qwen/qwen3_14b.py
deleted file mode 100644
index 2c6ac3ef74..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen3_14b.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider14B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen3ModelProvider14B:
-    """
-    Configure the Qwen3 14B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen3ModelProvider14B: Configuration for the Qwen3 14B model.
-    """
-    return Qwen3ModelProvider14B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen3 14B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,  # Not supported for custom FSDP for now, need to be set to False if using FSDP
-            data_parallel_sharding_strategy="optim_grads_params",  # For custom FSDP only
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="HuggingFaceTokenizer", tokenizer_model="Qwen/Qwen3-14B"),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen3_1p7b.py b/src/megatron/bridge/recipes/qwen/qwen3_1p7b.py
deleted file mode 100644
index 9eee593fb9..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen3_1p7b.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider1P7B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen3ModelProvider1P7B:
-    """
-    Configure the Qwen3 1.7B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen3ModelProvider1P7B: Configuration for the Qwen3 1.7B model.
-    """
-    return Qwen3ModelProvider1P7B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen3 1.7B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,  # Not supported for custom FSDP for now, need to be set to False if using FSDP
-            data_parallel_sharding_strategy="optim_grads_params",  # For custom FSDP only
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="HuggingFaceTokenizer", tokenizer_model="Qwen/Qwen3-1.7B"),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen3_235b_a22b.py b/src/megatron/bridge/recipes/qwen/qwen3_235b_a22b.py
deleted file mode 100644
index ef97213a84..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen3_235b_a22b.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen3MoEModelProvider235B_A22B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 16,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    expert_parallelism: Optional[int] = 8,
-    sequence_parallelism: bool = True,
-) -> Qwen3MoEModelProvider235B_A22B:
-    """
-    Configure the Qwen3 235B-A22B MoE model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        expert_parallelism (Optional[int]): Degree of expert parallelism for MoE.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen3MoEModelProvider235B_A22B: Configuration for the Qwen3 235B-A22B MoE model.
-    """
-    model_cfg = Qwen3MoEModelProvider235B_A22B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        expert_model_parallel_size=expert_parallelism,
-        expert_tensor_parallel_size=1,
-        sequence_parallel=sequence_parallelism,
-        account_for_embedding_in_pipeline_split=True,
-        account_for_loss_in_pipeline_split=True,
-    )
-
-    return model_cfg
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 16,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 2,
-    expert_parallelism: Optional[int] = 8,
-    sequence_parallelism: bool = True,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 1,  # Reduced for very large model
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = None,
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen3 235B-A22B MoE model. The default configuration is for 16 nodes with 8 GPUs per node.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        expert_parallelism (Optional[int]): Degree of expert parallelism for MoE.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        expert_parallelism=expert_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    if precision_config is None:
-        precision_config = bf16_mixed()
-    if isinstance(precision_config, MixedPrecisionConfig):
-        precision_config.grad_reduce_in_fp32 = False
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,  # Not supported for Megatron FSDP for now, need to be set to False if using Megatron FSDP
-            data_parallel_sharding_strategy="optim_grads_params",  # For Megatron FSDP only
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="HuggingFaceTokenizer", tokenizer_model="Qwen/Qwen3-235B-A22B"),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen3_32b.py b/src/megatron/bridge/recipes/qwen/qwen3_32b.py
deleted file mode 100644
index 627070519f..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen3_32b.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider32B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen3ModelProvider32B:
-    """
-    Configure the Qwen3 32B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen3ModelProvider32B: Configuration for the Qwen3 32B model.
-    """
-    model_cfg = Qwen3ModelProvider32B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-    # Add recompute settings for memory optimization
-    model_cfg.recompute_granularity = "full"
-    model_cfg.recompute_method = "uniform"
-    model_cfg.recompute_num_layers = 1
-
-    return model_cfg
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 8,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen3 32B model. The default configuration is for 2 nodes with 8 GPUs per node.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,  # Not supported for Megatron FSDP for now, need to be set to False if using Megatron FSDP
-            data_parallel_sharding_strategy="optim_grads_params",  # For Megatron FSDP only
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="HuggingFaceTokenizer", tokenizer_model="Qwen/Qwen3-32B"),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen3_4b.py b/src/megatron/bridge/recipes/qwen/qwen3_4b.py
deleted file mode 100644
index e5efcde9fe..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen3_4b.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider4B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 2,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen3ModelProvider4B:
-    """
-    Configure the Qwen3 4B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen3ModelProvider4B: Configuration for the Qwen3 4B model.
-    """
-    return Qwen3ModelProvider4B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 2,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen3 4B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,  # Not supported for Megatron FSDP for now, need to be set to False if using Megatron FSDP
-            data_parallel_sharding_strategy="optim_grads_params",  # For Megatron FSDP only
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="HuggingFaceTokenizer", tokenizer_model="Qwen/Qwen3-4B"),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen3_600m.py b/src/megatron/bridge/recipes/qwen/qwen3_600m.py
deleted file mode 100644
index 4c40c09e9a..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen3_600m.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider600M
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen3ModelProvider600M:
-    """
-    Configure the Qwen3 600M model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen3ModelProvider600M: Configuration for the Qwen3 600M model.
-    """
-    return Qwen3ModelProvider600M(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 1,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen3 600M model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,  # Not supported for Megatron FSDP for now, need to be set to False if using Megatron FSDP
-            data_parallel_sharding_strategy="optim_grads_params",  # For Megatron FSDP only
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen3_8b.py b/src/megatron/bridge/recipes/qwen/qwen3_8b.py
deleted file mode 100644
index f3d7e81250..0000000000
--- a/src/megatron/bridge/recipes/qwen/qwen3_8b.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import List, Optional, Union
-
-import torch
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider8B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-
-
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-) -> Qwen3ModelProvider8B:
-    """
-    Configure the Qwen3 8B model.
-
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-
-    Returns:
-        Qwen3ModelProvider8B: Configuration for the Qwen3 8B model.
-    """
-    return Qwen3ModelProvider8B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        sequence_parallel=sequence_parallelism,
-    )
-
-
-def pretrain_config(
-    dir: Optional[str] = None,
-    name: str = "default",
-    # Dataset configuration
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-    # Model configuration
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
-    use_megatron_fsdp: bool = False,
-    # Training hyperparameters
-    train_iters: int = 300000,
-    global_batch_size: int = 32,
-    micro_batch_size: int = 2,
-    seq_length: int = 4096,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
-    lr_warmup_iters: int = 500,
-    lr_decay_iters: Optional[int] = None,
-    # Precision recipe
-    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
-    comm_overlap_config: Optional[CommOverlapConfig] = None,
-) -> ConfigContainer:
-    """
-    Create a pre-training configuration for Qwen3 8B model.
-
-    Args:
-        dir (Optional[str]): Base directory for saving logs and checkpoints.
-        name (str): Name of the pre-training run.
-        data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism to be passed to model_config.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
-        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
-        train_iters (int): Total number of training iterations.
-        global_batch_size (int): Global batch size for training.
-        micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for training data.
-        lr (float): Learning rate.
-        min_lr (float): Minimum learning rate for cosine decay.
-        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
-        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
-
-    Returns:
-        ConfigContainer: Configuration for pre-training.
-    """
-    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, name)
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
-
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
-    model_cfg.seq_length = seq_length
-
-    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
-        max_lr=lr,
-        min_lr=min_lr,
-    )
-
-    # Config Container
-    cfg = ConfigContainer(
-        model=model_cfg,
-        train=TrainingConfig(
-            train_iters=train_iters,
-            eval_interval=500,
-            eval_iters=32,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        optimizer=opt_config,
-        scheduler=scheduler,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,  # Not supported for Megatron FSDP for now, need to be set to False if using Megatron FSDP
-            data_parallel_sharding_strategy="optim_grads_params",  # For Megatron FSDP only
-            use_distributed_optimizer=True,
-            use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
-        ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        tokenizer=TokenizerConfig(tokenizer_type="HuggingFaceTokenizer", tokenizer_model="Qwen/Qwen3-8B"),
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=1234),
-        comm_overlap=comm_overlap_config,
-        mixed_precision=precision_config,
-    )
-
-    return cfg
diff --git a/src/megatron/bridge/recipes/qwen/qwen3_30b_a3b.py b/src/megatron/bridge/recipes/qwen/qwen3_moe.py
similarity index 55%
rename from src/megatron/bridge/recipes/qwen/qwen3_30b_a3b.py
rename to src/megatron/bridge/recipes/qwen/qwen3_moe.py
index ae15a3f105..a1d6691e47 100644
--- a/src/megatron/bridge/recipes/qwen/qwen3_30b_a3b.py
+++ b/src/megatron/bridge/recipes/qwen/qwen3_moe.py
@@ -16,15 +16,17 @@
 from typing import List, Optional, Union
 
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from typing_extensions import TypedDict, Unpack
 
-from megatron.bridge.models.qwen import Qwen3MoEModelProvider30B_A3B
+from megatron.bridge import AutoBridge
 from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
+from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
 from megatron.bridge.training.comm_overlap import CommOverlapConfig
 from megatron.bridge.training.config import (
     CheckpointConfig,
     ConfigContainer,
-    DistributedDataParallelConfig,
     GPTDatasetConfig,
     LoggerConfig,
     RNGConfig,
@@ -34,50 +36,94 @@
 from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed
 
 
-def model_config(
-    tensor_parallelism: int = 4,
-    pipeline_parallelism: int = 2,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
-    virtual_pipeline_parallelism: Optional[int] = None,
-    context_parallelism: int = 1,
-    expert_parallelism: Optional[int] = 4,
-    sequence_parallelism: bool = True,
-) -> Qwen3MoEModelProvider30B_A3B:
-    """
-    Configure the Qwen3 30B-A3B MoE model.
+class Qwen3MoeCommonKwargs(TypedDict, total=False):
+    """Typed options accepted by Qwen3 MoE recipe helpers."""
 
-    Args:
-        tensor_parallelism (int): Degree of tensor model parallelism.
-        pipeline_parallelism (int): Degree of pipeline model parallelism.
-        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
-        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
-        context_parallelism (int): Degree of context parallelism.
-        expert_parallelism (Optional[int]): Degree of expert parallelism for MoE.
-        sequence_parallelism (bool): Whether to use sequence parallelism.
+    # Core identifiers
+    hf_path: str
+    dir: Optional[str]
+    name: str
+    # Dataset configuration
+    data_paths: Optional[List[str]]
+    data_args_path: Optional[str]
+    train_data_path: Optional[List[str]]
+    valid_data_path: Optional[List[str]]
+    test_data_path: Optional[List[str]]
+    per_split_data_args_path: Optional[str]
+    mock: bool
+    # Model configuration
+    tensor_parallelism: int
+    pipeline_parallelism: int
+    pipeline_parallelism_dtype: Optional[torch.dtype]
+    virtual_pipeline_parallelism: Optional[int]
+    context_parallelism: int
+    expert_parallelism: Optional[int]
+    expert_tensor_parallelism: int
+    sequence_parallelism: bool
+    use_megatron_fsdp: bool
+    enable_recompute: bool
+    account_for_embedding_in_pipeline_split: bool
+    account_for_loss_in_pipeline_split: bool
+    # Training hyperparameters
+    train_iters: int
+    global_batch_size: int
+    micro_batch_size: int
+    seq_length: int
+    lr: float
+    min_lr: float
+    lr_warmup_iters: int
+    lr_decay_iters: Optional[int]
+    eval_interval: int
+    save_interval: int
+    use_null_tokenizer: bool
+    # Precision / overlap configs
+    precision_config: Optional[Union[MixedPrecisionConfig, str]]
+    comm_overlap_config: Optional[CommOverlapConfig]
 
-    Returns:
-        Qwen3MoEModelProvider30B_A3B: Configuration for the Qwen3 30B-A3B MoE model.
+
+def qwen3_30b_a3b_pretrain_config(**user_kwargs: Unpack[Qwen3MoeCommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen3-30B-A3B MoE.
+
+    See `_qwen3_moe_common` for the full list of parameters.
     """
-    model_cfg = Qwen3MoEModelProvider30B_A3B(
-        tensor_model_parallel_size=tensor_parallelism,
-        pipeline_model_parallel_size=pipeline_parallelism,
-        pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
-        context_parallel_size=context_parallelism,
-        expert_model_parallel_size=expert_parallelism,
-        expert_tensor_parallel_size=1,
-        sequence_parallel=sequence_parallelism,
-    )
+    recommended_kwargs: Qwen3MoeCommonKwargs = {
+        "hf_path": "Qwen/Qwen3-30B-A3B",
+        "tensor_parallelism": 4,
+        "pipeline_parallelism": 2,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "expert_parallelism": 4,
+        "sequence_parallelism": True,
+        "enable_recompute": True,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen3MoeCommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen3_moe_common(**combined_kwargs)
 
-    # Add recompute settings for memory optimization
-    model_cfg.recompute_granularity = "full"
-    model_cfg.recompute_method = "uniform"
-    model_cfg.recompute_num_layers = 1
 
-    return model_cfg
+def qwen3_235b_a22b_pretrain_config(**user_kwargs: Unpack[Qwen3MoeCommonKwargs]) -> ConfigContainer:
+    """Return a pre-training config for Qwen3-235B-A22B MoE.
+
+    See `_qwen3_moe_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen3MoeCommonKwargs = {
+        "hf_path": "Qwen/Qwen3-235B-A22B",
+        "tensor_parallelism": 4,
+        "pipeline_parallelism": 16,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+        "context_parallelism": 2,
+        "expert_parallelism": 8,
+        "sequence_parallelism": True,
+        "micro_batch_size": 1,
+        "account_for_embedding_in_pipeline_split": True,
+        "account_for_loss_in_pipeline_split": True,
+    }
+    # Combine defaults with user kwargs; user values take precedence.
+    combined_kwargs: Qwen3MoeCommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen3_moe_common(**combined_kwargs)
 
 
-def pretrain_config(
+def _qwen3_moe_common(
+    hf_path: str,
     dir: Optional[str] = None,
     name: str = "default",
     # Dataset configuration
@@ -95,8 +141,12 @@ def pretrain_config(
     virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 1,
     expert_parallelism: Optional[int] = 4,
+    expert_tensor_parallelism: int = 1,
     sequence_parallelism: bool = True,
     use_megatron_fsdp: bool = False,
+    enable_recompute: bool = False,
+    account_for_embedding_in_pipeline_split: bool = False,
+    account_for_loss_in_pipeline_split: bool = False,
     # Training hyperparameters
     train_iters: int = 300000,
     global_batch_size: int = 32,
@@ -106,14 +156,18 @@ def pretrain_config(
     min_lr: float = 3e-5,
     lr_warmup_iters: int = 500,
     lr_decay_iters: Optional[int] = None,
+    eval_interval: int = 500,
+    save_interval: int = 500,
+    use_null_tokenizer: bool = False,
     # Precision recipe
     precision_config: Optional[Union[MixedPrecisionConfig, str]] = None,
     comm_overlap_config: Optional[CommOverlapConfig] = None,
 ) -> ConfigContainer:
     """
-    Create a pre-training configuration for Qwen3 30B-A3B MoE model.
+    Create a pre-training configuration for Qwen3 MoE models using a given HuggingFace path.
 
     Args:
+        hf_path (str): HuggingFace model path (e.g., "Qwen/Qwen3-30B-A3B", "Qwen/Qwen3-235B-A22B").
         dir (Optional[str]): Base directory for saving logs and checkpoints.
         name (str): Name of the pre-training run.
         data_paths (Optional[List[str]]): List of paths to dataset files. If None, mock data will be used.
@@ -129,8 +183,12 @@ def pretrain_config(
         virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
         context_parallelism (int): Degree of context parallelism to be passed to model_config.
         expert_parallelism (Optional[int]): Degree of expert parallelism for MoE.
+        expert_tensor_parallelism (int): Expert tensor parallelism for MoE.
         sequence_parallelism (bool): Whether to use sequence parallelism.
         use_megatron_fsdp (bool): Whether to use Megatron FSDP.
+        enable_recompute (bool): Whether to enable recompute for memory optimization.
+        account_for_embedding_in_pipeline_split (bool): Whether to account for embedding in pipeline split.
+        account_for_loss_in_pipeline_split (bool): Whether to account for loss in pipeline split.
         train_iters (int): Total number of training iterations.
         global_batch_size (int): Global batch size for training.
         micro_batch_size (int): Micro batch size for training.
@@ -138,8 +196,9 @@ def pretrain_config(
         lr (float): Learning rate.
         min_lr (float): Minimum learning rate for cosine decay.
         lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
+        lr_decay_iters (Optional[int]): Number of iterations over which to decay the LR.
         precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
+        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration.
 
     Returns:
         ConfigContainer: Configuration for pre-training.
@@ -153,15 +212,33 @@ def pretrain_config(
         data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
     )
 
-    model_cfg = model_config(
-        tensor_parallelism=tensor_parallelism,
-        pipeline_parallelism=pipeline_parallelism,
-        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
-        context_parallelism=context_parallelism,
-        expert_parallelism=expert_parallelism,
-        sequence_parallelism=sequence_parallelism,
-    )
+    bridge = AutoBridge.from_hf_pretrained(hf_path)
+    model_cfg = bridge.to_megatron_provider(load_weights=False)
+    model_cfg.tensor_model_parallel_size = tensor_parallelism
+    model_cfg.pipeline_model_parallel_size = pipeline_parallelism
+    model_cfg.pipeline_dtype = pipeline_parallelism_dtype
+    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_parallelism
+    model_cfg.context_parallel_size = context_parallelism
+    model_cfg.expert_model_parallel_size = expert_parallelism
+    model_cfg.expert_tensor_parallel_size = expert_tensor_parallelism
+    model_cfg.sequence_parallel = sequence_parallelism
+
+    if precision_config is None:
+        precision_config = bf16_mixed()
+    if isinstance(precision_config, MixedPrecisionConfig):
+        precision_config.grad_reduce_in_fp32 = False
+
+    # MoE-specific pipeline split configurations
+    if account_for_embedding_in_pipeline_split:
+        model_cfg.account_for_embedding_in_pipeline_split = True
+    if account_for_loss_in_pipeline_split:
+        model_cfg.account_for_loss_in_pipeline_split = True
+
+    # Add recompute settings for memory optimization (used by some MoE models)
+    if enable_recompute:
+        model_cfg.recompute_granularity = "full"
+        model_cfg.recompute_method = "uniform"
+        model_cfg.recompute_num_layers = 1
     model_cfg.seq_length = seq_length
 
     opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
@@ -171,17 +248,12 @@ def pretrain_config(
         min_lr=min_lr,
     )
 
-    if precision_config is None:
-        precision_config = bf16_mixed()
-    if isinstance(precision_config, MixedPrecisionConfig):
-        precision_config.grad_reduce_in_fp32 = False
-
     # Config Container
     cfg = ConfigContainer(
         model=model_cfg,
         train=TrainingConfig(
             train_iters=train_iters,
-            eval_interval=500,
+            eval_interval=eval_interval,
             eval_iters=32,
             global_batch_size=global_batch_size,
             micro_batch_size=micro_batch_size,
@@ -221,9 +293,13 @@ def pretrain_config(
             tensorboard_dir=tensorboard_dir,
             log_timers_to_tensorboard=True,
         ),
-        tokenizer=TokenizerConfig(tokenizer_type="HuggingFaceTokenizer", tokenizer_model="Qwen/Qwen3-30B-A3B"),
+        tokenizer=TokenizerConfig(
+            tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer",
+            tokenizer_model=hf_path if not use_null_tokenizer else None,
+            vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None,
+        ),
         checkpoint=CheckpointConfig(
-            save_interval=500,
+            save_interval=save_interval,
             save=checkpoint_dir,
             load=checkpoint_dir,
             ckpt_format="torch_dist",
diff --git a/src/megatron/bridge/recipes/wan/vace.py b/src/megatron/bridge/recipes/wan/vace.py
new file mode 100644
index 0000000000..1f8aef2169
--- /dev/null
+++ b/src/megatron/bridge/recipes/wan/vace.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Optional, Union
+
+from megatron.bridge.data.wan.wan_energon_datamodule import WanDataModuleConfig, VaceDataModuleConfig
+from megatron.bridge.models.wan.wan_provider import VACEModelProvider
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+
+from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
+from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
+from megatron.bridge.training.comm_overlap import CommOverlapConfig
+from megatron.bridge.training.config import (
+    CheckpointConfig,
+    ConfigContainer,
+    LoggerConfig,
+    RNGConfig,
+    TokenizerConfig, 
+    TrainingConfig,
+)
+from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, get_mixed_precision_config
+
+
+def vace_model_config(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 1,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    seq_length: int = 1024,
+    vace_layers: Optional[List[int]] = None,
+    vace_in_channels: int = 96,
+    base_num_layers: int = 30,
+    context_scale: float = 1.0,
+    freeze_base_model: bool = False,
+) -> VACEModelProvider:
+    """
+    Configure the VACE model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        seq_length (int): Sequence length for the model.
+        vace_layers (Optional[List[int]]): List of layer indices for VACE context layers.
+        vace_in_channels (int): Number of input channels for VACE.
+        base_num_layers (int): Base number of layers in the model.
+        context_scale (float): Scale factor for context attention.
+        freeze_base_model (bool): Whether to freeze base WAN model parameters (only train VACE layers).
+    Returns:
+        VACEModelProvider: Configuration for the VACE model.
+    """
+    return VACEModelProvider(
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_dtype,
+        virtual_pipeline_model_parallel_size=None,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        seq_length=seq_length,
+        vace_layers=vace_layers,
+        vace_in_channels=vace_in_channels,
+        base_num_layers=base_num_layers,
+        context_scale=context_scale,
+        freeze_base_model=freeze_base_model,
+    )
+
+
+def vace_pretrain_config(
+    dir: Optional[str] = None,
+    name: str = "vace_pretrain",
+    # Dataset configuration
+    data_path: Optional[str] = None,
+    data_args_path: Optional[str] = None,
+    train_data_path: Optional[List[str]] = None,
+    valid_data_path: Optional[List[str]] = None,
+    test_data_path: Optional[List[str]] = None,
+    per_split_data_args_path: Optional[str] = None,
+    mock: bool = False,
+    # Model configuration
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 1,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    use_megatron_fsdp: bool = False,
+    # VACE-specific configuration
+    vace_layers: Optional[List[int]] = None,
+    vace_in_channels: int = 96,
+    base_num_layers: int = 30,
+    context_scale: float = 1.0,
+    freeze_base_model: bool = True,
+    # Training hyperparameters
+    train_iters: int = 10000,
+    global_batch_size: int = 4,
+    micro_batch_size: int = 1,
+    lr: float = 5e-6,
+    min_lr: float = 5e-6,
+    lr_warmup_iters: int = 0,
+    lr_decay_style: str = "constant",
+    # Checkpoint configuration
+    pretrained_checkpoint: Optional[str] = None,
+    load_optim: bool = False,
+    save_interval: int = 200,
+    # Sequence length
+    seq_length: int = 24,
+    # Precision recipe
+    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
+    comm_overlap_config: Optional[CommOverlapConfig] = None,
+    # Logging
+    log_interval: int = 1,
+    eval_iters: int = 0,
+    eval_interval: int = 200,
+    wandb_project: Optional[str] = None,
+    wandb_exp_name: Optional[str] = None,
+) -> ConfigContainer:
+    """
+    Create a finetuning configuration for VACE model.
+
+    Args:
+        dir (Optional[str]): Base directory for saving logs and checkpoints.
+        name (str): Name of the finetuning run.
+        data_path (Optional[str]): Path to the energon dataset directory.
+        data_args_path (Optional[str]): Path to file containing data arguments.
+        train_data_path (Optional[List[str]]): List of training data paths.
+        valid_data_path (Optional[List[str]]): List of validation data paths.
+        test_data_path (Optional[List[str]]): List of test data paths.
+        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
+        mock (bool): Whether to use mock data. If True, ignores data_path.
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism to be passed to model_config.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        use_megatron_fsdp (bool): Whether to use Megatron FSDP.
+        vace_layers (Optional[List[int]]): List of layer indices for VACE context layers.
+        vace_in_channels (int): Number of input channels for VACE.
+        base_num_layers (int): Base number of layers in the model.
+        context_scale (float): Scale factor for context attention.
+        freeze_base_model (bool): Whether to freeze base WAN model parameters (only train VACE layers).
+        train_iters (int): Total number of training iterations.
+        global_batch_size (int): Global batch size for training.
+        micro_batch_size (int): Micro batch size for training.
+        seq_length (int): Sequence length for training data.
+        lr (float): Learning rate.
+        min_lr (float): Minimum learning rate for cosine decay.
+        lr_warmup_iters (int): Number of warmup iterations for the learning rate.
+        lr_decay_style (str): Learning rate decay style ('constant', 'cosine', etc.).
+        pretrained_checkpoint (Optional[str]): Path to pretrained checkpoint to load.
+        load_optim (bool): Whether to load optimizer state from checkpoint.
+        save_interval (int): Interval for saving checkpoints.
+        precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
+        comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model.
+        log_interval (int): Interval for logging.
+        eval_iters (int): Number of evaluation iterations.
+        eval_interval (int): Interval for evaluation.
+        wandb_project (Optional[str]): Weights & Biases project name.
+        wandb_exp_name (Optional[str]): Weights & Biases experiment name.
+
+    Returns:
+        ConfigContainer: Configuration for finetuning.
+    """
+    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "checkpoints_ft")
+    run_output_dir = os.path.join(base_output_dir, name)
+    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
+    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
+
+    model_cfg = vace_model_config(
+        tensor_parallelism=tensor_parallelism,
+        pipeline_parallelism=pipeline_parallelism,
+        pipeline_parallelism_dtype=pipeline_parallelism_dtype,
+        virtual_pipeline_parallelism=virtual_pipeline_parallelism,
+        context_parallelism=context_parallelism,
+        sequence_parallelism=sequence_parallelism,
+        seq_length=seq_length,
+        vace_layers=vace_layers,
+        vace_in_channels=vace_in_channels,
+        base_num_layers=base_num_layers,
+        context_scale=context_scale,
+        freeze_base_model=freeze_base_model,
+    )
+
+    # Setup optimizer and scheduler
+    if lr_decay_style == "constant":
+        opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
+            lr_warmup_iters=lr_warmup_iters,
+            lr_decay_iters=train_iters,
+            max_lr=lr,
+            min_lr=min_lr,
+        )
+    else:
+        opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
+            lr_warmup_iters=lr_warmup_iters,
+            lr_decay_iters=train_iters,
+            max_lr=lr,
+            min_lr=min_lr,
+        )
+    
+    opt_config.use_precision_aware_optimizer = False
+
+    if isinstance(precision_config, str):
+        precision_config = get_mixed_precision_config(precision_config)
+
+    precision_config.grad_reduce_in_fp32 = False
+
+    # Configure checkpoint settings
+    checkpoint_cfg = CheckpointConfig(
+        save_interval=save_interval,
+        save=checkpoint_dir,
+        load=pretrained_checkpoint if pretrained_checkpoint else checkpoint_dir,
+        ckpt_format="torch_dist",
+        fully_parallel_save=True,
+        load_optim=load_optim,
+    )
+
+    # Configure logging
+    logger_cfg = LoggerConfig(
+        log_interval=log_interval,
+        tensorboard_dir=tensorboard_dir,
+        log_timers_to_tensorboard=True,
+    )
+    
+    # Add wandb configuration if provided
+    if wandb_project:
+        logger_cfg.wandb_project = wandb_project
+    if wandb_exp_name:
+        logger_cfg.wandb_exp_name = wandb_exp_name
+    if checkpoint_dir:
+        logger_cfg.wandb_save_dir = checkpoint_dir
+
+    # Config Container
+    cfg = ConfigContainer(
+        model=model_cfg,
+        train=TrainingConfig(
+            train_iters=train_iters,
+            eval_interval=eval_interval,
+            eval_iters=eval_iters,
+            global_batch_size=global_batch_size,
+            micro_batch_size=micro_batch_size,
+            manual_gc=True,
+            manual_gc_interval=100,
+            manual_gc_eval=100,
+        ),
+        optimizer=opt_config,
+        scheduler=scheduler,
+        ddp=DistributedDataParallelConfig(
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=False,
+            overlap_param_gather=False,
+            average_in_collective=True,
+            use_distributed_optimizer=True,
+            use_megatron_fsdp=use_megatron_fsdp,
+        ),
+        dataset=VaceDataModuleConfig(
+            path=data_path,
+            seq_length=seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            num_workers=10
+        ),
+        logger=logger_cfg,
+        tokenizer=TokenizerConfig(
+            tokenizer_type="NullTokenizer", 
+            vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
+        ),
+        checkpoint=checkpoint_cfg,
+        rng=RNGConfig(seed=1234),
+        comm_overlap=comm_overlap_config,
+        mixed_precision=precision_config,
+    )
+
+    return cfg
diff --git a/src/megatron/bridge/recipes/llama/llama32_3b.py b/src/megatron/bridge/recipes/wan/wan.py
similarity index 76%
rename from src/megatron/bridge/recipes/llama/llama32_3b.py
rename to src/megatron/bridge/recipes/wan/wan.py
index 34e67a1d04..b4975ad5a9 100644
--- a/src/megatron/bridge/recipes/llama/llama32_3b.py
+++ b/src/megatron/bridge/recipes/wan/wan.py
@@ -15,36 +15,36 @@
 import os
 from typing import List, Optional, Union
 
+from megatron.bridge.data.wan.wan_energon_datamodule import WanDataModuleConfig
+from megatron.bridge.models.wan.wan_provider import WanModelProvider
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 
-from megatron.bridge.models.llama import Llama32ModelProvider3B
-from megatron.bridge.recipes.utils.dataset_utils import get_blend_fields_from_data_paths
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
 from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
 from megatron.bridge.training.comm_overlap import CommOverlapConfig
 from megatron.bridge.training.config import (
     CheckpointConfig,
     ConfigContainer,
-    DistributedDataParallelConfig,
-    GPTDatasetConfig,
     LoggerConfig,
     RNGConfig,
-    TokenizerConfig,
+    TokenizerConfig, 
     TrainingConfig,
 )
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
+from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, get_mixed_precision_config
 
 
 def model_config(
     tensor_parallelism: int = 1,
     pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 1,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
-) -> Llama32ModelProvider3B:
+    seq_length: int = 1024,
+) -> WanModelProvider:
     """
-    Configure the Llama3.2 3B model.
+    Configure the Wan model.
 
     Args:
         tensor_parallelism (int): Degree of tensor model parallelism.
@@ -53,17 +53,18 @@ def model_config(
         virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
         context_parallelism (int): Degree of context parallelism.
         sequence_parallelism (bool): Whether to use sequence parallelism.
-
+        seq_length (int): Sequence length for the model.
     Returns:
-        Llama32ModelProvider3B: Configuration for the Llama3.2 3B model.
+        WanModelProvider: Configuration for the Wan model.
     """
-    return Llama32ModelProvider3B(
+    return WanModelProvider(
         tensor_model_parallel_size=tensor_parallelism,
         pipeline_model_parallel_size=pipeline_parallelism,
         pipeline_dtype=pipeline_parallelism_dtype,
-        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        virtual_pipeline_model_parallel_size=None,
         context_parallel_size=context_parallelism,
         sequence_parallel=sequence_parallelism,
+        seq_length=seq_length,
     )
 
 
@@ -81,26 +82,32 @@ def pretrain_config(
     # Model configuration
     tensor_parallelism: int = 1,
     pipeline_parallelism: int = 1,
-    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
-    virtual_pipeline_parallelism: Optional[int] = None,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 1,
     context_parallelism: int = 1,
     sequence_parallelism: bool = False,
     use_megatron_fsdp: bool = False,
     # Training hyperparameters
-    train_iters: int = 1_168_251,
-    global_batch_size: int = 512,
+    train_iters: int = 10000,
+    global_batch_size: int = 4,
     micro_batch_size: int = 1,
-    seq_length: int = 8192,
-    lr: float = 3e-4,
-    min_lr: float = 3e-5,
+    lr: float = 0.9e-4,
     lr_warmup_iters: int = 2000,
-    lr_decay_iters: Optional[int] = None,
     # Precision recipe
+    # DEBUGGING
     precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
+    # precision_config: Optional[Union[MixedPrecisionConfig, str]] = MixedPrecisionConfig(
+    #     fp32=True,
+    #     params_dtype=torch.float32,
+    #     pipeline_dtype=torch.float32,
+    #     autocast_enabled=False,
+    # ),
     comm_overlap_config: Optional[CommOverlapConfig] = None,
 ) -> ConfigContainer:
     """
-    Create a pre-training configuration for Llama3.2 3B model.
+    Create a pre-training configuration for GPT3 175B model.
+
+    The default configuration is expected to run on 64 nodes with 8 GPUs each.
 
     Args:
         dir (Optional[str]): Base directory for saving logs and checkpoints.
@@ -121,28 +128,21 @@ def pretrain_config(
         train_iters (int): Total number of training iterations.
         global_batch_size (int): Global batch size for training.
         micro_batch_size (int): Micro batch size for training.
-        seq_length (int): Sequence length for the model.
+        seq_length (int): Sequence length for training data.
         lr (float): Learning rate.
         min_lr (float): Minimum learning rate for cosine decay.
         lr_warmup_iters (int): Number of warmup iterations for the learning rate.
-        lr_decay_iters (Optional[int]): Number of iterations for learning rate decay.
         precision_config (Optional[Union[MixedPrecisionConfig, str]]): Precision configuration for the model.
         comm_overlap_config (Optional[CommOverlapConfig]): Communication overlap configuration for the model.
 
     Returns:
         ConfigContainer: Configuration for pre-training.
-
-    Note:
-        Sequence length is hardcoded to 8192 for Llama3.2 3B pretraining.
     """
     base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
     run_output_dir = os.path.join(base_output_dir, name)
     checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
     tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
 
-    blend, blend_per_split, split = get_blend_fields_from_data_paths(
-        data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path, mock
-    )
 
     model_cfg = model_config(
         tensor_parallelism=tensor_parallelism,
@@ -151,14 +151,21 @@ def pretrain_config(
         virtual_pipeline_parallelism=virtual_pipeline_parallelism,
         context_parallelism=context_parallelism,
         sequence_parallelism=sequence_parallelism,
+        seq_length=1024,
     )
 
     opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
         lr_warmup_iters=lr_warmup_iters,
-        lr_decay_iters=lr_decay_iters,
+        lr_decay_iters=train_iters,
         max_lr=lr,
-        min_lr=min_lr,
     )
+    opt_config.use_precision_aware_optimizer = False
+
+    if isinstance(precision_config, str):
+        precision_config = get_mixed_precision_config(precision_config)
+
+    precision_config.grad_reduce_in_fp32 = False
+
 
     # Config Container
     cfg = ConfigContainer(
@@ -178,36 +185,29 @@ def pretrain_config(
         ddp=DistributedDataParallelConfig(
             check_for_nan_in_grad=True,
             grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
+            overlap_grad_reduce=False,
+            overlap_param_gather=False,
             average_in_collective=True,
             use_distributed_optimizer=True,
             use_megatron_fsdp=use_megatron_fsdp,  # need use_distributed_optimizer=True
         ),
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            sequence_length=seq_length,
-            num_dataset_builder_threads=1,
-            blend=blend,
-            blend_per_split=blend_per_split,
-            split=split,
-            # Dataloader config parameters
-            data_sharding=True,
-            dataloader_type="single",
-            num_workers=8,
-            skip_getting_attention_mask_from_dataset=True,
-        ),
+        dataset= WanDataModuleConfig(
+            path=None,
+            seq_length=1024, # we don't need to use this value, just add because Bridge training requires for LLMs
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            num_workers=10)
+        ,
         logger=LoggerConfig(
             log_interval=10,
             tensorboard_dir=tensorboard_dir,
+            log_timers_to_tensorboard=True,
         ),
         tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
         checkpoint=CheckpointConfig(
             save_interval=2000,
             save=checkpoint_dir,
+            load=checkpoint_dir,
             ckpt_format="torch_dist",
             fully_parallel_save=True,
         ),
@@ -216,9 +216,4 @@ def pretrain_config(
         mixed_precision=precision_config,
     )
 
-    if cfg.comm_overlap is None:
-        cfg.comm_overlap = CommOverlapConfig(
-            tp_comm_overlap=False,
-        )
-
     return cfg
diff --git a/src/megatron/bridge/training/checkpointing.py b/src/megatron/bridge/training/checkpointing.py
index 44f946e1f3..56e45fdfff 100644
--- a/src/megatron/bridge/training/checkpointing.py
+++ b/src/megatron/bridge/training/checkpointing.py
@@ -137,6 +137,45 @@ def get_checkpoint_version() -> Optional[float]:
     return _CHECKPOINT_VERSION
 
 
+def delete_extra_state(state_dict):
+    """Delete all extra state keys from the model state dictionary.
+
+    This function removes all keys containing '_extra_state' from the model
+    portion of the state dictionary. This is useful for cleaning up corrupted
+    or problematic extra state that can cause issues during model loading.
+
+    Args:
+        state_dict: The state dictionary. Can be either:
+                   - A full checkpoint dict with a "model" key, or
+                   - A model state dict directly
+
+    Returns:
+        The modified state dictionary with extra state keys removed.
+    """
+    # Handle both cases: full checkpoint dict with "model" key or direct model state dict
+    if isinstance(state_dict, dict) and "model" in state_dict:
+        # Full checkpoint dict case
+        target_dict = state_dict["model"]
+    else:
+        # Direct model state dict case
+        target_dict = state_dict
+
+    # If target is not a mapping-like object, nothing to clean
+    if not hasattr(target_dict, "keys"):
+        return state_dict
+
+    # Some objects may implement keys() but not be directly iterable into a list (e.g., mocks)
+    try:
+        keys = list(target_dict.keys())
+    except Exception:
+        return state_dict
+
+    for key in keys:
+        if isinstance(key, str) and "_extra_state" in key:
+            del target_dict[key]
+    return state_dict
+
+
 def _get_checkpoint_format(checkpoint_path: str) -> str:
     """Determine the checkpoint format by examining the checkpoint directory.
 
@@ -226,7 +265,7 @@ def read_metadata(tracker_filename: str) -> tuple[int, bool]:
         # iteration across all ranks.
         if iteration != max_iter:
             rank = torch.distributed.get_rank()
-            print(
+            print_rank_0(
                 "WARNING: on rank {} found iteration {} in the "
                 "metadata while max iteration across the ranks "
                 "is {}, replacing it with max iteration.".format(rank, iteration, max_iter),
@@ -626,6 +665,7 @@ def save_checkpoint(
         train_state_local_filename = get_checkpoint_train_state_filename(checkpoint_name)
         train_state_global_filename = get_checkpoint_train_state_filename(save_dir, prefix=TRACKER_PREFIX)
         config_filename = get_checkpoint_run_config_filename(checkpoint_name)
+        tracker_filename = get_checkpoint_tracker_filename(save_dir)
         if ckpt_type == CheckpointType.LOCAL:
 
             def train_state_finalize_fn():
@@ -646,9 +686,15 @@ def train_state_finalize_fn() -> None:
                     msc = MultiStorageClientFeature.import_package()
                     msc.torch.save(train_state_dict, train_state_local_filename)
                     msc.torch.save(train_state_dict, train_state_global_filename)
+                    # Write Megatron-LM tracker file for compatibility
+                    with msc.open(tracker_filename, "w") as f:
+                        f.write(str(train_state.step))
                 else:
                     torch.save(train_state_dict, train_state_local_filename)
                     shutil.copy(train_state_local_filename, train_state_global_filename)
+                    # Write Megatron-LM tracker file for compatibility
+                    with open(tracker_filename, "w") as f:
+                        f.write(str(train_state.step))
 
                 cfg.to_yaml(config_filename)
 
@@ -784,7 +830,7 @@ def maybe_save_dataloader_state(train_iterator: Any, iteration: int, dataloader_
         return
 
     dp_rank = mpu.get_data_parallel_rank()
-    print(f"saving dataloader checkpoint at iteration {iteration} to {dataloader_save_path}")
+    print_rank_0(f"saving dataloader checkpoint at iteration {iteration} to {dataloader_save_path}")
     train_dataloader_state_dict = train_iterator.iterable.save_state()
     # Get the base directory for the current iteration
     iter_dir = get_checkpoint_name(dataloader_save_path, iteration)
@@ -976,6 +1022,9 @@ def _load_model_weights_from_checkpoint(
     state_dict = dist_checkpointing.load(
         sharded_state_dict, checkpoint_path, load_strategy, strict=dist_ckpt_strictness
     )
+    # we keep weights only for bridge use, remove extra state
+    # because they are not needed and could cause unexpected issues.
+    delete_extra_state(state_dict)
     if return_state_dict:
         return state_dict
 
@@ -1048,11 +1097,15 @@ def _load_model_state_dict(module: torch.nn.Module, state_dict: dict[str, Any],
     """Helper function to load state dict with fallback for missing extra states."""
     try:
         module.load_state_dict(state_dict, strict=strict)
-    except Exception:
+    except Exception as e:
         if strict:
             # Fallback support for backward compatibility breaking changes in TransformerEngine
+            print_rank_0(f"Warning: Exception during strict loading: {e}")
             load_return = module.load_state_dict(state_dict, strict=False)
-            print(f"load_return: {load_return}")
+            print_rank_0(f"load_return: {load_return}")
+        else:
+            # Re-raise if we were already in non-strict mode
+            raise
 
 
 def _load_checkpoint_from_path(
@@ -1376,7 +1429,7 @@ def _load_checkpoint_from_path(
             if "rerun_state_machine" in state_dict:
                 get_rerun_state_machine().load_state_dict(state_dict["rerun_state_machine"])
         except Exception as e:
-            print(f"Unable to restore RerunMachine from checkpoint: {e}. Skipping.")
+            print_rank_0(f"Unable to restore RerunMachine from checkpoint: {e}. Skipping.")
             sys.exit()
 
     # Load RNG states
diff --git a/src/megatron/bridge/training/config.py b/src/megatron/bridge/training/config.py
index fd5c515ac8..028c50b6ec 100644
--- a/src/megatron/bridge/training/config.py
+++ b/src/megatron/bridge/training/config.py
@@ -20,6 +20,8 @@
 from pathlib import Path
 from typing import Any, Literal, Optional, Tuple, Union
 
+import torch
+
 from megatron.core.datasets.gpt_dataset import GPTDatasetConfig as MCoreGPTDatasetConfig
 from megatron.core.distributed import DistributedDataParallelConfig as MCoreDistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig as MCoreOptimizerConfig
@@ -768,6 +770,11 @@ def finalize(self) -> None:
         assert not (self.use_pytorch_profiler and self.use_nsys_profiler), (
             "Exactly one of pytorch or nsys profiler should be enabled, not both, when ProfilingConfig is active."
         )
+        assert self.profile_step_start >= 0, f"profile_step_start must be >= 0, got {self.profile_step_start}"
+        assert self.profile_step_end >= 0, f"profile_step_end must be >= 0, got {self.profile_step_end}"
+        assert self.profile_step_end >= self.profile_step_start, (
+            f"profile_step_end ({self.profile_step_end}) must be >= profile_step_start ({self.profile_step_start})"
+        )
 
 
 @dataclass
@@ -1155,7 +1162,10 @@ def validate(self) -> None:
                 if isinstance(self.dataset, FinetuningDatasetConfig)
                 else self.dataset.sequence_length
             )
-
+            # Place pdb on rank 0
+            # import pdb;pdb.set_trace()
+            # if torch.distributed.get_rank() == 0:
+            #     import pdb; pdb.set_trace()
             assert self.model.seq_length == data_seq_length, (
                 f"Please ensure sequence length configuration in model config and "
                 f"dataset config match.\nSequence length in model config: {self.model.seq_length}, "
diff --git a/src/megatron/bridge/training/eval.py b/src/megatron/bridge/training/eval.py
index 163e7b295b..d218eb8bbb 100644
--- a/src/megatron/bridge/training/eval.py
+++ b/src/megatron/bridge/training/eval.py
@@ -25,14 +25,15 @@
 
 from megatron.bridge.training import fault_tolerance
 from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.forward_step_func_types import ForwardStepCallable
 from megatron.bridge.training.state import GlobalState
-from megatron.bridge.training.utils.train_utils import check_forward_step_func_num_args, maybe_inject_state
+from megatron.bridge.training.utils.train_utils import prepare_forward_step_func
 from megatron.bridge.utils.common_utils import is_last_rank, print_rank_0, print_rank_last
 
 
 def evaluate(
     state: GlobalState,
-    forward_step_func: Callable,
+    forward_step_func: ForwardStepCallable,
     data_iterator: Optional[Union[RerunDataIterator, list[RerunDataIterator]]],
     model: list[MegatronModule],
     process_non_loss_data_func: Optional[Callable],
@@ -58,8 +59,9 @@ def evaluate(
             - collected_non_loss_data: Data collected by non_loss_data_func.
             - timelimit_hit: Boolean indicating if the time limit was reached.
     """
-    # Check num args to forward_step_func
-    num_fw_args = check_forward_step_func_num_args(forward_step_func)
+    # Prepare forward_step_func (check signature and inject state if needed)
+    # This is done once to prevent creating new partial objects every eval iteration
+    wrapped_forward_step = prepare_forward_step_func(forward_step_func, state)
 
     timers = state.timers
     timers("evaluate", log_level=0).start(barrier=True)
@@ -88,7 +90,6 @@ def evaluate(
             if verbose:
                 print_rank_0(f"Evaluating iter {iteration}/{state.cfg.train.eval_iters}")
 
-            wrapped_forward_step = maybe_inject_state(forward_step_func, state, num_fw_args=num_fw_args)
             forward_backward_func = get_forward_backward_func()
             # Don't care about timing during evaluation
             config.timers = None
@@ -177,7 +178,7 @@ def evaluate(
 def evaluate_and_print_results(
     state: GlobalState,
     prefix: str,
-    forward_step_func: Callable,
+    forward_step_func: ForwardStepCallable,
     data_iterator: Optional[Union[RerunDataIterator, list[RerunDataIterator]]],
     model: list[MegatronModule],
     config: ConfigContainer,
diff --git a/src/megatron/bridge/training/finetune.py b/src/megatron/bridge/training/finetune.py
index b0b3620a41..8ab3956148 100644
--- a/src/megatron/bridge/training/finetune.py
+++ b/src/megatron/bridge/training/finetune.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Callable
-
 from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.forward_step_func_types import ForwardStepCallable
 from megatron.bridge.training.pretrain import pretrain
 from megatron.bridge.utils.decorators import experimental_fn
 
@@ -22,14 +21,24 @@
 @experimental_fn
 def finetune(
     config: ConfigContainer,
-    forward_step_func: Callable,
+    forward_step_func: ForwardStepCallable,
 ) -> None:
     """Main function to run the finetuning.
 
     Args:
         config: The main configuration container holding all necessary parameters.
-        forward_step_func: A callable that performs a single forward and backward
-                           step, returning the loss and any computed metrics.
+        forward_step_func: A callable (function or functor) that performs a single
+                          forward and backward step, returning the loss and any computed
+                          metrics. Supports the following signatures:
+                          - 2 args: (data_iterator, model)
+                          - 3 args: (data_iterator, model, return_schedule_plan=False)
+                                   OR (state: GlobalState, data_iterator, model)
+                          - 4 args: (state: GlobalState, data_iterator, model, return_schedule_plan=False)
+
+    Note:
+        Use the signature with GlobalState type hint for full access to configuration, timers, and training state.
+        State injection is automatic based on type hints or parameter names.
+        Functors (classes with __call__) are fully supported.
 
     Warnings:
         This is an experimental API and is subject to change in backwards
diff --git a/src/megatron/bridge/training/forward_step_func_types.py b/src/megatron/bridge/training/forward_step_func_types.py
new file mode 100644
index 0000000000..6e9d33ed51
--- /dev/null
+++ b/src/megatron/bridge/training/forward_step_func_types.py
@@ -0,0 +1,279 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Type definitions for forward step function definitions.
+
+This module provides comprehensive type definitions for forward step functions used in
+Megatron Bridge training. Forward step functions are the core of the training loop,
+responsible for performing a single forward pass and returning both the output tensor
+and a loss function.
+
+Key Types:
+    - ForwardStepCallable: Union of all supported forward step signatures (functions + functors)
+    - LossFunction: The partial function returned by forward step functions
+    - LossFunctionReturn: The possible return types when calling a loss function
+
+Example Usage:
+    >>> from functools import partial
+    >>> from megatron.bridge.training.state import GlobalState
+    >>>
+    >>> def my_forward_step(state: GlobalState, data_iterator, model, return_schedule_plan=False):
+    ...     # Access configuration, timers, and training state
+    ...     timers = state.timers
+    ...     config = state.cfg
+    ...
+    ...     # Get batch data
+    ...     batch = next(data_iterator)
+    ...
+    ...     # Forward pass with timing
+    ...     timers("forward-step").start()
+    ...     output_tensor = model(batch['input_ids'])
+    ...     timers("forward-step").stop()
+    ...
+    ...     # Create loss function
+    ...     def loss_func(output_tensor):
+    ...         loss = compute_loss(output_tensor, batch['labels'])
+    ...         num_tokens = batch['labels'].numel()
+    ...         loss_reduced = {"lm_loss": loss.detach()}
+    ...         return loss, num_tokens, loss_reduced  # ThreeTupleLossReturn
+    ...
+    ...     return output_tensor, partial(loss_func)
+    ...
+    >>> # State injection is automatic - no manual binding needed!
+    >>> pretrain(config, my_forward_step)
+    >>>
+    >>> # Functor example (for stateful forward steps)
+    >>> class StatefulForwardStep:
+    ...     def __init__(self, loss_scale: float = 1.0):
+    ...         self.loss_scale = loss_scale
+    ...         self.step_count = 0
+    ...
+    ...     def __call__(self, state: GlobalState, data_iterator, model, return_schedule_plan=False):
+    ...         self.step_count += 1
+    ...         # ... forward step logic with state tracking ...
+    ...         return output_tensor, partial(loss_func)
+    ...
+    >>> functor = StatefulForwardStep(loss_scale=2.0)
+    >>> pretrain(config, functor)
+"""
+
+from functools import partial
+from typing import Any, Iterable, Protocol, overload
+
+import torch
+from megatron.core.models.gpt import GPTModel
+
+from megatron.bridge.training.state import GlobalState
+
+
+# Loss function return types
+LossReduced = dict[str, torch.Tensor]  # Dictionary of loss metrics for logging
+TwoTupleLossReturn = tuple[torch.Tensor, LossReduced]  # (loss, loss_reduced) - legacy format
+ThreeTupleLossReturn = tuple[
+    torch.Tensor, torch.Tensor, LossReduced
+]  # (loss, num_tokens, loss_reduced) - per-token loss
+InferenceLossReturn = Any  # Any data for inference/non-loss collection (when collect_non_loss_data=True)
+
+# Union of all possible loss function return types
+LossFunctionReturn = TwoTupleLossReturn | ThreeTupleLossReturn | InferenceLossReturn
+
+# Type for the loss function that gets called with output_tensor
+# This is a partial function that when called returns one of the LossFunctionReturn types
+LossFunction = partial[LossFunctionReturn]
+
+
+class TwoArgForwardStep(Protocol):
+    """Protocol for forward step functions with 2 arguments.
+
+    This represents forward step functions that don't need access to GlobalState
+    and don't support schedule plan return mode.
+
+    Args:
+        data_iterator: Iterator providing training data batches
+        model: The GPT model to train
+
+    Returns:
+        Tuple of (output_tensor, loss_function)
+    """
+
+    def __call__(
+        self,
+        data_iterator: Iterable,
+        model: GPTModel,
+    ) -> tuple[torch.Tensor, LossFunction]: ...
+
+
+class ThreeArgStateForwardStep(Protocol):
+    """Protocol for forward step functions with 3 arguments including state.
+
+    This represents forward step functions that need access to GlobalState
+    but don't support schedule plan return mode.
+
+    Args:
+        state: Global training state containing configuration and runtime objects
+        data_iterator: Iterator providing training data batches
+        model: The GPT model to train
+
+    Returns:
+        Tuple of (output_tensor, loss_function)
+    """
+
+    def __call__(
+        self,
+        state: GlobalState,
+        data_iterator: Iterable,
+        model: GPTModel,
+    ) -> tuple[torch.Tensor, LossFunction]: ...
+
+
+class ThreeArgForwardStep(Protocol):
+    """Protocol for forward step functions with 3 arguments.
+
+    This represents forward step functions that don't need access to GlobalState
+    but support schedule plan return mode. These are typically 4-arg functions
+    that have had GlobalState pre-bound via functools.partial.
+
+    Args:
+        data_iterator: Iterator providing training data batches
+        model: The GPT model to train
+        return_schedule_plan: Whether to return schedule plan instead of output tensor
+
+    Returns:
+        Tuple of (output_tensor, loss_function) or (schedule_plan, loss_function)
+    """
+
+    def __call__(
+        self,
+        data_iterator: Iterable,
+        model: GPTModel,
+        return_schedule_plan: bool = False,
+    ) -> tuple[torch.Tensor, LossFunction]: ...
+
+
+class FourArgForwardStep(Protocol):
+    """Protocol for forward step functions with 4 arguments.
+
+    This represents forward step functions that need access to GlobalState
+    and support schedule plan return mode. These are the most complete
+    forward step function signatures.
+
+    Args:
+        state: Global training state containing configuration and runtime objects
+        data_iterator: Iterator providing training data batches
+        model: The GPT model to train
+        return_schedule_plan: Whether to return schedule plan instead of output tensor
+
+    Returns:
+        Tuple of (output_tensor, loss_function) or (schedule_plan, loss_function)
+    """
+
+    def __call__(
+        self,
+        state: GlobalState,
+        data_iterator: Iterable,
+        model: GPTModel,
+        return_schedule_plan: bool = False,
+    ) -> tuple[torch.Tensor, LossFunction]: ...
+
+
+class ForwardStepFunctor(Protocol):
+    """Protocol for forward step functors (callable classes).
+
+    This protocol represents classes that implement __call__ with one of the
+    supported forward step function signatures. Functors are useful when you
+    need to maintain state between forward step calls or implement complex
+    forward step logic that benefits from object-oriented design.
+
+    The __call__ method must match one of the supported signatures:
+    - (data_iterator, model)
+    - (data_iterator, model, return_schedule_plan=False)
+             OR (state: GlobalState, data_iterator, model)
+    - (state: GlobalState, data_iterator, model, return_schedule_plan=False)
+
+    RECOMMENDED: Use GlobalState type hint for automatic state injection and full access
+    to configuration, timers, and training state.
+
+    Examples:
+        >>> class MyForwardFunctor:
+        ...     def __init__(self, loss_scale: float = 1.0):
+        ...         self.loss_scale = loss_scale
+        ...         self.call_count = 0
+        ...
+        ...     def __call__(self, state: GlobalState, data_iterator, model, return_schedule_plan=False):
+        ...         self.call_count += 1
+        ...         # Access training infrastructure
+        ...         timers = state.timers
+        ...         config = state.cfg
+        ...         # ... forward step logic ...
+        ...         return output_tensor, loss_function
+        ...
+        >>> functor = MyForwardFunctor(loss_scale=2.0)
+        >>> pretrain(config, functor)  # State injection is automatic!
+    """
+
+    @overload
+    def __call__(
+        self,
+        data_iterator: Iterable,
+        model: GPTModel,
+    ) -> tuple[torch.Tensor, LossFunction]:
+        """2-argument signature: (data_iterator, model)."""
+        ...
+
+    @overload
+    def __call__(
+        self,
+        data_iterator: Iterable,
+        model: GPTModel,
+        return_schedule_plan: bool = False,
+    ) -> tuple[torch.Tensor, LossFunction]:
+        """3-argument signature: (data_iterator, model, return_schedule_plan)."""
+        ...
+
+    @overload
+    def __call__(
+        self,
+        state: GlobalState,
+        data_iterator: Iterable,
+        model: GPTModel,
+    ) -> tuple[torch.Tensor, LossFunction]:
+        """3-argument signature with state: (state, data_iterator, model)."""
+        ...
+
+    @overload
+    def __call__(
+        self,
+        state: GlobalState,
+        data_iterator: Iterable,
+        model: GPTModel,
+        return_schedule_plan: bool = False,
+    ) -> tuple[torch.Tensor, LossFunction]:
+        """4-argument signature: (state, data_iterator, model, return_schedule_plan)."""
+        ...
+
+    def __call__(self, *args, **kwargs) -> tuple[torch.Tensor, LossFunction]:
+        """Execute the forward step.
+
+        The actual implementation must match one of the overloaded signatures above.
+        This fallback signature is required by the Protocol but should not be used
+        directly - type checkers will use the @overload signatures for validation.
+        """
+        ...
+
+
+# Union type for all supported forward step function signatures
+ForwardStepFunc = TwoArgForwardStep | ThreeArgStateForwardStep | ThreeArgForwardStep | FourArgForwardStep
+
+# Type alias that includes both functions and functors
+ForwardStepCallable = ForwardStepFunc | ForwardStepFunctor
diff --git a/src/megatron/bridge/training/model_load_save.py b/src/megatron/bridge/training/model_load_save.py
index 844b7ab4b8..bcd169ffff 100644
--- a/src/megatron/bridge/training/model_load_save.py
+++ b/src/megatron/bridge/training/model_load_save.py
@@ -27,7 +27,7 @@
 from megatron.core.transformer import MegatronModule, TransformerConfig
 from megatron.core.utils import get_model_config
 
-from megatron.bridge.models.model_provider import ModelProviderMixin
+from megatron.bridge.models.model_provider import ModelParallelKwargs, ModelProviderMixin
 from megatron.bridge.training.checkpointing import save_checkpoint
 from megatron.bridge.training.config import CheckpointConfig, ConfigContainer, LoggerConfig
 from megatron.bridge.training.state import GlobalState
@@ -307,6 +307,7 @@ def load_megatron_model(
     return_state_dict: bool = False,
     use_cpu_init: bool = False,
     skip_temp_dist_context: Optional[bool] = None,
+    mp_overrides: Optional[ModelParallelKwargs] = None,
 ) -> Union[Any, dict[str, torch.Tensor]]:
     """Load a Megatron model from a distributed checkpoint.
 
@@ -323,13 +324,31 @@ def load_megatron_model(
         skip_temp_dist_context: If True, skip temporary distributed context setup.
                                If None, automatically skip if distributed is already initialized.
                                Default: None.
+        mp_overrides: Optional model-parallel overrides to apply to the loaded config.
+                      Only provided fields are overridden.
 
     Returns:
         The model instance with loaded weights if return_state_dict is False,
         otherwise returns a dictionary containing the full, unsharded model state_dict.
     """
-
     model_cfg, mlm_args = load_model_config(checkpoint_path)
+    # If in single GPU environment, reset additional parallel settings
+    model_cfg.tensor_model_parallel_size = 1
+    model_cfg.pipeline_model_parallel_size = 1
+    model_cfg.context_parallel_size = 1
+    model_cfg.expert_model_parallel_size = 1
+    model_cfg.expert_tensor_parallel_size = 1
+    model_cfg.moe_extended_tp = False
+    model_cfg.sequence_parallel = False
+    model_cfg.virtual_pipeline_model_parallel_size = None
+    model_cfg.hierarchical_context_parallel_sizes = None
+
+    # Apply model-parallel overrides if provided
+    if mp_overrides:
+        for key, value in mp_overrides.items():
+            if hasattr(model_cfg, key) and value is not None:
+                setattr(model_cfg, key, value)
+
     return build_and_load_model(
         checkpoint_path, model_cfg, model_type, mlm_args, return_state_dict, use_cpu_init, skip_temp_dist_context
     )
@@ -363,7 +382,7 @@ def save_megatron_model(
         >>> save_megatron_model(
         ...     megatron_model,
         ...     "./megatron_checkpoint",
-        ...     hf_tokenizer_path="meta-llama/Llama-3-8B"
+        ...     hf_tokenizer_path="meta-llama/Meta-Llama-3-8B"
         ... )
 
     Note:
diff --git a/src/megatron/bridge/training/pretrain.py b/src/megatron/bridge/training/pretrain.py
index c1eab48540..d1539726be 100644
--- a/src/megatron/bridge/training/pretrain.py
+++ b/src/megatron/bridge/training/pretrain.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Callable, Optional
+from typing import Optional
 
 import torch.distributed as dist
 from nvidia_resiliency_ext.inprocess import CallWrapper
@@ -21,6 +21,7 @@
 from megatron.bridge.training.checkpointing import save_checkpoint
 from megatron.bridge.training.config import ConfigContainer, runtime_config_update
 from megatron.bridge.training.eval import evaluate_and_print_results
+from megatron.bridge.training.forward_step_func_types import ForwardStepCallable
 from megatron.bridge.training.setup import setup
 from megatron.bridge.training.state import GlobalState
 from megatron.bridge.training.train import _finish_train, train
@@ -32,7 +33,7 @@
 @experimental_fn
 def pretrain(
     config: ConfigContainer,
-    forward_step_func: Callable,
+    forward_step_func: ForwardStepCallable,
 ) -> None:
     """Main function to run the training pipeline.
 
@@ -42,8 +43,18 @@ def pretrain(
 
     Args:
         config: The main configuration container holding all necessary parameters.
-        forward_step_func: A callable that performs a single forward and backward
-                           step, returning the loss and any computed metrics.
+        forward_step_func: A callable (function or functor) that performs a single
+                          forward and backward step, returning the loss and any computed
+                          metrics. Supports the following signatures:
+                          - 2 args: (data_iterator, model)
+                          - 3 args: (data_iterator, model, return_schedule_plan=False)
+                                   OR (state: GlobalState, data_iterator, model)
+                          - 4 args: (state: GlobalState, data_iterator, model, return_schedule_plan=False)
+
+    Note:
+        Use the signature with GlobalState type hint for full access to configuration, timers, and training state.
+        State injection is automatic based on type hints or parameter names.
+        Functors (classes with __call__) are fully supported.
 
     Warnings:
         This is an experimental API and is subject to change in backwards
@@ -73,7 +84,7 @@ def pretrain(
 
 def _pretrain(
     state: GlobalState,
-    forward_step_func: Callable,
+    forward_step_func: ForwardStepCallable,
     store: Optional[dist.Store] = None,
     inprocess_call_wrapper: Optional[CallWrapper] = None,
 ) -> None:
@@ -81,7 +92,7 @@ def _pretrain(
 
     Args:
         state: Global training state containing the validated configuration and runtime objects
-        forward_step_func: Function that performs a single forward/backward step
+        forward_step_func: Function or functor that performs a single forward/backward step
         store: Optional distributed Store used by in-process restart for coordination
         inprocess_call_wrapper: Optional wrapper injected by nvrx to expose restart iteration
     """
diff --git a/src/megatron/bridge/training/profiling.py b/src/megatron/bridge/training/profiling.py
new file mode 100644
index 0000000000..167a1e11ff
--- /dev/null
+++ b/src/megatron/bridge/training/profiling.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Profiling utilities for training loop."""
+
+from typing import Optional
+
+import torch
+import torch.profiler
+
+from megatron.bridge.training.config import ProfilingConfig
+
+
+# Type alias for NVTX context manager
+TNvtxContext = torch.autograd.profiler.emit_nvtx
+
+
+def should_profile_rank(config: Optional[ProfilingConfig], rank: int) -> bool:
+    """Check if current rank should be profiled.
+
+    Args:
+        config: Profiling configuration
+        rank: Current process rank
+
+    Returns:
+        True if this rank should be profiled
+    """
+    if config is None:
+        return False
+    return rank in config.profile_ranks
+
+
+def handle_profiling_step(
+    config: Optional[ProfilingConfig],
+    iteration: int,
+    rank: int,
+    pytorch_prof: Optional[torch.profiler.profile],
+) -> Optional[TNvtxContext]:
+    """Handle profiling logic for a single training step.
+
+    Args:
+        config: Profiling configuration
+        iteration: Current training iteration
+        rank: Current process rank
+        pytorch_prof: PyTorch profiler instance (if using PyTorch profiler)
+
+    Returns:
+        NVTX context if nsys profiling was started at this step, None otherwise
+    """
+    if not should_profile_rank(config, rank):
+        return None
+
+    if config.use_pytorch_profiler and pytorch_prof is not None:
+        pytorch_prof.step()
+
+    if config.use_nsys_profiler:
+        if iteration == config.profile_step_start:
+            return start_nsys_profiler(config)
+
+    return None
+
+
+def handle_profiling_stop(
+    config: Optional[ProfilingConfig],
+    iteration: int,
+    rank: int,
+    pytorch_prof: Optional[torch.profiler.profile],
+    nsys_nvtx_context: Optional[TNvtxContext] = None,
+) -> None:
+    """Handle profiling cleanup at designated stop iteration.
+
+    Args:
+        config: Profiling configuration
+        iteration: Current training iteration
+        rank: Current process rank
+        pytorch_prof: PyTorch profiler instance (if using PyTorch profiler)
+        nsys_nvtx_context: NVTX context from handle_profiling_step (if using nsys profiler)
+    """
+    if not should_profile_rank(config, rank):
+        return
+
+    if iteration != config.profile_step_end:
+        return
+
+    if config.use_pytorch_profiler and pytorch_prof is not None:
+        pytorch_prof.stop()
+
+    if config.use_nsys_profiler:
+        stop_nsys_profiler(nsys_nvtx_context)
+
+
+def initialize_pytorch_profiler(
+    config: ProfilingConfig,
+    tensorboard_dir: str,
+) -> torch.profiler.profile:
+    """Initialize PyTorch profiler with config settings.
+
+    Args:
+        config: Profiling configuration
+        tensorboard_dir: Directory for tensorboard outputs
+
+    Returns:
+        Initialized (but not started) PyTorch profiler
+    """
+    prof = torch.profiler.profile(
+        schedule=torch.profiler.schedule(
+            wait=max(config.profile_step_start - 1, 0),
+            warmup=1 if config.profile_step_start > 0 else 0,
+            active=config.profile_step_end - config.profile_step_start,
+            repeat=1,
+        ),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler(tensorboard_dir),
+        record_shapes=config.record_shapes,
+        with_stack=True,
+    )
+    return prof
+
+
+def start_nsys_profiler(config: ProfilingConfig) -> TNvtxContext:
+    """Start CUDA profiler for nsys profiling.
+
+    Args:
+        config: Profiling configuration
+
+    Returns:
+        NVTX context manager that must be passed to stop_nsys_profiler
+    """
+    torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStart())
+    if config.record_shapes:
+        nvtx_context = torch.autograd.profiler.emit_nvtx(record_shapes=True)
+    else:
+        nvtx_context = torch.autograd.profiler.emit_nvtx()
+    nvtx_context.__enter__()
+    return nvtx_context
+
+
+def stop_nsys_profiler(nvtx_context: Optional[TNvtxContext]) -> None:
+    """Stop CUDA profiler for nsys profiling.
+
+    Args:
+        nvtx_context: NVTX context manager returned from start_nsys_profiler
+    """
+    torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStop())
+    if nvtx_context is not None:
+        nvtx_context.__exit__(None, None, None)
diff --git a/src/megatron/bridge/training/tokenizers/tokenizer.py b/src/megatron/bridge/training/tokenizers/tokenizer.py
index 924a1feb2d..88cb6c68b9 100644
--- a/src/megatron/bridge/training/tokenizers/tokenizer.py
+++ b/src/megatron/bridge/training/tokenizers/tokenizer.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional
 
-from megatron.core.datasets.megatron_tokenizer import MegatronLegacyTokenizer as MegatronTokenizerCore
+from megatron.core.datasets.megatron_tokenizer import MegatronLegacyTokenizer as MegatronTokenizer
 
 from megatron.bridge.training.tokenizers.bert_tokenization import FullTokenizer as FullBertTokenizer
 from megatron.bridge.training.tokenizers.config import TokenizerConfig
@@ -16,7 +16,7 @@
 from megatron.bridge.utils.common_utils import get_rank_safe, print_rank_0
 
 
-class MegatronTokenizer(MegatronTokenizerCore):
+class MegatronTokenizer(MegatronTokenizer):
     """Base tokenizer class, extending the MegatronTokenizer from megatron core.
 
     This class provides a common interface for various tokenizers used within the NeMo framework.
diff --git a/src/megatron/bridge/training/train.py b/src/megatron/bridge/training/train.py
index f273fdc419..2699de8b2c 100644
--- a/src/megatron/bridge/training/train.py
+++ b/src/megatron/bridge/training/train.py
@@ -42,19 +42,26 @@
 from megatron.bridge.training.checkpointing import maybe_finalize_async_save, save_checkpoint
 from megatron.bridge.training.config import ConfigContainer
 from megatron.bridge.training.eval import evaluate_and_print_results
+from megatron.bridge.training.forward_step_func_types import ForwardStepCallable
 from megatron.bridge.training.initialize import destroy_global_state
 from megatron.bridge.training.nvrx_straggler import (
     check_nvrx_straggler_detection,
     safe_shutdown_nvrx_straggler_manager,
 )
+from megatron.bridge.training.profiling import (
+    TNvtxContext,
+    handle_profiling_step,
+    handle_profiling_stop,
+    initialize_pytorch_profiler,
+    should_profile_rank,
+)
 from megatron.bridge.training.state import GlobalState
 from megatron.bridge.training.utils import flop_utils
 from megatron.bridge.training.utils.log_utils import append_to_progress_log, barrier_and_log
 from megatron.bridge.training.utils.train_utils import (
     calc_params_l2_norm,
-    check_forward_step_func_num_args,
     logical_and_across_model_parallel_group,
-    maybe_inject_state,
+    prepare_forward_step_func,
     reduce_max_stat_across_model_parallel_group,
     training_log,
 )
@@ -62,7 +69,7 @@
 
 
 def train(
-    forward_step_func: Callable,
+    forward_step_func: ForwardStepCallable,
     model: list[MegatronModule],
     optimizer: MegatronOptimizer,
     scheduler: OptimizerParamScheduler,
@@ -101,8 +108,18 @@ def train(
     straggler_timer = global_state.straggler_timer
     energy_monitor = global_state.energy_monitor
 
-    # Check num args to forward_step_func
-    num_fw_args = check_forward_step_func_num_args(forward_step_func)
+    # Prepare forward_step_func (check signature and inject state if needed).
+    # This is done once to prevent creating new partial objects every iteration.
+    #
+    # Note on reference semantics:
+    # - functools.partial stores a reference to global_state, not a copy
+    # - When global_state.train_state.step changes, the partial sees the updated value
+    # - This is safe because GlobalState is a mutable object passed by reference
+    #
+    # For functors (classes with __call__ defined):
+    # - For functors: partial(functor_instance, state) still allows functor's internal state to work
+    # - inspect.signature() properly inspects the __call__ method of functors
+    wrapped_forward_step_func = prepare_forward_step_func(forward_step_func, global_state)
 
     # Turn on training mode which enables dropout.
     for model_module in model:
@@ -170,20 +187,12 @@ def train(
     eval_iterations = 0
 
     prof = None
+    nsys_nvtx_context = None  # NVTX context for nsys profiling
     prof_config = config.profiling
-    if prof_config and torch.distributed.get_rank() in prof_config.profile_ranks and prof_config.use_pytorch_profiler:
-        prof = torch.profiler.profile(
-            schedule=torch.profiler.schedule(
-                wait=max(prof_config.profile_step_start - 1, 0),
-                warmup=1 if prof_config.profile_step_start > 0 else 0,
-                active=prof_config.profile_step_end - prof_config.profile_step_start,
-                repeat=1,
-            ),
-            on_trace_ready=torch.profiler.tensorboard_trace_handler(config.logger.tensorboard_dir),
-            record_shapes=prof_config.record_shapes,
-            with_stack=True,
-        )
-        prof.start()
+    if prof_config and should_profile_rank(prof_config, torch.distributed.get_rank()):
+        if prof_config.use_pytorch_profiler:
+            prof = initialize_pytorch_profiler(prof_config, config.logger.tensorboard_dir)
+            prof.start()
 
     start_iteration = global_state.train_state.step
     # Megatron FSDP and FSDP2 does not have this hook
@@ -223,13 +232,15 @@ def train(
 
     # Run training iterations till done.
     while global_state.train_state.step < train_config.train_iters:
-        if prof_config and torch.distributed.get_rank() in prof_config.profile_ranks:
-            if prof_config.use_pytorch_profiler:
-                prof.step()
-            if prof_config.use_nsys_profiler:
-                if global_state.train_state.step == prof_config.profile_step_start:
-                    torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStart())
-                    torch.autograd.profiler.emit_nvtx(record_shapes=prof_config.record_shapes).__enter__()
+        # Handle profiling for this step
+        nvtx_ctx = handle_profiling_step(
+            prof_config,
+            global_state.train_state.step,
+            torch.distributed.get_rank(),
+            prof,
+        )
+        if nvtx_ctx is not None:
+            nsys_nvtx_context = nvtx_ctx
 
         fault_tolerance.on_checkpointing_start(global_state)
         maybe_finalize_async_save(global_state=global_state, ckpt_cfg=config.checkpoint, blocking=False)
@@ -275,7 +286,7 @@ def train(
         # Run training step.
         fault_tolerance.on_training_step_start(global_state)
         loss_dict, skipped_iter, should_checkpoint, should_exit, exit_code, grad_norm, num_zeros_in_grad = train_step(
-            forward_step_func, num_fw_args, train_data_iterator, model, optimizer, scheduler, global_state
+            wrapped_forward_step_func, train_data_iterator, model, optimizer, scheduler, global_state
         )
         fault_tolerance.on_training_step_end(global_state)
         if should_checkpoint:
@@ -414,6 +425,7 @@ def train(
             prof,
             config,
             should_toggle_forward_pre_hook,
+            nsys_nvtx_context,
         )
 
         # Checkpoint and decide whether to exit.
@@ -464,8 +476,7 @@ def train(
 
 
 def train_step(
-    forward_step_func: Callable,
-    num_fw_args: int,
+    forward_step_func: ForwardStepCallable,
     data_iterator: Optional[Union[RerunDataIterator, list[RerunDataIterator]]],
     model: list[MegatronModule],
     optimizer: MegatronOptimizer,
@@ -475,8 +486,7 @@ def train_step(
     """Single training step.
 
     Args:
-        forward_step_func: Function that performs a forward step
-        num_fw_args: Number of arguments expected by forward_step_func
+        forward_step_func: Function that performs a forward step (already wrapped if needed)
         data_iterator: Iterator over training data
         model: list of model chunks
         optimizer: Optimizer for model parameters
@@ -506,9 +516,6 @@ def train_step(
             model_chunk.zero_grad_buffer()
         optimizer.zero_grad()
 
-        # Optionally inject state into forward step
-        wrapped_forward_step = maybe_inject_state(forward_step_func, global_state, num_fw_args=num_fw_args)
-
         _handle_mxfp8_param_buffer_copy(
             optimizer=optimizer,
             reuse_grad_buf_for_mxfp8_param_ag=cfg.optimizer.reuse_grad_buf_for_mxfp8_param_ag,
@@ -517,8 +524,9 @@ def train_step(
 
         # Forward pass.
         forward_backward_func = get_forward_backward_func()
+        # import pdb;pdb.set_trace()
         losses_reduced = forward_backward_func(
-            forward_step_func=wrapped_forward_step,
+            forward_step_func=forward_step_func,
             data_iterator=data_iterator,
             model=model,
             num_microbatches=get_num_microbatches(),
@@ -601,6 +609,7 @@ def post_training_step_callbacks(
     prof: Optional[torch.profiler.profile],
     config: ConfigContainer,
     should_toggle_forward_pre_hook: bool,
+    nsys_nvtx_context: Optional[TNvtxContext] = None,
 ) -> None:
     """Run all post-training-step functions (e.g., FT heartbeats, GC).
 
@@ -612,6 +621,7 @@ def post_training_step_callbacks(
         prof: PyTorch profiler instance
         config: Configuration container
         should_toggle_forward_pre_hook: Whether to toggle forward pre-hook
+        nsys_nvtx_context: NVTX context for nsys profiling (if active)
     """
     train_config = config.train
 
@@ -644,16 +654,13 @@ def post_training_step_callbacks(
             enable_forward_pre_hook(model)
 
     # Profiling.
-    if (
-        config.profiling
-        and iteration == config.profiling.profile_step_end
-        and torch.distributed.get_rank() in config.profiling.profile_ranks
-    ):
-        if config.profiling.use_pytorch_profiler:
-            assert prof is not None
-            prof.stop()
-        if config.profiling.use_nsys_profiler:
-            torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStop())
+    handle_profiling_stop(
+        config.profiling,
+        iteration,
+        torch.distributed.get_rank(),
+        prof,
+        nsys_nvtx_context,
+    )
 
     # Manual garbage collection.
     if train_config.manual_gc:
diff --git a/src/megatron/bridge/training/utils/flop_utils.py b/src/megatron/bridge/training/utils/flop_utils.py
index 10ea5b64e3..45b83a4405 100644
--- a/src/megatron/bridge/training/utils/flop_utils.py
+++ b/src/megatron/bridge/training/utils/flop_utils.py
@@ -278,6 +278,7 @@ def transformer_flops():
             cfg.model.vocab_size,
             cfg.model.make_vocab_size_divisible_by,
             cfg.model.tensor_model_parallel_size,
+            logging_enabled=False,
         )
 
         total_floating_point_operations = (
diff --git a/src/megatron/bridge/training/utils/train_utils.py b/src/megatron/bridge/training/utils/train_utils.py
index 156355165a..c8793a0ebe 100644
--- a/src/megatron/bridge/training/utils/train_utils.py
+++ b/src/megatron/bridge/training/utils/train_utils.py
@@ -16,7 +16,7 @@
 from collections import defaultdict
 from datetime import datetime
 from functools import partial
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -29,6 +29,7 @@
 from megatron.core.utils import get_data_parallel_group_if_dtensor, to_local_if_dtensor
 
 from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.forward_step_func_types import ForwardStepCallable
 from megatron.bridge.training.state import GlobalState
 from megatron.bridge.training.utils.flop_utils import num_floating_point_operations
 from megatron.bridge.training.utils.theoretical_memory_utils import report_theoretical_memory
@@ -612,57 +613,101 @@ def report_memory(name: str) -> None:
         print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True)
 
 
-def maybe_inject_state(forward_step_func: Callable, state: GlobalState, num_fw_args: Optional[int] = None) -> Callable:
-    """Optionally inject GlobalState into a 4-arg forward_step function.
+def prepare_forward_step_func(forward_step_func: ForwardStepCallable, state: GlobalState) -> ForwardStepCallable:
+    """Convenience function to check and inject GlobalState in one call.
 
-    - If the function has 4 parameters (state, data_iterator, model, return_schedule_plan),
-      bind the provided state via functools.partial to produce a callable that accepts
-      (data_iterator, model, return_schedule_plan).
-    - If the function already has 3 parameters (data_iterator, model, return_schedule_plan)
-      or 2 parameters (data_iterator, model), return it unchanged.
+    This combines needs_global_state_injection() and maybe_inject_state() for cleaner code.
+    Call this once at the beginning of train() or evaluate() to prevent creating new
+    partial objects every iteration.
+
+    Wrapping once is safe since:
+    - functools.partial stores a reference to the state object, not a copy
+    - When state.train_state.step or other fields change, the partial sees those changes
+    - No staleness issues because GlobalState is mutable and passed by reference
+
+    Functor support:
+    - Works with both regular functions (def forward_step(...)) and callable classes
+    - For functors: inspect.signature() inspects the __call__ method
+    - For functors: partial(functor_instance, state) preserves functor's internal state
+    - Example: If functor has self.call_count, it still increments correctly
 
     Args:
-        forward_step_func: The original forward step function.
-        state: The GlobalState object to potentially inject.
-        num_fw_args: The number of arguments the forward_step_func expects (optional,
-                     will be inspected if None).
+        forward_step_func: The original forward step function or functor
+        state: The GlobalState object to inject if needed
 
     Returns:
-        The original function or a partial function with GlobalState injected.
+        The wrapped function (if injection needed) or original function
     """
-    if not num_fw_args:
-        num_fw_args = len(inspect.signature(forward_step_func).parameters)
-    if num_fw_args == 4:  # megatron bridge gpt_step.py forward_step has 4 args
-        # inject global_state
-        return partial(forward_step_func, state)
-    else:
-        return forward_step_func
+    needs_injection = needs_global_state_injection(forward_step_func)
+    return maybe_inject_state(forward_step_func, state, needs_injection=needs_injection)
 
 
-def check_forward_step_func_num_args(forward_step_func: Callable) -> int:
-    """Check if the forward step function has a supported number of arguments.
+def needs_global_state_injection(forward_step_func: ForwardStepCallable) -> bool:
+    """Check if a forward step function needs GlobalState injection.
 
-    Currently supports 2, 3, or 4 arguments:
-    - func(data_iterator, model)
-    - func(data_iterator, model, return_schedule_plan: bool = False)  # state pre-bound via partial
-    - func(state, data_iterator, model, return_schedule_plan: bool = False)
+    This function does the signature inspection once to determine if state should be injected.
+    It's more efficient than repeated signature inspection in the training loop.
+
+    Detection logic:
+    1. First checks for GlobalState type annotation in any parameter
+    2. Falls back to checking if first parameter is named 'state' or 'global_state'
 
     Args:
-        forward_step_func: The function to check.
+        forward_step_func: The forward step function to inspect.
 
     Returns:
-        The number of arguments the function takes.
-
-    Raises:
-        AssertionError: If the function does not have 2 or 4 arguments.
+        True if GlobalState should be injected, False otherwise.
     """
-    num_fw_args = len(inspect.signature(forward_step_func).parameters)
-    fail_msg = f"""
-    forward_step_func has {num_fw_args} arguments. Only the following signatures are supported:
-        2 args: forward_step_func(data_iterator: Iterable, model: GPTModel)
-        3 args: forward_step_func(data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False)
-        4 args: forward_step_func(state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False)
+    signature = inspect.signature(forward_step_func)
+    parameters = signature.parameters
+    param_names = list(parameters.keys())
+
+    # Check for GlobalState type annotation in any parameter
+    for param_name, param in parameters.items():
+        if param.annotation != inspect.Parameter.empty:
+            # Handle both direct GlobalState and string annotations
+            if (
+                param.annotation == GlobalState
+                or (isinstance(param.annotation, str) and "GlobalState" in param.annotation)
+                or (hasattr(param.annotation, "__name__") and param.annotation.__name__ == "GlobalState")
+            ):
+                # Found GlobalState annotation - needs injection
+                return True
+
+    # Fallback: Check if the first parameter is named 'state' or 'global_state'
+    return param_names and param_names[0] in ("state", "global_state")
+
+
+def maybe_inject_state(
+    forward_step_func: ForwardStepCallable, state: GlobalState, needs_injection: Optional[bool] = None
+) -> ForwardStepCallable:
+    """Optionally inject GlobalState into forward_step functions that expect it.
+
+    Determines whether to inject state by inspecting function signature:
+    1. First checks for GlobalState type annotation in any parameter
+    2. Falls back to checking if first parameter is named 'state'
+    3. Otherwise assumes the function doesn't expect state
+
+    Supported signatures:
+    - (data_iterator, model) → no injection
+    - (data_iterator, model, return_schedule_plan) → no injection
+    - (state: GlobalState, data_iterator, model) → inject state
+    - (state: GlobalState, data_iterator, model, return_schedule_plan) → inject state
+    - (state, data_iterator, model) → inject state (fallback to name-based detection)
+
+    Args:
+        forward_step_func: The original forward step function.
+        state: The GlobalState object to potentially inject.
+        needs_injection: Whether injection is needed (optional, will be inspected if None).
+                        Pass this to avoid repeated signature inspection in training loops.
+
+    Returns:
+        The original function or a partial function with GlobalState injected.
     """
-    assert num_fw_args in (2, 3, 4), fail_msg
+    if needs_injection is None:
+        needs_injection = needs_global_state_injection(forward_step_func)
 
-    return num_fw_args
+    if needs_injection:
+        return partial(forward_step_func, state)
+    else:
+        return forward_step_func
\ No newline at end of file
diff --git a/tests/end_to_end_tests/train_from_recipe.py b/tests/end_to_end_tests/train_from_recipe.py
index b4edf3f592..ba6264d9a7 100644
--- a/tests/end_to_end_tests/train_from_recipe.py
+++ b/tests/end_to_end_tests/train_from_recipe.py
@@ -173,6 +173,8 @@ def apply_args_to_config(config, args):
         config.checkpoint.save = args.save_dir
     if args.save_interval:
         config.checkpoint.save_interval = args.save_interval
+    if args.async_save:
+        config.checkpoint.async_save = args.async_save
 
     # Dataset configuration
     logging.info(f"Configuring dataset: type={args.data}")
@@ -294,7 +296,7 @@ def setup_argument_parser():
 
     # Model specification
     parser.add_argument("--model-family", required=True, help="Model family (e.g., llama)")
-    parser.add_argument("--recipe-name", required=True, help="Recipe name (e.g., pretrain_llama3_8b)")
+    parser.add_argument("--recipe-name", required=True, help="Recipe name (e.g., llama3_8b_pretrain_config)")
     parser.add_argument("--exp-name", required=True, help="Experiment name for logging and checkpoints")
 
     # Training modes
@@ -333,6 +335,7 @@ def setup_argument_parser():
     parser.add_argument("--pretrained-checkpoint", type=str, help="Path to pretrained checkpoint")
     parser.add_argument("--save-dir", type=str, help="Directory to save checkpoints")
     parser.add_argument("--save-interval", type=int, help="Number of iterations between checkpoint saves")
+    parser.add_argument("--async-save", action="store_true", help="Enable async checkpoint saving", default=False)
 
     # Data
     parser.add_argument(
@@ -387,22 +390,75 @@ def main():
     # Parse plugin config overrides from unknown arguments
     plugin_config_overrides = parse_plugin_config_overrides(unknown_args)
 
-    # Import recipe dynamically
-    recipe_module_path = f"megatron.bridge.recipes.{args.model_family}.{args.recipe_name}"
-    logging.info(f"Loading recipe module path: {recipe_module_path}")
-    recipe_module = importlib.import_module(recipe_module_path)
-
-    # Get base configuration from recipe based on training mode
-    if args.pretrain:
-        config_name = args.config_name or "pretrain_config"
-    elif args.finetune:
-        config_name = args.config_name or "finetune_config"
-    else:
-        raise ValueError("Must specify either --pretrain or --finetune")
-
-    if not hasattr(recipe_module, config_name):
-        raise ValueError(f"Recipe {recipe_module_path} must have '{config_name}' function")
-    base_config = getattr(recipe_module, config_name)(dir="/nemo_run/", name=args.exp_name)
+    # Import recipe dynamically using merged naming convention with legacy fallback.
+    #
+    # Supported cases (in order):
+    # 1) New merged-name API (preferred):
+    #    - Path:  megatron.bridge.recipes.<family>.<merged_name>
+    #    - Args:  --model-family llama --recipe-name llama3_8b_pretrain_config --pretrain
+    #    - Example resolved symbol: megatron.bridge.recipes.llama.llama3_8b_pretrain_config
+    #
+    # 2) Legacy module API (single module exposes config function):
+    #    - Path:  megatron.bridge.recipes.<family>.<module>.<pretrain_config|finetune_config>
+    #    - Args:  --model-family llama --recipe-name llama3 --pretrain
+    #    - Example resolved symbol: megatron.bridge.recipes.llama.llama3.pretrain_config
+    #
+    # 3) Oldest attribute API (family __init__ exposes suffixed names):
+    #    - Path:  megatron.bridge.recipes.<family>.<recipe_name>_<pretrain_config|finetune_config>
+    #    - Args:  --model-family llama --recipe-name llama3_8b --pretrain
+    #    - Example resolved symbol: megatron.bridge.recipes.llama.llama3_8b_pretrain_config
+    #
+    # The resolver below tries (1) then (2) then (3), raising a clear error if none match.
+    merged_attr = args.recipe_name
+    family_pkg_path = f"megatron.bridge.recipes.{args.model_family}"
+    logging.info(f"Attempting merged-name import: {family_pkg_path}.{merged_attr}")
+
+    try:
+        family_pkg = importlib.import_module(family_pkg_path)
+        if not hasattr(family_pkg, merged_attr):
+            raise AttributeError
+        config_builder = getattr(family_pkg, merged_attr)
+        logging.info(f"Using merged recipe API: {family_pkg_path}.{merged_attr}")
+    except Exception:
+        # Legacy fallback paths
+        # 1) args.recipe_name is a module under the family exposing pretrain_config/finetune_config
+        legacy_module_path = f"{family_pkg_path}.{args.recipe_name}"
+        logging.info(f"Merged import failed; trying legacy module path: {legacy_module_path}")
+
+        # Determine function name by mode
+        if args.pretrain:
+            config_name = args.config_name or "pretrain_config"
+        elif args.finetune:
+            config_name = args.config_name or "finetune_config"
+        else:
+            raise ValueError("Must specify either --pretrain or --finetune")
+
+        try:
+            recipe_module = importlib.import_module(legacy_module_path)
+            if not hasattr(recipe_module, config_name):
+                raise AttributeError
+            config_builder = getattr(recipe_module, config_name)
+            logging.info(f"Using legacy module API: {legacy_module_path}.{config_name}")
+        except Exception:
+            # 2) Oldest style: attribute on family package named <recipe_name>_<config_name>
+            # Avoid double suffixing if user already passed a merged name
+            if merged_attr.endswith("_pretrain_config") or merged_attr.endswith("_finetune_config"):
+                legacy_attr = merged_attr
+            else:
+                legacy_attr = f"{args.recipe_name}_{config_name}"
+            logging.info(f"Trying oldest legacy attribute: {family_pkg_path}.{legacy_attr}")
+            family_pkg = importlib.import_module(family_pkg_path)
+            if not hasattr(family_pkg, legacy_attr):
+                raise ValueError(
+                    "Unable to resolve recipe. Tried: "
+                    f"(1) {family_pkg_path}.{merged_attr}, "
+                    f"(2) {legacy_module_path}.{config_name}, "
+                    f"(3) {family_pkg_path}.{legacy_attr}"
+                )
+            config_builder = getattr(family_pkg, legacy_attr)
+            logging.info(f"Using oldest legacy API: {family_pkg_path}.{legacy_attr}")
+
+    base_config = config_builder(dir="/nemo_run/", name=args.exp_name)
 
     # Apply plugin config overrides first (lower priority)
     if plugin_config_overrides:
diff --git a/tests/functional_tests/models/test_gemma2_conversion.py b/tests/functional_tests/models/test_gemma2_conversion.py
new file mode 100644
index 0000000000..bee06eeb87
--- /dev/null
+++ b/tests/functional_tests/models/test_gemma2_conversion.py
@@ -0,0 +1,278 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import subprocess
+from pathlib import Path
+
+import pytest
+import torch
+from transformers import Gemma2Config, Gemma2ForCausalLM, GemmaTokenizer
+
+
+HF_GEMMA2_TOY_MODEL_CONFIG = {
+    "architectures": ["Gemma2ForCausalLM"],
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "attn_logit_softcapping": 50.0,
+    "bos_token_id": 2,
+    "cache_implementation": "hybrid",
+    "eos_token_id": 1,
+    "final_logit_softcapping": 30.0,
+    "head_dim": 256,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 1024,  # Smaller than real 2B for faster testing
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,  # Reduced for TP compatibility testing
+    "max_position_embeddings": 8192,
+    "model_type": "gemma2",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 2,  # Much smaller for testing
+    "num_key_value_heads": 2,  # Changed from 4 to 2 to be divisible by TP=2
+    "pad_token_id": 0,
+    "query_pre_attn_scalar": 256,
+    "rms_norm_eps": 1e-06,
+    "rope_theta": 10000.0,
+    "sliding_window": 4096,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.42.4",
+    "use_cache": True,
+    "vocab_size": 256000,
+}
+
+
+class TestGemma2Conversion:
+    """
+    Test Gemma2 model conversion from local HuggingFace model with different parallelism configurations.
+    """
+
+    @pytest.fixture(scope="class")
+    def gemma2_toy_model_path(self, tmp_path_factory):
+        """
+        Create and save a HuggingFace Gemma2 toy model from config to a temporary directory.
+
+        Args:
+            tmp_path_factory: Pytest temporary path factory for class-scoped fixtures
+
+        Returns:
+            str: Path to the saved HuggingFace model directory
+        """
+        # Create a temporary directory for this test class
+        temp_dir = tmp_path_factory.mktemp("gemma2_toy_model")
+        model_dir = temp_dir / "gemma2_toy"
+
+        # Create Gemma2 config from the toy model config
+        config = Gemma2Config(**HF_GEMMA2_TOY_MODEL_CONFIG)
+        config.torch_dtype = torch.bfloat16  # Explicitly set the torch_dtype in config
+
+        # Create model with random weights and convert to bfloat16
+        model = Gemma2ForCausalLM(config)
+        model = model.bfloat16()  # Use .bfloat16() method instead of .to()
+
+        # Debug: Check model dtype before saving
+        for name, param in model.named_parameters():
+            print(f"Before save - {name}: {param.dtype}")
+            break  # Just check the first parameter
+
+        # Download and save tokenizer from a reference Gemma model
+        # We use the smallest available Gemma model for tokenizer artifacts
+        # First try to load from pre-mounted test data, then fall back to HuggingFace download
+        pre_downloaded_path = "/home/TestData/megatron_bridge/tokenizers/google/gemma-2b"
+        # Try loading from pre-downloaded location first
+        if Path(pre_downloaded_path).exists():
+            print(f"Loading tokenizer from pre-downloaded path: {pre_downloaded_path}")
+            tokenizer = GemmaTokenizer.from_pretrained(pre_downloaded_path)
+        else:
+            # Fall back to downloading from HuggingFace
+            print("Pre-downloaded tokenizer not found, attempting to download from HuggingFace")
+            tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2b")
+        tokenizer.save_pretrained(model_dir)
+
+        # Save model and config to directory
+        model.save_pretrained(model_dir, safe_serialization=True)
+
+        # Also save config.json explicitly to ensure compatibility with correct torch_dtype
+        config_to_save = HF_GEMMA2_TOY_MODEL_CONFIG.copy()
+        config_path = model_dir / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(config_to_save, f, indent=2)
+
+        return str(model_dir)
+
+    def test_toy_model_creation(self, gemma2_toy_model_path):
+        """
+        Test that the toy model is created correctly and can be loaded.
+
+        Args:
+            gemma2_toy_model_path: Path to the toy Gemma2 model (from fixture)
+        """
+        # Verify the model directory exists
+        model_path = Path(gemma2_toy_model_path)
+        assert model_path.exists(), f"Model directory not found at {model_path}"
+
+        # Check essential files exist
+        config_file = model_path / "config.json"
+        assert config_file.exists(), f"config.json not found at {config_file}"
+
+        # Check for model weights (safetensors preferred)
+        weights_file = model_path / "model.safetensors"
+        if not weights_file.exists():
+            weights_file = model_path / "pytorch_model.bin"
+        assert weights_file.exists(), f"Model weights file not found in {model_path}"
+
+        # Check for tokenizer files
+        tokenizer_config_file = model_path / "tokenizer_config.json"
+        assert tokenizer_config_file.exists(), f"tokenizer_config.json not found at {tokenizer_config_file}"
+
+        # Load and verify config
+        with open(config_file) as f:
+            config_data = json.load(f)
+
+        assert config_data["model_type"] == "gemma2"
+        assert config_data["hidden_size"] == 1024
+        assert config_data["intermediate_size"] == 2048
+        assert config_data["num_hidden_layers"] == 2
+        assert config_data["num_attention_heads"] == 8
+        assert config_data["num_key_value_heads"] == 2
+        assert config_data["vocab_size"] == 256000
+        assert config_data["head_dim"] == 256
+        # Check Gemma2-specific parameters
+        assert config_data["attn_logit_softcapping"] == 50.0
+        assert config_data["final_logit_softcapping"] == 30.0
+        assert config_data["query_pre_attn_scalar"] == 256
+        assert config_data["sliding_window"] == 4096
+
+        # Try loading the model to verify it's valid
+        try:
+            model = Gemma2ForCausalLM.from_pretrained(
+                gemma2_toy_model_path,
+                torch_dtype=torch.bfloat16,
+                low_cpu_mem_usage=False,  # Ensure full loading
+            )
+
+            # Try loading the tokenizer as well
+            try:
+                tokenizer = GemmaTokenizer.from_pretrained(gemma2_toy_model_path)
+                print(f"Tokenizer loaded successfully with vocab_size: {tokenizer.vocab_size}")
+            except Exception as e:
+                print(f"Warning: Could not load tokenizer (this might be OK for conversion testing): {e}")
+
+            # Verify model structure
+            assert hasattr(model, "model")
+            assert hasattr(model.model, "layers")
+            assert len(model.model.layers) == 2  # num_hidden_layers
+
+            print(f"SUCCESS: Toy model created and validated at {gemma2_toy_model_path}")
+            print("Model weights are correctly in bfloat16 format")
+
+        except Exception as e:
+            assert False, f"Failed to load created toy model: {e}"
+
+    @pytest.mark.run_only_on("GPU")
+    @pytest.mark.parametrize(
+        "tp,pp,test_name",
+        [
+            (2, 1, "TP"),
+            (1, 2, "PP"),
+        ],
+    )
+    def test_gemma2_conversion_parallelism(self, gemma2_toy_model_path, tmp_path, tp, pp, test_name):
+        """
+        Test Gemma2 model conversion with different parallelism configurations.
+
+        Args:
+            gemma2_toy_model_path: Path to the toy Gemma2 model (from fixture)
+            tmp_path: Pytest temporary path fixture
+            tp: Tensor parallelism size
+            pp: Pipeline parallelism size
+            test_name: Name of the test for identification
+        """
+        # Create temporary output directory for conversion results
+        test_output_dir = tmp_path / f"gemma2_{test_name}"
+        test_output_dir.mkdir(exist_ok=True)
+
+        cmd = [
+            "python",
+            "-m",
+            "torch.distributed.run",
+            "--nproc_per_node=2",
+            "--nnodes=1",
+            "-m",
+            "coverage",
+            "run",
+            "--data-file=/workspace/.coverage",
+            "--source=/workspace/",
+            "--parallel-mode",
+            "examples/conversion/hf_megatron_roundtrip_multi_gpu.py",
+            "--hf-model-id",
+            gemma2_toy_model_path,
+            "--output-dir",
+            str(test_output_dir),
+            "--tp",
+            str(tp),
+            "--pp",
+            str(pp),
+        ]
+
+        try:
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, cwd=Path(__file__).parent.parent.parent.parent
+            )
+
+            # Check that the conversion completed successfully
+            if result.returncode != 0:
+                print(f"STDOUT: {result.stdout}")
+                print(f"STDERR: {result.stderr}")
+                assert False, f"Gemma2 {test_name} conversion failed with return code {result.returncode}"
+
+            # Verify that the converted model was saved
+            # The output directory should be named after the last part of the model path
+            model_name = Path(gemma2_toy_model_path).name  # "gemma2_toy"
+            converted_model_dir = test_output_dir / model_name
+            assert converted_model_dir.exists(), f"Converted model directory not found at {converted_model_dir}"
+
+            # Check that essential model files exist
+            config_file = converted_model_dir / "config.json"
+            assert config_file.exists(), f"config.json not found in converted model at {config_file}"
+
+            # Check for model weights file (could be either safetensors or pytorch_model.bin)
+            weights_file_safetensors = converted_model_dir / "model.safetensors"
+            weights_file_pytorch = converted_model_dir / "pytorch_model.bin"
+            assert weights_file_safetensors.exists() or weights_file_pytorch.exists(), (
+                f"Model weights file not found in converted model at {converted_model_dir}"
+            )
+
+            # Verify the config contains Gemma2-specific parameters
+            with open(config_file) as f:
+                saved_config = json.load(f)
+
+            assert saved_config["model_type"] == "gemma2", "Model type should be gemma2"
+            assert saved_config["hidden_size"] == 1024, "Hidden size should match toy config"
+            assert saved_config["intermediate_size"] == 2048, "Intermediate size should match toy config"
+            assert saved_config["num_attention_heads"] == 8, "Number of attention heads should match toy config"
+            assert saved_config["num_key_value_heads"] == 2, "Number of key-value heads should match toy config"
+            assert saved_config["head_dim"] == 256, "Head dimension should match toy config"
+            # Verify Gemma2-specific parameters
+            assert saved_config["attn_logit_softcapping"] == 50.0, "Attention logit softcapping should match"
+            assert saved_config["final_logit_softcapping"] == 30.0, "Final logit softcapping should match"
+            assert saved_config["query_pre_attn_scalar"] == 256, "Query pre-attention scalar should match"
+            assert saved_config["sliding_window"] == 4096, "Sliding window should match"
+
+            print(f"SUCCESS: Gemma2 {test_name} conversion test completed successfully")
+            print(f"Converted model saved at: {converted_model_dir}")
+
+        except Exception as e:
+            print(f"Error during Gemma2 {test_name} conversion test: {e}")
+            raise
diff --git a/tests/functional_tests/models/test_gemma2_provider.py b/tests/functional_tests/models/test_gemma2_provider.py
new file mode 100644
index 0000000000..f5e92c2ca4
--- /dev/null
+++ b/tests/functional_tests/models/test_gemma2_provider.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from megatron.bridge.models.conversion.auto_bridge import AutoBridge
+from megatron.bridge.models.gemma import (
+    Gemma2ModelProvider2B,
+    Gemma2ModelProvider9B,
+    Gemma2ModelProvider27B,
+)
+from tests.functional_tests.utils import compare_provider_configs
+
+
+HF_MODEL_ID_TO_BRIDGE_MODEL_PROVIDER = {
+    "google/gemma-2-2b": Gemma2ModelProvider2B,
+    "google/gemma-2-9b": Gemma2ModelProvider9B,
+    "google/gemma-2-27b": Gemma2ModelProvider27B,
+}
+
+ROOT_PATH: str = "/home/TestData/megatron_bridge/hf_home"
+
+HF_MODEL_ID_PATH_TO_MODEL_PROVIDER = {
+    os.path.join(ROOT_PATH, hf_model_id): provider_class
+    for hf_model_id, provider_class in HF_MODEL_ID_TO_BRIDGE_MODEL_PROVIDER.items()
+}
+
+
+class TestGemma2ModelProviderMapping:
+    """Test that bridge provider configs are equivalent to predefined provider configs."""
+
+    @pytest.mark.parametrize("hf_model_id,provider_class", list(HF_MODEL_ID_PATH_TO_MODEL_PROVIDER.items()))
+    def test_bridge_vs_predefined_provider_config_equivalence(self, hf_model_id, provider_class):
+        """Test that bridge converted provider config matches predefined provider config."""
+        # Create bridge from HF model
+        bridge = AutoBridge.from_hf_pretrained(hf_model_id)
+        converted_provider = bridge.to_megatron_provider(load_weights=False)
+
+        # Create predefined provider
+        predefined_provider = provider_class()
+
+        # Compare configs
+        compare_provider_configs(converted_provider, predefined_provider, hf_model_id)
diff --git a/tests/functional_tests/models/test_gemma_conversion.py b/tests/functional_tests/models/test_gemma_conversion.py
new file mode 100644
index 0000000000..381e4d3c26
--- /dev/null
+++ b/tests/functional_tests/models/test_gemma_conversion.py
@@ -0,0 +1,261 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import subprocess
+from pathlib import Path
+
+import pytest
+import torch
+from transformers import GemmaConfig, GemmaForCausalLM, GemmaTokenizer
+
+
+HF_GEMMA_TOY_MODEL_CONFIG = {
+    "architectures": ["GemmaForCausalLM"],
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "bos_token_id": 2,
+    "eos_token_id": 1,
+    "head_dim": 256,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,  # Smaller than real 2B for faster testing
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,  # Smaller than real 2B for faster testing
+    "max_position_embeddings": 8192,
+    "model_type": "gemma",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 2,  # Much smaller for testing
+    "num_key_value_heads": 2,  # Changed from 1 to 2 to be divisible by TP=2
+    "pad_token_id": 0,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": None,
+    "rope_theta": 10000.0,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.38.0.dev0",
+    "use_cache": True,
+    "vocab_size": 256000,
+}
+
+
+class TestGemmaConversion:
+    """
+    Test Gemma model conversion from local HuggingFace model with different parallelism configurations.
+    """
+
+    @pytest.fixture(scope="class")
+    def gemma_toy_model_path(self, tmp_path_factory):
+        """
+        Create and save a HuggingFace Gemma toy model from config to a temporary directory.
+
+        Args:
+            tmp_path_factory: Pytest temporary path factory for class-scoped fixtures
+
+        Returns:
+            str: Path to the saved HuggingFace model directory
+        """
+        # Create a temporary directory for this test class
+        temp_dir = tmp_path_factory.mktemp("gemma_toy_model")
+        model_dir = temp_dir / "gemma_toy"
+
+        # Create Gemma config from the toy model config
+        config = GemmaConfig(**HF_GEMMA_TOY_MODEL_CONFIG)
+        config.torch_dtype = torch.bfloat16  # Explicitly set the torch_dtype in config
+
+        # Create model with random weights and convert to bfloat16
+        model = GemmaForCausalLM(config)
+        model = model.bfloat16()  # Use .bfloat16() method instead of .to()
+
+        # Debug: Check model dtype before saving
+        for name, param in model.named_parameters():
+            print(f"Before save - {name}: {param.dtype}")
+            break  # Just check the first parameter
+
+        # Download and save tokenizer from a reference Gemma model
+        # We use the smallest available Gemma model for tokenizer artifacts
+        # First try to load from pre-mounted test data, then fall back to HuggingFace download
+        pre_downloaded_path = "/home/TestData/megatron_bridge/tokenizers/google/gemma-2b"
+        # Try loading from pre-downloaded location first
+        if Path(pre_downloaded_path).exists():
+            print(f"Loading tokenizer from pre-downloaded path: {pre_downloaded_path}")
+            tokenizer = GemmaTokenizer.from_pretrained(pre_downloaded_path)
+        else:
+            # Fall back to downloading from HuggingFace
+            print("Pre-downloaded tokenizer not found, attempting to download from HuggingFace")
+            tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2b")
+        tokenizer.save_pretrained(model_dir)
+
+        # Save model and config to directory
+        model.save_pretrained(model_dir, safe_serialization=True)
+
+        # Also save config.json explicitly to ensure compatibility with correct torch_dtype
+        config_to_save = HF_GEMMA_TOY_MODEL_CONFIG.copy()
+        config_path = model_dir / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(config_to_save, f, indent=2)
+
+        return str(model_dir)
+
+    def test_toy_model_creation(self, gemma_toy_model_path):
+        """
+        Test that the toy model is created correctly and can be loaded.
+
+        Args:
+            gemma_toy_model_path: Path to the toy Gemma model (from fixture)
+        """
+        # Verify the model directory exists
+        model_path = Path(gemma_toy_model_path)
+        assert model_path.exists(), f"Model directory not found at {model_path}"
+
+        # Check essential files exist
+        config_file = model_path / "config.json"
+        assert config_file.exists(), f"config.json not found at {config_file}"
+
+        # Check for model weights (safetensors preferred)
+        weights_file = model_path / "model.safetensors"
+        if not weights_file.exists():
+            weights_file = model_path / "pytorch_model.bin"
+        assert weights_file.exists(), f"Model weights file not found in {model_path}"
+
+        # Check for tokenizer files
+        tokenizer_config_file = model_path / "tokenizer_config.json"
+        assert tokenizer_config_file.exists(), f"tokenizer_config.json not found at {tokenizer_config_file}"
+
+        # Load and verify config
+        with open(config_file) as f:
+            config_data = json.load(f)
+
+        assert config_data["model_type"] == "gemma"
+        assert config_data["hidden_size"] == 1024
+        assert config_data["num_hidden_layers"] == 2
+        assert config_data["num_attention_heads"] == 8
+        assert config_data["num_key_value_heads"] == 2
+        assert config_data["vocab_size"] == 256000
+        assert config_data["head_dim"] == 256
+
+        # Try loading the model to verify it's valid
+        try:
+            model = GemmaForCausalLM.from_pretrained(
+                gemma_toy_model_path,
+                torch_dtype=torch.bfloat16,
+                low_cpu_mem_usage=False,  # Ensure full loading
+            )
+
+            # Try loading the tokenizer as well
+            try:
+                tokenizer = GemmaTokenizer.from_pretrained(gemma_toy_model_path)
+                print(f"Tokenizer loaded successfully with vocab_size: {tokenizer.vocab_size}")
+            except Exception as e:
+                print(f"Warning: Could not load tokenizer (this might be OK for conversion testing): {e}")
+
+            # Verify model structure
+            assert hasattr(model, "model")
+            assert hasattr(model.model, "layers")
+            assert len(model.model.layers) == 2  # num_hidden_layers
+
+            print(f"SUCCESS: Toy model created and validated at {gemma_toy_model_path}")
+            print("Model weights are correctly in bfloat16 format")
+
+        except Exception as e:
+            assert False, f"Failed to load created toy model: {e}"
+
+    @pytest.mark.run_only_on("GPU")
+    @pytest.mark.parametrize(
+        "tp,pp,test_name",
+        [
+            (2, 1, "TP"),
+            (1, 2, "PP"),
+        ],
+    )
+    def test_gemma_conversion_parallelism(self, gemma_toy_model_path, tmp_path, tp, pp, test_name):
+        """
+        Test Gemma model conversion with different parallelism configurations.
+
+        Args:
+            gemma_toy_model_path: Path to the toy Gemma model (from fixture)
+            tmp_path: Pytest temporary path fixture
+            tp: Tensor parallelism size
+            pp: Pipeline parallelism size
+            test_name: Name of the test for identification
+        """
+
+        # Create temporary output directory for conversion results
+        test_output_dir = tmp_path / f"gemma_{test_name}"
+        test_output_dir.mkdir(exist_ok=True)
+
+        cmd = [
+            "python",
+            "-m",
+            "torch.distributed.run",
+            "--nproc_per_node=2",
+            "--nnodes=1",
+            "-m",
+            "coverage",
+            "run",
+            "--data-file=/workspace/.coverage",
+            "--source=/workspace/",
+            "--parallel-mode",
+            "examples/conversion/hf_megatron_roundtrip_multi_gpu.py",
+            "--hf-model-id",
+            gemma_toy_model_path,
+            "--output-dir",
+            str(test_output_dir),
+            "--tp",
+            str(tp),
+            "--pp",
+            str(pp),
+        ]
+
+        try:
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, cwd=Path(__file__).parent.parent.parent.parent
+            )
+            # Check that the conversion completed successfully
+            if result.returncode != 0:
+                print(f"STDOUT: {result.stdout}")
+                print(f"STDERR: {result.stderr}")
+                assert False, f"Gemma {test_name} conversion failed with return code {result.returncode}"
+
+            # Verify that the converted model was saved
+            # The output directory should be named after the last part of the model path
+            model_name = Path(gemma_toy_model_path).name  # "gemma_toy"
+            converted_model_dir = test_output_dir / model_name
+            assert converted_model_dir.exists(), f"Converted model directory not found at {converted_model_dir}"
+
+            # Check that essential model files exist
+            config_file = converted_model_dir / "config.json"
+            assert config_file.exists(), f"config.json not found in converted model at {config_file}"
+
+            # Check for model weights file (could be either safetensors or pytorch_model.bin)
+            weights_file_safetensors = converted_model_dir / "model.safetensors"
+            weights_file_pytorch = converted_model_dir / "pytorch_model.bin"
+            assert weights_file_safetensors.exists() or weights_file_pytorch.exists(), (
+                f"Model weights file not found in converted model at {converted_model_dir}"
+            )
+
+            # Verify the config contains Gemma-specific parameters
+            with open(config_file) as f:
+                saved_config = json.load(f)
+
+            assert saved_config["model_type"] == "gemma", "Model type should be gemma"
+            assert saved_config["hidden_size"] == 1024, "Hidden size should match toy config"
+            assert saved_config["num_attention_heads"] == 8, "Number of attention heads should match toy config"
+            assert saved_config["num_key_value_heads"] == 2, "Number of key-value heads should match toy config"
+            assert saved_config["head_dim"] == 256, "Head dimension should match toy config"
+
+            print(f"SUCCESS: Gemma {test_name} conversion test completed successfully")
+            print(f"Converted model saved at: {converted_model_dir}")
+
+        except Exception as e:
+            print(f"Error during Gemma {test_name} conversion test: {e}")
+            raise
diff --git a/tests/functional_tests/models/test_gemma_provider.py b/tests/functional_tests/models/test_gemma_provider.py
new file mode 100644
index 0000000000..db4811cc10
--- /dev/null
+++ b/tests/functional_tests/models/test_gemma_provider.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from megatron.bridge.models.conversion.auto_bridge import AutoBridge
+from megatron.bridge.models.gemma import (
+    CodeGemmaModelProvider2B,
+    CodeGemmaModelProvider7B,
+    GemmaModelProvider2B,
+    GemmaModelProvider7B,
+)
+from tests.functional_tests.utils import compare_provider_configs
+
+
+HF_MODEL_ID_TO_BRIDGE_MODEL_PROVIDER = {
+    "google/gemma-2b": GemmaModelProvider2B,
+    "google/gemma-7b": GemmaModelProvider7B,
+    "google/codegemma-2b": CodeGemmaModelProvider2B,
+    "google/codegemma-7b": CodeGemmaModelProvider7B,
+}
+
+ROOT_PATH: str = "/home/TestData/megatron_bridge/hf_home"
+
+HF_MODEL_ID_PATH_TO_MODEL_PROVIDER = {
+    os.path.join(ROOT_PATH, hf_model_id): provider_class
+    for hf_model_id, provider_class in HF_MODEL_ID_TO_BRIDGE_MODEL_PROVIDER.items()
+}
+
+
+class TestGemmaModelProviderMapping:
+    """Test that bridge provider configs are equivalent to predefined provider configs."""
+
+    @pytest.mark.parametrize("hf_model_id,provider_class", list(HF_MODEL_ID_PATH_TO_MODEL_PROVIDER.items()))
+    def test_bridge_vs_predefined_provider_config_equivalence(self, hf_model_id, provider_class):
+        """Test that bridge converted provider config matches predefined provider config."""
+        # Create bridge from HF model
+        bridge = AutoBridge.from_hf_pretrained(hf_model_id)
+        converted_provider = bridge.to_megatron_provider(load_weights=False)
+
+        # Create predefined provider
+        predefined_provider = provider_class()
+
+        # Compare configs
+        compare_provider_configs(converted_provider, predefined_provider, hf_model_id)
diff --git a/tests/functional_tests/recipes/test_llama_recipes.py b/tests/functional_tests/recipes/test_llama_recipes_pretrain.py
similarity index 90%
rename from tests/functional_tests/recipes/test_llama_recipes.py
rename to tests/functional_tests/recipes/test_llama_recipes_pretrain.py
index 74aec66b14..5d2b7d84fc 100644
--- a/tests/functional_tests/recipes/test_llama_recipes.py
+++ b/tests/functional_tests/recipes/test_llama_recipes_pretrain.py
@@ -16,8 +16,12 @@
 
 import pytest
 
-from megatron.bridge.recipes.llama.llama32_1b import pretrain_config as llama32_1b_config
-from megatron.bridge.recipes.llama.llama32_3b import pretrain_config as llama32_3b_config
+from megatron.bridge.recipes.llama import (
+    llama32_1b_pretrain_config as llama32_1b_config,
+)
+from megatron.bridge.recipes.llama import (
+    llama32_3b_pretrain_config as llama32_3b_config,
+)
 from tests.functional_tests.recipes.utils import run_pretrain_config_override_test, run_pretrain_recipe_test
 
 
diff --git a/tests/functional_tests/recipes/test_mamba_recipes.py b/tests/functional_tests/recipes/test_mamba_recipes_pretrain.py
similarity index 100%
rename from tests/functional_tests/recipes/test_mamba_recipes.py
rename to tests/functional_tests/recipes/test_mamba_recipes_pretrain.py
diff --git a/tests/functional_tests/recipes/test_qwen_recipes_pretrain.py b/tests/functional_tests/recipes/test_qwen_recipes_pretrain.py
new file mode 100644
index 0000000000..72bf22e8f2
--- /dev/null
+++ b/tests/functional_tests/recipes/test_qwen_recipes_pretrain.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for Qwen recipe configurations."""
+
+import pytest
+
+from megatron.bridge.recipes.qwen import (
+    qwen2_500m_pretrain_config as qwen2_500m_config,
+)
+from megatron.bridge.recipes.qwen import (
+    qwen25_500m_pretrain_config as qwen25_500m_config,
+)
+from tests.functional_tests.recipes.utils import run_pretrain_recipe_test
+
+
+QWEN_PRETRAIN_RECIPES = [
+    # (config_func, name, parallelism_overrides)
+    (qwen2_500m_config, "qwen2_500m", {}),
+    (qwen25_500m_config, "qwen25_500m", {}),
+]
+
+
+class TestQwenRecipes:
+    """Test class for Qwen recipe functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    @pytest.mark.parametrize("config_func,recipe_name,parallelism_overrides", QWEN_PRETRAIN_RECIPES)
+    def test_qwen_pretrain_recipes(self, config_func, recipe_name, parallelism_overrides, tmp_path):
+        """Functional test for Qwen recipes with appropriate parallelism configurations."""
+        run_pretrain_recipe_test(config_func, recipe_name, tmp_path, **parallelism_overrides)
diff --git a/tests/functional_tests/training/test_megatron_fsdp.py b/tests/functional_tests/training/test_megatron_fsdp.py
index d3b741d9a8..209183ab59 100644
--- a/tests/functional_tests/training/test_megatron_fsdp.py
+++ b/tests/functional_tests/training/test_megatron_fsdp.py
@@ -364,8 +364,8 @@ def test_fsdp_pretrain_save_resume(self, tmp_path):
 
             torch.distributed.barrier()
 
-            # Verify FSDP DTensor checkpoint files from second run
-            verify_checkpoint_files(checkpoint_dir, checkpoint_iters, ckpt_format=cfg_second.checkpoint.ckpt_format)
+            # Verify FSDP DTensor checkpoint files from second run (should be at total_iters=20)
+            verify_checkpoint_files(checkpoint_dir, total_iters, ckpt_format=cfg_second.checkpoint.ckpt_format)
 
         finally:
             clear_directories(shared_base_dir)
diff --git a/tests/functional_tests/utils.py b/tests/functional_tests/utils.py
index 87cdcb964c..76a29d6c4c 100644
--- a/tests/functional_tests/utils.py
+++ b/tests/functional_tests/utils.py
@@ -18,6 +18,13 @@
 
 import torch
 
+from megatron.bridge.training.utils.checkpoint_utils import (
+    TRACKER_PREFIX,
+    get_checkpoint_name,
+    get_checkpoint_tracker_filename,
+    get_checkpoint_train_state_filename,
+)
+
 
 def initialize_distributed() -> None:
     """Initialize global process group for distributed execution."""
@@ -107,10 +114,22 @@ def verify_checkpoint_files(checkpoint_dir: str, iteration_count: int, ckpt_form
         torch.distributed.barrier()
 
     if torch.distributed.get_rank() == 0:
-        latest_tracker_file = os.path.join(checkpoint_dir, "latest_train_state.pt")
+        # Verify Megatron-Bridge tracker file
+        latest_tracker_file = get_checkpoint_train_state_filename(checkpoint_dir, prefix=TRACKER_PREFIX)
         assert os.path.exists(latest_tracker_file), "Latest checkpoint tracker file not found"
 
-        final_iter_dir = os.path.join(checkpoint_dir, f"iter_{iteration_count:07d}")
+        # Verify Megatron-LM compatibility tracker file
+        megatron_lm_tracker = get_checkpoint_tracker_filename(checkpoint_dir)
+        assert os.path.exists(megatron_lm_tracker), "Megatron-LM tracker file not found"
+
+        # Verify the tracker file contains the correct iteration
+        with open(megatron_lm_tracker, "r") as f:
+            saved_iteration = f.read().strip()
+        assert saved_iteration == str(iteration_count), (
+            f"Megatron-LM tracker file contains '{saved_iteration}', expected '{iteration_count}'"
+        )
+
+        final_iter_dir = get_checkpoint_name(checkpoint_dir, iteration_count, release=False)
         assert os.path.exists(final_iter_dir), f"Final checkpoint directory not found at {final_iter_dir}"
 
         metadata_file = os.path.join(final_iter_dir, ".metadata")
diff --git a/tests/unit_tests/data/test_loaders.py b/tests/unit_tests/data/test_loaders.py
index 11f4e578c4..aae8c3741c 100644
--- a/tests/unit_tests/data/test_loaders.py
+++ b/tests/unit_tests/data/test_loaders.py
@@ -22,7 +22,7 @@
     get_blend_and_blend_per_split,
 )
 from megatron.bridge.data.utils import get_dataset_provider
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
+from megatron.bridge.recipes.llama.llama3 import llama3_8b_pretrain_config as pretrain_config
 from megatron.bridge.training.state import TrainState
 
 
@@ -87,7 +87,17 @@ def test_build_train_valid_test_data_loaders(
     ):
         mock_get_data_parallel_rank.return_value = 0
         mock_get_data_parallel_world_size.return_value = 1
-        cfg = pretrain_config()
+        # Avoid HF download by mocking AutoBridge
+        with mock.patch("megatron.bridge.recipes.llama.llama3.AutoBridge.from_hf_pretrained") as mock_from:
+
+            class _DummyBridge:
+                def to_megatron_provider(self, load_weights=False):
+                    from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider
+
+                    return Llama3ModelProvider()
+
+            mock_from.return_value = _DummyBridge()
+            cfg = pretrain_config()
         cfg.train.train_iters = 1000
         cfg.dataset.finalize()
         dataset_provider = get_dataset_provider(cfg.dataset)
@@ -111,7 +121,17 @@ def test_build_train_valid_test_data_loaders_eval_iters_0(
     ):
         mock_get_data_parallel_rank.return_value = 0
         mock_get_data_parallel_world_size.return_value = 1
-        cfg = pretrain_config()
+        # Avoid HF download by mocking AutoBridge
+        with mock.patch("megatron.bridge.recipes.llama.llama3.AutoBridge.from_hf_pretrained") as mock_from:
+
+            class _DummyBridge:
+                def to_megatron_provider(self, load_weights=False):
+                    from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider
+
+                    return Llama3ModelProvider()
+
+            mock_from.return_value = _DummyBridge()
+            cfg = pretrain_config()
         cfg.train.train_iters = 1000
         cfg.train.eval_iters = 0
         cfg.dataset.finalize()
diff --git a/tests/unit_tests/data/test_samplers.py b/tests/unit_tests/data/test_samplers.py
index 7d3bce341c..324226b5d0 100644
--- a/tests/unit_tests/data/test_samplers.py
+++ b/tests/unit_tests/data/test_samplers.py
@@ -18,7 +18,7 @@
     build_pretraining_data_loader,
 )
 from megatron.bridge.data.utils import get_dataset_provider
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
+from megatron.bridge.recipes.llama.llama3 import llama3_8b_pretrain_config as pretrain_config
 
 
 class TestDataSamplers:
@@ -35,8 +35,19 @@ def test_build_pretraining_data_loader(self):
         assert dataloader == None
 
     def test_build_pretraining_data_loader_single(self):
-        # Setup dataloader params
-        cfg = pretrain_config()
+        # Setup dataloader params (mock AutoBridge to avoid HF downloads)
+        from unittest import mock as _mock
+
+        with _mock.patch("megatron.bridge.recipes.llama.llama3.AutoBridge.from_hf_pretrained") as mock_from:
+
+            class _DummyBridge:
+                def to_megatron_provider(self, load_weights=False):
+                    from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider
+
+                    return Llama3ModelProvider()
+
+            mock_from.return_value = _DummyBridge()
+            cfg = pretrain_config()
         cfg.train.train_iters = 1000
         cfg.dataset.finalize()
         dataset_provider = get_dataset_provider(cfg.dataset)
@@ -67,8 +78,19 @@ def test_build_pretraining_data_loader_single(self):
         assert dataloader.num_workers == 0
 
     def test_build_pretraining_data_loader_cyclic(self):
-        # Setup dataloader params
-        cfg = pretrain_config()
+        # Setup dataloader params (mock AutoBridge to avoid HF downloads)
+        from unittest import mock as _mock
+
+        with _mock.patch("megatron.bridge.recipes.llama.llama3.AutoBridge.from_hf_pretrained") as mock_from:
+
+            class _DummyBridge:
+                def to_megatron_provider(self, load_weights=False):
+                    from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider
+
+                    return Llama3ModelProvider()
+
+            mock_from.return_value = _DummyBridge()
+            cfg = pretrain_config()
         cfg.train.train_iters = 1000
         cfg.dataset.finalize()
         dataset_provider = get_dataset_provider(cfg.dataset)
@@ -108,7 +130,19 @@ def test_build_pretraining_data_loader_cyclic(self):
         assert dataloader.num_workers == 0
 
     def test_build_pretraining_data_loader_external(self):
-        cfg = pretrain_config()
+        # Mock AutoBridge to avoid HF downloads
+        from unittest import mock as _mock
+
+        with _mock.patch("megatron.bridge.recipes.llama.llama3.AutoBridge.from_hf_pretrained") as mock_from:
+
+            class _DummyBridge:
+                def to_megatron_provider(self, load_weights=False):
+                    from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider
+
+                    return Llama3ModelProvider()
+
+            mock_from.return_value = _DummyBridge()
+            cfg = pretrain_config()
         cfg.train.train_iters = 1000
         cfg.dataset.finalize()
         dataset_provider = get_dataset_provider(cfg.dataset)
diff --git a/tests/unit_tests/recipes/llama/__init__.py b/tests/unit_tests/models/gemma/__init__.py
similarity index 100%
rename from tests/unit_tests/recipes/llama/__init__.py
rename to tests/unit_tests/models/gemma/__init__.py
diff --git a/tests/unit_tests/models/gemma/test_gemma2_bridge.py b/tests/unit_tests/models/gemma/test_gemma2_bridge.py
new file mode 100644
index 0000000000..456a1c3377
--- /dev/null
+++ b/tests/unit_tests/models/gemma/test_gemma2_bridge.py
@@ -0,0 +1,667 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+from transformers import Gemma2Config, Gemma2ForCausalLM, GenerationConfig
+
+from megatron.bridge.models import AutoBridge
+from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
+from megatron.bridge.models.gemma.gemma2_bridge import Gemma2Bridge
+from megatron.bridge.models.gemma.gemma2_provider import Gemma2ModelProvider
+from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
+
+
+class TestMegatronGemma2Bridge:
+    """Test cases for MegatronGemma2Bridge class."""
+
+    @pytest.fixture
+    def gemma2_2b_config_dict(self):
+        """Create a sample Gemma2 2B configuration."""
+        return {
+            "architectures": ["Gemma2ForCausalLM"],
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "attn_logit_softcapping": 50.0,
+            "bos_token_id": 2,
+            "cache_implementation": "hybrid",
+            "eos_token_id": 1,
+            "final_logit_softcapping": 30.0,
+            "head_dim": 256,
+            "hidden_act": "gelu_pytorch_tanh",
+            "hidden_activation": "gelu_pytorch_tanh",
+            "hidden_size": 2304,
+            "initializer_range": 0.02,
+            "intermediate_size": 9216,
+            "max_position_embeddings": 8192,
+            "model_type": "gemma2",
+            "num_attention_heads": 8,
+            "num_hidden_layers": 26,
+            "num_key_value_heads": 4,
+            "pad_token_id": 0,
+            "query_pre_attn_scalar": 256,
+            "rms_norm_eps": 1e-06,
+            "rope_theta": 10000.0,
+            "sliding_window": 4096,
+            "torch_dtype": "float32",
+            "transformers_version": "4.42.4",
+            "use_cache": True,
+            "vocab_size": 256000,
+        }
+
+    @pytest.fixture
+    def gemma2_9b_config_dict(self):
+        """Create a sample Gemma2 9B configuration."""
+        return {
+            "architectures": ["Gemma2ForCausalLM"],
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "attn_logit_softcapping": 50.0,
+            "bos_token_id": 2,
+            "cache_implementation": "hybrid",
+            "eos_token_id": 1,
+            "final_logit_softcapping": 30.0,
+            "head_dim": 256,
+            "hidden_act": "gelu_pytorch_tanh",
+            "hidden_activation": "gelu_pytorch_tanh",
+            "hidden_size": 3584,
+            "initializer_range": 0.02,
+            "intermediate_size": 14336,
+            "max_position_embeddings": 8192,
+            "model_type": "gemma2",
+            "num_attention_heads": 16,
+            "num_hidden_layers": 42,
+            "num_key_value_heads": 8,
+            "pad_token_id": 0,
+            "query_pre_attn_scalar": 256,
+            "rms_norm_eps": 1e-06,
+            "rope_theta": 10000.0,
+            "sliding_window": 4096,
+            "sliding_window_size": 4096,
+            "torch_dtype": "float32",
+            "transformers_version": "4.42.0.dev0",
+            "use_cache": True,
+            "vocab_size": 256000,
+        }
+
+    @pytest.fixture
+    def gemma2_27b_config_dict(self):
+        """Create a sample Gemma2 27B configuration."""
+        return {
+            "architectures": ["Gemma2ForCausalLM"],
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "attn_logit_softcapping": 50.0,
+            "bos_token_id": 2,
+            "cache_implementation": "hybrid",
+            "eos_token_id": 1,
+            "final_logit_softcapping": 30.0,
+            "head_dim": 128,
+            "hidden_act": "gelu_pytorch_tanh",
+            "hidden_activation": "gelu_pytorch_tanh",
+            "hidden_size": 4608,
+            "initializer_range": 0.02,
+            "intermediate_size": 36864,
+            "max_position_embeddings": 8192,
+            "model_type": "gemma2",
+            "num_attention_heads": 32,
+            "num_hidden_layers": 46,
+            "num_key_value_heads": 16,
+            "pad_token_id": 0,
+            "query_pre_attn_scalar": 144,
+            "rms_norm_eps": 1e-06,
+            "rope_theta": 10000.0,
+            "sliding_window": 4096,
+            "sliding_window_size": 4096,
+            "torch_dtype": "float32",
+            "transformers_version": "4.42.0.dev0",
+            "use_cache": True,
+            "vocab_size": 256000,
+            "_attn_implementation": "eager",
+        }
+
+    @pytest.fixture
+    def gemma2_2b_config(self, gemma2_2b_config_dict):
+        """Create a Gemma2Config instance for 2B model."""
+        return Gemma2Config(**gemma2_2b_config_dict)
+
+    @pytest.fixture
+    def gemma2_9b_config(self, gemma2_9b_config_dict):
+        """Create a Gemma2Config instance for 9B model."""
+        return Gemma2Config(**gemma2_9b_config_dict)
+
+    @pytest.fixture
+    def gemma2_27b_config(self, gemma2_27b_config_dict):
+        """Create a Gemma2Config instance for 27B model."""
+        return Gemma2Config(**gemma2_27b_config_dict)
+
+    @pytest.fixture
+    def mock_gemma2_2b_model(self, gemma2_2b_config):
+        """Create a mock Gemma2ForCausalLM 2B model."""
+        mock_model = Mock(spec=Gemma2ForCausalLM)
+        mock_model.config = gemma2_2b_config
+        mock_model.dtype = torch.bfloat16
+        return mock_model
+
+    @pytest.fixture
+    def mock_gemma2_9b_model(self, gemma2_9b_config):
+        """Create a mock Gemma2ForCausalLM 9B model."""
+        mock_model = Mock(spec=Gemma2ForCausalLM)
+        mock_model.config = gemma2_9b_config
+        mock_model.dtype = torch.bfloat16
+        return mock_model
+
+    @pytest.fixture
+    def mock_gemma2_27b_model(self, gemma2_27b_config):
+        """Create a mock Gemma2ForCausalLM 27B model."""
+        mock_model = Mock(spec=Gemma2ForCausalLM)
+        mock_model.config = gemma2_27b_config
+        mock_model.dtype = torch.bfloat16
+        return mock_model
+
+    @pytest.fixture
+    def mock_pretrained_gemma2_2b(self, gemma2_2b_config):
+        """Create a mock PreTrainedCausalLM with Gemma2 2B model."""
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = gemma2_2b_config
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+        mock_pretrained.model = Mock(spec=Gemma2ForCausalLM)
+        mock_pretrained.model.dtype = torch.bfloat16
+        return mock_pretrained
+
+    @pytest.fixture
+    def mock_pretrained_gemma2_9b(self, gemma2_9b_config):
+        """Create a mock PreTrainedCausalLM with Gemma2 9B model."""
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = gemma2_9b_config
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+        mock_pretrained.model = Mock(spec=Gemma2ForCausalLM)
+        mock_pretrained.model.dtype = torch.bfloat16
+        return mock_pretrained
+
+    @pytest.fixture
+    def mock_pretrained_gemma2_27b(self, gemma2_27b_config):
+        """Create a mock PreTrainedCausalLM with Gemma2 27B model."""
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = gemma2_27b_config
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+        mock_pretrained.model = Mock(spec=Gemma2ForCausalLM)
+        mock_pretrained.model.dtype = torch.bfloat16
+        return mock_pretrained
+
+    def test_bridge_registration(self):
+        """Test that MegatronGemma2Bridge is properly registered."""
+        # The @MegatronModelBridge.register_bridge decorator should register the bridge
+        # Check that the class exists and has the expected base class
+        assert issubclass(Gemma2Bridge, MegatronModelBridge)
+
+    def test_provider_bridge_basic_2b(self, mock_pretrained_gemma2_2b, gemma2_2b_config):
+        """Test basic provider_bridge functionality for Gemma2 2B."""
+        bridge = Gemma2Bridge()
+
+        # Call provider_bridge
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Check that it returns a Gemma2ModelProvider instance
+        assert isinstance(result, Gemma2ModelProvider)
+
+        # Check basic configuration mapping
+        assert result.num_layers == gemma2_2b_config.num_hidden_layers
+        assert result.hidden_size == gemma2_2b_config.hidden_size
+        assert result.num_attention_heads == gemma2_2b_config.num_attention_heads
+        assert result.seq_length == gemma2_2b_config.max_position_embeddings
+        assert result.rotary_base == gemma2_2b_config.rope_theta
+
+    def test_provider_bridge_basic_9b(self, mock_pretrained_gemma2_9b, gemma2_9b_config):
+        """Test basic provider_bridge functionality for Gemma2 9B."""
+        bridge = Gemma2Bridge()
+
+        # Call provider_bridge
+        result = bridge.provider_bridge(mock_pretrained_gemma2_9b)
+
+        # Check that it returns a Gemma2ModelProvider instance
+        assert isinstance(result, Gemma2ModelProvider)
+
+        # Check basic configuration mapping
+        assert result.num_layers == gemma2_9b_config.num_hidden_layers
+        assert result.hidden_size == gemma2_9b_config.hidden_size
+        assert result.num_attention_heads == gemma2_9b_config.num_attention_heads
+        assert result.seq_length == gemma2_9b_config.max_position_embeddings
+        assert result.rotary_base == gemma2_9b_config.rope_theta
+
+    def test_provider_bridge_basic_27b(self, mock_pretrained_gemma2_27b, gemma2_27b_config):
+        """Test basic provider_bridge functionality for Gemma2 27B."""
+        bridge = Gemma2Bridge()
+
+        # Call provider_bridge
+        result = bridge.provider_bridge(mock_pretrained_gemma2_27b)
+
+        # Check that it returns a Gemma2ModelProvider instance
+        assert isinstance(result, Gemma2ModelProvider)
+
+        # Check basic configuration mapping
+        assert result.num_layers == gemma2_27b_config.num_hidden_layers
+        assert result.hidden_size == gemma2_27b_config.hidden_size
+        assert result.num_attention_heads == gemma2_27b_config.num_attention_heads
+        assert result.seq_length == gemma2_27b_config.max_position_embeddings
+        assert result.rotary_base == gemma2_27b_config.rope_theta
+
+    def test_provider_bridge_vocabulary(self, mock_pretrained_gemma2_2b, gemma2_2b_config):
+        """Test vocabulary size mapping."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Check vocabulary configuration
+        assert result.vocab_size == gemma2_2b_config.vocab_size
+        # Gemma2 uses tied embeddings by default
+        assert result.share_embeddings_and_output_weights == True
+
+    def test_provider_bridge_attention_config(self, mock_pretrained_gemma2_2b, gemma2_2b_config):
+        """Test attention configuration mapping."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Check attention configuration
+        assert result.num_attention_heads == gemma2_2b_config.num_attention_heads
+        assert result.num_query_groups == gemma2_2b_config.num_key_value_heads
+
+    def test_provider_bridge_mlp_config(self, mock_pretrained_gemma2_2b, gemma2_2b_config):
+        """Test MLP configuration mapping."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Check MLP configuration
+        assert result.ffn_hidden_size == gemma2_2b_config.intermediate_size
+        assert result.gated_linear_unit == True  # Gemma2 uses gated MLP
+
+    def test_provider_bridge_normalization(self, mock_pretrained_gemma2_2b, gemma2_2b_config):
+        """Test normalization configuration."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Check normalization settings
+        assert result.layernorm_epsilon == gemma2_2b_config.rms_norm_eps
+
+    def test_provider_bridge_position_embedding(self, mock_pretrained_gemma2_2b, gemma2_2b_config):
+        """Test position embedding configuration."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Check position embedding
+        assert result.rotary_base == gemma2_2b_config.rope_theta
+
+    def test_provider_bridge_gemma2_specific_features(self, mock_pretrained_gemma2_2b, gemma2_2b_config):
+        """Test Gemma2-specific features."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Check Gemma2-specific features
+        assert result.query_pre_attn_scalar == gemma2_2b_config.query_pre_attn_scalar
+        assert result.attn_logit_softcapping == gemma2_2b_config.attn_logit_softcapping
+        assert result.final_logit_softcapping == gemma2_2b_config.final_logit_softcapping
+        assert result.window_size == (gemma2_2b_config.sliding_window, 0)
+        assert result.add_bias_linear == False  # Gemma2 doesn't use bias in linear layers
+        assert result.layernorm_zero_centered_gamma == True  # Gemma2-specific RMSNorm behavior
+
+    def test_provider_bridge_head_dim_calculation_2b(self, mock_pretrained_gemma2_2b, gemma2_2b_config):
+        """Test head dimension calculation for Gemma2 2B."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Gemma2 2B should use the explicit head_dim from config
+        assert result.kv_channels == gemma2_2b_config.head_dim  # 256
+        # Verify this matches the HF config
+        assert result.kv_channels == 256
+
+    def test_provider_bridge_head_dim_calculation_9b(self, mock_pretrained_gemma2_9b, gemma2_9b_config):
+        """Test head dimension calculation for Gemma2 9B."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_9b)
+
+        # Gemma2 9B should use the explicit head_dim from config
+        assert result.kv_channels == gemma2_9b_config.head_dim  # 256
+        # Verify this is different from standard calculation
+        standard_calculation = gemma2_9b_config.hidden_size // gemma2_9b_config.num_attention_heads  # 3584 / 16 = 224
+        assert result.kv_channels != standard_calculation
+        assert result.kv_channels == 256
+
+    def test_provider_bridge_head_dim_calculation_27b(self, mock_pretrained_gemma2_27b, gemma2_27b_config):
+        """Test head dimension calculation for Gemma2 27B - this is where NeMo has a bug."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_27b)
+
+        # Gemma2 27B should use the explicit head_dim from config
+        assert result.kv_channels == gemma2_27b_config.head_dim  # 128
+        # Verify this is different from both standard calculation and NeMo default
+        standard_calculation = (
+            gemma2_27b_config.hidden_size // gemma2_27b_config.num_attention_heads
+        )  # 4608 / 32 = 144
+        nemo_default = 256  # What NeMo incorrectly uses
+        assert result.kv_channels != standard_calculation
+        assert result.kv_channels != nemo_default
+        assert result.kv_channels == 128  # Correct value from HF config
+
+    def test_provider_bridge_dtype_handling(self, gemma2_2b_config):
+        """Test dtype handling in provider_bridge."""
+        # Create model with specific dtype - set it in the config
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = gemma2_2b_config
+        mock_pretrained.config.torch_dtype = torch.bfloat16  # Set config dtype to bfloat16
+        mock_pretrained.model = Mock(spec=Gemma2ForCausalLM)
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+
+        bridge = Gemma2Bridge()
+        result = bridge.provider_bridge(mock_pretrained)
+
+        # The provider should respect the config's dtype
+        assert result.params_dtype == torch.bfloat16
+        assert result.bf16 == True
+        assert result.fp16 == False
+
+    def test_provider_bridge_fp16_dtype_handling(self, gemma2_2b_config):
+        """Test FP16 dtype handling in provider_bridge."""
+        # Create model with FP16 dtype - set it in the config
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = gemma2_2b_config
+        mock_pretrained.config.torch_dtype = torch.float16  # Set config dtype to fp16
+        mock_pretrained.model = Mock(spec=Gemma2ForCausalLM)
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+
+        bridge = Gemma2Bridge()
+        result = bridge.provider_bridge(mock_pretrained)
+
+        # The provider should respect the config's dtype
+        assert result.params_dtype == torch.float16
+        assert result.fp16 == True
+        assert result.bf16 == False
+
+    def test_provider_bridge_sliding_window_config(self, mock_pretrained_gemma2_2b, gemma2_2b_config):
+        """Test sliding window configuration."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Check sliding window configuration specific to Gemma2
+        assert result.window_size == (gemma2_2b_config.sliding_window, 0)
+        assert result.window_size == (4096, 0)
+
+    def test_provider_bridge_query_pre_attn_scalar_variants(self, mock_pretrained_gemma2_27b, gemma2_27b_config):
+        """Test query_pre_attn_scalar for 27B model which has different value."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_27b)
+
+        # 27B model has different query_pre_attn_scalar
+        assert result.query_pre_attn_scalar == gemma2_27b_config.query_pre_attn_scalar
+        assert result.query_pre_attn_scalar == 144  # Different from 2B/9B which use 256
+
+    def test_mapping_registry_implementation(self, mock_pretrained_gemma2_2b):
+        """Test that mapping_registry returns a proper MegatronMappingRegistry."""
+        bridge = Gemma2Bridge()
+
+        # Get the mapping registry
+        mapping_registry = bridge.mapping_registry()
+
+        # Check it's not None
+        assert mapping_registry is not None
+        # Check it has param mappings (they are passed as args to __init__)
+        # The mapping registry should have embedding, layer norm, attention, and MLP mappings
+
+    def test_provider_bridge_make_vocab_size_divisible_by(self, mock_pretrained_gemma2_2b):
+        """Test make_vocab_size_divisible_by calculation."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # The method should calculate a reasonable divisor based on vocab size
+        assert hasattr(result, "make_vocab_size_divisible_by")
+        assert result.make_vocab_size_divisible_by > 0
+
+    def test_provider_bridge_generation_config(self, mock_pretrained_gemma2_2b):
+        """Test that generation config is passed through."""
+        bridge = Gemma2Bridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma2_2b)
+
+        # Generation config should be passed from the pretrained model
+        assert result.generation_config == mock_pretrained_gemma2_2b.generation_config
+
+
+class TestAutoBridgeIntegration:
+    """Integration tests for AutoBridge with Gemma2 models."""
+
+    @pytest.fixture
+    def gemma2_configs(self):
+        """Different Gemma2 model configurations for testing."""
+        return {
+            "gemma2-2b": {
+                "architectures": ["Gemma2ForCausalLM"],
+                "model_type": "gemma2",
+                "hidden_size": 2304,
+                "num_hidden_layers": 26,
+                "num_attention_heads": 8,
+                "num_key_value_heads": 4,
+                "intermediate_size": 9216,
+                "vocab_size": 256000,
+                "max_position_embeddings": 8192,
+                "rope_theta": 10000.0,
+                "rms_norm_eps": 1e-06,
+                "head_dim": 256,
+                "attention_bias": False,
+                "torch_dtype": "bfloat16",
+                "query_pre_attn_scalar": 256,
+                "attn_logit_softcapping": 50.0,
+                "final_logit_softcapping": 30.0,
+                "sliding_window": 4096,
+            },
+            "gemma2-9b": {
+                "architectures": ["Gemma2ForCausalLM"],
+                "model_type": "gemma2",
+                "hidden_size": 3584,
+                "num_hidden_layers": 42,
+                "num_attention_heads": 16,
+                "num_key_value_heads": 8,
+                "intermediate_size": 14336,
+                "vocab_size": 256000,
+                "max_position_embeddings": 8192,
+                "rope_theta": 10000.0,
+                "rms_norm_eps": 1e-06,
+                "head_dim": 256,
+                "attention_bias": False,
+                "torch_dtype": "bfloat16",
+                "query_pre_attn_scalar": 256,
+                "attn_logit_softcapping": 50.0,
+                "final_logit_softcapping": 30.0,
+                "sliding_window": 4096,
+            },
+            "gemma2-27b": {
+                "architectures": ["Gemma2ForCausalLM"],
+                "model_type": "gemma2",
+                "hidden_size": 4608,
+                "num_hidden_layers": 46,
+                "num_attention_heads": 32,
+                "num_key_value_heads": 16,
+                "intermediate_size": 36864,
+                "vocab_size": 256000,
+                "max_position_embeddings": 8192,
+                "rope_theta": 10000.0,
+                "rms_norm_eps": 1e-06,
+                "head_dim": 128,
+                "attention_bias": False,
+                "torch_dtype": "bfloat16",
+                "query_pre_attn_scalar": 144,
+                "attn_logit_softcapping": 50.0,
+                "final_logit_softcapping": 30.0,
+                "sliding_window": 4096,
+            },
+        }
+
+    def create_mock_model_files(self, config_dict, save_dir):
+        """Create mock model files in a directory."""
+        import json
+
+        # Save config
+        config_path = Path(save_dir) / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(config_dict, f, indent=2)
+
+        # Create a dummy safetensors index file
+        index_path = Path(save_dir) / "model.safetensors.index.json"
+        index_data = {
+            "metadata": {"total_size": 1000000},
+            "weight_map": {
+                "model.embed_tokens.weight": "model-00001-of-00001.safetensors",
+                "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
+            },
+        }
+        with open(index_path, "w") as f:
+            json.dump(index_data, f, indent=2)
+
+        # Create tokenizer files
+        tokenizer_config = {
+            "tokenizer_class": "GemmaTokenizer",
+            "model_max_length": config_dict["max_position_embeddings"],
+        }
+        tokenizer_path = Path(save_dir) / "tokenizer_config.json"
+        with open(tokenizer_path, "w") as f:
+            json.dump(tokenizer_config, f, indent=2)
+
+        # Create dummy tokenizer.json
+        tokenizer_json_path = Path(save_dir) / "tokenizer.json"
+        tokenizer_data = {
+            "version": "1.0",
+            "model": {"type": "BPE"},
+        }
+        with open(tokenizer_json_path, "w") as f:
+            json.dump(tokenizer_data, f, indent=2)
+
+    @patch("megatron.bridge.models.conversion.auto_bridge.PreTrainedCausalLM.from_pretrained")
+    @patch("megatron.bridge.models.hf_pretrained.safe_config_loader.AutoConfig.from_pretrained")
+    def test_from_pretrained_with_temp_dir(self, mock_autoconfig, mock_pretrained, gemma2_configs):
+        """Test AutoBridge.from_hf_pretrained with temporary directory."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Test with Gemma2 2B config
+            config_dict = gemma2_configs["gemma2-2b"]
+            self.create_mock_model_files(config_dict, temp_dir)
+
+            # Mock the config loading
+            config = Gemma2Config(**config_dict)
+            mock_autoconfig.return_value = config
+
+            # Mock the pretrained model
+            mock_model = Mock(spec=PreTrainedCausalLM)
+            mock_model.config = config
+            mock_model.model_name_or_path = temp_dir
+            mock_pretrained.return_value = mock_model
+
+            # Create bridge from the temp directory
+            bridge = AutoBridge.from_hf_pretrained(temp_dir)
+
+            # Verify
+            assert isinstance(bridge, AutoBridge)
+            assert bridge.hf_pretrained == mock_model
+            mock_autoconfig.assert_called_once_with(temp_dir, trust_remote_code=False)
+            mock_pretrained.assert_called_once_with(temp_dir)
+
+    def test_supports_gemma2_architectures(self, gemma2_configs):
+        """Test that AutoBridge.supports correctly identifies Gemma2 models."""
+        for model_name, config_dict in gemma2_configs.items():
+            config = Gemma2Config(**config_dict)
+            assert AutoBridge.supports(config) == True
+
+        # Test non-causal LM architecture
+        non_causal_config = Mock()
+        non_causal_config.architectures = ["Gemma2Model"]  # Not ForCausalLM
+        assert AutoBridge.supports(non_causal_config) == False
+
+
+class TestGemma2BridgeParameterMapping:
+    """Test parameter mapping functionality in Gemma2Bridge."""
+
+    @pytest.fixture
+    def mock_gemma2_state_dict(self):
+        """Create a mock state dict with Gemma2 parameter names."""
+        return {
+            "model.embed_tokens.weight": torch.randn(256000, 2304),
+            "model.norm.weight": torch.randn(2304),
+            "model.layers.0.input_layernorm.weight": torch.randn(2304),
+            "model.layers.0.pre_feedforward_layernorm.weight": torch.randn(2304),
+            "model.layers.0.post_feedforward_layernorm.weight": torch.randn(2304),
+            "model.layers.0.post_attention_layernorm.weight": torch.randn(2304),
+            "model.layers.0.self_attn.q_proj.weight": torch.randn(2304, 2304),
+            "model.layers.0.self_attn.k_proj.weight": torch.randn(1024, 2304),  # GQA: different size for K
+            "model.layers.0.self_attn.v_proj.weight": torch.randn(1024, 2304),  # GQA: different size for V
+            "model.layers.0.self_attn.o_proj.weight": torch.randn(2304, 2304),
+            "model.layers.0.mlp.gate_proj.weight": torch.randn(9216, 2304),
+            "model.layers.0.mlp.up_proj.weight": torch.randn(9216, 2304),
+            "model.layers.0.mlp.down_proj.weight": torch.randn(2304, 9216),
+        }
+
+    def test_mapping_registry_has_gemma2_specific_mappings(self):
+        """Test that mapping registry includes Gemma2-specific mappings."""
+        bridge = Gemma2Bridge()
+        mapping_registry = bridge.mapping_registry()
+
+        # This test verifies that the mapping registry was created
+        # The actual parameter mappings are tested in integration tests
+        assert mapping_registry is not None
+
+    def test_gemma2_tied_embeddings_mapping(self):
+        """Test that Gemma2 bridge handles tied embeddings correctly."""
+        bridge = Gemma2Bridge()
+        mapping_registry = bridge.mapping_registry()
+
+        # Gemma2 uses tied embeddings, so there should be no separate lm_head.weight mapping
+        # This is reflected in the mapping registry not including lm_head.weight
+        assert mapping_registry is not None
+
+    def test_gemma2_no_bias_mapping(self):
+        """Test that Gemma2 bridge doesn't include bias mappings."""
+        bridge = Gemma2Bridge()
+        mapping_registry = bridge.mapping_registry()
+
+        # Gemma2 doesn't have bias in linear layers
+        # This is reflected in the QKVMapping and other mappings not including bias terms
+        assert mapping_registry is not None
+
+    def test_gemma2_gated_mlp_mapping(self):
+        """Test that Gemma2 bridge includes gated MLP mappings."""
+        bridge = Gemma2Bridge()
+        mapping_registry = bridge.mapping_registry()
+
+        # Gemma2 uses gated MLP, so it should have GatedMLPMapping
+        # This combines gate_proj and up_proj into linear_fc1
+        assert mapping_registry is not None
+
+    def test_gemma2_additional_layer_norms_mapping(self):
+        """Test that Gemma2 bridge includes additional layer norm mappings."""
+        bridge = Gemma2Bridge()
+        mapping_registry = bridge.mapping_registry()
+
+        # Gemma2 has additional layer normalizations compared to original Gemma
+        # pre_feedforward_layernorm, post_feedforward_layernorm, post_attention_layernorm
+        assert mapping_registry is not None
diff --git a/tests/unit_tests/models/gemma/test_gemma2_provider.py b/tests/unit_tests/models/gemma/test_gemma2_provider.py
new file mode 100644
index 0000000000..ad9886ccd5
--- /dev/null
+++ b/tests/unit_tests/models/gemma/test_gemma2_provider.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import Mock, patch
+
+from megatron.core.activations import fast_gelu
+
+from megatron.bridge.models.gemma.gemma2_provider import (
+    Gemma2ModelProvider,
+    Gemma2ModelProvider2B,
+    Gemma2ModelProvider9B,
+    Gemma2ModelProvider27B,
+)
+
+
+class TestGemma2ModelProvider:
+    """Test cases for base Gemma2ModelProvider class."""
+
+    def test_gemma2_model_provider_initialization(self):
+        """Test Gemma2ModelProvider can be initialized with default values."""
+        provider = Gemma2ModelProvider(
+            num_layers=26,
+            hidden_size=2304,
+            num_attention_heads=8,
+        )
+
+        # Check required transformer config fields
+        assert provider.num_layers == 26
+        assert provider.hidden_size == 2304
+        assert provider.num_attention_heads == 8
+
+        # Check Gemma2-specific defaults
+        assert provider.normalization == "RMSNorm"
+        assert provider.activation_func == fast_gelu
+        assert provider.gated_linear_unit is True
+        assert provider.position_embedding_type == "rope"
+        assert provider.add_bias_linear is False
+        assert provider.seq_length == 8192
+        assert provider.kv_channels == 256
+        assert provider.attention_dropout == 0.0
+        assert provider.hidden_dropout == 0.0
+        assert provider.share_embeddings_and_output_weights is True
+        assert provider.layernorm_zero_centered_gamma is True
+
+        # Check Gemma2-specific parameters
+        assert provider.layernorm_epsilon == 1e-6
+        assert provider.rotary_base == 10000
+        assert provider.window_size == (4096, 0)
+        assert provider.vocab_size == 256000
+        assert provider.gradient_accumulation_fusion is False
+        assert provider.query_pre_attn_scalar == 224
+        assert provider.attn_logit_softcapping == 50.0
+        assert provider.final_logit_softcapping == 30.0
+
+    @patch("megatron.bridge.models.gemma.gemma2_provider.parallel_state")
+    @patch("megatron.bridge.models.gemma.gemma2_provider.extend_instance")
+    def test_gemma2_provider_provide_with_embedding_scaling(self, mock_extend_instance, mock_parallel_state):
+        """Test that provide method applies embedding scaling when appropriate."""
+        # Mock the parent provide method
+        mock_model = Mock()
+        mock_model.embedding = Mock()
+
+        provider = Gemma2ModelProvider(
+            num_layers=26,
+            hidden_size=2304,
+            num_attention_heads=8,
+        )
+
+        with patch.object(provider.__class__.__bases__[0], "provide", return_value=mock_model):
+            # Mock both pipeline stages
+            mock_parallel_state.is_pipeline_first_stage.return_value = True
+            mock_parallel_state.is_pipeline_last_stage.return_value = False
+
+            result = provider.provide(vp_stage=0)
+
+            # Verify that parent provide was called
+            assert result == mock_model
+
+            # Verify that is_pipeline_first_stage was called with correct parameters
+            mock_parallel_state.is_pipeline_first_stage.assert_called_once_with(
+                ignore_virtual=False,
+                vp_stage=0,
+            )
+
+            # Verify that extend_instance was called for embedding scaling
+            assert mock_extend_instance.call_count == 1
+            args = mock_extend_instance.call_args_list[0][0]
+            assert args[0] == mock_model.embedding
+
+    @patch("megatron.bridge.models.gemma.gemma2_provider.parallel_state")
+    @patch("megatron.bridge.models.gemma.gemma2_provider.extend_instance")
+    def test_gemma2_provider_provide_with_output_layer_scaling(self, mock_extend_instance, mock_parallel_state):
+        """Test that provide method applies output layer modifications when appropriate."""
+        # Mock the parent provide method
+        mock_model = Mock()
+        mock_model.embedding = Mock()
+        mock_model.output_layer = Mock()
+
+        provider = Gemma2ModelProvider(
+            num_layers=26,
+            hidden_size=2304,
+            num_attention_heads=8,
+        )
+
+        with patch.object(provider.__class__.__bases__[0], "provide", return_value=mock_model):
+            # Mock both pipeline stages
+            mock_parallel_state.is_pipeline_first_stage.return_value = False
+            mock_parallel_state.is_pipeline_last_stage.return_value = True
+
+            result = provider.provide(vp_stage=1)
+
+            # Verify that parent provide was called
+            assert result == mock_model
+
+            # Verify that is_pipeline_last_stage was called with correct parameters
+            mock_parallel_state.is_pipeline_last_stage.assert_called_once_with(
+                ignore_virtual=False,
+                vp_stage=1,
+            )
+
+            # Verify that extend_instance was called for output layer modifications
+            assert mock_extend_instance.call_count == 1
+            args = mock_extend_instance.call_args_list[0][0]
+            assert args[0] == mock_model.output_layer
+
+    @patch("megatron.bridge.models.gemma.gemma2_provider.parallel_state")
+    @patch("megatron.bridge.models.gemma.gemma2_provider.extend_instance")
+    def test_gemma2_provider_provide_both_stages(self, mock_extend_instance, mock_parallel_state):
+        """Test provide method when model is both first and last stage."""
+        mock_model = Mock()
+        mock_model.embedding = Mock()
+        mock_model.output_layer = Mock()
+
+        provider = Gemma2ModelProvider(
+            num_layers=26,
+            hidden_size=2304,
+            num_attention_heads=8,
+        )
+
+        with patch.object(provider.__class__.__bases__[0], "provide", return_value=mock_model):
+            # Mock both pipeline stages as True (single stage setup)
+            mock_parallel_state.is_pipeline_first_stage.return_value = True
+            mock_parallel_state.is_pipeline_last_stage.return_value = True
+
+            result = provider.provide(vp_stage=0)
+
+            # Verify that parent provide was called
+            assert result == mock_model
+
+            # Both should be called
+            mock_parallel_state.is_pipeline_first_stage.assert_called_once()
+            mock_parallel_state.is_pipeline_last_stage.assert_called_once()
+
+            # Verify that extend_instance was called twice (embedding + output layer)
+            assert mock_extend_instance.call_count == 2
+
+
+class TestGemma2ModelProvider2B:
+    """Test cases for Gemma2ModelProvider2B class."""
+
+    def test_gemma2_2b_configuration(self):
+        """Test that Gemma2ModelProvider2B has correct configuration values."""
+        provider = Gemma2ModelProvider2B()
+
+        # Test 2B specific values
+        assert provider.num_layers == 26
+        assert provider.hidden_size == 2304
+        assert provider.num_attention_heads == 8
+        assert provider.num_query_groups == 4
+        assert provider.ffn_hidden_size == 9216
+        assert provider.query_pre_attn_scalar == 256
+
+        # Test inherited Gemma2 defaults
+        assert provider.normalization == "RMSNorm"
+        assert provider.activation_func == fast_gelu
+        assert provider.gated_linear_unit is True
+        assert provider.window_size == (4096, 0)
+        assert provider.attn_logit_softcapping == 50.0
+        assert provider.final_logit_softcapping == 30.0
+
+    def test_gemma2_2b_inheritance(self):
+        """Test that Gemma2ModelProvider2B properly inherits from Gemma2ModelProvider."""
+        provider = Gemma2ModelProvider2B()
+        assert isinstance(provider, Gemma2ModelProvider)
+
+
+class TestGemma2ModelProvider9B:
+    """Test cases for Gemma2ModelProvider9B class."""
+
+    def test_gemma2_9b_configuration(self):
+        """Test that Gemma2ModelProvider9B has correct configuration values."""
+        provider = Gemma2ModelProvider9B()
+
+        # Test 9B specific values
+        assert provider.num_layers == 42
+        assert provider.hidden_size == 3584
+        assert provider.num_attention_heads == 16
+        assert provider.num_query_groups == 8
+        assert provider.ffn_hidden_size == 14336
+        assert provider.query_pre_attn_scalar == 256
+
+        # Test inherited Gemma2 defaults
+        assert provider.normalization == "RMSNorm"
+        assert provider.activation_func == fast_gelu
+        assert provider.gated_linear_unit is True
+
+    def test_gemma2_9b_inheritance(self):
+        """Test that Gemma2ModelProvider9B properly inherits from Gemma2ModelProvider."""
+        provider = Gemma2ModelProvider9B()
+        assert isinstance(provider, Gemma2ModelProvider)
+
+
+class TestGemma2ModelProvider27B:
+    """Test cases for Gemma2ModelProvider27B class."""
+
+    def test_gemma2_27b_configuration(self):
+        """Test that Gemma2ModelProvider27B has correct configuration values."""
+        provider = Gemma2ModelProvider27B()
+
+        # Test 27B specific values
+        assert provider.num_layers == 46
+        assert provider.hidden_size == 4608
+        assert provider.num_attention_heads == 32
+        assert provider.num_query_groups == 16
+        assert provider.ffn_hidden_size == 36864
+        assert provider.query_pre_attn_scalar == 144
+
+    def test_gemma2_27b_inheritance(self):
+        """Test that Gemma2ModelProvider27B properly inherits from Gemma2ModelProvider."""
+        provider = Gemma2ModelProvider27B()
+        assert isinstance(provider, Gemma2ModelProvider)
+
+
+class TestGemma2ModelProviderIntegration:
+    """Integration tests for Gemma2 model providers."""
+
+    def test_all_providers_have_provide_method(self):
+        """Test that all provider classes have the provide method."""
+        providers = [
+            Gemma2ModelProvider2B(),
+            Gemma2ModelProvider9B(),
+            Gemma2ModelProvider27B(),
+        ]
+
+        for provider in providers:
+            assert hasattr(provider, "provide")
+            assert callable(getattr(provider, "provide"))
diff --git a/tests/unit_tests/models/gemma/test_gemma_bridge.py b/tests/unit_tests/models/gemma/test_gemma_bridge.py
new file mode 100644
index 0000000000..7c6a4c80ff
--- /dev/null
+++ b/tests/unit_tests/models/gemma/test_gemma_bridge.py
@@ -0,0 +1,610 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+from transformers import GemmaConfig, GemmaForCausalLM, GenerationConfig
+
+from megatron.bridge.models import AutoBridge
+from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
+from megatron.bridge.models.gemma.gemma_bridge import GemmaBridge
+from megatron.bridge.models.gemma.gemma_provider import GemmaModelProvider
+from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM
+
+
+class TestMegatronGemmaBridge:
+    """Test cases for MegatronGemmaBridge class."""
+
+    @pytest.fixture
+    def gemma_2b_config_dict(self):
+        """Create a sample Gemma 2B configuration."""
+        return {
+            "architectures": ["GemmaForCausalLM"],
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "bos_token_id": 2,
+            "eos_token_id": 1,
+            "head_dim": 256,
+            "hidden_act": "gelu",
+            "hidden_size": 2048,
+            "initializer_range": 0.02,
+            "intermediate_size": 16384,
+            "max_position_embeddings": 8192,
+            "model_type": "gemma",
+            "num_attention_heads": 8,
+            "num_hidden_layers": 18,
+            "num_key_value_heads": 1,
+            "pad_token_id": 0,
+            "rms_norm_eps": 1e-06,
+            "rope_scaling": None,
+            "rope_theta": 10000.0,
+            "torch_dtype": "bfloat16",
+            "transformers_version": "4.38.0.dev0",
+            "use_cache": True,
+            "vocab_size": 256000,
+        }
+
+    @pytest.fixture
+    def gemma_7b_config_dict(self):
+        """Create a sample Gemma 7B configuration."""
+        return {
+            "architectures": ["GemmaForCausalLM"],
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "bos_token_id": 2,
+            "eos_token_id": 1,
+            "head_dim": 256,
+            "hidden_act": "gelu",
+            "hidden_size": 3072,
+            "initializer_range": 0.02,
+            "intermediate_size": 24576,
+            "max_position_embeddings": 8192,
+            "model_type": "gemma",
+            "num_attention_heads": 16,
+            "num_hidden_layers": 28,
+            "num_key_value_heads": 16,
+            "pad_token_id": 0,
+            "rms_norm_eps": 1e-06,
+            "rope_scaling": None,
+            "rope_theta": 10000.0,
+            "torch_dtype": "bfloat16",
+            "transformers_version": "4.38.0.dev0",
+            "use_cache": True,
+            "vocab_size": 256000,
+        }
+
+    @pytest.fixture
+    def gemma_2b_config(self, gemma_2b_config_dict):
+        """Create a GemmaConfig instance for 2B model."""
+        return GemmaConfig(**gemma_2b_config_dict)
+
+    @pytest.fixture
+    def gemma_7b_config(self, gemma_7b_config_dict):
+        """Create a GemmaConfig instance for 7B model."""
+        return GemmaConfig(**gemma_7b_config_dict)
+
+    @pytest.fixture
+    def mock_gemma_2b_model(self, gemma_2b_config):
+        """Create a mock GemmaForCausalLM 2B model."""
+        mock_model = Mock(spec=GemmaForCausalLM)
+        mock_model.config = gemma_2b_config
+        mock_model.dtype = torch.bfloat16
+        return mock_model
+
+    @pytest.fixture
+    def mock_gemma_7b_model(self, gemma_7b_config):
+        """Create a mock GemmaForCausalLM 7B model."""
+        mock_model = Mock(spec=GemmaForCausalLM)
+        mock_model.config = gemma_7b_config
+        mock_model.dtype = torch.bfloat16
+        return mock_model
+
+    @pytest.fixture
+    def mock_pretrained_gemma_2b(self, gemma_2b_config):
+        """Create a mock PreTrainedCausalLM with Gemma 2B model."""
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = gemma_2b_config
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+        mock_pretrained.model = Mock(spec=GemmaForCausalLM)
+        mock_pretrained.model.dtype = torch.bfloat16
+        return mock_pretrained
+
+    @pytest.fixture
+    def mock_pretrained_gemma_7b(self, gemma_7b_config):
+        """Create a mock PreTrainedCausalLM with Gemma 7B model."""
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = gemma_7b_config
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+        mock_pretrained.model = Mock(spec=GemmaForCausalLM)
+        mock_pretrained.model.dtype = torch.bfloat16
+        return mock_pretrained
+
+    def test_bridge_registration(self):
+        """Test that MegatronGemmaBridge is properly registered."""
+        # The @MegatronModelBridge.register_bridge decorator should register the bridge
+        # Check that the class exists and has the expected base class
+        assert issubclass(GemmaBridge, MegatronModelBridge)
+
+    def test_provider_bridge_basic_2b(self, mock_pretrained_gemma_2b, gemma_2b_config):
+        """Test basic provider_bridge functionality for Gemma 2B."""
+        bridge = GemmaBridge()
+
+        # Call provider_bridge
+        result = bridge.provider_bridge(mock_pretrained_gemma_2b)
+
+        # Check that it returns a GemmaModelProvider instance
+        assert isinstance(result, GemmaModelProvider)
+
+        # Check basic configuration mapping
+        assert result.num_layers == gemma_2b_config.num_hidden_layers
+        assert result.hidden_size == gemma_2b_config.hidden_size
+        assert result.num_attention_heads == gemma_2b_config.num_attention_heads
+        assert result.seq_length == gemma_2b_config.max_position_embeddings
+        assert result.rotary_base == gemma_2b_config.rope_theta
+
+    def test_provider_bridge_basic_7b(self, mock_pretrained_gemma_7b, gemma_7b_config):
+        """Test basic provider_bridge functionality for Gemma 7B."""
+        bridge = GemmaBridge()
+
+        # Call provider_bridge
+        result = bridge.provider_bridge(mock_pretrained_gemma_7b)
+
+        # Check that it returns a GemmaModelProvider instance
+        assert isinstance(result, GemmaModelProvider)
+
+        # Check basic configuration mapping
+        assert result.num_layers == gemma_7b_config.num_hidden_layers
+        assert result.hidden_size == gemma_7b_config.hidden_size
+        assert result.num_attention_heads == gemma_7b_config.num_attention_heads
+        assert result.seq_length == gemma_7b_config.max_position_embeddings
+        assert result.rotary_base == gemma_7b_config.rope_theta
+
+    def test_provider_bridge_vocabulary(self, mock_pretrained_gemma_2b, gemma_2b_config):
+        """Test vocabulary size mapping."""
+        bridge = GemmaBridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma_2b)
+
+        # Check vocabulary configuration
+        assert result.vocab_size == gemma_2b_config.vocab_size
+        # Gemma uses tied embeddings by default
+        assert result.share_embeddings_and_output_weights == True
+
+    def test_provider_bridge_attention_config(self, mock_pretrained_gemma_2b, gemma_2b_config):
+        """Test attention configuration mapping."""
+        bridge = GemmaBridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma_2b)
+
+        # Check attention configuration
+        assert result.num_attention_heads == gemma_2b_config.num_attention_heads
+        assert result.num_query_groups == gemma_2b_config.num_key_value_heads
+
+    def test_provider_bridge_mlp_config(self, mock_pretrained_gemma_2b, gemma_2b_config):
+        """Test MLP configuration mapping."""
+        bridge = GemmaBridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma_2b)
+
+        # Check MLP configuration
+        assert result.ffn_hidden_size == gemma_2b_config.intermediate_size
+        assert result.gated_linear_unit == True  # Gemma uses gated MLP
+
+    def test_provider_bridge_normalization(self, mock_pretrained_gemma_2b, gemma_2b_config):
+        """Test normalization configuration."""
+        bridge = GemmaBridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma_2b)
+
+        # Check normalization settings
+        assert result.layernorm_epsilon == gemma_2b_config.rms_norm_eps
+
+    def test_provider_bridge_position_embedding(self, mock_pretrained_gemma_2b, gemma_2b_config):
+        """Test position embedding configuration."""
+        bridge = GemmaBridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma_2b)
+
+        # Check position embedding
+        assert result.rotary_base == gemma_2b_config.rope_theta
+
+    def test_provider_bridge_gemma_specific_features(self, mock_pretrained_gemma_2b, gemma_2b_config):
+        """Test Gemma-specific features."""
+        bridge = GemmaBridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma_2b)
+
+        # Check Gemma-specific features
+        assert result.kv_channels == gemma_2b_config.head_dim  # Gemma has explicit head_dim
+        assert result.add_bias_linear == False  # Gemma doesn't use bias in linear layers
+        assert result.layernorm_zero_centered_gamma == True  # Gemma-specific RMSNorm behavior
+
+    def test_provider_bridge_head_dim_calculation(self, mock_pretrained_gemma_7b, gemma_7b_config):
+        """Test head dimension calculation for Gemma 7B."""
+        bridge = GemmaBridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma_7b)
+
+        # Gemma 7B should use the explicit head_dim from config
+        assert result.kv_channels == gemma_7b_config.head_dim  # 256
+        # Verify this is different from standard calculation
+        standard_calculation = gemma_7b_config.hidden_size // gemma_7b_config.num_attention_heads  # 3072 / 16 = 192
+        assert result.kv_channels != standard_calculation
+        assert result.kv_channels == 256  # Gemma uses 256 regardless of model size
+
+    def test_provider_bridge_head_dim_fallback(self, gemma_2b_config):
+        """Test head dimension fallback when head_dim is not in config."""
+        # Create config without head_dim
+        config_dict = gemma_2b_config.to_dict()
+        del config_dict["head_dim"]
+        config = GemmaConfig(**config_dict)
+
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = config
+        mock_pretrained.model = Mock(spec=GemmaForCausalLM)
+        mock_pretrained.model.dtype = torch.bfloat16
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+
+        bridge = GemmaBridge()
+        result = bridge.provider_bridge(mock_pretrained)
+
+        # Should fallback to standard calculation
+        expected_kv_channels = config.hidden_size // config.num_attention_heads  # 2048 / 8 = 256
+        assert result.kv_channels == expected_kv_channels
+
+    def test_provider_bridge_dtype_handling(self, gemma_2b_config):
+        """Test dtype handling in provider_bridge."""
+        # Create model with specific dtype
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = gemma_2b_config
+        mock_pretrained.model = Mock(spec=GemmaForCausalLM)
+        mock_pretrained.model.dtype = torch.bfloat16
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+
+        bridge = GemmaBridge()
+        result = bridge.provider_bridge(mock_pretrained)
+
+        # The provider should respect the model's dtype
+        assert result.params_dtype == torch.bfloat16
+        assert result.bf16 == True
+        assert result.fp16 == False
+
+    def test_provider_bridge_fp16_dtype_handling(self, gemma_2b_config):
+        """Test FP16 dtype handling in provider_bridge."""
+        # Create model with FP16 dtype - set it in the config
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = gemma_2b_config
+        mock_pretrained.config.torch_dtype = torch.float16  # Set config dtype to fp16
+        mock_pretrained.model = Mock(spec=GemmaForCausalLM)
+        mock_pretrained.generation_config = Mock(spec=GenerationConfig)
+
+        bridge = GemmaBridge()
+        result = bridge.provider_bridge(mock_pretrained)
+
+        # The provider should respect the config's dtype
+        assert result.params_dtype == torch.float16
+        assert result.fp16 == True
+        assert result.bf16 == False
+
+    def test_provider_bridge_without_tie_embeddings(self, gemma_2b_config):
+        """Test provider_bridge when tie_word_embeddings is not present."""
+        # Remove tie_word_embeddings from config if it exists
+        config_dict = gemma_2b_config.to_dict()
+        if "tie_word_embeddings" in config_dict:
+            del config_dict["tie_word_embeddings"]
+        config = GemmaConfig(**config_dict)
+
+        mock_pretrained = Mock(spec=PreTrainedCausalLM)
+        mock_pretrained.config = config
+        mock_pretrained.model = Mock(spec=GemmaForCausalLM)
+        mock_pretrained.model.dtype = torch.float32
+        mock_pretrained.generation_config = None
+
+        bridge = GemmaBridge()
+        result = bridge.provider_bridge(mock_pretrained)
+
+        # Gemma should default to True for tied embeddings
+        assert result.share_embeddings_and_output_weights == True
+
+    def test_mapping_registry_implementation(self, mock_pretrained_gemma_2b):
+        """Test that mapping_registry returns a proper MegatronMappingRegistry."""
+        bridge = GemmaBridge()
+
+        # Get the mapping registry
+        mapping_registry = bridge.mapping_registry()
+
+        # Check it's not None
+        assert mapping_registry is not None
+        # Check it has param mappings (they are passed as args to __init__)
+        # The mapping registry should have embedding, layer norm, attention, and MLP mappings
+
+    def test_provider_bridge_make_vocab_size_divisible_by(self, mock_pretrained_gemma_2b):
+        """Test make_vocab_size_divisible_by calculation."""
+        bridge = GemmaBridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma_2b)
+
+        # The method should calculate a reasonable divisor based on vocab size
+        assert hasattr(result, "make_vocab_size_divisible_by")
+        assert result.make_vocab_size_divisible_by > 0
+
+    def test_provider_bridge_generation_config(self, mock_pretrained_gemma_2b):
+        """Test that generation config is passed through."""
+        bridge = GemmaBridge()
+
+        result = bridge.provider_bridge(mock_pretrained_gemma_2b)
+
+        # Generation config should be passed from the pretrained model
+        assert result.generation_config == mock_pretrained_gemma_2b.generation_config
+
+
+class TestAutoBridgeIntegration:
+    """Integration tests for AutoBridge with Gemma models."""
+
+    @pytest.fixture
+    def gemma_configs(self):
+        """Different Gemma model configurations for testing."""
+        return {
+            "gemma-2b": {
+                "architectures": ["GemmaForCausalLM"],
+                "model_type": "gemma",
+                "hidden_size": 2048,
+                "num_hidden_layers": 18,
+                "num_attention_heads": 8,
+                "num_key_value_heads": 1,
+                "intermediate_size": 16384,
+                "vocab_size": 256000,
+                "max_position_embeddings": 8192,
+                "rope_theta": 10000.0,
+                "rms_norm_eps": 1e-06,
+                "head_dim": 256,
+                "attention_bias": False,
+                "torch_dtype": "bfloat16",
+            },
+            "gemma-7b": {
+                "architectures": ["GemmaForCausalLM"],
+                "model_type": "gemma",
+                "hidden_size": 3072,
+                "num_hidden_layers": 28,
+                "num_attention_heads": 16,
+                "num_key_value_heads": 16,
+                "intermediate_size": 24576,
+                "vocab_size": 256000,
+                "max_position_embeddings": 8192,
+                "rope_theta": 10000.0,
+                "rms_norm_eps": 1e-06,
+                "head_dim": 256,
+                "attention_bias": False,
+                "torch_dtype": "bfloat16",
+            },
+        }
+
+    def create_mock_model_files(self, config_dict, save_dir):
+        """Create mock model files in a directory."""
+        import json
+
+        # Save config
+        config_path = Path(save_dir) / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(config_dict, f, indent=2)
+
+        # Create a dummy safetensors index file
+        index_path = Path(save_dir) / "model.safetensors.index.json"
+        index_data = {
+            "metadata": {"total_size": 1000000},
+            "weight_map": {
+                "model.embed_tokens.weight": "model-00001-of-00001.safetensors",
+                "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
+            },
+        }
+        with open(index_path, "w") as f:
+            json.dump(index_data, f, indent=2)
+
+        # Create tokenizer files
+        tokenizer_config = {
+            "tokenizer_class": "GemmaTokenizer",
+            "model_max_length": config_dict["max_position_embeddings"],
+        }
+        tokenizer_path = Path(save_dir) / "tokenizer_config.json"
+        with open(tokenizer_path, "w") as f:
+            json.dump(tokenizer_config, f, indent=2)
+
+        # Create dummy tokenizer.json
+        tokenizer_json_path = Path(save_dir) / "tokenizer.json"
+        tokenizer_data = {
+            "version": "1.0",
+            "model": {"type": "BPE"},
+        }
+        with open(tokenizer_json_path, "w") as f:
+            json.dump(tokenizer_data, f, indent=2)
+
+    @patch("megatron.bridge.models.conversion.auto_bridge.PreTrainedCausalLM.from_pretrained")
+    @patch("megatron.bridge.models.conversion.auto_bridge.safe_load_config_with_retry")
+    def test_from_pretrained_with_temp_dir(self, mock_safe_load_config, mock_pretrained, gemma_configs):
+        """Test AutoBridge.from_hf_pretrained with temporary directory."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Test with Gemma 2B config
+            config_dict = gemma_configs["gemma-2b"]
+            self.create_mock_model_files(config_dict, temp_dir)
+
+            # Mock the config loading
+            config = GemmaConfig(**config_dict)
+            mock_safe_load_config.return_value = config
+
+            # Mock the pretrained model
+            mock_model = Mock(spec=PreTrainedCausalLM)
+            mock_model.config = config
+            mock_model.model_name_or_path = temp_dir
+            mock_pretrained.return_value = mock_model
+
+            # Create bridge from the temp directory
+            bridge = AutoBridge.from_hf_pretrained(temp_dir)
+
+            # Verify
+            assert isinstance(bridge, AutoBridge)
+            assert bridge.hf_pretrained == mock_model
+            mock_safe_load_config.assert_called_once_with(temp_dir, trust_remote_code=False)
+            mock_pretrained.assert_called_once_with(temp_dir)
+
+    @patch("megatron.bridge.models.conversion.auto_bridge.PreTrainedCausalLM.from_pretrained")
+    @patch("megatron.bridge.models.conversion.auto_bridge.safe_load_config_with_retry")
+    def test_from_pretrained_multiple_models(self, mock_safe_load_config, mock_pretrained, gemma_configs):
+        """Test AutoBridge.from_hf_pretrained with different Gemma model configs."""
+        for model_name, config_dict in gemma_configs.items():
+            with tempfile.TemporaryDirectory() as temp_dir:
+                self.create_mock_model_files(config_dict, temp_dir)
+
+                # Mock the config loading
+                config = GemmaConfig(**config_dict)
+                mock_safe_load_config.return_value = config
+
+                # Mock the pretrained model
+                mock_model = Mock(spec=PreTrainedCausalLM)
+                mock_model.config = config
+                mock_model.model_name_or_path = temp_dir
+                mock_pretrained.return_value = mock_model
+
+                # Create bridge
+                bridge = AutoBridge.from_hf_pretrained(temp_dir, torch_dtype=torch.float16)
+
+                # Verify
+                assert isinstance(bridge, AutoBridge)
+
+                # Get the provider to verify model-specific settings
+                # Since _model_bridge is a property, we need to patch the method it calls
+                with patch(
+                    "megatron.bridge.models.conversion.auto_bridge.model_bridge.get_model_bridge"
+                ) as mock_get_bridge:
+                    mock_bridge = Mock()
+                    mock_provider = Mock(spec=GemmaModelProvider)
+                    mock_bridge.provider_bridge.return_value = mock_provider
+                    mock_get_bridge.return_value = mock_bridge
+
+                    _ = bridge.to_megatron_provider(load_weights=False)
+
+                    # Verify provider_bridge was called with correct model
+                    mock_bridge.provider_bridge.assert_called_once_with(mock_model)
+
+                # Clear mocks for next iteration
+                mock_safe_load_config.reset_mock()
+                mock_pretrained.reset_mock()
+
+    @patch("megatron.bridge.models.conversion.auto_bridge.PreTrainedCausalLM.from_pretrained")
+    @patch("megatron.bridge.models.conversion.auto_bridge.safe_load_config_with_retry")
+    def test_from_pretrained_with_kwargs(self, mock_safe_load_config, mock_pretrained, gemma_configs):
+        """Test AutoBridge.from_hf_pretrained with various kwargs."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            config_dict = gemma_configs["gemma-7b"]
+            self.create_mock_model_files(config_dict, temp_dir)
+
+            # Mock the config loading
+            config = GemmaConfig(**config_dict)
+            mock_safe_load_config.return_value = config
+
+            # Mock the pretrained model
+            mock_model = Mock(spec=PreTrainedCausalLM)
+            mock_model.config = config
+            mock_pretrained.return_value = mock_model
+
+            # Test with various kwargs
+            kwargs = {
+                "torch_dtype": torch.bfloat16,
+                "device_map": "auto",
+                "trust_remote_code": True,
+                "attn_implementation": "flash_attention_2",
+            }
+
+            _ = AutoBridge.from_hf_pretrained(temp_dir, **kwargs)
+
+            # Verify kwargs were passed through
+            mock_pretrained.assert_called_once_with(temp_dir, **kwargs)
+
+    def test_supports_gemma_architectures(self, gemma_configs):
+        """Test that AutoBridge.supports correctly identifies Gemma models."""
+        for model_name, config_dict in gemma_configs.items():
+            config = GemmaConfig(**config_dict)
+            assert AutoBridge.supports(config) == True
+
+        # Test non-causal LM architecture
+        non_causal_config = Mock()
+        non_causal_config.architectures = ["GemmaModel"]  # Not ForCausalLM
+        assert AutoBridge.supports(non_causal_config) == False
+
+    def test_list_supported_models(self):
+        """Test list_supported_models includes GemmaForCausalLM."""
+        # This test requires the dispatch system to be set up
+        # Since we're testing in isolation, we'll skip this test
+        # In a real environment, this would work if the bridges are registered
+        pass  # Skip for now as it requires full dispatch setup
+
+
+class TestGemmaBridgeParameterMapping:
+    """Test parameter mapping functionality in GemmaBridge."""
+
+    @pytest.fixture
+    def mock_gemma_state_dict(self):
+        """Create a mock state dict with Gemma parameter names."""
+        return {
+            "model.embed_tokens.weight": torch.randn(256000, 2048),
+            "model.norm.weight": torch.randn(2048),
+            "model.layers.0.input_layernorm.weight": torch.randn(2048),
+            "model.layers.0.post_attention_layernorm.weight": torch.randn(2048),
+            "model.layers.0.self_attn.q_proj.weight": torch.randn(2048, 2048),
+            "model.layers.0.self_attn.k_proj.weight": torch.randn(256, 2048),  # GQA: different size for K
+            "model.layers.0.self_attn.v_proj.weight": torch.randn(256, 2048),  # GQA: different size for V
+            "model.layers.0.self_attn.o_proj.weight": torch.randn(2048, 2048),
+            "model.layers.0.mlp.gate_proj.weight": torch.randn(16384, 2048),
+            "model.layers.0.mlp.up_proj.weight": torch.randn(16384, 2048),
+            "model.layers.0.mlp.down_proj.weight": torch.randn(2048, 16384),
+        }
+
+    def test_mapping_registry_has_gemma_specific_mappings(self):
+        """Test that mapping registry includes Gemma-specific mappings."""
+        bridge = GemmaBridge()
+        mapping_registry = bridge.mapping_registry()
+
+        # This test verifies that the mapping registry was created
+        # The actual parameter mappings are tested in integration tests
+        assert mapping_registry is not None
+
+    def test_gemma_tied_embeddings_mapping(self):
+        """Test that Gemma bridge handles tied embeddings correctly."""
+        bridge = GemmaBridge()
+        mapping_registry = bridge.mapping_registry()
+
+        # Gemma uses tied embeddings, so there should be no separate lm_head.weight mapping
+        # This is reflected in the mapping registry not including lm_head.weight
+        assert mapping_registry is not None
+
+    def test_gemma_no_bias_mapping(self):
+        """Test that Gemma bridge doesn't include bias mappings."""
+        bridge = GemmaBridge()
+        mapping_registry = bridge.mapping_registry()
+
+        # Gemma doesn't have bias in linear layers
+        # This is reflected in the QKVMapping and other mappings not including bias terms
+        assert mapping_registry is not None
+
+    def test_gemma_gated_mlp_mapping(self):
+        """Test that Gemma bridge includes gated MLP mappings."""
+        bridge = GemmaBridge()
+        mapping_registry = bridge.mapping_registry()
+
+        # Gemma uses gated MLP, so it should have GatedMLPMapping
+        # This combines gate_proj and up_proj into linear_fc1
+        assert mapping_registry is not None
diff --git a/tests/unit_tests/models/gemma/test_gemma_provider.py b/tests/unit_tests/models/gemma/test_gemma_provider.py
new file mode 100644
index 0000000000..91738cb332
--- /dev/null
+++ b/tests/unit_tests/models/gemma/test_gemma_provider.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import Mock, patch
+
+from megatron.core.activations import fast_gelu
+from megatron.core.transformer.enums import AttnBackend
+
+from megatron.bridge.models.gemma.gemma_provider import (
+    CodeGemmaModelProvider2B,
+    CodeGemmaModelProvider7B,
+    GemmaModelProvider,
+    GemmaModelProvider2B,
+    GemmaModelProvider7B,
+)
+
+
+class TestGemmaModelProvider:
+    """Test cases for base GemmaModelProvider class."""
+
+    def test_gemma_model_provider_initialization(self):
+        """Test GemmaModelProvider can be initialized with default values."""
+        provider = GemmaModelProvider(
+            num_layers=18,
+            hidden_size=2048,
+            num_attention_heads=8,
+        )
+
+        # Check required transformer config fields
+        assert provider.num_layers == 18
+        assert provider.hidden_size == 2048
+        assert provider.num_attention_heads == 8
+
+        # Check Gemma-specific defaults
+        assert provider.normalization == "RMSNorm"
+        assert provider.activation_func == fast_gelu
+        assert provider.gated_linear_unit is True
+        assert provider.position_embedding_type == "rope"
+        assert provider.add_bias_linear is False
+        assert provider.seq_length == 8192
+        assert provider.kv_channels == 256
+        assert provider.attention_dropout == 0.0
+        assert provider.hidden_dropout == 0.0
+        assert provider.share_embeddings_and_output_weights is True
+        assert provider.layernorm_zero_centered_gamma is True
+        assert provider.attention_backend == AttnBackend.flash
+
+    @patch("megatron.bridge.models.gemma.gemma_provider.parallel_state")
+    @patch("megatron.bridge.models.gemma.modules.extend_instance")
+    def test_gemma_model_provider_provide_with_embedding_scaling(self, mock_extend_instance, mock_parallel_state):
+        """Test that provide method applies embedding scaling when appropriate."""
+        # Mock the parent provide method
+        mock_model = Mock()
+        mock_model.embedding = Mock()
+
+        provider = GemmaModelProvider(
+            num_layers=18,
+            hidden_size=2048,
+            num_attention_heads=8,
+        )
+
+        with patch.object(provider.__class__.__bases__[0], "provide", return_value=mock_model):
+            # Test case: First pipeline stage
+            mock_parallel_state.is_pipeline_first_stage.return_value = True
+
+            result = provider.provide(vp_stage=0)
+
+            # Verify that parent provide was called
+            assert result == mock_model
+
+            # Verify that is_pipeline_first_stage was called with correct parameters
+            mock_parallel_state.is_pipeline_first_stage.assert_called_once_with(
+                ignore_virtual=False,
+                vp_stage=0,
+            )
+
+            # Verify that extend_instance was called with embedding scaling mixin
+            mock_extend_instance.assert_called_once()
+            args = mock_extend_instance.call_args[0]
+            assert args[0] == mock_model.embedding  # First arg should be the embedding
+            # Second arg should be the EmbeddingScalingMixin class
+
+    @patch("megatron.bridge.models.gemma.gemma_provider.parallel_state")
+    @patch("megatron.bridge.models.gemma.modules.extend_instance")
+    def test_gemma_model_provider_provide_no_embedding_scaling(self, mock_extend_instance, mock_parallel_state):
+        """Test that provide method doesn't apply embedding scaling when not first stage."""
+        mock_model = Mock()
+        mock_model.embedding = Mock()
+
+        provider = GemmaModelProvider(
+            num_layers=18,
+            hidden_size=2048,
+            num_attention_heads=8,
+        )
+
+        with patch.object(provider.__class__.__bases__[0], "provide", return_value=mock_model):
+            # Test case: Not first pipeline stage
+            mock_parallel_state.is_pipeline_first_stage.return_value = False
+
+            result = provider.provide(vp_stage=1)
+
+            # Verify that parent provide was called
+            assert result == mock_model
+
+            # Verify that is_pipeline_first_stage was called with correct parameters
+            mock_parallel_state.is_pipeline_first_stage.assert_called_once_with(
+                ignore_virtual=False,
+                vp_stage=1,
+            )
+
+            # Verify that extend_instance was NOT called
+            mock_extend_instance.assert_not_called()
+
+    @patch("megatron.bridge.models.gemma.gemma_provider.parallel_state")
+    @patch("megatron.bridge.models.gemma.modules.extend_instance")
+    def test_gemma_model_provider_provide_virtual_pipeline_none(self, mock_extend_instance, mock_parallel_state):
+        """Test provide method when vp_stage is None (no virtual pipeline)."""
+        mock_model = Mock()
+        mock_model.embedding = Mock()
+
+        provider = GemmaModelProvider(
+            num_layers=18,
+            hidden_size=2048,
+            num_attention_heads=8,
+        )
+
+        with patch.object(provider.__class__.__bases__[0], "provide", return_value=mock_model):
+            # Test case: No virtual pipeline (vp_stage=None)
+            mock_parallel_state.is_pipeline_first_stage.return_value = True
+
+            _ = provider.provide(vp_stage=None)
+
+            # Verify that is_pipeline_first_stage was called with vp_stage=None
+            mock_parallel_state.is_pipeline_first_stage.assert_called_once_with(
+                ignore_virtual=False,
+                vp_stage=None,
+            )
+
+            # Verify that extend_instance was called since it's first stage
+            mock_extend_instance.assert_called_once()
+
+
+class TestGemmaModelProvider2B:
+    """Test cases for GemmaModelProvider2B class."""
+
+    def test_gemma_2b_configuration(self):
+        """Test that GemmaModelProvider2B has correct configuration values."""
+        provider = GemmaModelProvider2B()
+
+        # Test 2B specific values
+        assert provider.num_layers == 18
+        assert provider.hidden_size == 2048
+        assert provider.num_attention_heads == 8
+        assert provider.num_query_groups == 1
+        assert provider.ffn_hidden_size == 16384
+
+        # Test inherited Gemma defaults
+        assert provider.normalization == "RMSNorm"
+        assert provider.activation_func == fast_gelu
+        assert provider.gated_linear_unit is True
+        assert provider.attention_backend == AttnBackend.flash
+
+    def test_gemma_2b_inheritance(self):
+        """Test that GemmaModelProvider2B properly inherits from GemmaModelProvider."""
+        provider = GemmaModelProvider2B()
+        assert isinstance(provider, GemmaModelProvider)
+
+
+class TestGemmaModelProvider7B:
+    """Test cases for GemmaModelProvider7B class."""
+
+    def test_gemma_7b_configuration(self):
+        """Test that GemmaModelProvider7B has correct configuration values."""
+        provider = GemmaModelProvider7B()
+
+        # Test 7B specific values
+        assert provider.num_layers == 28
+        assert provider.hidden_size == 3072
+        assert provider.num_attention_heads == 16
+        assert provider.num_query_groups == 16
+        assert provider.ffn_hidden_size == 24576
+
+        # Test inherited Gemma defaults
+        assert provider.normalization == "RMSNorm"
+        assert provider.activation_func == fast_gelu
+        assert provider.gated_linear_unit is True
+        assert provider.attention_backend == AttnBackend.flash
+
+    def test_gemma_7b_inheritance(self):
+        """Test that GemmaModelProvider7B properly inherits from GemmaModelProvider."""
+        provider = GemmaModelProvider7B()
+        assert isinstance(provider, GemmaModelProvider)
+
+
+class TestCodeGemmaModelProviders:
+    """Test cases for Code Gemma model provider classes."""
+
+    def test_code_gemma_2b_configuration(self):
+        """Test that CodeGemmaModelProvider2B has correct 2B configuration values."""
+        provider = CodeGemmaModelProvider2B()
+
+        # Test 2B specific values
+        assert provider.num_layers == 18
+        assert provider.hidden_size == 2048
+        assert provider.num_attention_heads == 8
+        assert provider.num_query_groups == 1
+        assert provider.ffn_hidden_size == 16384
+
+        # Test inherited Gemma defaults
+        assert provider.normalization == "RMSNorm"
+        assert provider.activation_func == fast_gelu
+        assert provider.gated_linear_unit is True
+        assert provider.attention_backend == AttnBackend.flash
+
+    def test_code_gemma_7b_configuration(self):
+        """Test that CodeGemmaModelProvider7B has correct 7B configuration values."""
+        provider = CodeGemmaModelProvider7B()
+
+        # Test 7B specific values
+        assert provider.num_layers == 28
+        assert provider.hidden_size == 3072
+        assert provider.num_attention_heads == 16
+        assert provider.num_query_groups == 16
+        assert provider.ffn_hidden_size == 24576
+
+        # Test inherited Gemma defaults
+        assert provider.normalization == "RMSNorm"
+        assert provider.activation_func == fast_gelu
+        assert provider.gated_linear_unit is True
+        assert provider.attention_backend == AttnBackend.flash
+
+    def test_code_gemma_inheritance_chain(self):
+        """Test the inheritance chain for Code Gemma providers."""
+        provider_2b = CodeGemmaModelProvider2B()
+        provider_7b = CodeGemmaModelProvider7B()
+
+        # Check inheritance chain - both should inherit directly from GemmaModelProvider
+        assert isinstance(provider_2b, GemmaModelProvider)
+        assert isinstance(provider_7b, GemmaModelProvider)
+
+
+class TestGemmaModelProviderIntegration:
+    """Integration tests for Gemma model providers."""
+
+    def test_all_providers_have_provide_method(self):
+        """Test that all provider classes have the provide method."""
+        providers = [
+            GemmaModelProvider2B(),
+            GemmaModelProvider7B(),
+            CodeGemmaModelProvider2B(),
+            CodeGemmaModelProvider7B(),
+        ]
+
+        for provider in providers:
+            assert hasattr(provider, "provide")
+            assert callable(getattr(provider, "provide"))
diff --git a/tests/unit_tests/models/gemma/test_modules.py b/tests/unit_tests/models/gemma/test_modules.py
new file mode 100644
index 0000000000..cd9cffc422
--- /dev/null
+++ b/tests/unit_tests/models/gemma/test_modules.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from unittest.mock import Mock
+
+import torch
+import torch.nn as nn
+
+from megatron.bridge.models.gemma.modules import EmbeddingScalingMixin, extend_instance
+
+
+class TestExtendInstance:
+    """Test suite for the extend_instance function."""
+
+    def test_extend_instance_basic_functionality(self):
+        """Test basic functionality of extend_instance."""
+
+        # Create a simple base class
+        class BaseClass:
+            def method(self):
+                return "base"
+
+        # Create a mixin class
+        class Mixin:
+            def method(self):
+                return f"mixin -> {super().method()}"
+
+            def new_method(self):
+                return "new_method"
+
+        # Create an instance and extend it
+        obj = BaseClass()
+        original_class = obj.__class__
+        extend_instance(obj, Mixin)
+
+        # Test that the class has changed
+        assert obj.__class__ != original_class
+        assert obj.__class__.__name__ == "BaseClass"
+
+        # Test that the mixin method is called first
+        assert obj.method() == "mixin -> base"
+
+        # Test that new methods are available
+        assert obj.new_method() == "new_method"
+
+    def test_extend_instance_preserves_attributes(self):
+        """Test that extend_instance preserves object attributes."""
+
+        class BaseClass:
+            def __init__(self, value):
+                self.value = value
+
+        class Mixin:
+            def get_doubled_value(self):
+                return self.value * 2
+
+        # Create an instance with attributes
+        obj = BaseClass(42)
+        extend_instance(obj, Mixin)
+
+        # Test that attributes are preserved
+        assert obj.value == 42
+        assert obj.get_doubled_value() == 84
+
+    def test_extend_instance_method_resolution_order(self):
+        """Test that extend_instance correctly sets the method resolution order."""
+
+        class BaseClass:
+            def identify(self):
+                return "base"
+
+        class Mixin:
+            def identify(self):
+                return "mixin"
+
+        obj = BaseClass()
+        extend_instance(obj, Mixin)
+
+        # Mixin should be first in MRO, so its method should be called
+        assert obj.identify() == "mixin"
+
+        # Check MRO
+        mro = obj.__class__.__mro__
+        assert len(mro) >= 3  # NewClass, Mixin, BaseClass, object
+        assert mro[1] == Mixin
+
+    def test_extend_instance_multiple_extensions(self):
+        """Test applying multiple mixins in sequence."""
+
+        class BaseClass:
+            def value(self):
+                return 1
+
+        class FirstMixin:
+            def value(self):
+                return super().value() + 10
+
+        class SecondMixin:
+            def value(self):
+                return super().value() + 100
+
+        obj = BaseClass()
+        extend_instance(obj, FirstMixin)
+        extend_instance(obj, SecondMixin)
+
+        # Should be 1 + 10 + 100 = 111
+        assert obj.value() == 111
+
+    def test_extend_instance_with_torch_module(self):
+        """Test extend_instance with PyTorch modules."""
+
+        class SimpleModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(10, 5)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        class ModuleMixin:
+            def forward(self, x):
+                result = super().forward(x)
+                return result * 2  # Scale output by 2
+
+        module = SimpleModule()
+        x = torch.randn(3, 10)
+
+        # Get original output
+        original_output = module(x)
+
+        # Extend the module
+        extend_instance(module, ModuleMixin)
+
+        # Get new output
+        new_output = module(x)
+
+        # Should be doubled
+        assert torch.allclose(new_output, original_output * 2)
+
+
+class TestEmbeddingScalingMixin:
+    """Test suite for the EmbeddingScalingMixin class."""
+
+    def test_embedding_scaling_mixin(self):
+        """Test basic functionality of EmbeddingScalingMixin."""
+
+        # Create a mock embedding class
+        class MockEmbedding(nn.Module):
+            def __init__(self, hidden_size):
+                super().__init__()
+                self.config = Mock()
+                self.config.hidden_size = hidden_size
+
+            def forward(self, **kwargs):
+                # Return a simple tensor for testing
+                return torch.ones(2, 3, self.config.hidden_size)
+
+        # Create an embedding and extend it
+        embedding = MockEmbedding(hidden_size=64)
+        extend_instance(embedding, EmbeddingScalingMixin)
+
+        # Test forward pass
+        result = embedding.forward()
+        expected_scale = math.sqrt(64)
+        expected_result = torch.ones(2, 3, 64) * expected_scale
+
+        assert torch.allclose(result, expected_result)
diff --git a/tests/unit_tests/models/test_auto_bridge.py b/tests/unit_tests/models/test_auto_bridge.py
index 14cd95bc47..459f891305 100644
--- a/tests/unit_tests/models/test_auto_bridge.py
+++ b/tests/unit_tests/models/test_auto_bridge.py
@@ -143,8 +143,8 @@ def test_can_handle_supported_model(self, llama_config_mock):
         ) as mock_safe_load_config:
             mock_safe_load_config.return_value = llama_config_mock
 
-            assert AutoBridge.can_handle("meta-llama/Llama-3-8B") is True
-            mock_safe_load_config.assert_called_with("meta-llama/Llama-3-8B", trust_remote_code=False)
+            assert AutoBridge.can_handle("meta-llama/Meta-Llama-3-8B") is True
+            mock_safe_load_config.assert_called_with("meta-llama/Meta-Llama-3-8B", trust_remote_code=False)
 
     def test_can_handle_unsupported_model(self, bert_config):
         """Test can_handle returns False for unsupported models."""
@@ -685,13 +685,13 @@ def test_import_ckpt_basic(self, mock_from_hf_pretrained, mock_to_megatron_model
         mock_bridge.save_megatron_model = Mock()
 
         # Test import_ckpt
-        AutoBridge.import_ckpt("meta-llama/Llama-3-8B", "./megatron_checkpoint")
+        AutoBridge.import_ckpt("meta-llama/Meta-Llama-3-8B", "./megatron_checkpoint")
 
         # Assertions
-        mock_from_hf_pretrained.assert_called_once_with("meta-llama/Llama-3-8B")
+        mock_from_hf_pretrained.assert_called_once_with("meta-llama/Meta-Llama-3-8B")
         mock_bridge.to_megatron_model.assert_called_once_with(wrap_with_ddp=False, use_cpu_initialization=True)
         mock_bridge.save_megatron_model.assert_called_once_with(
-            mock_megatron_model, "./megatron_checkpoint", hf_tokenizer_path="meta-llama/Llama-3-8B"
+            mock_megatron_model, "./megatron_checkpoint", hf_tokenizer_path="meta-llama/Meta-Llama-3-8B"
         )
 
     @patch.object(AutoBridge, "save_megatron_model")
@@ -800,11 +800,11 @@ def test_save_megatron_model_with_tokenizer(self):
 
         with patch("megatron.bridge.training.model_load_save.save_megatron_model") as mock_save_megatron_model:
             bridge.save_megatron_model(
-                mock_megatron_model, "./checkpoint_path", hf_tokenizer_path="meta-llama/Llama-3-8B"
+                mock_megatron_model, "./checkpoint_path", hf_tokenizer_path="meta-llama/Meta-Llama-3-8B"
             )
 
             mock_save_megatron_model.assert_called_once_with(
-                mock_megatron_model, "./checkpoint_path", hf_tokenizer_path="meta-llama/Llama-3-8B"
+                mock_megatron_model, "./checkpoint_path", hf_tokenizer_path="meta-llama/Meta-Llama-3-8B"
             )
 
     def test_save_megatron_model_import_error(self):
@@ -885,3 +885,52 @@ def test_load_megatron_model_with_iter_folder(self):
                 mock_load_megatron_model.assert_called_once()
                 mock_iterdir.assert_called_once()
                 # Should use the latest iteration (iter_0000020)
+
+    def test_load_megatron_model_with_mp_overrides(self):
+        """Test load_megatron_model with model-parallel overrides argument."""
+
+        mock_hf_model = Mock(spec=PreTrainedCausalLM)
+        mock_config = Mock(spec=PretrainedConfig)
+        mock_config.architectures = ["LlamaForCausalLM"]
+        mock_hf_model.config = mock_config
+
+        bridge = AutoBridge.__new__(AutoBridge)
+        bridge.hf_pretrained = mock_hf_model
+
+        # Create model-parallel overrides
+        mp_overrides = {
+            "tensor_model_parallel_size": 2,
+            "pipeline_model_parallel_size": 1,
+        }
+
+        with patch("megatron.bridge.training.model_load_save.load_megatron_model") as mock_load_megatron_model:
+            with patch("torch.distributed.is_available", return_value=False):
+                with patch("torch.distributed.is_initialized", return_value=False):
+                    from pathlib import Path
+
+                    with patch.object(Path, "iterdir") as mock_iterdir:
+                        # Setup mocks
+                        mock_model = Mock()
+                        mock_load_megatron_model.return_value = mock_model
+
+                        # Mock iterdir to return empty list (no iter_ folders)
+                        mock_iterdir.return_value = []
+
+                        # Call load_megatron_model with model-parallel overrides
+                        result = bridge.load_megatron_model(
+                            "checkpoint_path", mp_overrides=mp_overrides, wrap_with_ddp=False
+                        )
+
+                        # Verify the result
+                        assert result == [mock_model]
+
+                        # Verify that load_megatron_model was called with mp_overrides
+                        mock_load_megatron_model.assert_called_once()
+                        call_args = mock_load_megatron_model.call_args
+
+                        # Check that mp_overrides was passed correctly
+                        assert call_args.kwargs["mp_overrides"] == mp_overrides
+
+                        # Check other expected arguments
+                        assert call_args.args[0] == "checkpoint_path"  # path argument
+                        assert "skip_temp_dist_context" in call_args.kwargs
diff --git a/tests/unit_tests/recipes/llama/test_llama2_7b.py b/tests/unit_tests/recipes/llama/test_llama2_7b.py
deleted file mode 100644
index 943dc04572..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama2_7b.py
+++ /dev/null
@@ -1,392 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama2ModelProvider7B
-from megatron.bridge.recipes.llama.llama2_7b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama2ModelProvider7B)
-        assert config.tensor_model_parallel_size == 2
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=4)
-
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 1  # default
-        assert config.context_parallel_size == 1  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=8, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 2  # default
-        assert config.pipeline_model_parallel_size == 8
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.float16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=4)
-
-        assert config.virtual_pipeline_model_parallel_size == 4
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=8)
-
-        assert config.context_parallel_size == 8
-
-    def test_model_config_sequence_parallelism_enabled(self):
-        """Test model_config with sequence parallelism enabled."""
-        config = model_config(sequence_parallelism=True, tensor_parallelism=2)
-
-        assert config.sequence_parallel is True
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all parameters customized."""
-        config = model_config(
-            tensor_parallelism=2,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.bfloat16,
-            virtual_pipeline_parallelism=8,
-            context_parallelism=16,
-            sequence_parallelism=True,
-        )
-
-        assert config.tensor_model_parallel_size == 2
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size == 8
-        assert config.context_parallel_size == 16
-        assert config.sequence_parallel is True
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama2ModelProvider7B)
-
-        # Check training configuration
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.weight_decay == 0.1
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            seq_length=4096,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.dataset.sequence_length == 4096
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000  # Note: fixed in scheduler config
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=4,
-            pipeline_parallelism=2,
-            context_parallelism=8,
-            sequence_parallelism=True,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 2
-        assert config.model.context_parallel_size == 8
-        assert config.model.sequence_parallel is True
-        assert config.model.pipeline_dtype == torch.bfloat16
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True  # DP size > 1 with default config
-        assert config.ddp.overlap_param_gather is True  # DP size > 1 with default config
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_default_comm_overlap(self):
-        """Test default CommOverlapConfig setup."""
-        config = pretrain_config()
-
-        # Default setup should have TP comm overlap disabled for 7B model
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_custom_comm_overlap(self):
-        """Test custom CommOverlapConfig."""
-        custom_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=50,
-            data_parallel_size=1,  # Add this to avoid None
-        )
-        config = pretrain_config(comm_overlap_config=custom_overlap)
-
-        # Should use the custom config
-        # Since default TP size is 1, it should be disabled
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism",
-        [
-            (1, 1, 1),
-            (2, 1, 4),
-            (1, 4, 2),
-            (2, 2, 2),  # Changed from 8 to 2 to fit in 8 GPUs
-            (4, 2, 1),  # Changed from 4,4,16 to fit in 8 GPUs
-        ],
-    )
-    def test_pretrain_config_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism
-    ):
-        """Test various parallelism combinations."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (128, 1),
-            (512, 2),
-            (1024, 4),
-            (256, 8),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama31_405b.py b/tests/unit_tests/recipes/llama/test_llama31_405b.py
deleted file mode 100644
index 677fe39aa7..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama31_405b.py
+++ /dev/null
@@ -1,453 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama31ModelProvider405B
-from megatron.bridge.recipes.llama.llama31_405b import model_config, pretrain_config
-from megatron.bridge.training.comm_overlap import (
-    CommOverlapConfig,
-    userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
-)
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama31ModelProvider405B)
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 8
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size == 2
-        assert config.context_parallel_size == 4
-        assert config.sequence_parallel is True
-        assert config.account_for_embedding_in_pipeline_split is True
-        assert config.account_for_loss_in_pipeline_split is True
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=8)
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 8  # default
-        assert config.context_parallel_size == 4  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=16, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 8  # default
-        assert config.pipeline_model_parallel_size == 16
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=4, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.float16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=4)
-
-        assert config.virtual_pipeline_model_parallel_size == 4
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=8)
-
-        assert config.context_parallel_size == 8
-
-    def test_model_config_sequence_parallelism_disabled(self):
-        """Test model_config with sequence parallelism disabled."""
-        config = model_config(sequence_parallelism=False)
-
-        assert config.sequence_parallel is False
-
-    def test_model_config_405b_specific_parameters(self):
-        """Test model_config with 405B-specific parameters."""
-        config = model_config(
-            account_for_embedding_in_pipeline_split=False,
-            account_for_loss_in_pipeline_split=False,
-        )
-
-        assert config.account_for_embedding_in_pipeline_split is False
-        assert config.account_for_loss_in_pipeline_split is False
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all parameters customized."""
-        config = model_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=16,
-            pipeline_parallelism_dtype=torch.float32,
-            virtual_pipeline_parallelism=4,
-            context_parallelism=8,
-            sequence_parallelism=False,
-            account_for_embedding_in_pipeline_split=False,
-            account_for_loss_in_pipeline_split=False,
-        )
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 16
-        assert config.pipeline_dtype == torch.float32
-        assert config.virtual_pipeline_model_parallel_size == 4
-        assert config.context_parallel_size == 8
-        assert config.sequence_parallel is False
-        assert config.account_for_embedding_in_pipeline_split is False
-        assert config.account_for_loss_in_pipeline_split is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama31ModelProvider405B)
-
-        # Check training configuration
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.weight_decay == 0.1
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 8192  # Hardcoded to 8192
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.dataset.sequence_length == 8192  # Always 8192 for Llama3.1 405B
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000  # Note: fixed in scheduler config
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=16,
-            context_parallelism=8,
-            sequence_parallelism=False,
-            pipeline_parallelism_dtype=torch.float32,
-            virtual_pipeline_parallelism=4,
-            account_for_embedding_in_pipeline_split=False,
-            account_for_loss_in_pipeline_split=False,
-        )
-
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 16
-        assert config.model.context_parallel_size == 8
-        assert config.model.sequence_parallel is False
-        assert config.model.pipeline_dtype == torch.float32
-        assert config.model.virtual_pipeline_model_parallel_size == 4
-        assert config.model.account_for_embedding_in_pipeline_split is False
-        assert config.model.account_for_loss_in_pipeline_split is False
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        # Note: overlap_grad_reduce and overlap_param_gather are now controlled by CommOverlapConfig
-        # and default to False when data_parallel_size is None or <= 1
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-        # align_param_gather is True when PP > 1 and VP > 1 (which is the case for 405B defaults)
-        # However, without proper distributed setup, data_parallel_size might be None,
-        # so align_param_gather would be False
-        assert config.ddp.align_param_gather is False
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_default_comm_overlap(self):
-        """Test default CommOverlapConfig setup."""
-        config = pretrain_config()
-
-        # Default setup should have TP comm overlap disabled due to TP size being 1
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_custom_comm_overlap(self):
-        """Test custom CommOverlapConfig."""
-        custom_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=80,
-            data_parallel_size=2,
-        )
-        config = pretrain_config(comm_overlap_config=custom_overlap)
-
-        # Should apply custom config
-        assert config.comm_overlap.defer_embedding_wgrad_compute is True
-        assert config.model.wgrad_deferral_limit == 0
-
-    def test_pretrain_config_comm_overlap_with_tp(self):
-        """Test CommOverlapConfig with tensor parallelism enabled."""
-        # Mock HAVE_TE to True to simulate transformer engine being available
-        with patch("megatron.bridge.training.comm_overlap.HAVE_TE", True):
-            config = pretrain_config(tensor_parallelism=8, sequence_parallelism=True)
-
-        # With TP > 1 and sequence parallelism, comm_overlap should be configured
-        assert config.comm_overlap is not None
-        assert config.comm_overlap.tp_comm_overlap is True
-        assert config.comm_overlap.defer_embedding_wgrad_compute is True
-        assert config.model.wgrad_deferral_limit == 0
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    @pytest.mark.parametrize("vocab_size", [2048, 4096, 8192, 16384])
-    def test_pretrain_config_tokenizer_configuration(self, vocab_size):
-        """Test tokenizer configuration."""
-        config = pretrain_config(vocab_size=vocab_size)
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == vocab_size
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism",
-        [
-            (8, 8, 4),
-            (8, 8, 4),
-            (8, 16, 2),
-            (8, 16, 2),
-        ],
-    )
-    def test_pretrain_config_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism
-    ):
-        """Test various parallelism combinations for 405B model."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (256, 1),
-            (512, 1),
-            (1024, 2),
-            (512, 4),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    def test_pretrain_config_llama31_405b_optimized_defaults(self):
-        """Test that Llama3.1 405B specific optimizations are applied by default."""
-        config = pretrain_config()
-
-        # Check model defaults optimized for Llama3.1 405B
-        assert config.model.tensor_model_parallel_size == 8  # Higher than smaller models
-        assert config.model.pipeline_model_parallel_size == 8  # Higher than smaller models
-        assert config.model.pipeline_dtype == torch.bfloat16  # Optimized dtype
-        assert config.model.sequence_parallel is True  # Enabled for efficiency
-        assert config.model.context_parallel_size == 4  # Higher for 405B
-        assert config.model.virtual_pipeline_model_parallel_size == 2  # Lower for 405B
-
-        # Check 405B-specific parameters
-        assert config.model.account_for_embedding_in_pipeline_split is True
-        assert config.model.account_for_loss_in_pipeline_split is True
-
-        # Check dataset defaults
-        assert config.dataset.sequence_length == 8192  # Hardcoded sequence length
-
-    @pytest.mark.parametrize("virtual_pipeline_parallelism", [None, 1, 2, 4, 8])
-    def test_pretrain_config_virtual_pipeline_parallelism(self, virtual_pipeline_parallelism):
-        """Test various virtual pipeline parallelism settings."""
-        config = pretrain_config(virtual_pipeline_parallelism=virtual_pipeline_parallelism)
-
-        assert config.model.virtual_pipeline_model_parallel_size == virtual_pipeline_parallelism
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama31_70b.py b/tests/unit_tests/recipes/llama/test_llama31_70b.py
deleted file mode 100644
index e732fb19e7..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama31_70b.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama31ModelProvider70B
-from megatron.bridge.recipes.llama.llama31_70b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig, userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama31ModelProvider70B)
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size == 5
-        assert config.context_parallel_size == 2
-        assert config.sequence_parallel is True
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=8)
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 4  # default
-        assert config.context_parallel_size == 2  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=8, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 4  # default
-        assert config.pipeline_model_parallel_size == 8
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.float16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=8)
-
-        assert config.virtual_pipeline_model_parallel_size == 8
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=8)
-
-        assert config.context_parallel_size == 8
-
-    def test_model_config_sequence_parallelism_disabled(self):
-        """Test model_config with sequence parallelism disabled."""
-        config = model_config(sequence_parallelism=False)
-
-        assert config.sequence_parallel is False
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all parameters customized."""
-        config = model_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=8,
-            pipeline_parallelism_dtype=torch.float32,
-            virtual_pipeline_parallelism=10,
-            context_parallelism=4,
-            sequence_parallelism=False,
-        )
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 8
-        assert config.pipeline_dtype == torch.float32
-        assert config.virtual_pipeline_model_parallel_size == 10
-        assert config.context_parallel_size == 4
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama31ModelProvider70B)
-
-        # Check training configuration
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.weight_decay == 0.1
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 8192  # Hardcoded to 8192
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.dataset.sequence_length == 8192  # Always 8192 for Llama3.1 70B
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000  # Note: fixed in scheduler config
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=8,
-            context_parallelism=4,
-            sequence_parallelism=False,
-            pipeline_parallelism_dtype=torch.float32,
-            virtual_pipeline_parallelism=10,
-        )
-
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 8
-        assert config.model.context_parallel_size == 4
-        assert config.model.sequence_parallel is False
-        assert config.model.pipeline_dtype == torch.float32
-        assert config.model.virtual_pipeline_model_parallel_size == 10
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        # Note: overlap_grad_reduce and overlap_param_gather are now controlled by CommOverlapConfig
-        # and default to False when data_parallel_size is None or <= 1
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-        # align_param_gather is set by comm_overlap config during setup, not in recipe
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_default_comm_overlap(self):
-        """Test default CommOverlapConfig setup."""
-        config = pretrain_config()
-
-        # Default setup should have TP comm overlap disabled due to TP size being 1
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_custom_comm_overlap(self):
-        """Test custom CommOverlapConfig."""
-        custom_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=22,
-            data_parallel_size=2,
-        )
-        config = pretrain_config(comm_overlap_config=custom_overlap)
-
-        # Should use the custom config
-        assert config.comm_overlap is not None  # TP size is 1 by default
-
-    def test_pretrain_config_comm_overlap_with_tp(self):
-        """Test CommOverlapConfig with tensor parallelism enabled."""
-        # Mock HAVE_TE to True to simulate transformer engine being available
-        with patch("megatron.bridge.training.comm_overlap.HAVE_TE", True):
-            config = pretrain_config(
-                tensor_parallelism=4,
-                pipeline_parallelism=2,
-                context_parallelism=2,
-                sequence_parallelism=True,
-            )
-
-        # With TP > 1 and sequence parallelism, comm_overlap should be configured
-        assert config.comm_overlap is not None
-        assert config.comm_overlap.tp_comm_overlap is True
-        assert config.comm_overlap.defer_embedding_wgrad_compute is True
-        assert config.comm_overlap.wgrad_deferral_limit == 50  # Default from recipe
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism",
-        [
-            (2, 2, 1),
-            (4, 4, 2),
-            (8, 2, 2),  # Changed from 8,4,4 to fit in 32 GPUs
-            (4, 4, 2),  # Changed from 4,8,2 to fit in 32 GPUs
-            (8, 4, 1),  # Changed from 8,8,4 to fit in 32 GPUs
-        ],
-    )
-    def test_pretrain_config_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism
-    ):
-        """Test various parallelism combinations for 70B model."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (256, 1),
-            (512, 1),
-            (1024, 2),
-            (512, 4),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    def test_pretrain_config_llama31_70b_optimized_defaults(self):
-        """Test that Llama3.1 70B specific optimizations are applied by default."""
-        config = pretrain_config()
-
-        # Check model defaults optimized for Llama3.1 70B
-        assert config.model.tensor_model_parallel_size == 4  # Higher than smaller models
-        assert config.model.pipeline_model_parallel_size == 4  # Higher than smaller models
-        assert config.model.pipeline_dtype == torch.bfloat16  # Optimized dtype
-        assert config.model.sequence_parallel is True  # Enabled for efficiency
-        assert config.model.context_parallel_size == 2  # Llama3.1 specific
-        assert config.model.virtual_pipeline_model_parallel_size == 5  # Virtual PP for large model
-
-        # Check dataset defaults
-        assert config.dataset.sequence_length == 8192  # Hardcoded sequence length
-
-    @pytest.mark.parametrize("virtual_pipeline_parallelism", [None, 3, 5, 7, 10])
-    def test_pretrain_config_virtual_pipeline_parallelism(self, virtual_pipeline_parallelism):
-        """Test various virtual pipeline parallelism settings."""
-        config = pretrain_config(virtual_pipeline_parallelism=virtual_pipeline_parallelism)
-
-        assert config.model.virtual_pipeline_model_parallel_size == virtual_pipeline_parallelism
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama31_8b.py b/tests/unit_tests/recipes/llama/test_llama31_8b.py
deleted file mode 100644
index 0daf172277..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama31_8b.py
+++ /dev/null
@@ -1,450 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama31ModelProvider8B
-from megatron.bridge.recipes.llama.llama31_8b import get_comm_overlap_config, model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig, userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama31ModelProvider8B)
-        assert config.tensor_model_parallel_size == 1
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 2
-        assert config.sequence_parallel is False
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=4)
-
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 1  # default
-        assert config.context_parallel_size == 2  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=8, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 1  # default
-        assert config.pipeline_model_parallel_size == 8
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.float16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=4)
-
-        assert config.virtual_pipeline_model_parallel_size == 4
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=8)
-
-        assert config.context_parallel_size == 8
-
-    def test_model_config_sequence_parallelism_enabled(self):
-        """Test model_config with sequence parallelism enabled."""
-        config = model_config(sequence_parallelism=True, tensor_parallelism=2)
-
-        assert config.sequence_parallel is True
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all parameters customized."""
-        config = model_config(
-            tensor_parallelism=2,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.bfloat16,
-            virtual_pipeline_parallelism=8,
-            context_parallelism=16,
-            sequence_parallelism=True,
-        )
-
-        assert config.tensor_model_parallel_size == 2
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size == 8
-        assert config.context_parallel_size == 16
-        assert config.sequence_parallel is True
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama31ModelProvider8B)
-
-        # Check training configuration
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.weight_decay == 0.1
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 8192
-        assert config.model.seq_length == 8192
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.dataset.sequence_length == 8192  # Always 8192 for Llama3.1 8B
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000  # Note: fixed in scheduler config
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=4,
-            context_parallelism=2,
-            sequence_parallelism=False,
-            pipeline_parallelism_dtype=torch.float32,
-            virtual_pipeline_parallelism=10,
-        )
-
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 4
-        assert config.model.context_parallel_size == 2
-        assert config.model.sequence_parallel is False
-        assert config.model.pipeline_dtype == torch.float32
-        assert config.model.virtual_pipeline_model_parallel_size == 10
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True  # DP size > 1 with default config
-        assert config.ddp.overlap_param_gather is True  # DP size > 1 with default config
-        # align_param_gather is set by comm_overlap config during setup, not in recipe
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_default_comm_overlap(self):
-        """Test default CommOverlapConfig setup."""
-        config = pretrain_config()
-
-        # Default setup should have comm_overlap disabled (None) for memory efficiency
-        assert config.comm_overlap is None
-
-    def test_pretrain_config_custom_comm_overlap(self):
-        """Test custom CommOverlapConfig."""
-        custom_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=22,
-            data_parallel_size=2,
-        )
-        config = pretrain_config(comm_overlap_config=custom_overlap)
-
-        # Should use the custom config
-        assert config.comm_overlap is not None  # TP size is 1 by default
-
-    def test_pretrain_config_comm_overlap_with_tp(self):
-        """Test CommOverlapConfig with tensor parallelism enabled."""
-        # Even with TP > 1, comm_overlap should be None by default for memory efficiency
-        config = pretrain_config(
-            tensor_parallelism=4,
-            context_parallelism=2,
-            sequence_parallelism=True,
-        )
-
-        # Comm overlap should be disabled by default regardless of parallelism settings
-        assert config.comm_overlap is None
-
-    def test_pretrain_config_explicit_comm_overlap_enable(self):
-        """Test that communication overlap can still be enabled when explicitly provided."""
-        # Create a custom comm overlap config to enable it explicitly
-        custom_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=25,
-        )
-
-        config = pretrain_config(
-            tensor_parallelism=4, context_parallelism=2, sequence_parallelism=True, comm_overlap_config=custom_overlap
-        )
-
-        # Should use the explicitly provided config
-        assert config.comm_overlap is not None
-        assert config.comm_overlap.tp_comm_overlap is True
-        assert config.comm_overlap.defer_embedding_wgrad_compute is True
-        assert config.comm_overlap.wgrad_deferral_limit == 25
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism",
-        [
-            (1, 1, 1),
-            (2, 1, 4),
-            (1, 4, 2),
-            (2, 2, 2),  # Changed from 8 to 2 to fit in 8 GPUs
-            (4, 2, 1),  # Changed from 4,4,16 to fit in 8 GPUs
-        ],
-    )
-    def test_pretrain_config_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism
-    ):
-        """Test various parallelism combinations."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (128, 1),
-            (512, 2),
-            (1024, 4),
-            (256, 8),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    def test_pretrain_config_llama31_defaults(self):
-        """Test that Llama3.1 8B specific defaults are applied correctly."""
-        config = pretrain_config()
-
-        # Check model defaults for Llama3.1 8B
-        assert config.model.tensor_model_parallel_size == 1  # Default for 8B
-        assert config.model.pipeline_model_parallel_size == 1  # Default for 8B
-        assert config.model.pipeline_dtype is None  # Default
-        assert config.model.sequence_parallel is False  # Default for 8B
-        assert config.model.context_parallel_size == 2  # Llama3.1 specific
-        assert config.model.virtual_pipeline_model_parallel_size is None  # Default
-
-        # Check dataset defaults
-        assert config.dataset.sequence_length == 8192  # Standard 8k sequence length
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
-
-
-@pytest.mark.unit
-class TestGetCommOverlapConfig:
-    """Test cases for the get_comm_overlap_config function."""
-
-    def test_get_comm_overlap_config_default_values(self):
-        """Test get_comm_overlap_config returns the expected configuration."""
-        config = get_comm_overlap_config()
-
-        assert isinstance(config, CommOverlapConfig)
-        assert config.tp_comm_overlap is True
-        assert config.defer_embedding_wgrad_compute is True
-        assert config.wgrad_deferral_limit == 50
-        assert config.overlap_param_gather_with_optimizer_step is False
-        assert config.align_param_gather is True
diff --git a/tests/unit_tests/recipes/llama/test_llama32_1b.py b/tests/unit_tests/recipes/llama/test_llama32_1b.py
deleted file mode 100644
index d253365717..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama32_1b.py
+++ /dev/null
@@ -1,411 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama32ModelProvider1B
-from megatron.bridge.recipes.llama.llama32_1b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama32ModelProvider1B)
-        assert config.tensor_model_parallel_size == 1
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=2)
-
-        assert config.tensor_model_parallel_size == 2
-        assert config.pipeline_model_parallel_size == 1  # default
-        assert config.context_parallel_size == 1  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 1  # default
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.bfloat16)
-
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.bfloat16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=2)
-
-        assert config.virtual_pipeline_model_parallel_size == 2
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=2)
-
-        assert config.context_parallel_size == 2
-
-    def test_model_config_sequence_parallelism_enabled(self):
-        """Test model_config with sequence parallelism enabled."""
-        config = model_config(sequence_parallelism=True, tensor_parallelism=2)
-
-        assert config.sequence_parallel is True
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all parameters customized."""
-        config = model_config(
-            tensor_parallelism=2,
-            pipeline_parallelism=2,
-            pipeline_parallelism_dtype=torch.bfloat16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=2,
-            sequence_parallelism=True,
-        )
-
-        assert config.tensor_model_parallel_size == 2
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size == 2
-        assert config.context_parallel_size == 2
-        assert config.sequence_parallel is True
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama32ModelProvider1B)
-
-        # Check training configuration
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.weight_decay == 0.1
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 8192
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            seq_length=4096,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.dataset.sequence_length == 4096
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000  # Note: fixed in scheduler config
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=2,
-            pipeline_parallelism=2,
-            context_parallelism=2,
-            sequence_parallelism=True,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == 2
-        assert config.model.pipeline_model_parallel_size == 2
-        assert config.model.context_parallel_size == 2
-        assert config.model.sequence_parallel is True
-        assert config.model.pipeline_dtype == torch.bfloat16
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True  # DP size > 1 with default config
-        assert config.ddp.overlap_param_gather is True  # DP size > 1 with default config
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism",
-        [
-            (1, 1, 1),
-            (2, 1, 1),
-            (1, 2, 1),
-            (2, 2, 1),
-            (2, 2, 2),
-        ],
-    )
-    def test_pretrain_config_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism
-    ):
-        """Test various parallelism combinations for 1B model."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (128, 1),
-            (256, 1),
-            (512, 2),
-            (1024, 4),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    def test_pretrain_config_llama32_1b_defaults(self):
-        """Test that Llama3.2 1B specific defaults are applied correctly."""
-        config = pretrain_config()
-
-        # Check model defaults for Llama3.2 1B (small model)
-        assert config.model.tensor_model_parallel_size == 1  # Minimal for 1B
-        assert config.model.pipeline_model_parallel_size == 1  # Minimal for 1B
-        assert config.model.pipeline_dtype is None  # Default for small model
-        assert config.model.sequence_parallel is False  # Default for 1B
-        assert config.model.context_parallel_size == 1  # Minimal for 1B
-        assert config.model.virtual_pipeline_model_parallel_size is None  # Default
-
-        # Check dataset defaults
-        assert config.dataset.sequence_length == 8192  # Hardcoded sequence length
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_default_comm_overlap(self):
-        """Test default CommOverlapConfig setup."""
-        config = pretrain_config()
-
-        # Default setup should have TP comm overlap disabled for 1B model
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_custom_comm_overlap(self):
-        """Test custom CommOverlapConfig."""
-        custom_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=50,
-            data_parallel_size=1,  # Add this to avoid None
-        )
-        config = pretrain_config(comm_overlap_config=custom_overlap)
-
-        # Should use the custom config
-        # Since default TP size is 1, it should be disabled
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_seq_length_parameter(self):
-        """Test seq_length parameter."""
-        config = pretrain_config(seq_length=4096)
-        assert config.dataset.sequence_length == 4096
-
-        config = pretrain_config(seq_length=16384)
-        assert config.dataset.sequence_length == 16384
diff --git a/tests/unit_tests/recipes/llama/test_llama32_3b.py b/tests/unit_tests/recipes/llama/test_llama32_3b.py
deleted file mode 100644
index 390e677ef1..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama32_3b.py
+++ /dev/null
@@ -1,411 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama32ModelProvider3B
-from megatron.bridge.recipes.llama.llama32_3b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama32ModelProvider3B)
-        assert config.tensor_model_parallel_size == 1
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=2)
-
-        assert config.tensor_model_parallel_size == 2
-        assert config.pipeline_model_parallel_size == 1  # default
-        assert config.context_parallel_size == 1  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 1  # default
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.bfloat16)
-
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.bfloat16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=2)
-
-        assert config.virtual_pipeline_model_parallel_size == 2
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=2)
-
-        assert config.context_parallel_size == 2
-
-    def test_model_config_sequence_parallelism_enabled(self):
-        """Test model_config with sequence parallelism enabled."""
-        config = model_config(sequence_parallelism=True, tensor_parallelism=2)
-
-        assert config.sequence_parallel is True
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all parameters customized."""
-        config = model_config(
-            tensor_parallelism=2,
-            pipeline_parallelism=2,
-            pipeline_parallelism_dtype=torch.bfloat16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=2,
-            sequence_parallelism=True,
-        )
-
-        assert config.tensor_model_parallel_size == 2
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size == 2
-        assert config.context_parallel_size == 2
-        assert config.sequence_parallel is True
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama32ModelProvider3B)
-
-        # Check training configuration
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.weight_decay == 0.1
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 8192
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            seq_length=4096,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.dataset.sequence_length == 4096
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000  # Note: fixed in scheduler config
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=2,
-            pipeline_parallelism=2,
-            context_parallelism=2,
-            sequence_parallelism=True,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == 2
-        assert config.model.pipeline_model_parallel_size == 2
-        assert config.model.context_parallel_size == 2
-        assert config.model.sequence_parallel is True
-        assert config.model.pipeline_dtype == torch.bfloat16
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True  # DP size > 1 with default config
-        assert config.ddp.overlap_param_gather is True  # DP size > 1 with default config
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism",
-        [
-            (1, 1, 1),
-            (2, 1, 1),
-            (1, 2, 1),
-            (2, 2, 1),
-            (2, 2, 2),
-        ],
-    )
-    def test_pretrain_config_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism
-    ):
-        """Test various parallelism combinations for 3B model."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (128, 1),
-            (256, 1),
-            (512, 2),
-            (1024, 4),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    def test_pretrain_config_llama32_3b_defaults(self):
-        """Test that Llama3.2 3B specific defaults are applied correctly."""
-        config = pretrain_config()
-
-        # Check model defaults for Llama3.2 3B (mid-size model)
-        assert config.model.tensor_model_parallel_size == 1  # Default for 3B
-        assert config.model.pipeline_model_parallel_size == 1  # Default for 3B
-        assert config.model.pipeline_dtype is None  # Default for mid-size model
-        assert config.model.sequence_parallel is False  # Default for 3B
-        assert config.model.context_parallel_size == 1  # Default for 3B
-        assert config.model.virtual_pipeline_model_parallel_size is None  # Default
-
-        # Check dataset defaults
-        assert config.dataset.sequence_length == 8192  # Hardcoded sequence length
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_default_comm_overlap(self):
-        """Test default CommOverlapConfig setup."""
-        config = pretrain_config()
-
-        # Default setup should have TP comm overlap disabled for 3B model
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_custom_comm_overlap(self):
-        """Test custom CommOverlapConfig."""
-        custom_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=50,
-            data_parallel_size=1,  # Add this to avoid None
-        )
-        config = pretrain_config(comm_overlap_config=custom_overlap)
-
-        # Should use the custom config
-        # Since default TP size is 1, it should be disabled
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_seq_length_parameter(self):
-        """Test seq_length parameter."""
-        config = pretrain_config(seq_length=4096)
-        assert config.dataset.sequence_length == 4096
-
-        config = pretrain_config(seq_length=16384)
-        assert config.dataset.sequence_length == 16384
diff --git a/tests/unit_tests/recipes/llama/test_llama3_70b.py b/tests/unit_tests/recipes/llama/test_llama3_70b.py
deleted file mode 100644
index 71c36e96dd..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama3_70b.py
+++ /dev/null
@@ -1,429 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider70B
-from megatron.bridge.recipes.llama.llama3_70b import model_config, pretrain_config
-from megatron.bridge.training.comm_overlap import CommOverlapConfig, userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama3ModelProvider70B)
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size == 5
-        assert config.context_parallel_size == 2
-        assert config.sequence_parallel is True
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=8)
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 4  # default
-        assert config.context_parallel_size == 2  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=8, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 4  # default
-        assert config.pipeline_model_parallel_size == 8
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.float16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=8)
-
-        assert config.virtual_pipeline_model_parallel_size == 8
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=8)
-
-        assert config.context_parallel_size == 8
-
-    def test_model_config_sequence_parallelism_disabled(self):
-        """Test model_config with sequence parallelism disabled."""
-        config = model_config(sequence_parallelism=False)
-
-        assert config.sequence_parallel is False
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all parameters customized."""
-        config = model_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=8,
-            pipeline_parallelism_dtype=torch.float32,
-            virtual_pipeline_parallelism=10,
-            context_parallelism=4,
-            sequence_parallelism=False,
-        )
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 8
-        assert config.pipeline_dtype == torch.float32
-        assert config.virtual_pipeline_model_parallel_size == 10
-        assert config.context_parallel_size == 4
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama3ModelProvider70B)
-
-        # Check training configuration
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.weight_decay == 0.1
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 8192
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            seq_length=4096,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.dataset.sequence_length == 4096
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000  # Note: fixed in scheduler config
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=8,
-            context_parallelism=4,
-            sequence_parallelism=False,
-            pipeline_parallelism_dtype=torch.float32,
-            virtual_pipeline_parallelism=10,
-        )
-
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 8
-        assert config.model.context_parallel_size == 4
-        assert config.model.sequence_parallel is False
-        assert config.model.pipeline_dtype == torch.float32
-        assert config.model.virtual_pipeline_model_parallel_size == 10
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_default_comm_overlap(self):
-        """Test default CommOverlapConfig setup."""
-        config = pretrain_config()
-
-        # Default setup should have comm_overlap config
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_custom_comm_overlap(self):
-        """Test custom CommOverlapConfig."""
-        custom_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=22,
-            data_parallel_size=2,
-        )
-        config = pretrain_config(comm_overlap_config=custom_overlap)
-
-        # Should use the custom config
-        assert config.comm_overlap == custom_overlap
-
-    def test_pretrain_config_comm_overlap_with_tp(self):
-        """Test CommOverlapConfig with tensor parallelism enabled."""
-        # Mock HAVE_TE to True to simulate transformer engine being available
-        with patch("megatron.bridge.training.comm_overlap.HAVE_TE", True):
-            config = pretrain_config(tensor_parallelism=4, sequence_parallelism=True)
-
-        # With TP > 1 and sequence parallelism, comm_overlap should be configured
-        assert config.comm_overlap is not None
-        assert config.comm_overlap.tp_comm_overlap is True
-        assert config.comm_overlap.defer_embedding_wgrad_compute is True
-        assert config.comm_overlap.wgrad_deferral_limit == 22
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    @pytest.mark.parametrize("vocab_size", [2048, 4096, 8192, 16384])
-    def test_pretrain_config_tokenizer_configuration(self, vocab_size):
-        """Test tokenizer configuration."""
-        config = pretrain_config(vocab_size=vocab_size)
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == vocab_size
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism",
-        [
-            (2, 2, 1),
-            (4, 4, 2),
-            (8, 4, 4),
-            (4, 8, 2),
-            (8, 8, 4),
-        ],
-    )
-    def test_pretrain_config_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism
-    ):
-        """Test various parallelism combinations for 70B model."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (256, 1),
-            (512, 1),
-            (1024, 2),
-            (512, 4),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    @pytest.mark.parametrize("seq_length", [2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-
-    def test_pretrain_config_70b_optimized_defaults(self):
-        """Test that 70B specific optimizations are applied by default."""
-        config = pretrain_config()
-
-        # Check model defaults optimized for 70B
-        assert config.model.tensor_model_parallel_size == 4  # Higher than smaller models
-        assert config.model.pipeline_model_parallel_size == 4  # Higher than smaller models
-        assert config.model.pipeline_dtype == torch.bfloat16  # Optimized dtype
-        assert config.model.sequence_parallel is True  # Enabled for efficiency
-        assert config.model.context_parallel_size == 2  # Context parallelism for efficiency
-        assert config.model.virtual_pipeline_model_parallel_size == 5  # Virtual PP for large model
-
-        # Check dataset defaults
-        assert config.dataset.sequence_length == 8192  # Standard sequence length
-
-    @pytest.mark.parametrize("virtual_pipeline_parallelism", [None, 3, 5, 7, 10])
-    def test_pretrain_config_virtual_pipeline_parallelism(self, virtual_pipeline_parallelism):
-        """Test various virtual pipeline parallelism settings."""
-        config = pretrain_config(virtual_pipeline_parallelism=virtual_pipeline_parallelism)
-
-        assert config.model.virtual_pipeline_model_parallel_size == virtual_pipeline_parallelism
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        """Ensure precision recipes properly affect model/optimizer/ddp settings."""
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama3_70b_16k.py b/tests/unit_tests/recipes/llama/test_llama3_70b_16k.py
deleted file mode 100644
index 08fa03f336..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama3_70b_16k.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider70B
-from megatron.bridge.recipes.llama.llama3_70b_16k import SEQUENCE_LENGTH_16K, model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters_70b_16k_optimized(self):
-        """Test model_config with default parameters optimized for 70B with 16k sequences."""
-        config = model_config()
-
-        assert isinstance(config, Llama3ModelProvider70B)
-        # Verify 70B + 16k optimized defaults
-        assert config.tensor_model_parallel_size == 8  # High for 70B model
-        assert config.pipeline_model_parallel_size == 2  # Reasonable for 70B
-        assert config.pipeline_dtype == torch.bfloat16  # Specified for efficiency
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 2  # Appropriate for 16k
-        assert config.sequence_parallel is True  # Enabled for 70B + 16k
-        # Verify model sequence length matches 16k
-        assert config.seq_length == SEQUENCE_LENGTH_16K  # Model configured for 16k sequences
-
-    def test_model_config_custom_parameters(self):
-        """Test model_config with custom parameters."""
-        config = model_config(
-            tensor_parallelism=4,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.float16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=4,
-            sequence_parallelism=False,
-        )
-
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.float16
-        assert config.virtual_pipeline_model_parallel_size == 2
-        assert config.context_parallel_size == 4
-        assert config.sequence_parallel is False
-        # Verify model sequence length is still 16k with custom parameters
-        assert config.seq_length == SEQUENCE_LENGTH_16K
-
-    def test_model_config_sequence_length_consistency(self):
-        """Test that model_config always uses the 16k sequence length constant."""
-        configs = [
-            model_config(),
-            model_config(tensor_parallelism=4),
-            model_config(context_parallelism=4),
-            model_config(sequence_parallelism=False),
-        ]
-
-        for config in configs:
-            assert config.seq_length == SEQUENCE_LENGTH_16K, "Model sequence length should always be 16k"
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters_70b_16k_optimized(self):
-        """Test pretrain_config with default parameters optimized for 70B with 16k sequences."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama3ModelProvider70B)
-
-        # Check that sequence length is set to 16k in both model and dataset
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K
-        assert config.model.seq_length == SEQUENCE_LENGTH_16K
-
-        # Check that model uses 70B + 16k optimized defaults
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 2
-        assert config.model.pipeline_dtype == torch.bfloat16
-        assert config.model.context_parallel_size == 2
-        assert config.model.sequence_parallel is True
-
-    def test_pretrain_config_custom_parameters(self):
-        """Test pretrain_config with custom parameters."""
-        config = pretrain_config(
-            dir="/custom/path",
-            name="custom_run",
-            tensor_parallelism=8,
-            pipeline_parallelism=4,
-            context_parallelism=2,
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-        )
-
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 4
-        assert config.model.context_parallel_size == 2
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K  # Should be 16k
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-
-    def test_pretrain_config_16k_sequence_length_override(self):
-        """Test that sequence length is always set to 16k."""
-        # Test with various parameters, but sequence length should always be 16k
-        configs = [
-            pretrain_config(),
-            pretrain_config(tensor_parallelism=4),
-            pretrain_config(train_iters=100000),
-            pretrain_config(global_batch_size=1024),
-        ]
-
-        for config in configs:
-            assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K, (
-                "Dataset sequence length should always be 16k"
-            )
-            assert config.model.seq_length == SEQUENCE_LENGTH_16K, "Model sequence length should always be 16k"
-
-    def test_pretrain_config_model_dataset_sequence_length_match(self):
-        """Test that model and dataset sequence lengths always match."""
-        config = pretrain_config()
-        assert config.model.seq_length == config.dataset.sequence_length, (
-            "Model and dataset sequence lengths must match"
-        )
-        assert config.model.seq_length == SEQUENCE_LENGTH_16K, "Both should be 16k"
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_70b_16k_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_70b_16k_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-            assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Should still have 16k sequence length
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K
-        # Should use non-mock data configuration
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-        # Should still have 16k sequence length
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism,sequence_parallelism",
-        [
-            (8, 2, 2, True),  # Default 70B + 16k optimized
-            (4, 4, 2, True),  # Different parallelism distribution
-            (8, 1, 4, True),  # Higher context parallelism
-            (4, 2, 1, False),  # Lower parallelism
-        ],
-    )
-    def test_pretrain_config_70b_16k_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism, sequence_parallelism
-    ):
-        """Test various parallelism combinations for 70B model with 16k sequences."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            sequence_parallelism=sequence_parallelism,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-        assert config.model.sequence_parallel == sequence_parallelism
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K  # Always 16k
-
-    def test_pretrain_config_mock_mode_with_16k_sequence(self):
-        """Test pretrain_config in mock mode with 16k sequence length."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K  # Still 16k in mock mode
-        assert config.dataset.split == "1,1,1"  # Mock mode split
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        # Note: overlap_grad_reduce and overlap_param_gather are now controlled by CommOverlapConfig
-        # and default to False when data_parallel_size is None or <= 1
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (256, 1),
-            (512, 1),
-            (1024, 2),
-            (2048, 4),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations for 70B model."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-        # Sequence length should still be 16k regardless of batch size
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K
-
-    @pytest.mark.parametrize("virtual_pipeline_parallelism", [None, 1, 2, 4])
-    def test_pretrain_config_virtual_pipeline_parallelism(self, virtual_pipeline_parallelism):
-        """Test various virtual pipeline parallelism settings."""
-        config = pretrain_config(virtual_pipeline_parallelism=virtual_pipeline_parallelism)
-
-        assert config.model.virtual_pipeline_model_parallel_size == virtual_pipeline_parallelism
-        # Sequence length should still be 16k
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_16K
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama3_70b_64k.py b/tests/unit_tests/recipes/llama/test_llama3_70b_64k.py
deleted file mode 100644
index 2b832a3c96..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama3_70b_64k.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider70B
-from megatron.bridge.recipes.llama.llama3_70b_64k import SEQUENCE_LENGTH_64K, model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters_70b_64k_optimized(self):
-        """Test model_config with default parameters optimized for 70B with 64k sequences."""
-        config = model_config()
-
-        assert isinstance(config, Llama3ModelProvider70B)
-        # Verify 70B + 64k optimized defaults
-        assert config.tensor_model_parallel_size == 8  # High for 70B model
-        assert config.pipeline_model_parallel_size == 4  # Moderate for 64k sequences
-        assert config.pipeline_dtype == torch.bfloat16  # Specified for efficiency
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 8  # High for 64k sequences
-        assert config.sequence_parallel is True  # Enabled for 70B + 64k
-        # Verify model sequence length matches 64k
-        assert config.seq_length == SEQUENCE_LENGTH_64K  # Model configured for 64k sequences
-
-    def test_model_config_custom_parameters(self):
-        """Test model_config with custom parameters."""
-        config = model_config(
-            tensor_parallelism=4,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.float16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=4,
-            sequence_parallelism=False,
-        )
-
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.float16
-        assert config.virtual_pipeline_model_parallel_size == 2
-        assert config.context_parallel_size == 4
-        assert config.sequence_parallel is False
-        # Verify model sequence length is still 64k with custom parameters
-        assert config.seq_length == SEQUENCE_LENGTH_64K
-
-    def test_model_config_sequence_length_consistency(self):
-        """Test that model_config always uses the 64k sequence length constant."""
-        configs = [
-            model_config(),
-            model_config(tensor_parallelism=4),
-            model_config(context_parallelism=4),
-            model_config(sequence_parallelism=False),
-        ]
-
-        for config in configs:
-            assert config.seq_length == SEQUENCE_LENGTH_64K, "Model sequence length should always be 64k"
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters_70b_64k_optimized(self):
-        """Test pretrain_config with default parameters optimized for 70B with 64k sequences."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama3ModelProvider70B)
-
-        # Check that sequence length is set to 64k in both model and dataset
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-        assert config.model.seq_length == SEQUENCE_LENGTH_64K
-
-        # Check that model uses 70B + 64k optimized defaults
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 4
-        assert config.model.pipeline_dtype == torch.bfloat16
-        assert config.model.context_parallel_size == 8
-        assert config.model.sequence_parallel is True
-
-    def test_pretrain_config_custom_parameters(self):
-        """Test pretrain_config with custom parameters."""
-        config = pretrain_config(
-            tensor_parallelism=4,
-            pipeline_parallelism=4,
-            context_parallelism=4,
-            sequence_parallelism=False,
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-        )
-
-        # Check that sequence length is still 64k in both model and dataset
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-        assert config.model.seq_length == SEQUENCE_LENGTH_64K
-
-        # Check custom model parameters
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 4
-        assert config.model.context_parallel_size == 4
-        assert config.model.sequence_parallel is False
-
-        # Check custom training parameters
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-
-    def test_pretrain_config_64k_sequence_length_override(self):
-        """Test that sequence length is always set to 64k."""
-        # Test with various parameters, but sequence length should always be 64k
-        configs = [
-            pretrain_config(),
-            pretrain_config(tensor_parallelism=4),
-            pretrain_config(train_iters=100000),
-            pretrain_config(global_batch_size=1024),
-        ]
-
-        for config in configs:
-            assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K, (
-                "Dataset sequence length should always be 64k"
-            )
-            assert config.model.seq_length == SEQUENCE_LENGTH_64K, "Model sequence length should always be 64k"
-
-    def test_pretrain_config_model_dataset_sequence_length_match(self):
-        """Test that model and dataset sequence lengths always match."""
-        config = pretrain_config()
-        assert config.model.seq_length == config.dataset.sequence_length, (
-            "Model and dataset sequence lengths must match"
-        )
-        assert config.model.seq_length == SEQUENCE_LENGTH_64K, "Both should be 64k"
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_70b_64k_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_70b_64k_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-            assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Should still have 64k sequence length
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-        # Should use non-mock data configuration
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-        # Should still have 64k sequence length
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism,sequence_parallelism",
-        [
-            (8, 4, 8, True),  # Default 70B + 64k optimized
-            (4, 4, 4, True),  # Different parallelism distribution
-            (8, 2, 8, True),  # Higher context parallelism
-            (4, 2, 4, False),  # Lower parallelism
-        ],
-    )
-    def test_pretrain_config_70b_64k_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism, sequence_parallelism
-    ):
-        """Test various parallelism combinations for 70B model with 64k sequences."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            sequence_parallelism=sequence_parallelism,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-        assert config.model.sequence_parallel == sequence_parallelism
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K  # Always 64k
-
-    def test_pretrain_config_mock_mode_with_64k_sequence(self):
-        """Test pretrain_config in mock mode with 64k sequence length."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K  # Still 64k in mock mode
-        assert config.dataset.split == "1,1,1"  # Mock mode split
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        # Note: overlap_grad_reduce and overlap_param_gather are now controlled by CommOverlapConfig
-        # and default to False when data_parallel_size is None or <= 1
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (256, 1),
-            (512, 1),
-            (1024, 2),
-            (2048, 4),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations for 70B model."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-        # Sequence length should still be 64k regardless of batch size
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-
-    @pytest.mark.parametrize("virtual_pipeline_parallelism", [None, 1, 2, 4])
-    def test_pretrain_config_virtual_pipeline_parallelism(self, virtual_pipeline_parallelism):
-        """Test various virtual pipeline parallelism settings."""
-        config = pretrain_config(virtual_pipeline_parallelism=virtual_pipeline_parallelism)
-
-        assert config.model.virtual_pipeline_model_parallel_size == virtual_pipeline_parallelism
-        # Sequence length should still be 64k
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama3_8b.py b/tests/unit_tests/recipes/llama/test_llama3_8b.py
deleted file mode 100644
index bb44ba4de1..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama3_8b.py
+++ /dev/null
@@ -1,392 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider8B
-from megatron.bridge.recipes.llama.llama3_8b import model_config, pretrain_config
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama3ModelProvider8B)
-        assert config.tensor_model_parallel_size == 1
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 2
-        assert config.sequence_parallel is False
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=4)
-
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 1  # default
-        assert config.context_parallel_size == 2  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=8, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 1  # default
-        assert config.pipeline_model_parallel_size == 8
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.float16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=4)
-
-        assert config.virtual_pipeline_model_parallel_size == 4
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=8)
-
-        assert config.context_parallel_size == 8
-
-    def test_model_config_sequence_parallelism_enabled(self):
-        """Test model_config with sequence parallelism enabled."""
-        config = model_config(sequence_parallelism=True, tensor_parallelism=2)
-
-        assert config.sequence_parallel is True
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all parameters customized."""
-        config = model_config(
-            tensor_parallelism=2,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.bfloat16,
-            virtual_pipeline_parallelism=8,
-            context_parallelism=16,
-            sequence_parallelism=True,
-        )
-
-        assert config.tensor_model_parallel_size == 2
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size == 8
-        assert config.context_parallel_size == 16
-        assert config.sequence_parallel is True
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama3ModelProvider8B)
-
-        # Check training configuration
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.weight_decay == 0.1
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 8192
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            seq_length=4096,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.dataset.sequence_length == 4096
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000  # Note: fixed in scheduler config
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=2,
-            pipeline_parallelism=4,
-            context_parallelism=8,
-            sequence_parallelism=True,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == 2
-        assert config.model.pipeline_model_parallel_size == 4
-        assert config.model.context_parallel_size == 8
-        assert config.model.sequence_parallel is True
-        assert config.model.pipeline_dtype == torch.bfloat16
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_default_comm_overlap(self):
-        """Test default CommOverlapConfig setup."""
-        config = pretrain_config()
-
-        # Default setup should have TP comm overlap disabled for 8B model
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_custom_comm_overlap(self):
-        """Test custom CommOverlapConfig."""
-        custom_overlap = CommOverlapConfig(
-            tp_comm_overlap=True,
-            defer_embedding_wgrad_compute=True,
-            wgrad_deferral_limit=50,
-            data_parallel_size=1,
-        )
-        config = pretrain_config(comm_overlap_config=custom_overlap)
-
-        # Should use the custom config
-        # Since default TP size is 1, it should be disabled
-        assert config.comm_overlap is not None
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    @pytest.mark.parametrize("vocab_size", [2048, 4096, 8192, 16384])
-    def test_pretrain_config_tokenizer_configuration(self, vocab_size):
-        """Test tokenizer configuration."""
-        config = pretrain_config(vocab_size=vocab_size)
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == vocab_size
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism",
-        [
-            (1, 1, 1),
-            (2, 1, 4),
-            (1, 4, 2),
-            (2, 2, 8),
-            (4, 4, 16),
-        ],
-    )
-    def test_pretrain_config_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism
-    ):
-        """Test various parallelism combinations."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (128, 1),
-            (512, 2),
-            (1024, 4),
-            (256, 8),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        config = pretrain_config(precision_config=precision)
-        assert config.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama3_8b_128k.py b/tests/unit_tests/recipes/llama/test_llama3_8b_128k.py
deleted file mode 100644
index ecddc1f763..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama3_8b_128k.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider8B
-from megatron.bridge.recipes.llama.llama3_8b_128k import SEQUENCE_LENGTH_128K, model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters_128k_optimized(self):
-        """Test model_config with default parameters optimized for 128k sequences."""
-        config = model_config()
-
-        assert isinstance(config, Llama3ModelProvider8B)
-        # Verify 128k-optimized defaults
-        assert config.tensor_model_parallel_size == 4  # Same as 64k
-        assert config.pipeline_model_parallel_size == 2  # Same as 64k
-        assert config.pipeline_dtype == torch.bfloat16  # Specified for 128k
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 8  # Higher than 64k version (4)
-        assert config.sequence_parallel is True  # Enabled for 128k
-        # Verify model sequence length matches 128k
-        assert config.seq_length == SEQUENCE_LENGTH_128K  # Model configured for 128k sequences
-
-    def test_model_config_custom_parameters(self):
-        """Test model_config with custom parameters."""
-        config = model_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.float16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=16,
-            sequence_parallelism=False,
-        )
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.float16
-        assert config.virtual_pipeline_model_parallel_size == 2
-        assert config.context_parallel_size == 16
-        assert config.sequence_parallel is False
-        # Verify model sequence length is still 128k with custom parameters
-        assert config.seq_length == SEQUENCE_LENGTH_128K
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters_128k_optimized(self):
-        """Test pretrain_config with default parameters optimized for 128k sequences."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama3ModelProvider8B)
-
-        # Check that sequence length is set to 128k in both model and dataset
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_128K
-        assert config.model.seq_length == SEQUENCE_LENGTH_128K
-
-        # Check that model uses 128k-optimized defaults
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 2
-        assert config.model.pipeline_dtype == torch.bfloat16
-        assert config.model.context_parallel_size == 8  # Higher than 64k (4)
-        assert config.model.sequence_parallel is True
-
-    def test_pretrain_config_custom_parameters(self):
-        """Test pretrain_config with custom parameters."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=8,
-            context_parallelism=8,
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-        )
-
-        # Sequence length should be 128k from recipe
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_128K
-        assert config.model.seq_length == SEQUENCE_LENGTH_128K
-
-        # Check custom model parameters
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 8
-        assert config.model.context_parallel_size == 8
-        assert config.model.sequence_parallel is True
-
-        # Check custom training parameters
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-
-    def test_pretrain_config_128k_sequence_length_override(self):
-        """Test that sequence length is hardcoded to 128k and cannot be overridden."""
-        config = pretrain_config(
-            tensor_parallelism=4,
-            pipeline_parallelism=4,
-            context_parallelism=8,
-        )
-
-        # Sequence length should always be 128k
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_128K
-        assert config.model.seq_length == SEQUENCE_LENGTH_128K
-
-    def test_pretrain_config_model_dataset_sequence_length_match(self):
-        """Test that model and dataset sequence lengths always match."""
-        config = pretrain_config()
-        assert config.model.seq_length == config.dataset.sequence_length, (
-            "Model and dataset sequence lengths must match"
-        )
-        assert config.model.seq_length == SEQUENCE_LENGTH_128K, "Both should be 128k"
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_128k_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_128k_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-            assert config.dataset.sequence_length == SEQUENCE_LENGTH_128K
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Should still have 128k sequence length
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_128K
-        # Should use non-mock data configuration
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism,sequence_parallelism",
-        [
-            (4, 2, 8, True),  # Default 128k-optimized
-            (8, 2, 8, True),  # Higher tensor parallelism
-            (4, 4, 16, True),  # Higher pipeline and context parallelism
-            (2, 1, 4, False),  # Lower parallelism
-        ],
-    )
-    def test_pretrain_config_128k_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism, sequence_parallelism
-    ):
-        """Test various parallelism combinations for 128k sequences."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            sequence_parallelism=sequence_parallelism,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-        assert config.model.sequence_parallel == sequence_parallelism
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_128K  # Always 128k
-
-    def test_pretrain_config_mock_mode_with_128k_sequence(self):
-        """Test pretrain_config in mock mode with 128k sequence length."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_128K  # Still 128k in mock mode
-        assert config.dataset.split == "1,1,1"  # Mock mode split
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_128k_optimized_parallelism(self):
-        """Test 128k-optimized parallelism configuration."""
-        # Test a realistic configuration for 128k sequences
-        config = pretrain_config(
-            tensor_parallelism=4,
-            pipeline_parallelism=2,
-            context_parallelism=8,  # Key difference from 64k (4) and 8k (2)
-            sequence_parallelism=True,
-        )
-
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 2
-        assert config.model.context_parallel_size == 8  # Optimized for 128k
-        assert config.model.sequence_parallel is True
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_128K
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama3_8b_16k.py b/tests/unit_tests/recipes/llama/test_llama3_8b_16k.py
deleted file mode 100644
index 080906aa3a..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama3_8b_16k.py
+++ /dev/null
@@ -1,372 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider8B
-from megatron.bridge.recipes.llama.llama3_8b_16k import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama3ModelProvider8B)
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 2
-        assert config.sequence_parallel is True
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=8)
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 2  # default
-        assert config.context_parallel_size == 2  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=4, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 4  # default
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=4, pipeline_parallelism_dtype=torch.float32)
-
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.float32
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=8)
-
-        assert config.virtual_pipeline_model_parallel_size == 8
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=4)
-
-        assert config.context_parallel_size == 4
-
-    def test_model_config_sequence_parallelism_disabled(self):
-        """Test model_config with sequence parallelism disabled."""
-        config = model_config(sequence_parallelism=False)
-
-        assert config.sequence_parallel is False
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all parameters customized."""
-        config = model_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.float32,
-            virtual_pipeline_parallelism=16,
-            context_parallelism=8,
-            sequence_parallelism=False,
-        )
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.float32
-        assert config.virtual_pipeline_model_parallel_size == 16
-        assert config.context_parallel_size == 8
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama3ModelProvider8B)
-
-        # Check training configuration
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.weight_decay == 0.1
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 16384  # 16k default
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000  # Note: fixed in scheduler config
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=8,
-            context_parallelism=2,
-            sequence_parallelism=False,
-            pipeline_parallelism_dtype=torch.float32,
-            virtual_pipeline_parallelism=10,
-        )
-
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 8
-        assert config.model.context_parallel_size == 2
-        assert config.model.sequence_parallel is False
-        assert config.model.pipeline_dtype == torch.float32
-        assert config.model.virtual_pipeline_model_parallel_size == 10
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 2000
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-        assert config.checkpoint.async_save is False
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        # Note: overlap_grad_reduce and overlap_param_gather are now controlled by CommOverlapConfig
-        # and default to False when data_parallel_size is None or <= 1
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 2000
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-        assert config.dataset.num_workers == 8
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism",
-        [
-            (2, 1, 1),
-            (4, 2, 2),
-            (8, 2, 4),
-            (4, 4, 2),
-            (8, 4, 8),
-        ],
-    )
-    def test_pretrain_config_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism
-    ):
-        """Test various parallelism combinations optimized for 16k."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            pipeline_parallelism_dtype=torch.bfloat16,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (256, 1),
-            (512, 1),
-            (1024, 2),
-            (512, 4),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    def test_pretrain_config_16k_optimized_defaults(self):
-        """Test that 16k specific optimizations are applied by default."""
-        config = pretrain_config()
-
-        # Check model defaults optimized for 16k
-        assert config.model.tensor_model_parallel_size == 4  # Higher than 8k version
-        assert config.model.pipeline_model_parallel_size == 2  # Higher than 8k version
-        assert config.model.pipeline_dtype == torch.bfloat16  # Optimized dtype
-        assert config.model.sequence_parallel is True  # Enabled for long sequences
-        assert config.model.context_parallel_size == 2  # Context parallelism for efficiency
-
-        # Check dataset defaults
-        assert config.dataset.sequence_length == 16384  # 16k sequence length
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        """Ensure precision recipes properly update configs for 8B 16k recipe."""
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama3_8b_64k.py b/tests/unit_tests/recipes/llama/test_llama3_8b_64k.py
deleted file mode 100644
index 0536cdaaf8..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama3_8b_64k.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.llama import Llama3ModelProvider8B
-from megatron.bridge.recipes.llama.llama3_8b_64k import SEQUENCE_LENGTH_64K, model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters_64k_optimized(self):
-        """Test model_config with default parameters optimized for 64k sequences."""
-        config = model_config()
-
-        assert isinstance(config, Llama3ModelProvider8B)
-        # Verify 64k-optimized defaults
-        assert config.tensor_model_parallel_size == 4  # Higher than 8k version (1)
-        assert config.pipeline_model_parallel_size == 2  # Higher than 8k version (1)
-        assert config.pipeline_dtype == torch.bfloat16  # Specified for 64k
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 4  # Higher than 8k version (2)
-        assert config.sequence_parallel is True  # Enabled for 64k (False for 8k)
-        # Verify model sequence length matches 64k
-        assert config.seq_length == SEQUENCE_LENGTH_64K  # Model configured for 64k sequences
-
-    def test_model_config_custom_parameters(self):
-        """Test model_config with custom parameters."""
-        config = model_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.float16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=8,
-            sequence_parallelism=False,
-        )
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.float16
-        assert config.virtual_pipeline_model_parallel_size == 2
-        assert config.context_parallel_size == 8
-        assert config.sequence_parallel is False
-        # Verify model sequence length is still 64k with custom parameters
-        assert config.seq_length == SEQUENCE_LENGTH_64K
-
-    def test_model_config_inheritance_from_llama3_8b(self):
-        """Test that model_config correctly delegates to llama3_8b.model_config."""
-        with patch("megatron.bridge.recipes.llama.llama3_8b.model_config") as mock_base_config:
-            mock_base_config.return_value = Llama3ModelProvider8B(
-                tensor_model_parallel_size=4,
-                pipeline_model_parallel_size=2,
-                pipeline_dtype=torch.bfloat16,
-                context_parallel_size=4,
-                sequence_parallel=True,
-            )
-
-            config = model_config()
-
-            # Verify the base function was called with correct parameters
-            mock_base_config.assert_called_once_with(
-                tensor_parallelism=4,
-                pipeline_parallelism=2,
-                pipeline_parallelism_dtype=torch.bfloat16,
-                virtual_pipeline_parallelism=None,
-                context_parallelism=4,
-                sequence_parallelism=True,
-            )
-            assert isinstance(config, Llama3ModelProvider8B)
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters_64k_optimized(self):
-        """Test pretrain_config with default parameters optimized for 64k sequences."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama3ModelProvider8B)
-
-        # Check that sequence length is set to 64k in both model and dataset
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-        assert config.model.seq_length == SEQUENCE_LENGTH_64K
-
-        # Check that model uses 64k-optimized defaults
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 2
-        assert config.model.pipeline_dtype == torch.bfloat16
-        assert config.model.context_parallel_size == 4
-        assert config.model.sequence_parallel is True
-
-    def test_pretrain_config_custom_parameters(self):
-        """Test pretrain_config with custom parameters."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=4,
-            context_parallelism=8,
-            sequence_parallelism=False,
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-        )
-
-        # Check that sequence length is still 64k in both model and dataset
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-        assert config.model.seq_length == SEQUENCE_LENGTH_64K
-
-        # Check custom model parameters
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 4
-        assert config.model.context_parallel_size == 8
-        assert config.model.sequence_parallel is False
-
-        # Check custom training parameters
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-
-    def test_pretrain_config_64k_sequence_length_override(self):
-        """Test that sequence length is always overridden to 64k."""
-        # Test with various parameters, but sequence length should always be 64k
-        configs = [
-            pretrain_config(),
-            pretrain_config(tensor_parallelism=8),
-            pretrain_config(train_iters=100000),
-            pretrain_config(global_batch_size=1024),
-        ]
-
-        for config in configs:
-            assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K, (
-                "Dataset sequence length should always be 64k"
-            )
-            assert config.model.seq_length == SEQUENCE_LENGTH_64K, "Model sequence length should always be 64k"
-
-    def test_pretrain_config_model_dataset_sequence_length_match(self):
-        """Test that model and dataset sequence lengths always match."""
-        config = pretrain_config()
-        assert config.model.seq_length == config.dataset.sequence_length, (
-            "Model and dataset sequence lengths must match"
-        )
-        assert config.model.seq_length == SEQUENCE_LENGTH_64K, "Both should be 64k"
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_64k_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_64k_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-            assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Should still have 64k sequence length
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K
-        # Should use non-mock data configuration
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-
-    @pytest.mark.parametrize(
-        "tensor_parallelism,pipeline_parallelism,context_parallelism,sequence_parallelism",
-        [
-            (4, 2, 4, True),  # Default 64k-optimized
-            (8, 2, 4, True),  # Higher tensor parallelism
-            (4, 4, 8, True),  # Higher pipeline and context parallelism
-            (2, 1, 2, False),  # Lower parallelism
-        ],
-    )
-    def test_pretrain_config_64k_parallelism_combinations(
-        self, tensor_parallelism, pipeline_parallelism, context_parallelism, sequence_parallelism
-    ):
-        """Test various parallelism combinations for 64k sequences."""
-        config = pretrain_config(
-            tensor_parallelism=tensor_parallelism,
-            pipeline_parallelism=pipeline_parallelism,
-            context_parallelism=context_parallelism,
-            sequence_parallelism=sequence_parallelism,
-        )
-
-        assert config.model.tensor_model_parallel_size == tensor_parallelism
-        assert config.model.pipeline_model_parallel_size == pipeline_parallelism
-        assert config.model.context_parallel_size == context_parallelism
-        assert config.model.sequence_parallel == sequence_parallelism
-        assert config.dataset.sequence_length == SEQUENCE_LENGTH_64K  # Always 64k
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_precision_recipes(self, precision):
-        cfg = pretrain_config(precision_config=precision)
-        assert cfg.mixed_precision == precision
diff --git a/tests/unit_tests/recipes/llama/test_llama4_e128.py b/tests/unit_tests/recipes/llama/test_llama4_e128.py
deleted file mode 100644
index 180560abb3..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama4_e128.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-from megatron.core.distributed import DistributedDataParallelConfig
-
-from megatron.bridge.models.llama import Llama4Experts128ModelProvider
-from megatron.bridge.recipes.llama.llama4_e128 import model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer, TrainingConfig
-from megatron.bridge.training.mixed_precision import get_mixed_precision_config
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama4Experts128ModelProvider)
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is True
-        assert config.expert_tensor_parallel_size == 4
-        assert config.expert_model_parallel_size == 128
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=8)
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 1  # default
-        assert config.context_parallel_size == 1  # default
-        assert config.expert_tensor_parallel_size == 4  # default
-        assert config.expert_model_parallel_size == 128  # default
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=4, pipeline_parallelism_dtype=torch.bfloat16)
-
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.bfloat16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=2)
-
-        assert config.virtual_pipeline_model_parallel_size == 2
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=4)
-
-        assert config.context_parallel_size == 4
-
-    def test_model_config_expert_parallelism(self):
-        """Test model_config with custom expert parallelism settings."""
-        config = model_config(expert_tensor_parallelism=8, expert_model_parallelism=256)
-
-        assert config.expert_tensor_parallel_size == 8
-        assert config.expert_model_parallel_size == 256
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all custom parameters."""
-        config = model_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.bfloat16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=2,
-            expert_tensor_parallelism=8,
-            expert_model_parallelism=256,
-        )
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size == 2
-        assert config.context_parallel_size == 2
-        assert config.expert_tensor_parallel_size == 8
-        assert config.expert_model_parallel_size == 256
-
-    def test_model_config_expert_count(self):
-        """Test model_config with large expert count typical for 128-expert model."""
-        config = model_config(expert_model_parallelism=128)
-
-        assert config.expert_model_parallel_size == 128
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama4Experts128ModelProvider)
-        assert isinstance(config.train, TrainingConfig)
-        assert isinstance(config.ddp, DistributedDataParallelConfig)
-
-        # Check default training settings
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-
-        # Check default model settings
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.context_parallel_size == 1
-        assert config.model.sequence_parallel is True
-        assert config.model.expert_tensor_parallel_size == 4
-        assert config.model.expert_model_parallel_size == 128
-
-        # Check dataset settings
-        assert config.dataset.sequence_length == 8192
-        assert config.dataset.random_seed == 1234
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-
-        # Check DDP settings
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=500_000,
-            global_batch_size=1024,
-            micro_batch_size=2,
-            lr=1e-4,
-            min_lr=1e-6,
-            lr_warmup_iters=5000,
-        )
-
-        assert config.train.train_iters == 500_000
-        assert config.train.global_batch_size == 1024
-        assert config.train.micro_batch_size == 2
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=2,
-            pipeline_parallelism_dtype=torch.bfloat16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=2,
-            expert_tensor_parallelism=8,
-            expert_model_parallelism=256,
-        )
-
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 2
-        assert config.model.pipeline_dtype == torch.bfloat16
-        assert config.model.virtual_pipeline_model_parallel_size == 2
-        assert config.model.context_parallel_size == 2
-        assert config.model.expert_tensor_parallel_size == 8
-        assert config.model.expert_model_parallel_size == 256
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with custom data paths."""
-        config = pretrain_config(
-            data_paths=["/path/to/data1", "/path/to/data2"],
-            train_data_path=["/path/to/train"],
-            valid_data_path=["/path/to/valid"],
-        )
-
-        # Should have blend configuration from data paths
-        assert config.dataset.blend is not None
-
-    def test_pretrain_config_with_mock_data(self):
-        """Test pretrain_config with mock data enabled."""
-        config = pretrain_config(mock=True)
-
-        # Should still create proper configuration
-        assert isinstance(config, ConfigContainer)
-        assert config.dataset.sequence_length == 8192
-
-    def test_pretrain_config_with_custom_dir_and_name(self):
-        """Test pretrain_config with custom directory and name."""
-        config = pretrain_config(dir="/custom/path", name="test_run")
-
-        # Should still create proper configuration
-        assert isinstance(config, ConfigContainer)
-        assert config.checkpoint.save.endswith("test_run/checkpoints")
-        assert config.logger.tensorboard_dir.endswith("test_run/tb_logs")
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (256, 1),
-            (512, 2),
-            (1024, 4),
-            (2048, 8),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    @pytest.mark.parametrize("train_iters", [50_000, 100_000, 500_000, 1_000_000])
-    def test_pretrain_config_train_iters(self, train_iters):
-        """Test various training iteration counts."""
-        config = pretrain_config(train_iters=train_iters)
-
-        assert config.train.train_iters == train_iters
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_pretrain_config_precision_string(self, precision):
-        """Test precision configuration with string values."""
-        config = pretrain_config(precision_config=precision)
-
-        assert isinstance(config, ConfigContainer)
-        assert config.mixed_precision == precision
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_pretrain_config_precision_object(self, precision):
-        """Test precision configuration with MixedPrecisionConfig object."""
-        precision_config = get_mixed_precision_config(precision)
-        config = pretrain_config(precision_config=precision_config)
-
-        assert isinstance(config, ConfigContainer)
-        assert config.mixed_precision == precision_config
-
-    def test_pretrain_config_llama4_e128_defaults(self):
-        """Test that Llama4 128-Experts specific defaults are applied correctly."""
-        config = pretrain_config()
-
-        # Check model defaults for Llama4 128-Experts
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.context_parallel_size == 1
-        assert config.model.sequence_parallel is True
-        assert config.model.expert_tensor_parallel_size == 4
-        assert config.model.expert_model_parallel_size == 128
-
-        # Check dataset defaults
-        assert config.dataset.sequence_length == 8192
-
-    @pytest.mark.parametrize("expert_tensor_parallelism", [1, 2, 4, 8])
-    def test_pretrain_config_expert_tensor_parallelism(self, expert_tensor_parallelism):
-        """Test various expert tensor parallelism settings."""
-        config = pretrain_config(expert_tensor_parallelism=expert_tensor_parallelism)
-
-        assert config.model.expert_tensor_parallel_size == expert_tensor_parallelism
-
-    @pytest.mark.parametrize("expert_model_parallelism", [32, 64, 128, 256])
-    def test_pretrain_config_expert_model_parallelism(self, expert_model_parallelism):
-        """Test various expert model parallelism settings."""
-        config = pretrain_config(expert_model_parallelism=expert_model_parallelism)
-
-        assert config.model.expert_model_parallel_size == expert_model_parallelism
-
-    def test_pretrain_config_expert_parallelism_combination(self):
-        """Test combination of expert parallelism settings."""
-        config = pretrain_config(expert_tensor_parallelism=8, expert_model_parallelism=256)
-
-        assert config.model.expert_tensor_parallel_size == 8
-        assert config.model.expert_model_parallel_size == 256
-
-    def test_pretrain_config_128_experts(self):
-        """Test configuration typical for large-scale 128-expert model."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.bfloat16,
-            context_parallelism=2,
-            sequence_parallelism=True,
-            expert_tensor_parallelism=8,
-            expert_model_parallelism=128,
-            global_batch_size=2048,
-            micro_batch_size=4,
-        )
-
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 4
-        assert config.model.pipeline_dtype == torch.bfloat16
-        assert config.model.context_parallel_size == 2
-        assert config.model.sequence_parallel is True
-        assert config.model.expert_tensor_parallel_size == 8
-        assert config.model.expert_model_parallel_size == 128
-        assert config.train.global_batch_size == 2048
-        assert config.train.micro_batch_size == 4
-
-    def test_pretrain_config_expert_model_parallelism(self):
-        """Test configuration behavior with specific expert parallelism."""
-        # Test high expert parallelism typical for 128-expert model
-        config = pretrain_config(expert_model_parallelism=128)
-        assert config.model.expert_model_parallel_size == 128
-
-    @pytest.mark.parametrize("context_parallelism", [1, 2, 4, 8])
-    def test_pretrain_config_context_parallelism_scaling(self, context_parallelism):
-        """Test context parallelism scaling for 128-expert model."""
-        config = pretrain_config(context_parallelism=context_parallelism)
-
-        assert config.model.context_parallel_size == context_parallelism
-
-    def test_pretrain_config_expert_tensor_combinations(self):
-        """Test various expert tensor parallelism combinations."""
-        # Test common combinations for 128-expert model
-        combinations = [
-            (1, 128),
-            (2, 64),
-            (4, 32),
-            (8, 16),
-        ]
-
-        for expert_tp, expert_mp in combinations:
-            config = pretrain_config(expert_tensor_parallelism=expert_tp, expert_model_parallelism=expert_mp)
-            assert config.model.expert_tensor_parallel_size == expert_tp
-            assert config.model.expert_model_parallel_size == expert_mp
diff --git a/tests/unit_tests/recipes/llama/test_llama4_e16.py b/tests/unit_tests/recipes/llama/test_llama4_e16.py
deleted file mode 100644
index db823f494a..0000000000
--- a/tests/unit_tests/recipes/llama/test_llama4_e16.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-from megatron.core.distributed import DistributedDataParallelConfig
-
-from megatron.bridge.models.llama import Llama4Experts16ModelProvider
-from megatron.bridge.recipes.llama.llama4_e16 import model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer, TrainingConfig
-from megatron.bridge.training.mixed_precision import get_mixed_precision_config
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Llama4Experts16ModelProvider)
-        assert config.tensor_model_parallel_size == 4
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is True
-        assert config.expert_tensor_parallel_size == 4
-        assert config.expert_model_parallel_size == 16
-
-    def test_model_config_custom_tensor_parallelism(self):
-        """Test model_config with custom tensor parallelism."""
-        config = model_config(tensor_parallelism=8)
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 1  # default
-        assert config.context_parallel_size == 1  # default
-        assert config.expert_tensor_parallel_size == 4  # default
-        assert config.expert_model_parallel_size == 16  # default
-
-    def test_model_config_custom_pipeline_parallelism(self):
-        """Test model_config with custom pipeline parallelism."""
-        config = model_config(pipeline_parallelism=2, pipeline_parallelism_dtype=torch.float16)
-
-        assert config.tensor_model_parallel_size == 4  # default
-        assert config.pipeline_model_parallel_size == 2
-        assert config.pipeline_dtype is torch.float16
-
-    def test_model_config_with_pipeline_dtype(self):
-        """Test model_config with pipeline dtype specified."""
-        config = model_config(pipeline_parallelism=4, pipeline_parallelism_dtype=torch.bfloat16)
-
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.bfloat16
-
-    def test_model_config_virtual_pipeline_parallelism(self):
-        """Test model_config with virtual pipeline parallelism."""
-        config = model_config(virtual_pipeline_parallelism=2)
-
-        assert config.virtual_pipeline_model_parallel_size == 2
-
-    def test_model_config_context_parallelism(self):
-        """Test model_config with custom context parallelism."""
-        config = model_config(context_parallelism=4)
-
-        assert config.context_parallel_size == 4
-
-    def test_model_config_expert_parallelism(self):
-        """Test model_config with custom expert parallelism settings."""
-        config = model_config(expert_tensor_parallelism=8, expert_model_parallelism=32)
-
-        assert config.expert_tensor_parallel_size == 8
-        assert config.expert_model_parallel_size == 32
-
-    def test_model_config_all_custom_parameters(self):
-        """Test model_config with all custom parameters."""
-        config = model_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=4,
-            pipeline_parallelism_dtype=torch.float16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=2,
-            expert_tensor_parallelism=8,
-            expert_model_parallelism=32,
-        )
-
-        assert config.tensor_model_parallel_size == 8
-        assert config.pipeline_model_parallel_size == 4
-        assert config.pipeline_dtype == torch.float16
-        assert config.virtual_pipeline_model_parallel_size == 2
-        assert config.context_parallel_size == 2
-        assert config.expert_tensor_parallel_size == 8
-        assert config.expert_model_parallel_size == 32
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Llama4Experts16ModelProvider)
-        assert isinstance(config.train, TrainingConfig)
-        assert isinstance(config.ddp, DistributedDataParallelConfig)
-
-        # Check default training settings
-        assert config.train.train_iters == 1_168_251
-        assert config.train.global_batch_size == 512
-        assert config.train.micro_batch_size == 1
-        assert config.train.eval_interval == 2000
-        assert config.train.eval_iters == 32
-
-        # Check default model settings
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.context_parallel_size == 1
-        assert config.model.sequence_parallel is True
-        assert config.model.expert_tensor_parallel_size == 4
-        assert config.model.expert_model_parallel_size == 16
-
-        # Check dataset settings
-        assert config.dataset.sequence_length == 8192
-        assert config.dataset.random_seed == 1234
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-
-        # Check DDP settings
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=500_000,
-            global_batch_size=1024,
-            micro_batch_size=2,
-            lr=1e-4,
-            min_lr=1e-6,
-            lr_warmup_iters=5000,
-        )
-
-        assert config.train.train_iters == 500_000
-        assert config.train.global_batch_size == 1024
-        assert config.train.micro_batch_size == 2
-
-    def test_pretrain_config_custom_model_parameters(self):
-        """Test pretrain_config with custom model parameters."""
-        config = pretrain_config(
-            tensor_parallelism=8,
-            pipeline_parallelism=2,
-            pipeline_parallelism_dtype=torch.bfloat16,
-            virtual_pipeline_parallelism=2,
-            context_parallelism=2,
-            expert_tensor_parallelism=8,
-            expert_model_parallelism=32,
-        )
-
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 2
-        assert config.model.virtual_pipeline_model_parallel_size == 2
-        assert config.model.context_parallel_size == 2
-        assert config.model.expert_tensor_parallel_size == 8
-        assert config.model.expert_model_parallel_size == 32
-
-    def test_pretrain_config_with_fp16_precision_and_pipeline_dtype(self):
-        """Test pretrain_config with fp16 precision and compatible pipeline dtype."""
-        config = pretrain_config(
-            pipeline_parallelism=2, pipeline_parallelism_dtype=torch.float16, precision_config="fp16_mixed"
-        )
-
-        assert config.model.pipeline_model_parallel_size == 2
-        # With fp16_mixed precision, pipeline dtype should be compatible
-        assert config.mixed_precision == "fp16_mixed"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with custom data paths."""
-        config = pretrain_config(
-            data_paths=["/path/to/data1", "/path/to/data2"],
-            train_data_path=["/path/to/train"],
-            valid_data_path=["/path/to/valid"],
-        )
-
-        # Should have blend configuration from data paths
-        assert config.dataset.blend is not None
-
-    def test_pretrain_config_with_mock_data(self):
-        """Test pretrain_config with mock data enabled."""
-        config = pretrain_config(mock=True)
-
-        # Should still create proper configuration
-        assert isinstance(config, ConfigContainer)
-        assert config.dataset.sequence_length == 8192
-
-    def test_pretrain_config_with_custom_dir_and_name(self):
-        """Test pretrain_config with custom directory and name."""
-        config = pretrain_config(dir="/custom/path", name="test_run")
-
-        # Should still create proper configuration
-        assert isinstance(config, ConfigContainer)
-        assert config.checkpoint.save.endswith("test_run/checkpoints")
-        assert config.logger.tensorboard_dir.endswith("test_run/tb_logs")
-
-    @pytest.mark.parametrize(
-        "global_batch_size,micro_batch_size",
-        [
-            (256, 1),
-            (512, 2),
-            (1024, 4),
-            (2048, 8),
-        ],
-    )
-    def test_pretrain_config_batch_size_combinations(self, global_batch_size, micro_batch_size):
-        """Test various batch size combinations."""
-        config = pretrain_config(global_batch_size=global_batch_size, micro_batch_size=micro_batch_size)
-
-        assert config.train.global_batch_size == global_batch_size
-        assert config.train.micro_batch_size == micro_batch_size
-
-    @pytest.mark.parametrize("train_iters", [50_000, 100_000, 500_000, 1_000_000])
-    def test_pretrain_config_train_iters(self, train_iters):
-        """Test various training iteration counts."""
-        config = pretrain_config(train_iters=train_iters)
-
-        assert config.train.train_iters == train_iters
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_pretrain_config_precision_string(self, precision):
-        """Test precision configuration with string values."""
-        config = pretrain_config(precision_config=precision)
-
-        assert isinstance(config, ConfigContainer)
-        assert config.mixed_precision == precision
-
-    @pytest.mark.parametrize("precision", ["fp16_mixed", "bf16_mixed", "bf16_with_fp8_delayed_scaling_mixed"])
-    def test_pretrain_config_precision_object(self, precision):
-        """Test precision configuration with MixedPrecisionConfig object."""
-        precision_config = get_mixed_precision_config(precision)
-        config = pretrain_config(precision_config=precision_config)
-
-        assert isinstance(config, ConfigContainer)
-        assert config.mixed_precision == precision_config
-
-    def test_pretrain_config_llama4_e16_defaults(self):
-        """Test that Llama4 16-Experts specific defaults are applied correctly."""
-        config = pretrain_config()
-
-        # Check model defaults for Llama4 16-Experts
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.context_parallel_size == 1
-        assert config.model.sequence_parallel is True
-        assert config.model.expert_tensor_parallel_size == 4
-        assert config.model.expert_model_parallel_size == 16
-
-        # Check dataset defaults
-        assert config.dataset.sequence_length == 8192
-
-    @pytest.mark.parametrize("expert_tensor_parallelism", [1, 2, 4, 8])
-    def test_pretrain_config_expert_tensor_parallelism(self, expert_tensor_parallelism):
-        """Test various expert tensor parallelism settings."""
-        config = pretrain_config(expert_tensor_parallelism=expert_tensor_parallelism)
-
-        assert config.model.expert_tensor_parallel_size == expert_tensor_parallelism
-
-    @pytest.mark.parametrize("expert_model_parallelism", [8, 16, 32, 64])
-    def test_pretrain_config_expert_model_parallelism(self, expert_model_parallelism):
-        """Test various expert model parallelism settings."""
-        config = pretrain_config(expert_model_parallelism=expert_model_parallelism)
-
-        assert config.model.expert_model_parallel_size == expert_model_parallelism
-
-    def test_pretrain_config_expert_parallelism_combination(self):
-        """Test combination of expert parallelism settings."""
-        config = pretrain_config(expert_tensor_parallelism=8, expert_model_parallelism=64)
-
-        assert config.model.expert_tensor_parallel_size == 8
-        assert config.model.expert_model_parallel_size == 64
diff --git a/tests/unit_tests/recipes/qwen/test_qwen25_14b.py b/tests/unit_tests/recipes/qwen/test_qwen25_14b.py
deleted file mode 100644
index 9770ffe591..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen25_14b.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider14B
-from megatron.bridge.recipes.qwen.qwen25_14b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen25ModelProvider14B)
-        assert config.tensor_model_parallel_size == 4  # Default for 14B model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen25ModelProvider14B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (14B specific defaults)
-        assert config.model.tensor_model_parallel_size == 4  # Default for 14B model
-        assert config.model.pipeline_model_parallel_size == 1  # No PP by default
-        assert config.model.pipeline_dtype is None  # No pipeline dtype by default
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-        assert config.ddp.check_for_nan_in_grad is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen25_14b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen25_1p5b.py b/tests/unit_tests/recipes/qwen/test_qwen25_1p5b.py
deleted file mode 100644
index 086e2e685a..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen25_1p5b.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider1P5B
-from megatron.bridge.recipes.qwen.qwen25_1p5b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen25ModelProvider1P5B)
-        assert config.tensor_model_parallel_size == 1  # Default for 1.5B model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen25ModelProvider1P5B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (1.5B specific defaults)
-        assert config.model.tensor_model_parallel_size == 1  # Default for 1.5B model
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.pipeline_dtype is None
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen25_1p5b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen25_32b.py b/tests/unit_tests/recipes/qwen/test_qwen25_32b.py
deleted file mode 100644
index 2a566dbb6c..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen25_32b.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider32B
-from megatron.bridge.recipes.qwen.qwen25_32b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen25ModelProvider32B)
-        assert config.tensor_model_parallel_size == 8  # Default for 32B model
-        assert config.pipeline_model_parallel_size == 2  # Default for 32B model
-        assert config.pipeline_dtype == torch.bfloat16  # Default for 32B model
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen25ModelProvider32B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (32B specific defaults)
-        assert config.model.tensor_model_parallel_size == 8  # Default for 32B model
-        assert config.model.pipeline_model_parallel_size == 2  # Default for 32B model
-        assert config.model.pipeline_dtype == torch.bfloat16  # Default for 32B model
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-        assert config.ddp.check_for_nan_in_grad is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen25_32b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen25_500m.py b/tests/unit_tests/recipes/qwen/test_qwen25_500m.py
deleted file mode 100644
index 5e1f7d028b..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen25_500m.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider500M
-from megatron.bridge.recipes.qwen.qwen25_500m import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen25ModelProvider500M)
-        assert config.tensor_model_parallel_size == 1  # Default for 500M model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen25ModelProvider500M)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (500M specific defaults)
-        assert config.model.tensor_model_parallel_size == 1  # Default for 500M model
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.pipeline_dtype is None
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-        assert config.ddp.check_for_nan_in_grad is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen25_500m
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen25_72b.py b/tests/unit_tests/recipes/qwen/test_qwen25_72b.py
deleted file mode 100644
index 33bacce41c..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen25_72b.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider72B
-from megatron.bridge.recipes.qwen.qwen25_72b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen25ModelProvider72B)
-        assert config.tensor_model_parallel_size == 8  # Default for 72B model
-        assert config.pipeline_model_parallel_size == 4  # Default for 72B model
-        assert config.pipeline_dtype == torch.bfloat16  # Default for 72B model
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen25ModelProvider72B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (72B specific defaults)
-        assert config.model.tensor_model_parallel_size == 8  # Default for 72B model
-        assert config.model.pipeline_model_parallel_size == 4  # Default for 72B model
-        assert config.model.pipeline_dtype == torch.bfloat16  # Default for 72B model
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-        assert config.ddp.check_for_nan_in_grad is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen25_72b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen25_7b.py b/tests/unit_tests/recipes/qwen/test_qwen25_7b.py
deleted file mode 100644
index 35214f8b0a..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen25_7b.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen25ModelProvider7B
-from megatron.bridge.recipes.qwen.qwen25_7b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen25ModelProvider7B)
-        assert config.tensor_model_parallel_size == 2  # Default for 7B model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen25ModelProvider7B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (7B specific defaults)
-        assert config.model.tensor_model_parallel_size == 2  # Default for 7B model
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.pipeline_dtype is None
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-        assert config.ddp.check_for_nan_in_grad is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen25_7b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen2_1p5b.py b/tests/unit_tests/recipes/qwen/test_qwen2_1p5b.py
deleted file mode 100644
index 8708a21491..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen2_1p5b.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen2ModelProvider1P5B
-from megatron.bridge.recipes.qwen.qwen2_1p5b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen2ModelProvider1P5B)
-        assert config.tensor_model_parallel_size == 1
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen2ModelProvider1P5B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen2_1p5b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen2_500m.py b/tests/unit_tests/recipes/qwen/test_qwen2_500m.py
deleted file mode 100644
index 47b2dff12b..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen2_500m.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen2ModelProvider500M
-from megatron.bridge.recipes.qwen.qwen2_500m import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen2ModelProvider500M)
-        assert config.tensor_model_parallel_size == 1
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen2ModelProvider500M)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen2_500m
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen2_72b.py b/tests/unit_tests/recipes/qwen/test_qwen2_72b.py
deleted file mode 100644
index b20cc2d9fc..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen2_72b.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from megatron.bridge.models.qwen import Qwen2ModelProvider72B
-from megatron.bridge.recipes.qwen.qwen2_72b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen2ModelProvider72B)
-        assert config.tensor_model_parallel_size == 8  # Default for 72B model
-        assert config.pipeline_model_parallel_size == 4  # Default for 72B model
-        assert config.pipeline_dtype == torch.bfloat16  # Default for 72B model
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen2ModelProvider72B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (72B specific defaults)
-        assert config.model.tensor_model_parallel_size == 8  # Default for 72B model
-        assert config.model.pipeline_model_parallel_size == 4  # Default for 72B model
-        assert config.model.pipeline_dtype == torch.bfloat16  # Default for 72B model
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen2_72b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
-
-    def test_pretrain_config_72b_specific_defaults(self):
-        """Test that 72B model has appropriate defaults for its size."""
-        config = pretrain_config()
-
-        # 72B model should default to high parallelism for efficiency
-        assert config.model.tensor_model_parallel_size == 8
-        assert config.model.pipeline_model_parallel_size == 4
-        assert config.model.pipeline_dtype == torch.bfloat16
diff --git a/tests/unit_tests/recipes/qwen/test_qwen2_7b.py b/tests/unit_tests/recipes/qwen/test_qwen2_7b.py
deleted file mode 100644
index 80c851b003..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen2_7b.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen2ModelProvider7B
-from megatron.bridge.recipes.qwen.qwen2_7b import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen2ModelProvider7B)
-        assert config.tensor_model_parallel_size == 2  # Default for 7B model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen2ModelProvider7B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (7B specific defaults)
-        assert config.model.tensor_model_parallel_size == 2  # Default for 7B model
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen2_7b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
-
-    def test_pretrain_config_custom_tensor_parallelism(self):
-        """Test pretrain_config with custom tensor parallelism."""
-        config = pretrain_config(tensor_parallelism=4)
-
-        assert config.model.tensor_model_parallel_size == 4
-        assert config.model.pipeline_model_parallel_size == 1  # default
-        assert config.model.context_parallel_size == 1  # default
-
-    def test_pretrain_config_7b_specific_defaults(self):
-        """Test that 7B model has appropriate defaults for its size."""
-        config = pretrain_config()
-
-        # 7B model should default to tensor parallelism of 2 for efficiency
-        assert config.model.tensor_model_parallel_size == 2
diff --git a/tests/unit_tests/recipes/qwen/test_qwen3_14b.py b/tests/unit_tests/recipes/qwen/test_qwen3_14b.py
deleted file mode 100644
index b63623e39f..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen3_14b.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider14B
-from megatron.bridge.recipes.qwen.qwen3_14b import model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen3ModelProvider14B)
-        assert config.tensor_model_parallel_size == 8  # Default for Qwen3 14B model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen3ModelProvider14B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (Qwen3 14B specific defaults)
-        assert config.model.tensor_model_parallel_size == 8  # Default for Qwen3 14B model
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.pipeline_dtype is None
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.data_parallel_sharding_strategy == "optim_grads_params"
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen3_14b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "HuggingFaceTokenizer"
-        assert config.tokenizer.tokenizer_model == "Qwen/Qwen3-14B"
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen3_1p7b.py b/tests/unit_tests/recipes/qwen/test_qwen3_1p7b.py
deleted file mode 100644
index e45f8403e5..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen3_1p7b.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import patch
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider1P7B
-from megatron.bridge.recipes.qwen.qwen3_1p7b import model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen3ModelProvider1P7B)
-        assert config.tensor_model_parallel_size == 1  # Default for Qwen3 1.7B model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen3ModelProvider1P7B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (Qwen3 1.7B specific defaults)
-        assert config.model.tensor_model_parallel_size == 1  # Default for Qwen3 1.7B model
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.pipeline_dtype is None
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_with_data_paths(self):
-        """Test pretrain_config with data paths provided."""
-
-        data_paths = ["/path/to/data1", "/path/to/data2", "/path/to/data3"]
-        config = pretrain_config(data_paths=data_paths)
-
-        # Check that non-mock mode is configured
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_with_train_valid_test_paths(self):
-        """Test pretrain_config with separate train/valid/test paths."""
-
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2", "/path/to/train3"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2", "/path/to/valid3"],
-            test_data_path=["/path/to/test1", "/path/to/test2", "/path/to/test3"],
-        )
-
-        # When blend_per_split is used, split should be None
-        assert config.dataset.split is None
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is not None
-
-    def test_pretrain_config_prioritizes_blend(self):
-        """Test that blend takes priority over blend_per_split when both are provided."""
-        config = pretrain_config(
-            train_data_path=["/path/to/train1", "/path/to/train2"],
-            valid_data_path=["/path/to/valid1", "/path/to/valid2"],
-            test_data_path=["/path/to/test1", "/path/to/test2"],
-            data_paths=["/path/to/data1", "/path/to/data2"],
-        )
-
-        # Should prioritize blend over blend_per_split
-        assert config.dataset.split == "9999,8,2"
-        assert config.dataset.blend is not None
-        assert config.dataset.blend_per_split is None
-
-    @patch("megatron.bridge.recipes.utils.dataset_utils.get_blend_and_blend_per_split")
-    def test_pretrain_config_fallback_to_mock_when_no_weights(self, mock_get_blend):
-        """Test pretrain_config falls back to mock when no weights are returned."""
-        # Mock function returns None for both weights
-        mock_get_blend.return_value = (None, None)
-
-        config = pretrain_config(data_paths=["/some/path"])
-
-        # Should fall back to mock mode
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_checkpoint_configuration(self):
-        """Test checkpoint configuration in pretrain_config."""
-        config = pretrain_config()
-
-        assert config.checkpoint.save_interval == 500
-        assert config.checkpoint.ckpt_format == "torch_dist"
-        assert config.checkpoint.fully_parallel_save is True
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.data_parallel_sharding_strategy == "optim_grads_params"
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_manual_gc(self):
-        """Test manual garbage collection configuration."""
-        config = pretrain_config()
-
-        assert config.train.manual_gc is True
-        assert config.train.manual_gc_interval == 100
-        assert config.train.manual_gc_eval == 100
-
-    def test_pretrain_config_scheduler_configuration(self):
-        """Test scheduler configuration."""
-        config = pretrain_config(train_iters=50000)
-
-        assert config.scheduler.start_weight_decay == 0.033
-        assert config.scheduler.end_weight_decay == 0.033
-        assert config.scheduler.weight_decay_incr_style == "constant"
-        assert config.scheduler.lr_decay_style == "cosine"
-        assert config.scheduler.lr_warmup_iters == 500  # default for qwen3_1p7b
-        assert config.scheduler.lr_warmup_init == 0.0
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-        assert config.scheduler.override_opt_param_scheduler is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "HuggingFaceTokenizer"
-        assert config.tokenizer.tokenizer_model == "Qwen/Qwen3-1.7B"
-
-    def test_pretrain_config_rng_configuration(self):
-        """Test RNG configuration."""
-        config = pretrain_config()
-
-        assert config.rng.seed == 1234
-        assert config.dataset.random_seed == 1234
-
-    def test_pretrain_config_dataset_configuration(self):
-        """Test dataset configuration details."""
-        config = pretrain_config()
-
-        assert config.dataset.reset_attention_mask is False
-        assert config.dataset.reset_position_ids is False
-        assert config.dataset.eod_mask_loss is False
-        assert config.dataset.num_dataset_builder_threads == 1
-        assert config.dataset.data_sharding is True
-        assert config.dataset.dataloader_type == "single"
-
-    def test_pretrain_config_logger_configuration(self):
-        """Test logger configuration."""
-        config = pretrain_config()
-
-        assert config.logger.log_interval == 10
-        assert config.logger.log_timers_to_tensorboard is True
-        assert "tb_logs" in config.logger.tensorboard_dir
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen3_235b_a22b.py b/tests/unit_tests/recipes/qwen/test_qwen3_235b_a22b.py
deleted file mode 100644
index 6c681ed261..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen3_235b_a22b.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-
-import pytest
-import torch
-
-from megatron.bridge.models.qwen import Qwen3MoEModelProvider235B_A22B
-from megatron.bridge.recipes.qwen.qwen3_235b_a22b import model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen3MoEModelProvider235B_A22B)
-        assert config.tensor_model_parallel_size == 4  # Default for Qwen3 235B-A22B MoE
-        assert config.pipeline_model_parallel_size == 16  # Default for Qwen3 235B-A22B MoE
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 2  # Default context parallelism for massive model
-        assert config.expert_model_parallel_size == 8  # Default expert parallelism
-        assert config.sequence_parallel is True  # Enabled by default for MoE
-
-        # Check pipeline split configuration for massive model
-        assert config.account_for_embedding_in_pipeline_split is True
-        assert config.account_for_loss_in_pipeline_split is True
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen3MoEModelProvider235B_A22B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 1  # Reduced for very large model
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (Qwen3 235B-A22B MoE specific defaults)
-        assert config.model.tensor_model_parallel_size == 4  # Default for Qwen3 235B-A22B MoE
-        assert config.model.pipeline_model_parallel_size == 16  # Default for Qwen3 235B-A22B MoE
-        assert config.model.pipeline_dtype == torch.bfloat16
-        assert config.model.context_parallel_size == 2  # Default context parallelism for massive model
-        assert config.model.expert_model_parallel_size == 8  # Default expert parallelism
-        assert config.model.sequence_parallel is True  # Enabled by default for MoE
-
-        # Check pipeline split configuration for massive model
-        assert config.model.account_for_embedding_in_pipeline_split is True
-        assert config.model.account_for_loss_in_pipeline_split is True
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=2,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 2
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.data_parallel_sharding_strategy == "optim_grads_params"
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "HuggingFaceTokenizer"
-        assert config.tokenizer.tokenizer_model == "Qwen/Qwen3-235B-A22B"
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen3_30b_a3b.py b/tests/unit_tests/recipes/qwen/test_qwen3_30b_a3b.py
deleted file mode 100644
index 625cb3143d..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen3_30b_a3b.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-
-import pytest
-import torch
-
-from megatron.bridge.models.qwen import Qwen3MoEModelProvider30B_A3B
-from megatron.bridge.recipes.qwen.qwen3_30b_a3b import model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen3MoEModelProvider30B_A3B)
-        assert config.tensor_model_parallel_size == 4  # Default for Qwen3 30B-A3B MoE
-        assert config.pipeline_model_parallel_size == 2  # Default for Qwen3 30B-A3B MoE
-        assert config.pipeline_dtype == torch.bfloat16
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.expert_model_parallel_size == 4  # Default expert parallelism
-        assert config.sequence_parallel is True  # Enabled by default for MoE
-
-        # Check recompute settings
-        assert config.recompute_granularity == "full"
-        assert config.recompute_method == "uniform"
-        assert config.recompute_num_layers == 1
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen3MoEModelProvider30B_A3B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (Qwen3 30B-A3B MoE specific defaults)
-        assert config.model.tensor_model_parallel_size == 4  # Default for Qwen3 30B-A3B MoE
-        assert config.model.pipeline_model_parallel_size == 2  # Default for Qwen3 30B-A3B MoE
-        assert config.model.pipeline_dtype == torch.bfloat16
-        assert config.model.expert_model_parallel_size == 4  # Default expert parallelism
-        assert config.model.sequence_parallel is True  # Enabled by default for MoE
-
-        # Check recompute settings
-        assert config.model.recompute_granularity == "full"
-        assert config.model.recompute_method == "uniform"
-        assert config.model.recompute_num_layers == 1
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-        # Check tokenizer configuration
-        assert config.tokenizer.tokenizer_type == "HuggingFaceTokenizer"
-        assert config.tokenizer.tokenizer_model == "Qwen/Qwen3-30B-A3B"
-
-        # Check DDP configuration
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.data_parallel_sharding_strategy == "optim_grads_params"
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen3_32b.py b/tests/unit_tests/recipes/qwen/test_qwen3_32b.py
deleted file mode 100644
index 85dc352b26..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen3_32b.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-
-import pytest
-import torch
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider32B
-from megatron.bridge.recipes.qwen.qwen3_32b import model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen3ModelProvider32B)
-        assert config.tensor_model_parallel_size == 8  # Default for Qwen3 32B model
-        assert config.pipeline_model_parallel_size == 2  # Default for Qwen3 32B model
-        assert config.pipeline_dtype == torch.bfloat16  # Default pipeline dtype for PP > 1
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-        # Check recompute settings
-        assert config.recompute_granularity == "full"
-        assert config.recompute_method == "uniform"
-        assert config.recompute_num_layers == 1
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen3ModelProvider32B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (Qwen3 32B specific defaults)
-        assert config.model.tensor_model_parallel_size == 8  # Default for Qwen3 32B model
-        assert config.model.pipeline_model_parallel_size == 2  # Default for Qwen3 32B model
-        assert config.model.pipeline_dtype == torch.bfloat16  # Default pipeline dtype for PP > 1
-
-        # Check recompute settings
-        assert config.model.recompute_granularity == "full"
-        assert config.model.recompute_method == "uniform"
-        assert config.model.recompute_num_layers == 1
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.data_parallel_sharding_strategy == "optim_grads_params"
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "HuggingFaceTokenizer"
-        assert config.tokenizer.tokenizer_model == "Qwen/Qwen3-32B"
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen3_4b.py b/tests/unit_tests/recipes/qwen/test_qwen3_4b.py
deleted file mode 100644
index 9f668fe700..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen3_4b.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider4B
-from megatron.bridge.recipes.qwen.qwen3_4b import model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen3ModelProvider4B)
-        assert config.tensor_model_parallel_size == 2  # Default for Qwen3 4B model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen3ModelProvider4B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (Qwen3 4B specific defaults)
-        assert config.model.tensor_model_parallel_size == 2  # Default for Qwen3 4B model
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.pipeline_dtype is None
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.data_parallel_sharding_strategy == "optim_grads_params"
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "HuggingFaceTokenizer"
-        assert config.tokenizer.tokenizer_model == "Qwen/Qwen3-4B"
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen3_600m.py b/tests/unit_tests/recipes/qwen/test_qwen3_600m.py
deleted file mode 100644
index 677dd1c97f..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen3_600m.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider600M
-from megatron.bridge.recipes.qwen.qwen3_600m import model_config, pretrain_config
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen3ModelProvider600M)
-        assert config.tensor_model_parallel_size == 1  # Default for Qwen3 600M model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen3ModelProvider600M)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (Qwen3 600M specific defaults)
-        assert config.model.tensor_model_parallel_size == 1  # Default for Qwen3 600M model
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.pipeline_dtype is None
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.data_parallel_sharding_strategy == "optim_grads_params"
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "NullTokenizer"
-        assert config.tokenizer.vocab_size == DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/qwen/test_qwen3_8b.py b/tests/unit_tests/recipes/qwen/test_qwen3_8b.py
deleted file mode 100644
index 8b7edc37d7..0000000000
--- a/tests/unit_tests/recipes/qwen/test_qwen3_8b.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-
-import pytest
-
-from megatron.bridge.models.qwen import Qwen3ModelProvider8B
-from megatron.bridge.recipes.qwen.qwen3_8b import model_config, pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-@pytest.mark.unit
-class TestModelConfig:
-    """Test cases for the model_config function."""
-
-    def test_model_config_default_parameters(self):
-        """Test model_config with default parameters."""
-        config = model_config()
-
-        assert isinstance(config, Qwen3ModelProvider8B)
-        assert config.tensor_model_parallel_size == 4  # Default for Qwen3 8B model
-        assert config.pipeline_model_parallel_size == 1
-        assert config.pipeline_dtype is None
-        assert config.virtual_pipeline_model_parallel_size is None
-        assert config.context_parallel_size == 1
-        assert config.sequence_parallel is False
-
-
-@pytest.mark.unit
-class TestPretrainConfig:
-    """Test cases for the pretrain_config function."""
-
-    def test_pretrain_config_default_parameters(self):
-        """Test pretrain_config with default parameters (mock mode)."""
-        config = pretrain_config()
-
-        assert isinstance(config, ConfigContainer)
-        assert isinstance(config.model, Qwen3ModelProvider8B)
-
-        # Check training configuration
-        assert config.train.train_iters == 300000
-        assert config.train.global_batch_size == 32
-        assert config.train.micro_batch_size == 2
-        assert config.train.eval_interval == 500
-        assert config.train.eval_iters == 32
-
-        # Check optimizer configuration
-        assert config.optimizer.optimizer == "adam"
-        assert config.optimizer.lr == 3e-4
-        assert config.optimizer.min_lr == 3e-5
-        assert config.optimizer.bf16 is True
-        assert config.optimizer.fp16 is False
-
-        # Check model configuration (Qwen3 8B specific defaults)
-        assert config.model.tensor_model_parallel_size == 4  # Default for Qwen3 8B model
-        assert config.model.pipeline_model_parallel_size == 1
-        assert config.model.pipeline_dtype is None
-
-        # Check dataset configuration (should be in mock mode)
-        assert config.dataset.sequence_length == 4096
-        assert config.dataset.split == "1,1,1"
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-
-    def test_pretrain_config_custom_training_parameters(self):
-        """Test pretrain_config with custom training parameters."""
-        config = pretrain_config(
-            train_iters=10000,
-            global_batch_size=256,
-            micro_batch_size=4,
-            seq_length=2048,
-            lr=1e-4,
-            min_lr=1e-5,
-            lr_warmup_iters=1000,
-        )
-
-        assert config.train.train_iters == 10000
-        assert config.train.global_batch_size == 256
-        assert config.train.micro_batch_size == 4
-        assert config.dataset.sequence_length == 2048
-        assert config.optimizer.lr == 1e-4
-        assert config.optimizer.min_lr == 1e-5
-        assert config.scheduler.lr_warmup_iters == 1000
-        assert config.scheduler.lr_decay_iters is None  # Will be set to train_iters during validation
-
-    def test_pretrain_config_with_custom_directory(self):
-        """Test pretrain_config with custom directory."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            config = pretrain_config(dir=temp_dir, name="test_run")
-
-            expected_run_dir = os.path.join(temp_dir, "test_run")
-            expected_checkpoint_dir = os.path.join(expected_run_dir, "checkpoints")
-            expected_tensorboard_dir = os.path.join(expected_run_dir, "tb_logs")
-
-            assert config.checkpoint.save == expected_checkpoint_dir
-            assert config.checkpoint.load == expected_checkpoint_dir
-            assert config.logger.tensorboard_dir == expected_tensorboard_dir
-
-    def test_pretrain_config_explicit_mock_mode(self):
-        """Test pretrain_config with explicit mock=True."""
-        config = pretrain_config(mock=True)
-
-        assert config.dataset.blend is None
-        assert config.dataset.blend_per_split is None
-        assert config.dataset.split == "1,1,1"
-
-    def test_pretrain_config_ddp_configuration(self):
-        """Test distributed data parallel configuration."""
-        config = pretrain_config()
-
-        assert config.ddp.check_for_nan_in_grad is True
-        assert config.ddp.grad_reduce_in_fp32 is True
-        assert config.ddp.overlap_grad_reduce is True
-        assert config.ddp.overlap_param_gather is True
-        assert config.ddp.average_in_collective is True
-        assert config.ddp.data_parallel_sharding_strategy == "optim_grads_params"
-        assert config.ddp.use_distributed_optimizer is True
-
-    def test_pretrain_config_tokenizer_configuration(self):
-        """Test tokenizer configuration."""
-        config = pretrain_config()
-
-        assert config.tokenizer.tokenizer_type == "HuggingFaceTokenizer"
-        assert config.tokenizer.tokenizer_model == "Qwen/Qwen3-8B"
-
-    @pytest.mark.parametrize("seq_length", [1024, 2048, 4096, 8192, 16384])
-    def test_pretrain_config_sequence_lengths(self, seq_length):
-        """Test various sequence lengths."""
-        config = pretrain_config(seq_length=seq_length)
-
-        assert config.dataset.sequence_length == seq_length
-        assert config.model.seq_length == seq_length
diff --git a/tests/unit_tests/recipes/test_llama_recipes.py b/tests/unit_tests/recipes/test_llama_recipes.py
new file mode 100644
index 0000000000..7308209524
--- /dev/null
+++ b/tests/unit_tests/recipes/test_llama_recipes.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Test purpose:
+# - Parametrize over all exported Qwen recipe functions in `megatron.bridge.recipes.qwen`.
+# - For each recipe, monkeypatch `AutoBridge` with a lightweight fake to avoid I/O.
+# - Build a config with small, safe overrides and assert it forms a valid `ConfigContainer`.
+# - Verify tokenizer selection honors `use_null_tokenizer`, and sanity-check parallelism fields.
+#
+
+import importlib
+from typing import Callable
+
+import pytest
+
+
+_llama_module = importlib.import_module("megatron.bridge.recipes.llama")
+_LLAMA_RECIPE_FUNCS = [
+    getattr(_llama_module, name)
+    for name in getattr(_llama_module, "__all__", [])
+    if callable(getattr(_llama_module, name, None))
+]
+
+
+def _safe_overrides_for(name: str) -> dict:
+    overrides = {
+        "name": f"unit_{name}",
+        "dir": ".",
+        "mock": True,
+        "train_iters": 10,
+        "global_batch_size": 2,
+        "micro_batch_size": 1,
+        "seq_length": 64,
+        "lr": 1e-4,
+        "min_lr": 1e-5,
+        "lr_warmup_iters": 2,
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+        "context_parallelism": 1,
+        "use_null_tokenizer": True,
+    }
+
+    # Large models/variants may set additional flags in recipes; keep harmless defaults
+    lname = name.lower()
+    if "70b" in lname or "405b" in lname:
+        overrides.update(
+            {
+                "virtual_pipeline_parallelism": None,
+                "sequence_parallelism": True,
+            }
+        )
+
+    return overrides
+
+
+class _FakeModelCfg:
+    def finalize(self):
+        return None
+
+
+class _FakeBridge:
+    def __init__(self):
+        pass
+
+    def to_megatron_provider(self, load_weights: bool = False):
+        return _FakeModelCfg()
+
+    @staticmethod
+    def from_hf_pretrained(hf_path: str):
+        return _FakeBridge()
+
+
+def _assert_basic_config(cfg):
+    from megatron.bridge.training.config import ConfigContainer
+
+    assert isinstance(cfg, ConfigContainer)
+    assert cfg.model is not None
+    assert cfg.train is not None
+    assert cfg.optimizer is not None
+    assert cfg.scheduler is not None
+    assert cfg.dataset is not None
+    assert cfg.logger is not None
+    assert cfg.tokenizer is not None
+    assert cfg.checkpoint is not None
+    assert cfg.rng is not None
+
+    assert cfg.train.global_batch_size >= 1
+    assert cfg.train.micro_batch_size >= 1
+    assert cfg.dataset.sequence_length >= 1
+
+
+@pytest.mark.parametrize("recipe_func", _LLAMA_RECIPE_FUNCS)
+def test_each_llama_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    module_name = recipe_func.__module__
+    mod = importlib.import_module(module_name)
+    monkeypatch.setattr(mod, "AutoBridge", _FakeBridge)
+
+    overrides = _safe_overrides_for(recipe_func.__name__)
+
+    cfg = recipe_func(**overrides)
+
+    _assert_basic_config(cfg)
+
+    if overrides.get("use_null_tokenizer") and hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
+
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
diff --git a/tests/unit_tests/recipes/test_qwen_recipes.py b/tests/unit_tests/recipes/test_qwen_recipes.py
new file mode 100644
index 0000000000..c077209302
--- /dev/null
+++ b/tests/unit_tests/recipes/test_qwen_recipes.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Test purpose:
+# - Parametrize over all exported Qwen recipe functions in `megatron.bridge.recipes.qwen`.
+# - For each recipe, monkeypatch `AutoBridge` with a lightweight fake to avoid I/O.
+# - Build a config with small, safe overrides and assert it forms a valid `ConfigContainer`.
+# - Verify tokenizer selection honors `use_null_tokenizer`, and sanity-check parallelism fields.
+#
+
+import importlib
+from typing import Callable
+
+import pytest
+
+
+_qwen_module = importlib.import_module("megatron.bridge.recipes.qwen")
+_QWEN_RECIPE_FUNCS = [
+    getattr(_qwen_module, name)
+    for name in getattr(_qwen_module, "__all__", [])
+    if callable(getattr(_qwen_module, name, None))
+]
+
+
+def _safe_overrides_for(name: str) -> dict:
+    # Minimal, dependency-light overrides for fast unit testing
+    overrides = {
+        "name": f"unit_{name}",
+        "dir": ".",  # keep paths local
+        "mock": True,  # use mock data paths
+        "train_iters": 10,
+        "global_batch_size": 2,
+        "micro_batch_size": 1,
+        "seq_length": 64,
+        "lr": 1e-4,
+        "min_lr": 1e-5,
+        "lr_warmup_iters": 2,
+        # Keep parallelism tiny so provider shaping is trivial
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+        "context_parallelism": 1,
+        # Prefer NullTokenizer in tests to avoid HF tokenizer I/O
+        "use_null_tokenizer": True,
+    }
+
+    # For MoE recipes, ensure expert settings are small/valid
+    lname = name.lower()
+    if "a3b" in lname or "a22b" in lname or "moe" in lname:
+        overrides.update(
+            {
+                "expert_parallelism": 2,
+                "expert_tensor_parallelism": 1,
+                "sequence_parallelism": True,
+            }
+        )
+
+    return overrides
+
+
+class _FakeModelCfg:
+    # Minimal provider to accept attribute assignments used in recipes
+    def finalize(self):
+        # qwen3 recipe may call finalize(); make it a no-op
+        return None
+
+
+class _FakeBridge:
+    def __init__(self):
+        pass
+
+    def to_megatron_provider(self, load_weights: bool = False):
+        return _FakeModelCfg()
+
+    @staticmethod
+    def from_hf_pretrained(hf_path: str):
+        # Ignore hf_path; return a bridge that yields a fake provider
+        return _FakeBridge()
+
+
+def _assert_basic_config(cfg):
+    from megatron.bridge.training.config import ConfigContainer
+
+    assert isinstance(cfg, ConfigContainer)
+    # Required top-level sections
+    assert cfg.model is not None
+    assert cfg.train is not None
+    assert cfg.optimizer is not None
+    assert cfg.scheduler is not None
+    assert cfg.dataset is not None
+    assert cfg.logger is not None
+    assert cfg.tokenizer is not None
+    assert cfg.checkpoint is not None
+    assert cfg.rng is not None
+
+    # A few critical fields
+    assert cfg.train.global_batch_size >= 1
+    assert cfg.train.micro_batch_size >= 1
+    assert cfg.dataset.sequence_length >= 1
+
+
+@pytest.mark.parametrize("recipe_func", _QWEN_RECIPE_FUNCS)
+def test_each_qwen_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch):
+    # Monkeypatch AutoBridge in the specific module where the recipe function is defined
+    module_name = recipe_func.__module__
+    mod = importlib.import_module(module_name)
+    monkeypatch.setattr(mod, "AutoBridge", _FakeBridge)
+
+    overrides = _safe_overrides_for(recipe_func.__name__)
+
+    cfg = recipe_func(**overrides)
+
+    _assert_basic_config(cfg)
+
+    # Ensure tokenizer choice matches override
+    if overrides.get("use_null_tokenizer"):
+        assert cfg.tokenizer.tokenizer_type == "NullTokenizer"
+        assert cfg.tokenizer.vocab_size is not None
+    else:
+        assert cfg.tokenizer.tokenizer_type == "HuggingFaceTokenizer"
+        assert cfg.tokenizer.tokenizer_model is not None
+
+    # Parallelism and shaping
+    assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1
+    assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1
diff --git a/tests/unit_tests/recipes/utils/test_nemo_run_utils.py b/tests/unit_tests/recipes/utils/test_nemo_run_utils.py
index 91fcc62866..f01156e356 100644
--- a/tests/unit_tests/recipes/utils/test_nemo_run_utils.py
+++ b/tests/unit_tests/recipes/utils/test_nemo_run_utils.py
@@ -246,11 +246,22 @@ def test_mixed_partial_and_non_partial(self):
 
     def test_with_real_gpt_config(self):
         """Test with a real GPTConfig to ensure compatibility."""
-        # Import actual configs for realistic testing
-        from megatron.bridge.recipes.llama.llama3_8b import model_config
+        # Import actual configs for realistic testing, but avoid HF downloads by mocking AutoBridge
+        from unittest import mock as _mock
 
-        # Get a real model config
-        model_cfg = model_config()
+        from megatron.bridge.recipes.llama.llama3 import llama3_8b_pretrain_config as pretrain_config
+
+        with _mock.patch("megatron.bridge.recipes.llama.llama3.AutoBridge.from_hf_pretrained") as mock_from:
+
+            class _DummyBridge:
+                def to_megatron_provider(self, load_weights: bool = False):
+                    from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider
+
+                    return Llama3ModelProvider()
+
+            mock_from.return_value = _DummyBridge()
+            # Get a real model config (provider) without contacting HF
+            model_cfg = pretrain_config().model
 
         # Create a minimal ConfigContainer with required fields
         config = ConfigContainer(
diff --git a/tests/unit_tests/training/test_checkpointing.py b/tests/unit_tests/training/test_checkpointing.py
index 0e636821a7..02211d4e29 100644
--- a/tests/unit_tests/training/test_checkpointing.py
+++ b/tests/unit_tests/training/test_checkpointing.py
@@ -16,7 +16,7 @@
 import os
 import tempfile
 from pathlib import Path
-from unittest.mock import Mock, patch
+from unittest.mock import Mock, mock_open, patch
 
 import pytest
 import torch
@@ -28,8 +28,10 @@
     _get_checkpoint_format,
     _get_non_persistent_iteration,
     _load_base_checkpoint,
+    _load_model_state_dict,
     checkpoint_exists,
     cleanup_old_non_persistent_checkpoint,
+    delete_extra_state,
     ensure_directory_exists,
     find_checkpoint_rank_0,
     get_checkpoint_name,
@@ -115,6 +117,31 @@ def test_get_checkpoint_tracker_filename(self):
         expected = "/checkpoints/latest_checkpointed_iteration.txt"
         assert result == expected
 
+    @patch("torch.distributed.is_initialized")
+    @patch("torch.distributed.get_rank")
+    @patch("torch.distributed.all_reduce")
+    @patch("megatron.bridge.training.checkpointing.print_rank_0")
+    @patch("builtins.open", create=True)
+    def test_read_metadata_mismatch_warns(
+        self, mock_open, mock_print_rank_0, mock_all_reduce, mock_get_rank, mock_dist_init
+    ):
+        """When iterations differ across ranks, a warning should be printed via print_rank_0."""
+        mock_dist_init.return_value = True
+        mock_get_rank.return_value = 0
+        mock_file = mock_open.return_value.__enter__.return_value
+        mock_file.read.return_value = "10"
+
+        # Mock tensor semantics: iters_cuda[0].item() -> 20
+        mock_tensor_item = Mock()
+        mock_tensor_item.item.return_value = 20
+        mock_tensor = Mock()
+        mock_tensor.__getitem__ = Mock(return_value=mock_tensor_item)
+
+        with patch("torch.tensor", return_value=mock_tensor):
+            _ = read_metadata("/path/to/tracker")
+
+        assert mock_print_rank_0.called
+
     @patch("torch.distributed.is_initialized")
     @patch("torch.distributed.get_rank")
     @patch("torch.distributed.all_reduce")
@@ -261,6 +288,32 @@ def test_get_rng_state(self, mock_random, mock_np, mock_torch, mock_cuda, mock_d
         assert rng_state["rng_tracker_states"] == "tracker_states"
 
 
+class TestDeleteExtraState:
+    """Tests for delete_extra_state utility added for cleanup of extraneous keys."""
+
+    def test_delete_extra_state_with_model_section(self):
+        sd = {"model": {"layer.weight": 1, "te_extra_state": 2, "_extra_state.foo": 3}}
+        result = delete_extra_state(sd)
+        assert "te_extra_state" not in result["model"]
+        assert "_extra_state.foo" not in result["model"]
+        assert result["model"]["layer.weight"] == 1
+
+    def test_delete_extra_state_direct_model_state(self):
+        sd = {"layer.weight": 1, "something_extra_state": 2}
+        result = delete_extra_state(sd)
+        assert "something_extra_state" not in result
+        assert result["layer.weight"] == 1
+
+    def test_delete_extra_state_non_mapping_noop(self):
+        class NotMapping:
+            pass
+
+        # Should not throw and should return the original object wrapper
+        sd = {"model": NotMapping()}
+        result = delete_extra_state(sd)
+        assert result is sd
+
+
 @pytest.fixture
 def save_checkpoint_fixtures():
     """Fixture for save checkpoint tests."""
@@ -321,6 +374,7 @@ class TestSaveCheckpoint:
 
     @patch("megatron.bridge.training.checkpointing.wandb_utils")
     @patch("megatron.bridge.training.checkpointing.is_last_rank")
+    @patch("builtins.open", new_callable=mock_open)
     @patch("torch.save")
     @patch("shutil.copy")
     @_patch_modelopt_state_saver()
@@ -361,6 +415,7 @@ def test_save_checkpoint_global(
         mock_save_modelopt,
         mock_shutil_copy,
         mock_torch_save,
+        mock_file_open,
         mock_is_last_rank,
         mock_wandb,
         save_checkpoint_fixtures,
@@ -407,6 +462,22 @@ def test_save_checkpoint_global(
         mock_gen_state.assert_called_once()
         mock_dist_ckpt.save.assert_called_once()
 
+        # Verify that the tracker file was written with the correct iteration
+        tracker_calls = [
+            call
+            for call in mock_file_open.call_args_list
+            if len(call[0]) > 0 and "latest_checkpointed_iteration.txt" in call[0][0]
+        ]
+        assert len(tracker_calls) > 0, "Tracker file should be written"
+
+        # Verify the iteration was written to the file
+        mock_file_handle = mock_file_open()
+        write_calls = [call for call in mock_file_handle.write.call_args_list]
+        assert len(write_calls) > 0, "Should write iteration to tracker file"
+        # Check that the iteration (1000) was written
+        written_content = "".join([str(call[0][0]) for call in write_calls if len(call[0]) > 0])
+        assert "1000" in written_content, f"Expected '1000' in written content, got: {written_content}"
+
     @patch("megatron.bridge.training.checkpointing.print_rank_0")
     def test_save_checkpoint_invalid_non_persistent_type(self, mock_print_rank_0, save_checkpoint_fixtures):
         """Test error handling for invalid non_persistent_ckpt_type."""
@@ -969,6 +1040,43 @@ def test_load_model_weights_single_model_success(
         mock_get_strategy.assert_called_once_with("/test/checkpoint")
         mock_load_state_dict.assert_called_once_with(mock_model[0], mock_full_state_dict["model"], True)
 
+    @patch("megatron.bridge.training.checkpointing.delete_extra_state")
+    @patch("megatron.bridge.training.checkpointing.dist_checkpointing")
+    @patch("megatron.bridge.training.checkpointing.unwrap_model")
+    @patch("megatron.bridge.training.checkpointing._generate_model_state_dict")
+    @patch("megatron.bridge.training.checkpointing.get_default_load_sharded_strategy")
+    def test_load_model_weights_calls_delete_extra_state(
+        self,
+        mock_get_strategy,
+        mock_generate_state_dict,
+        mock_unwrap_model,
+        mock_dist_ckpt,
+        mock_delete_extra_state,
+        mock_model,
+        mock_common_state_dict,
+        mock_full_state_dict,
+        mock_metadata,
+    ):
+        """Ensure extra state cleanup is invoked on the loaded state dict."""
+        mock_dist_ckpt.load_common_state_dict.return_value = mock_common_state_dict
+        mock_dist_ckpt.load_content_metadata.return_value = mock_metadata
+        mock_dist_ckpt.load.return_value = mock_full_state_dict
+        mock_get_strategy.return_value = Mock()
+        mock_generate_state_dict.return_value = {"model": {"weight": torch.randn(1)}}
+        mock_unwrap_model.return_value = mock_model
+
+        from megatron.bridge.training.checkpointing import _load_model_weights_from_checkpoint
+
+        _load_model_weights_from_checkpoint(
+            checkpoint_path="/ckpt",
+            model=mock_model,
+            fully_parallel_load=False,
+            dist_ckpt_strictness="assume_ok_unexpected",
+            strict=True,
+        )
+
+        mock_delete_extra_state.assert_called_once_with(mock_full_state_dict)
+
     @patch("megatron.bridge.training.checkpointing.dist_checkpointing")
     @patch("megatron.bridge.training.checkpointing.unwrap_model")
     @patch("megatron.bridge.training.checkpointing._generate_model_state_dict")
@@ -1160,6 +1268,33 @@ def test_return_state_dict(
         mock_load_state_dict.assert_not_called()
 
 
+class TestLoadModelStateDictHelper:
+    """Tests for _load_model_state_dict strict fallback behavior and logging."""
+
+    @patch("megatron.bridge.training.checkpointing.print_rank_0")
+    def test_load_model_state_dict_strict_fallback(self, mock_print_rank_0):
+        module = Mock()
+        # First call raises, second (non-strict) call succeeds
+        module.load_state_dict.side_effect = [Exception("boom"), "ok"]
+
+        _load_model_state_dict(module, {"w": 1}, strict=True)
+
+        # Should have been called twice: strict=True then strict=False
+        assert module.load_state_dict.call_count == 2
+        first_args, first_kwargs = module.load_state_dict.call_args_list[0]
+        second_args, second_kwargs = module.load_state_dict.call_args_list[1]
+        assert first_kwargs.get("strict") is True
+        assert second_kwargs.get("strict") is False
+        assert mock_print_rank_0.called
+
+    def test_load_model_state_dict_non_strict_raises(self):
+        module = Mock()
+        module.load_state_dict.side_effect = Exception("fail")
+
+        with pytest.raises(Exception):
+            _load_model_state_dict(module, {"w": 1}, strict=False)
+
+
 class TestMegatronLMCompatibility:
     """Test Megatron-LM checkpoint compatibility features."""
 
diff --git a/tests/unit_tests/training/test_config.py b/tests/unit_tests/training/test_config.py
index ea38abddc4..3af6e44cdb 100644
--- a/tests/unit_tests/training/test_config.py
+++ b/tests/unit_tests/training/test_config.py
@@ -659,6 +659,34 @@ def test_profiling_config_instantiation_validation(
         finally:
             restore_get_world_size_safe(og_ws, cfg_mod)
 
+    @pytest.mark.parametrize(
+        "profile_step_start, profile_step_end, expect_assertion_error, expected_error_match",
+        [
+            (10, 20, False, None),  # Valid: end > start
+            (10, 10, False, None),  # Valid: end == start (single step)
+            (0, 5, False, None),  # Valid: start at 0
+            (20, 10, True, "profile_step_end .* must be >= profile_step_start"),  # Invalid: end < start
+            (-1, 10, True, "profile_step_start must be >= 0"),  # Invalid: start < 0
+            (10, -1, True, "profile_step_end must be >= 0"),  # Invalid: end < 0
+            (-5, -1, True, "profile_step_start must be >= 0"),  # Invalid: both < 0
+        ],
+    )
+    def test_profiling_config_step_range_validation(
+        self, profile_step_start, profile_step_end, expect_assertion_error, expected_error_match
+    ):
+        """Test ProfilingConfig validation for profile step ranges."""
+        prof_cfg = create_test_profiling_config(
+            use_pytorch_profiler=True,
+            profile_step_start=profile_step_start,
+            profile_step_end=profile_step_end,
+        )
+
+        if expect_assertion_error:
+            with pytest.raises(AssertionError, match=expected_error_match):
+                prof_cfg.finalize()
+        else:
+            prof_cfg.finalize()  # Should pass without error
+
     def test_packed_sequence_micro_batch_size_validation_error(self, monkeypatch):
         """Test validation error when micro_batch_size > 1 with packed sequences."""
         from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
diff --git a/tests/unit_tests/training/test_functor_support.py b/tests/unit_tests/training/test_functor_support.py
new file mode 100644
index 0000000000..80cbb4d3fe
--- /dev/null
+++ b/tests/unit_tests/training/test_functor_support.py
@@ -0,0 +1,453 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for functor support in forward step functions."""
+
+import inspect
+from functools import partial
+from typing import Iterable, Optional
+from unittest.mock import MagicMock, Mock, patch
+
+import torch
+from megatron.core.models.gpt import GPTModel
+
+from megatron.bridge.training.pretrain import pretrain
+from megatron.bridge.training.state import GlobalState
+from megatron.bridge.training.utils.train_utils import (
+    maybe_inject_state,
+    needs_global_state_injection,
+)
+from tests.unit_tests.training.test_config import (
+    create_test_checkpoint_config,
+    create_test_config_container,
+    create_test_gpt_config,
+    create_test_training_config,
+    restore_get_world_size_safe,
+)
+
+
+class TwoArgForwardFunctor:
+    """Functor with 2 arguments: (data_iterator, model)."""
+
+    def __init__(self):
+        self.call_count = 0
+        self.last_args = None
+        self.last_kwargs = None
+
+    def __call__(self, data_iterator: Iterable, model: GPTModel) -> tuple[torch.Tensor, partial]:
+        self.call_count += 1
+        self.last_args = (data_iterator, model)
+        self.last_kwargs = {}
+        # Return mock tensor and loss function
+        return torch.tensor([1.0]), partial(lambda x: x)
+
+
+class ThreeArgForwardFunctor:
+    """Functor with 3 arguments: (data_iterator, model, return_schedule_plan)."""
+
+    def __init__(self):
+        self.call_count = 0
+        self.last_args = None
+        self.last_kwargs = None
+
+    def __call__(
+        self, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False
+    ) -> tuple[torch.Tensor, partial]:
+        self.call_count += 1
+        self.last_args = (data_iterator, model, return_schedule_plan)
+        self.last_kwargs = {}
+        # Return mock tensor and loss function
+        return torch.tensor([1.0]), partial(lambda x: x)
+
+
+class FourArgForwardFunctor:
+    """Functor with 4 arguments: (state, data_iterator, model, return_schedule_plan)."""
+
+    def __init__(self):
+        self.call_count = 0
+        self.last_args = None
+        self.last_kwargs = None
+
+    def __call__(
+        self,
+        state: GlobalState,
+        data_iterator: Iterable,
+        model: GPTModel,
+        return_schedule_plan: bool = False,
+    ) -> tuple[torch.Tensor, partial]:
+        self.call_count += 1
+        self.last_args = (state, data_iterator, model, return_schedule_plan)
+        self.last_kwargs = {}
+        # Return mock tensor and loss function
+        return torch.tensor([1.0]), partial(lambda x: x)
+
+
+class StatefulForwardFunctor:
+    """Functor that maintains state across calls."""
+
+    def __init__(self, initial_loss: float = 1.0):
+        self.initial_loss = initial_loss
+        self.call_count = 0
+        self.loss_history = []
+        self.state_received = None
+
+    def __call__(
+        self,
+        state: GlobalState,
+        data_iterator: Iterable,
+        model: GPTModel,
+        return_schedule_plan: bool = False,
+    ) -> tuple[torch.Tensor, partial]:
+        self.call_count += 1
+        self.state_received = state
+
+        # Simulate decreasing loss over time
+        current_loss = self.initial_loss * (0.9**self.call_count)
+        self.loss_history.append(current_loss)
+
+        loss_tensor = torch.tensor([current_loss])
+        loss_function = partial(lambda x: loss_tensor)
+
+        return loss_tensor, loss_function
+
+    def get_average_loss(self) -> Optional[float]:
+        """Return average loss across all calls."""
+        if not self.loss_history:
+            return None
+        return sum(self.loss_history) / len(self.loss_history)
+
+
+class TestFunctorStateInjectionDetection:
+    """Test that functors are correctly inspected for state injection needs."""
+
+    def test_two_arg_functor_inspection(self):
+        """Test that 2-arg functor doesn't need state injection."""
+        functor = TwoArgForwardFunctor()
+        needs_injection = needs_global_state_injection(functor)
+        assert needs_injection is False  # No state parameter
+
+    def test_three_arg_functor_inspection(self):
+        """Test that 3-arg functor without state doesn't need injection."""
+        functor = ThreeArgForwardFunctor()
+        needs_injection = needs_global_state_injection(functor)
+        assert needs_injection is False  # No state parameter
+
+    def test_four_arg_functor_inspection(self):
+        """Test that 4-arg functor with state needs injection."""
+        functor = FourArgForwardFunctor()
+        needs_injection = needs_global_state_injection(functor)
+        assert needs_injection is True  # Has 'state' parameter name
+
+    def test_functor_signature_inspection_works(self):
+        """Test that inspect.signature works correctly on functors."""
+        functor = FourArgForwardFunctor()
+        signature = inspect.signature(functor)
+        params = list(signature.parameters.keys())
+        assert params == ["state", "data_iterator", "model", "return_schedule_plan"]
+
+
+class TestFunctorStateInjection:
+    """Test that state injection works correctly with functors."""
+
+    def test_four_arg_functor_gets_state_injected(self):
+        """Test that 4-arg functor gets state injected via partial."""
+        functor = FourArgForwardFunctor()
+        mock_state = Mock(spec=GlobalState)
+
+        wrapped_functor = maybe_inject_state(functor, mock_state)
+
+        # Should return a partial function
+        assert isinstance(wrapped_functor, partial)
+        assert wrapped_functor.func is functor
+        assert wrapped_functor.args == (mock_state,)
+
+    def test_three_arg_functor_no_state_injection(self):
+        """Test that 3-arg functor doesn't get state injected."""
+        functor = ThreeArgForwardFunctor()
+        mock_state = Mock(spec=GlobalState)
+
+        wrapped_functor = maybe_inject_state(functor, mock_state)
+
+        # Should return the original functor unchanged
+        assert wrapped_functor is functor
+
+    def test_two_arg_functor_no_state_injection(self):
+        """Test that 2-arg functor doesn't get state injected."""
+        functor = TwoArgForwardFunctor()
+        mock_state = Mock(spec=GlobalState)
+
+        wrapped_functor = maybe_inject_state(functor, mock_state)
+
+        # Should return the original functor unchanged
+        assert wrapped_functor is functor
+
+
+class TestFunctorWithPretrain:
+    """Integration tests for functors with the pretrain function."""
+
+    @patch("megatron.bridge.training.pretrain.setup")
+    @patch("megatron.bridge.training.pretrain.get_dataset_provider")
+    @patch("megatron.bridge.training.pretrain.runtime_config_update")
+    @patch("megatron.bridge.training.pretrain.train")
+    def test_pretrain_with_four_arg_functor(
+        self, mock_train, mock_runtime_update, mock_get_dataset_provider, mock_setup
+    ):
+        """Test pretrain works with a 4-arg functor."""
+        gpt_model_cfg = create_test_gpt_config()
+        checkpoint_cfg = create_test_checkpoint_config(save=None)
+        train_cfg = create_test_training_config(train_iters=100, skip_train=False)
+
+        container, og_ws, cfg_mod = create_test_config_container(
+            world_size_override=1,
+            model_config=gpt_model_cfg,
+            checkpoint_config=checkpoint_cfg,
+            train_config=train_cfg,
+        )
+
+        functor = FourArgForwardFunctor()
+
+        # Mock setup return
+        setup_output = MagicMock()
+        setup_output.state = MagicMock()
+        setup_output.state.cfg = container
+        setup_output.state.train_state.do_train = True
+        setup_output.state.train_state.step = 0
+        setup_output.state.train_state.do_valid = False
+        setup_output.state.train_state.do_test = False
+
+        # Mock fault tolerance state to avoid comparison issues
+        setup_output.state.fault_tolerance_state.seen_tr_iters_cnt = 0
+        setup_output.state.fault_tolerance_state.is_calculating_timeouts = False
+        setup_output.state.fault_tolerance_state.is_persistent_chkpt_loaded = False
+        setup_output.state.rank_monitor_client = None
+
+        setup_output.model = MagicMock()
+        setup_output.optimizer = MagicMock()
+        setup_output.scheduler = MagicMock()
+        setup_output.train_data_iterator = MagicMock()
+        setup_output.valid_data_iterator = None
+        setup_output.test_data_iterator = None
+        setup_output.checkpointing_context = {}
+        mock_setup.return_value = setup_output
+
+        try:
+            pretrain(container, functor)
+        finally:
+            restore_get_world_size_safe(og_ws, cfg_mod)
+
+        # Verify the functor was passed to train
+        mock_train.assert_called_once()
+        assert mock_train.call_args[0][0] is functor
+
+    @patch("megatron.bridge.training.pretrain.setup")
+    @patch("megatron.bridge.training.pretrain.get_dataset_provider")
+    @patch("megatron.bridge.training.pretrain.runtime_config_update")
+    @patch("megatron.bridge.training.pretrain.train")
+    def test_pretrain_with_stateful_functor(
+        self, mock_train, mock_runtime_update, mock_get_dataset_provider, mock_setup
+    ):
+        """Test pretrain works with a stateful functor that tracks calls."""
+        gpt_model_cfg = create_test_gpt_config()
+        checkpoint_cfg = create_test_checkpoint_config(save=None)
+        train_cfg = create_test_training_config(train_iters=100, skip_train=False)
+
+        container, og_ws, cfg_mod = create_test_config_container(
+            world_size_override=1,
+            model_config=gpt_model_cfg,
+            checkpoint_config=checkpoint_cfg,
+            train_config=train_cfg,
+        )
+
+        functor = StatefulForwardFunctor(initial_loss=2.0)
+        assert functor.call_count == 0
+        assert functor.loss_history == []
+
+        # Mock setup return
+        setup_output = MagicMock()
+        setup_output.state = MagicMock()
+        setup_output.state.cfg = container
+        setup_output.state.train_state.do_train = True
+        setup_output.state.train_state.step = 0
+        setup_output.state.train_state.do_valid = False
+        setup_output.state.train_state.do_test = False
+
+        # Mock fault tolerance state to avoid comparison issues
+        setup_output.state.fault_tolerance_state.seen_tr_iters_cnt = 0
+        setup_output.state.fault_tolerance_state.is_calculating_timeouts = False
+        setup_output.state.fault_tolerance_state.is_persistent_chkpt_loaded = False
+        setup_output.state.rank_monitor_client = None
+
+        setup_output.model = MagicMock()
+        setup_output.optimizer = MagicMock()
+        setup_output.scheduler = MagicMock()
+        setup_output.train_data_iterator = MagicMock()
+        setup_output.valid_data_iterator = None
+        setup_output.test_data_iterator = None
+        setup_output.checkpointing_context = {}
+        mock_setup.return_value = setup_output
+
+        try:
+            pretrain(container, functor)
+        finally:
+            restore_get_world_size_safe(og_ws, cfg_mod)
+
+        # Verify the functor was passed to train and maintains its identity
+        mock_train.assert_called_once()
+        assert mock_train.call_args[0][0] is functor
+        # Functor state should be preserved
+        assert functor.initial_loss == 2.0
+
+
+class TestFunctorStateDetectionEdgeCases:
+    """Test edge cases in functor state detection."""
+
+    def test_functor_with_typed_state_parameter(self):
+        """Test that functors with GlobalState type hints are detected correctly."""
+
+        class TypedStateFunctor:
+            def __call__(self, state: GlobalState, data_iterator, model):
+                return "typed state"
+
+        functor = TypedStateFunctor()
+        needs_injection = needs_global_state_injection(functor)
+        assert needs_injection is True  # Has GlobalState type hint
+
+    def test_functor_with_mixed_parameters(self):
+        """Test functor with mixed typed and untyped parameters."""
+
+        class MixedFunctor:
+            def __call__(self, data_iterator, state: GlobalState, model):
+                return "mixed"
+
+        functor = MixedFunctor()
+        needs_injection = needs_global_state_injection(functor)
+        assert needs_injection is True  # Has GlobalState type hint (not first param)
+
+
+class TestFunctorVsFunctionEquivalence:
+    """Test that functors behave equivalently to regular functions."""
+
+    def test_functor_vs_function_state_injection(self):
+        """Test that functors and functions get the same state injection treatment."""
+
+        def four_arg_function(state, data_iterator, model, return_schedule_plan=False):
+            return torch.tensor([1.0]), partial(lambda x: x)
+
+        functor = FourArgForwardFunctor()
+        mock_state = Mock(spec=GlobalState)
+
+        wrapped_function = maybe_inject_state(four_arg_function, mock_state)
+        wrapped_functor = maybe_inject_state(functor, mock_state)
+
+        # Both should be wrapped with partial
+        assert isinstance(wrapped_function, partial)
+        assert isinstance(wrapped_functor, partial)
+
+        # Both should have the same state injected
+        assert wrapped_function.args == (mock_state,)
+        assert wrapped_functor.args == (mock_state,)
+
+    def test_functor_vs_function_state_detection(self):
+        """Test that functors and functions are inspected the same way for state injection."""
+
+        def three_arg_function(data_iterator, model, return_schedule_plan=False):
+            return torch.tensor([1.0]), partial(lambda x: x)
+
+        functor = ThreeArgForwardFunctor()
+
+        func_needs_injection = needs_global_state_injection(three_arg_function)
+        functor_needs_injection = needs_global_state_injection(functor)
+
+        assert func_needs_injection == functor_needs_injection == False  # Neither has state
+
+
+class TestComplexFunctorScenarios:
+    """Test complex scenarios with functors."""
+
+    def test_functor_with_inheritance(self):
+        """Test that functors work correctly with inheritance."""
+
+        class BaseFunctor:
+            def __init__(self):
+                self.base_calls = 0
+
+            def __call__(self, state, data_iterator, model, return_schedule_plan=False):
+                self.base_calls += 1
+                return self._forward(state, data_iterator, model, return_schedule_plan)
+
+            def _forward(self, state, data_iterator, model, return_schedule_plan):
+                return torch.tensor([1.0]), partial(lambda x: x)
+
+        class DerivedFunctor(BaseFunctor):
+            def __init__(self):
+                super().__init__()
+                self.derived_calls = 0
+
+            def _forward(self, state, data_iterator, model, return_schedule_plan):
+                self.derived_calls += 1
+                # Override with different behavior
+                return torch.tensor([0.5]), partial(lambda x: x * 0.5)
+
+        functor = DerivedFunctor()
+        needs_injection = needs_global_state_injection(functor)
+        assert needs_injection is True
+
+        # Test that inheritance works
+        mock_state = Mock()
+        mock_iterator = Mock()
+        mock_model = Mock()
+
+        result = functor(mock_state, mock_iterator, mock_model)
+        assert functor.base_calls == 1
+        assert functor.derived_calls == 1
+        assert result[0].item() == 0.5
+
+    def test_functor_with_decorator(self):
+        """Test that functors work with decorators."""
+
+        import functools
+
+        def call_counter(cls):
+            """Decorator that adds call counting to a functor while preserving signature."""
+            original_call = cls.__call__
+
+            @functools.wraps(original_call)
+            def wrapped_call(self, *args, **kwargs):
+                if not hasattr(self, "_decorator_calls"):
+                    self._decorator_calls = 0
+                self._decorator_calls += 1
+                return original_call(self, *args, **kwargs)
+
+            cls.__call__ = wrapped_call
+            return cls
+
+        @call_counter
+        class DecoratedFunctor:
+            def __call__(self, state, data_iterator, model, return_schedule_plan=False):
+                return torch.tensor([1.0]), partial(lambda x: x)
+
+        functor = DecoratedFunctor()
+        needs_injection = needs_global_state_injection(functor)
+        assert needs_injection is True
+
+        # Test that decorator works
+        mock_state = Mock()
+        mock_iterator = Mock()
+        mock_model = Mock()
+
+        functor(mock_state, mock_iterator, mock_model)
+        assert functor._decorator_calls == 1
+
+        functor(mock_state, mock_iterator, mock_model)
+        assert functor._decorator_calls == 2
diff --git a/tests/unit_tests/training/test_model_load_save.py b/tests/unit_tests/training/test_model_load_save.py
index bd6f85de45..bf8984d5f3 100644
--- a/tests/unit_tests/training/test_model_load_save.py
+++ b/tests/unit_tests/training/test_model_load_save.py
@@ -373,6 +373,75 @@ def test_load_megatron_model_skip_temp_dist_context(
         assert result == mock_model
         mock_temp_dist.assert_not_called()
 
+    @patch("megatron.bridge.training.model_load_save.build_and_load_model")
+    @patch("megatron.bridge.training.model_load_save.load_model_config")
+    def test_load_megatron_model_resets_defaults(self, mock_load_model_config, mock_build_and_load):
+        """Verify single-GPU default resets are applied before building the model."""
+        # Prepare a config object with non-default values that should be reset
+        cfg = Mock()
+        cfg.tensor_model_parallel_size = 8
+        cfg.pipeline_model_parallel_size = 4
+        cfg.context_parallel_size = 2
+        cfg.expert_model_parallel_size = 2
+        cfg.expert_tensor_parallel_size = 2
+        cfg.moe_extended_tp = True
+        cfg.sequence_parallel = True
+        cfg.virtual_pipeline_model_parallel_size = 2
+        cfg.hierarchical_context_parallel_sizes = [2, 2]
+
+        mock_load_model_config.return_value = (cfg, None)
+        sentinel = object()
+        mock_build_and_load.return_value = sentinel
+
+        result = load_megatron_model("/ckpt", model_type=None, return_state_dict=False, use_cpu_init=True)
+
+        # Ensure build_and_load_model was called and returned
+        assert result is sentinel
+
+        # After resets (no overrides), the following should hold
+        assert cfg.tensor_model_parallel_size == 1
+        assert cfg.pipeline_model_parallel_size == 1
+        assert cfg.context_parallel_size == 1
+        assert cfg.expert_model_parallel_size == 1
+        assert cfg.expert_tensor_parallel_size == 1
+        assert cfg.moe_extended_tp is False
+        assert cfg.sequence_parallel is False
+        assert cfg.virtual_pipeline_model_parallel_size is None
+        assert cfg.hierarchical_context_parallel_sizes is None
+
+    @patch("megatron.bridge.training.model_load_save.build_and_load_model")
+    @patch("megatron.bridge.training.model_load_save.load_model_config")
+    def test_load_megatron_model_applies_overrides(self, mock_load_model_config, mock_build_and_load):
+        """Verify mp_overrides entries are applied to the config."""
+        cfg = Mock()
+        # Start with defaults to make verification straightforward
+        cfg.tensor_model_parallel_size = 1
+        cfg.pipeline_model_parallel_size = 1
+        cfg.context_parallel_size = 1
+        cfg.expert_model_parallel_size = 1
+        cfg.expert_tensor_parallel_size = 1
+        cfg.moe_extended_tp = False
+        cfg.sequence_parallel = False
+        cfg.virtual_pipeline_model_parallel_size = None
+        cfg.hierarchical_context_parallel_sizes = None
+
+        mock_load_model_config.return_value = (cfg, None)
+        mock_build_and_load.return_value = Mock()
+
+        overrides = {
+            "tensor_model_parallel_size": 2,
+            "pipeline_model_parallel_size": 3,
+            "sequence_parallel": True,
+            "virtual_pipeline_model_parallel_size": 4,
+        }
+
+        _ = load_megatron_model("/ckpt", mp_overrides=overrides)
+
+        assert cfg.tensor_model_parallel_size == 2
+        assert cfg.pipeline_model_parallel_size == 3
+        assert cfg.sequence_parallel is True
+        assert cfg.virtual_pipeline_model_parallel_size == 4
+
 
 class TestSaveMegatronModel:
     """Test save_megatron_model function."""
@@ -461,7 +530,7 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None):
         # Test with tokenizer path
         with tempfile.TemporaryDirectory() as temp_dir:
             save_megatron_model(
-                [mock_model], temp_dir, ckpt_format="torch_dist", hf_tokenizer_path="meta-llama/Llama-3-8B"
+                [mock_model], temp_dir, ckpt_format="torch_dist", hf_tokenizer_path="meta-llama/Meta-Llama-3-8B"
             )
 
         # Assertions
@@ -474,7 +543,7 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None):
         assert "tokenizer" in call_kwargs
         tokenizer_config = call_kwargs["tokenizer"]
         assert tokenizer_config.tokenizer_type == "HuggingFaceTokenizer"
-        assert tokenizer_config.tokenizer_model == "meta-llama/Llama-3-8B"
+        assert tokenizer_config.tokenizer_model == "meta-llama/Meta-Llama-3-8B"
         assert tokenizer_config.vocab_size is None
 
         mock_save_checkpoint.assert_called_once_with(
diff --git a/tests/unit_tests/training/test_profiling.py b/tests/unit_tests/training/test_profiling.py
new file mode 100644
index 0000000000..47f217296c
--- /dev/null
+++ b/tests/unit_tests/training/test_profiling.py
@@ -0,0 +1,532 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for profiling utility functions."""
+
+from unittest.mock import MagicMock, Mock, patch
+
+from megatron.bridge.training.config import ProfilingConfig
+from megatron.bridge.training.profiling import (
+    handle_profiling_step,
+    handle_profiling_stop,
+    initialize_pytorch_profiler,
+    should_profile_rank,
+    start_nsys_profiler,
+    stop_nsys_profiler,
+)
+
+
+class TestShouldProfileRank:
+    """Tests for should_profile_rank function."""
+
+    def test_should_profile_rank_with_no_config(self):
+        """Test that profiling is disabled when config is None."""
+        assert should_profile_rank(None, 0) is False
+        assert should_profile_rank(None, 1) is False
+
+    def test_should_profile_rank_with_matching_rank(self):
+        """Test that profiling is enabled for ranks in profile_ranks."""
+        config = ProfilingConfig(use_pytorch_profiler=True, profile_ranks=[0, 2])
+        assert should_profile_rank(config, 0) is True
+        assert should_profile_rank(config, 2) is True
+
+    def test_should_profile_rank_with_non_matching_rank(self):
+        """Test that profiling is disabled for ranks not in profile_ranks."""
+        config = ProfilingConfig(use_pytorch_profiler=True, profile_ranks=[0, 2])
+        assert should_profile_rank(config, 1) is False
+        assert should_profile_rank(config, 3) is False
+
+    def test_should_profile_rank_empty_list(self):
+        """Test that profiling is disabled when profile_ranks is empty."""
+        config = ProfilingConfig(use_pytorch_profiler=True, profile_ranks=[])
+        assert should_profile_rank(config, 0) is False
+
+
+class TestInitializePytorchProfiler:
+    """Tests for initialize_pytorch_profiler function."""
+
+    @patch("torch.profiler.profile")
+    @patch("torch.profiler.tensorboard_trace_handler")
+    @patch("torch.profiler.schedule")
+    def test_initialize_pytorch_profiler_basic(self, mock_schedule, mock_handler, mock_profile):
+        """Test PyTorch profiler initialization with basic parameters."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_step_start=5,
+            profile_step_end=10,
+            record_shapes=False,
+        )
+
+        mock_schedule_instance = Mock()
+        mock_schedule.return_value = mock_schedule_instance
+        mock_handler_instance = Mock()
+        mock_handler.return_value = mock_handler_instance
+        mock_profiler = Mock()
+        mock_profile.return_value = mock_profiler
+
+        prof = initialize_pytorch_profiler(config, "/tmp/tensorboard")
+
+        # Verify schedule was created with correct parameters
+        mock_schedule.assert_called_once_with(
+            wait=4,  # max(5-1, 0)
+            warmup=1,  # 1 if start > 0
+            active=5,  # end - start
+            repeat=1,
+        )
+
+        # Verify handler was called with correct directory
+        mock_handler.assert_called_once_with("/tmp/tensorboard")
+
+        # Verify profiler was created with correct kwargs
+        mock_profile.assert_called_once()
+        call_kwargs = mock_profile.call_args.kwargs
+        assert call_kwargs["schedule"] == mock_schedule_instance
+        assert call_kwargs["on_trace_ready"] == mock_handler_instance
+        assert call_kwargs["record_shapes"] is False
+        assert call_kwargs["with_stack"] is True
+
+        # Verify returned profiler
+        assert prof == mock_profiler
+
+    @patch("torch.profiler.profile")
+    @patch("torch.profiler.tensorboard_trace_handler")
+    @patch("torch.profiler.schedule")
+    def test_initialize_pytorch_profiler_with_shapes(self, mock_schedule, mock_handler, mock_profile):
+        """Test profiler initialization with shape recording enabled."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_step_start=3,
+            profile_step_end=8,
+            record_shapes=True,
+        )
+
+        initialize_pytorch_profiler(config, "/tmp/tb")
+
+        # Verify record_shapes is True
+        call_kwargs = mock_profile.call_args.kwargs
+        assert call_kwargs["record_shapes"] is True
+
+    @patch("torch.profiler.profile")
+    @patch("torch.profiler.tensorboard_trace_handler")
+    @patch("torch.profiler.schedule")
+    def test_initialize_pytorch_profiler_start_at_zero(self, mock_schedule, mock_handler, mock_profile):
+        """Test profiler initialization when starting at iteration 0."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_step_start=0,
+            profile_step_end=3,
+        )
+
+        initialize_pytorch_profiler(config, "/tmp/tb")
+
+        # When start=0, wait should be 0 and warmup should be 0
+        mock_schedule.assert_called_once_with(
+            wait=0,  # max(0-1, 0) = 0
+            warmup=0,  # 0 if start == 0
+            active=3,
+            repeat=1,
+        )
+
+    @patch("torch.profiler.profile")
+    @patch("torch.profiler.tensorboard_trace_handler")
+    @patch("torch.profiler.schedule")
+    def test_initialize_pytorch_profiler_start_at_one(self, mock_schedule, mock_handler, mock_profile):
+        """Test profiler initialization when starting at iteration 1."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_step_start=1,
+            profile_step_end=4,
+        )
+
+        initialize_pytorch_profiler(config, "/tmp/tb")
+
+        # When start=1, wait should be 0, warmup should be 1
+        mock_schedule.assert_called_once_with(
+            wait=0,  # max(1-1, 0) = 0
+            warmup=1,  # 1 if start > 0
+            active=3,
+            repeat=1,
+        )
+
+
+class TestStartNsysProfiler:
+    """Tests for start_nsys_profiler function."""
+
+    @patch("torch.cuda.cudart")
+    @patch("torch.autograd.profiler.emit_nvtx")
+    @patch("torch.cuda.check_error")
+    def test_start_nsys_profiler_without_shapes(self, mock_check_error, mock_nvtx, mock_cudart):
+        """Test nsys profiler start without shape recording."""
+        mock_cudart_instance = Mock()
+        mock_cudart_instance.cudaProfilerStart.return_value = (0,)
+        mock_cudart.return_value = mock_cudart_instance
+
+        mock_nvtx_context = MagicMock()
+        mock_nvtx.return_value = mock_nvtx_context
+
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            record_shapes=False,
+        )
+
+        result = start_nsys_profiler(config)
+
+        # Verify CUDA profiler was started
+        mock_cudart_instance.cudaProfilerStart.assert_called_once()
+        mock_check_error.assert_called_once_with((0,))
+
+        # Verify NVTX was called without record_shapes
+        mock_nvtx.assert_called_once_with()
+        mock_nvtx_context.__enter__.assert_called_once()
+
+        # Verify context is returned
+        assert result == mock_nvtx_context
+
+    @patch("torch.cuda.cudart")
+    @patch("torch.autograd.profiler.emit_nvtx")
+    @patch("torch.cuda.check_error")
+    def test_start_nsys_profiler_with_shapes(self, mock_check_error, mock_nvtx, mock_cudart):
+        """Test nsys profiler start with shape recording."""
+        mock_cudart_instance = Mock()
+        mock_cudart_instance.cudaProfilerStart.return_value = (0,)
+        mock_cudart.return_value = mock_cudart_instance
+
+        mock_nvtx_context = MagicMock()
+        mock_nvtx.return_value = mock_nvtx_context
+
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            record_shapes=True,
+        )
+
+        result = start_nsys_profiler(config)
+
+        # Verify NVTX was called WITH record_shapes
+        mock_nvtx.assert_called_once_with(record_shapes=True)
+        mock_nvtx_context.__enter__.assert_called_once()
+
+        # Verify context is returned
+        assert result == mock_nvtx_context
+
+
+class TestStopNsysProfiler:
+    """Tests for stop_nsys_profiler function."""
+
+    @patch("torch.cuda.cudart")
+    @patch("torch.cuda.check_error")
+    def test_stop_nsys_profiler(self, mock_check_error, mock_cudart):
+        """Test nsys profiler stop."""
+        mock_cudart_instance = Mock()
+        mock_cudart_instance.cudaProfilerStop.return_value = (0,)
+        mock_cudart.return_value = mock_cudart_instance
+
+        mock_nvtx_context = MagicMock()
+
+        stop_nsys_profiler(mock_nvtx_context)
+
+        # Verify CUDA profiler was stopped
+        mock_cudart_instance.cudaProfilerStop.assert_called_once()
+        mock_check_error.assert_called_once_with((0,))
+
+        # Verify NVTX context was exited
+        mock_nvtx_context.__exit__.assert_called_once_with(None, None, None)
+
+    @patch("torch.cuda.cudart")
+    @patch("torch.cuda.check_error")
+    def test_stop_nsys_profiler_with_none_context(self, mock_check_error, mock_cudart):
+        """Test nsys profiler stop handles None context gracefully."""
+        mock_cudart_instance = Mock()
+        mock_cudart_instance.cudaProfilerStop.return_value = (0,)
+        mock_cudart.return_value = mock_cudart_instance
+
+        # Should not raise exception
+        stop_nsys_profiler(None)
+
+        # Verify CUDA profiler was still stopped
+        mock_cudart_instance.cudaProfilerStop.assert_called_once()
+
+
+class TestHandleProfilingStep:
+    """Tests for handle_profiling_step function."""
+
+    def test_handle_profiling_step_with_no_config(self):
+        """Test that profiling step does nothing when config is None."""
+        mock_prof = Mock()
+
+        handle_profiling_step(None, iteration=5, rank=0, pytorch_prof=mock_prof)
+
+        # Profiler should not be called
+        mock_prof.step.assert_not_called()
+
+    def test_handle_profiling_step_skips_non_profiled_rank(self):
+        """Test that profiling step is skipped for non-profiled ranks."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_ranks=[0],
+        )
+        mock_prof = Mock()
+
+        # Rank 1 should not profile
+        handle_profiling_step(config, iteration=5, rank=1, pytorch_prof=mock_prof)
+
+        # PyTorch profiler step should NOT be called
+        mock_prof.step.assert_not_called()
+
+    def test_handle_profiling_step_pytorch_profiler(self):
+        """Test profiling step calls PyTorch profiler.step()."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_ranks=[0],
+        )
+        mock_prof = Mock()
+
+        handle_profiling_step(config, iteration=5, rank=0, pytorch_prof=mock_prof)
+
+        # PyTorch profiler step should be called
+        mock_prof.step.assert_called_once()
+
+    def test_handle_profiling_step_pytorch_profiler_none(self):
+        """Test profiling step handles None profiler gracefully."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_ranks=[0],
+        )
+
+        # Should not raise exception
+        handle_profiling_step(config, iteration=5, rank=0, pytorch_prof=None)
+
+    @patch("megatron.bridge.training.profiling.start_nsys_profiler")
+    def test_handle_profiling_step_nsys_before_start(self, mock_start_nsys):
+        """Test nsys profiler does not start before profile_step_start."""
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            profile_step_start=10,
+            profile_step_end=15,
+            profile_ranks=[0],
+        )
+
+        # Before start iteration - should not start
+        handle_profiling_step(config, iteration=9, rank=0, pytorch_prof=None)
+        mock_start_nsys.assert_not_called()
+
+    @patch("megatron.bridge.training.profiling.start_nsys_profiler")
+    def test_handle_profiling_step_nsys_at_start_iteration(self, mock_start_nsys):
+        """Test nsys profiler starts at profile_step_start."""
+        mock_nvtx_context = Mock()
+        mock_start_nsys.return_value = mock_nvtx_context
+
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            profile_step_start=10,
+            profile_step_end=15,
+            profile_ranks=[0],
+        )
+
+        # At start iteration - should start and return context
+        result = handle_profiling_step(config, iteration=10, rank=0, pytorch_prof=None)
+        mock_start_nsys.assert_called_once_with(config)
+        assert result == mock_nvtx_context
+
+    @patch("megatron.bridge.training.profiling.start_nsys_profiler")
+    def test_handle_profiling_step_nsys_after_start(self, mock_start_nsys):
+        """Test nsys profiler does not restart after profile_step_start."""
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            profile_step_start=10,
+            profile_step_end=15,
+            profile_ranks=[0],
+        )
+
+        # After start iteration - should not start again
+        handle_profiling_step(config, iteration=11, rank=0, pytorch_prof=None)
+        mock_start_nsys.assert_not_called()
+
+    @patch("megatron.bridge.training.profiling.start_nsys_profiler")
+    def test_handle_profiling_step_nsys_rank_filtering(self, mock_start_nsys):
+        """Test nsys profiler respects rank filtering."""
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            profile_step_start=10,
+            profile_step_end=15,
+            profile_ranks=[0, 2],
+        )
+
+        # Rank 1 should not start profiler
+        handle_profiling_step(config, iteration=10, rank=1, pytorch_prof=None)
+        mock_start_nsys.assert_not_called()
+
+        # Rank 0 should start profiler
+        handle_profiling_step(config, iteration=10, rank=0, pytorch_prof=None)
+        mock_start_nsys.assert_called_once_with(config)
+
+
+class TestHandleProfilingStop:
+    """Tests for handle_profiling_stop function."""
+
+    def test_handle_profiling_stop_with_no_config(self):
+        """Test that profiling stop does nothing when config is None."""
+        mock_prof = Mock()
+
+        handle_profiling_stop(None, iteration=10, rank=0, pytorch_prof=mock_prof)
+
+        # Profiler should not be stopped
+        mock_prof.stop.assert_not_called()
+
+    def test_handle_profiling_stop_skips_non_profiled_rank(self):
+        """Test that profiling stop is skipped for non-profiled ranks."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_step_end=10,
+            profile_ranks=[0],
+        )
+        mock_prof = Mock()
+
+        # Rank 1 should not stop profiler
+        handle_profiling_stop(config, iteration=10, rank=1, pytorch_prof=mock_prof)
+
+        # PyTorch profiler stop should NOT be called
+        mock_prof.stop.assert_not_called()
+
+    def test_handle_profiling_stop_skips_wrong_iteration(self):
+        """Test that profiling stop is skipped for iterations other than profile_step_end."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_step_end=10,
+            profile_ranks=[0],
+        )
+        mock_prof = Mock()
+
+        # Wrong iteration - should not stop
+        handle_profiling_stop(config, iteration=9, rank=0, pytorch_prof=mock_prof)
+        mock_prof.stop.assert_not_called()
+
+        # Also test after end iteration
+        handle_profiling_stop(config, iteration=11, rank=0, pytorch_prof=mock_prof)
+        mock_prof.stop.assert_not_called()
+
+    def test_handle_profiling_stop_pytorch_profiler(self):
+        """Test profiling stop calls PyTorch profiler.stop()."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_step_end=10,
+            profile_ranks=[0],
+        )
+        mock_prof = Mock()
+
+        handle_profiling_stop(config, iteration=10, rank=0, pytorch_prof=mock_prof)
+
+        # PyTorch profiler stop should be called
+        mock_prof.stop.assert_called_once()
+
+    def test_handle_profiling_stop_pytorch_profiler_none(self):
+        """Test profiling stop handles None profiler gracefully."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_step_end=10,
+            profile_ranks=[0],
+        )
+
+        # Should not raise exception
+        handle_profiling_stop(config, iteration=10, rank=0, pytorch_prof=None)
+
+    @patch("megatron.bridge.training.profiling.stop_nsys_profiler")
+    def test_handle_profiling_stop_nsys_at_end_iteration(self, mock_stop_nsys):
+        """Test nsys profiler stops at profile_step_end."""
+        mock_nvtx_context = Mock()
+
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            profile_step_end=10,
+            profile_ranks=[0],
+        )
+
+        handle_profiling_stop(config, iteration=10, rank=0, pytorch_prof=None, nsys_nvtx_context=mock_nvtx_context)
+
+        # Nsys stop should be called with the context
+        mock_stop_nsys.assert_called_once_with(mock_nvtx_context)
+
+    @patch("megatron.bridge.training.profiling.stop_nsys_profiler")
+    def test_handle_profiling_stop_nsys_wrong_iteration(self, mock_stop_nsys):
+        """Test nsys profiler does not stop at wrong iteration."""
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            profile_step_end=10,
+            profile_ranks=[0],
+        )
+
+        # Wrong iteration - should not stop
+        handle_profiling_stop(config, iteration=9, rank=0, pytorch_prof=None)
+        mock_stop_nsys.assert_not_called()
+
+    @patch("megatron.bridge.training.profiling.stop_nsys_profiler")
+    def test_handle_profiling_stop_nsys_rank_filtering(self, mock_stop_nsys):
+        """Test nsys profiler stop respects rank filtering."""
+        mock_nvtx_context = Mock()
+
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            profile_step_end=10,
+            profile_ranks=[0, 2],
+        )
+
+        # Rank 1 should not stop profiler
+        handle_profiling_stop(config, iteration=10, rank=1, pytorch_prof=None, nsys_nvtx_context=mock_nvtx_context)
+        mock_stop_nsys.assert_not_called()
+
+        # Rank 0 should stop profiler
+        handle_profiling_stop(config, iteration=10, rank=0, pytorch_prof=None, nsys_nvtx_context=mock_nvtx_context)
+        mock_stop_nsys.assert_called_once_with(mock_nvtx_context)
+
+
+class TestProfilingEdgeCases:
+    """Tests for edge cases and combinations."""
+
+    def test_handle_profiling_step_both_profilers_disabled(self):
+        """Test that nothing happens when both profilers are disabled."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=False,
+            use_nsys_profiler=False,
+            profile_ranks=[0],
+        )
+        mock_prof = Mock()
+
+        handle_profiling_step(config, iteration=5, rank=0, pytorch_prof=mock_prof)
+
+        # Nothing should be called
+        mock_prof.step.assert_not_called()
+
+    def test_multiple_ranks_profiling(self):
+        """Test that multiple ranks can be profiled."""
+        config = ProfilingConfig(
+            use_pytorch_profiler=True,
+            profile_ranks=[0, 1, 3],
+        )
+
+        assert should_profile_rank(config, 0) is True
+        assert should_profile_rank(config, 1) is True
+        assert should_profile_rank(config, 2) is False
+        assert should_profile_rank(config, 3) is True
+
+    @patch("megatron.bridge.training.profiling.start_nsys_profiler")
+    def test_handle_profiling_step_nsys_at_iteration_zero(self, mock_start_nsys):
+        """Test nsys profiler can start at iteration 0."""
+        config = ProfilingConfig(
+            use_nsys_profiler=True,
+            profile_step_start=0,
+            profile_step_end=5,
+            profile_ranks=[0],
+        )
+
+        handle_profiling_step(config, iteration=0, rank=0, pytorch_prof=None)
+        mock_start_nsys.assert_called_once_with(config)
diff --git a/tests/unit_tests/training/test_state_injection_logic.py b/tests/unit_tests/training/test_state_injection_logic.py
new file mode 100644
index 0000000000..81496d6a81
--- /dev/null
+++ b/tests/unit_tests/training/test_state_injection_logic.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for state injection logic with type hint detection."""
+
+from functools import partial
+from typing import Iterable
+from unittest.mock import Mock
+
+import torch
+from megatron.core.models.gpt import GPTModel
+
+from megatron.bridge.training.state import GlobalState
+from megatron.bridge.training.utils.train_utils import maybe_inject_state
+
+
+class TestTypeHintBasedStateInjection:
+    """Test state injection based on type hints."""
+
+    def test_inject_with_globalstate_type_hint_first_param(self):
+        """Test state injection when first parameter has GlobalState type hint."""
+
+        def forward_step(state: GlobalState, data_iterator, model, return_schedule_plan=False):
+            return f"state: {state.name}"
+
+        mock_state = Mock()
+        mock_state.name = "test_state"
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        assert isinstance(wrapped, partial)
+        assert wrapped.args == (mock_state,)
+
+        # Test calling the wrapped function
+        result = wrapped(Mock(), Mock(), True)
+        assert result == "state: test_state"
+
+    def test_inject_with_globalstate_type_hint_middle_param(self):
+        """Test state injection when GlobalState type hint is in middle parameter."""
+
+        def forward_step(data_iterator, state: GlobalState, model):
+            return f"state: {state.name}"
+
+        mock_state = Mock()
+        mock_state.name = "test_state"
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        # Should inject state because GlobalState type hint was found
+        assert isinstance(wrapped, partial)
+        assert wrapped.args == (mock_state,)
+
+    def test_inject_with_string_type_annotation(self):
+        """Test state injection with string type annotation (forward reference)."""
+
+        def forward_step(state: "GlobalState", data_iterator, model):
+            return f"state: {state.name}"
+
+        mock_state = Mock()
+        mock_state.name = "test_state"
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        assert isinstance(wrapped, partial)
+        assert wrapped.args == (mock_state,)
+
+    def test_no_injection_without_globalstate_type_hint(self):
+        """Test no state injection when no GlobalState type hint is present."""
+
+        def forward_step(data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False):
+            return "no state needed"
+
+        mock_state = Mock()
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        # Should return original function unchanged
+        assert wrapped is forward_step
+        assert not isinstance(wrapped, partial)
+
+    def test_fallback_to_name_based_detection(self):
+        """Test fallback to name-based detection when no type hints are present."""
+
+        def forward_step(state, data_iterator, model, return_schedule_plan=False):
+            return f"state: {state.name}"
+
+        mock_state = Mock()
+        mock_state.name = "test_state"
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        # Should inject based on parameter name 'state'
+        assert isinstance(wrapped, partial)
+        assert wrapped.args == (mock_state,)
+
+    def test_no_injection_when_first_param_not_state(self):
+        """Test no injection when first parameter is not named 'state' and has no GlobalState type."""
+
+        def forward_step(data_iterator, model, return_schedule_plan=False):
+            return "no state"
+
+        mock_state = Mock()
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        assert wrapped is forward_step
+        assert not isinstance(wrapped, partial)
+
+
+class TestFunctorTypeHintStateInjection:
+    """Test state injection with functors using type hints."""
+
+    def test_functor_with_globalstate_type_hint(self):
+        """Test functor with GlobalState type hint gets state injected."""
+
+        class TypedForwardFunctor:
+            def __init__(self):
+                self.seen_state = None
+
+            def __call__(self, state: GlobalState, data_iterator: Iterable, model: GPTModel):
+                self.seen_state = state
+                return torch.tensor([1.0]), partial(lambda x: x)
+
+        functor = TypedForwardFunctor()
+        mock_state = Mock()
+        mock_state.name = "test_state"
+
+        wrapped = maybe_inject_state(functor, mock_state)
+
+        assert isinstance(wrapped, partial)
+        assert wrapped.args == (mock_state,)
+
+        # Test calling the wrapped functor
+        wrapped(Mock(), Mock())
+        assert functor.seen_state is mock_state
+
+    def test_functor_without_type_hints_name_fallback(self):
+        """Test functor without type hints falls back to name-based detection."""
+
+        class NameBasedFunctor:
+            def __init__(self):
+                self.seen_state = None
+
+            def __call__(self, state, data_iterator, model):
+                self.seen_state = state
+                return torch.tensor([1.0]), partial(lambda x: x)
+
+        functor = NameBasedFunctor()
+        mock_state = Mock()
+
+        wrapped = maybe_inject_state(functor, mock_state)
+
+        assert isinstance(wrapped, partial)
+        assert wrapped.args == (mock_state,)
+
+    def test_functor_no_injection_without_state(self):
+        """Test functor without state parameter gets no injection."""
+
+        class NoStateFunctor:
+            def __call__(self, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False):
+                return torch.tensor([1.0]), partial(lambda x: x)
+
+        functor = NoStateFunctor()
+        mock_state = Mock()
+
+        wrapped = maybe_inject_state(functor, mock_state)
+
+        assert wrapped is functor
+        assert not isinstance(wrapped, partial)
+
+
+class TestAmbiguousSignatureResolution:
+    """Test resolution of ambiguous signatures using type hints."""
+
+    def test_three_args_with_state_type_hint_injects(self):
+        """Test that (state: GlobalState, data_iterator, model) correctly injects state."""
+
+        def forward_step(state: GlobalState, data_iterator, model):
+            return f"received state: {state.name}"
+
+        mock_state = Mock()
+        mock_state.name = "injected"
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        # Should inject state because of type hint
+        assert isinstance(wrapped, partial)
+
+        result = wrapped(Mock(), Mock())
+        assert result == "received state: injected"
+
+    def test_three_args_without_state_type_hint_no_injection(self):
+        """Test that (data_iterator, model, return_schedule_plan) doesn't inject state."""
+
+        def forward_step(data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False):
+            return f"no state, schedule_plan: {return_schedule_plan}"
+
+        mock_state = Mock()
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        # Should NOT inject state because no GlobalState type hint
+        assert wrapped is forward_step
+        assert not isinstance(wrapped, partial)
+
+        result = wrapped(Mock(), Mock(), True)
+        assert result == "no state, schedule_plan: True"
+
+    def test_ambiguous_three_args_resolved_by_type_hint(self):
+        """Test that type hints resolve the ambiguity between different 3-arg patterns."""
+
+        # Pattern 1: State injection expected
+        def state_forward_step(state: GlobalState, data_iterator, model):
+            return "with state"
+
+        # Pattern 2: No state injection expected
+        def schedule_forward_step(data_iterator, model, return_schedule_plan=False):
+            return "with schedule"
+
+        mock_state = Mock()
+
+        wrapped_state = maybe_inject_state(state_forward_step, mock_state)
+        wrapped_schedule = maybe_inject_state(schedule_forward_step, mock_state)
+
+        # State function should be wrapped
+        assert isinstance(wrapped_state, partial)
+
+        # Schedule function should not be wrapped
+        assert wrapped_schedule is schedule_forward_step
+        assert not isinstance(wrapped_schedule, partial)
+
+
+class TestEdgeCases:
+    """Test edge cases in type hint detection."""
+
+    def test_mixed_type_hints_first_param_wins(self):
+        """Test that when multiple params have types, first GlobalState param wins."""
+
+        def forward_step(data_iterator: Iterable, state: GlobalState, model: GPTModel):
+            return f"state: {state.name}"
+
+        mock_state = Mock()
+        mock_state.name = "test"
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        # Should inject because GlobalState type hint was found (even though not first param)
+        assert isinstance(wrapped, partial)
+
+    def test_no_type_hints_fallback_to_name(self):
+        """Test fallback to name-based detection when no type hints are present."""
+
+        def forward_step(state, data_iterator, model):
+            return f"state: {state.name}"
+
+        mock_state = Mock()
+        mock_state.name = "fallback"
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        # Should inject based on parameter name
+        assert isinstance(wrapped, partial)
+
+        result = wrapped(Mock(), Mock())
+        assert result == "state: fallback"
+
+    def test_wrong_parameter_name_no_injection(self):
+        """Test that wrong parameter name with no type hints doesn't inject."""
+
+        def forward_step(wrong_name, data_iterator, model):  # Wrong name
+            return "should not inject"
+
+        mock_state = Mock()
+
+        wrapped = maybe_inject_state(forward_step, mock_state)
+
+        # Should NOT inject because first param is not named 'state'
+        assert wrapped is forward_step
+        assert not isinstance(wrapped, partial)
diff --git a/tests/unit_tests/training/test_train.py b/tests/unit_tests/training/test_train.py
index 8771a1aa95..802684df83 100644
--- a/tests/unit_tests/training/test_train.py
+++ b/tests/unit_tests/training/test_train.py
@@ -24,6 +24,7 @@
     checkpoint_and_decide_exit,
     should_disable_forward_pre_hook,
 )
+from megatron.bridge.training.utils.train_utils import maybe_inject_state
 
 
 class TestMxfp8ParamBufferCopy:
@@ -150,6 +151,30 @@ def test_keep_enabled_with_megatron_fsdp(self):
         )
         assert result is False
 
+    def test_callable_class_state_injection_integration(self):
+        """Integration test ensuring state injection works with functors in training context."""
+
+        class ForwardFunctor:
+            def __init__(self):
+                self.state_seen = None
+
+            def __call__(self, state, data_iterator, model, return_schedule_plan=False):
+                self.state_seen = state
+                return "ok"
+
+        mock_state = Mock()
+        functor = ForwardFunctor()
+
+        wrapped = maybe_inject_state(functor, mock_state)
+        assert callable(wrapped)
+
+        data_iterator = Mock()
+        model = Mock()
+        result = wrapped(data_iterator, model, return_schedule_plan=True)
+
+        assert result == "ok"
+        assert functor.state_seen is mock_state
+
     def test_keep_enabled_without_distributed_optimizer(self):
         """Test that pre-hook stays enabled when not using distributed optimizer."""
         result = should_disable_forward_pre_hook(
diff --git a/tests/unit_tests/training/utils/test_train_utils.py b/tests/unit_tests/training/utils/test_train_utils.py
index 2704c275af..d48e247820 100644
--- a/tests/unit_tests/training/utils/test_train_utils.py
+++ b/tests/unit_tests/training/utils/test_train_utils.py
@@ -18,9 +18,11 @@
 import pytest
 import torch
 
+from megatron.bridge.training.state import GlobalState
 from megatron.bridge.training.utils.train_utils import (
-    check_forward_step_func_num_args,
     maybe_inject_state,
+    needs_global_state_injection,
+    prepare_forward_step_func,
     training_log,
 )
 
@@ -850,102 +852,99 @@ def test_memory_tensorboard_logging(
         writer.add_scalar.assert_any_call("mem-allocated-count", 5000, 10)
 
 
-class TestCheckForwardStepFuncNumArgs:
-    """Test suite for the check_forward_step_func_num_args function."""
+class TestNeedsGlobalStateInjection:
+    """Test suite for the needs_global_state_injection function."""
 
-    def test_two_args_function(self):
-        """Test function with 2 arguments."""
+    def test_function_with_globalstate_type_hint_needs_injection(self):
+        """Test function with GlobalState type hint needs injection."""
+        from megatron.bridge.training.state import GlobalState
 
-        def forward_step_func_2_args(data_iterator, model):
+        def forward_step_func(state: GlobalState, data_iterator, model):
             return None
 
-        result = check_forward_step_func_num_args(forward_step_func_2_args)
-        assert result == 2
+        result = needs_global_state_injection(forward_step_func)
+        assert result is True
 
-    def test_three_args_function(self):
-        """Test function with 3 arguments."""
+    def test_function_with_string_globalstate_annotation_needs_injection(self):
+        """Test function with string GlobalState annotation needs injection."""
 
-        def forward_step_func_3_args(data_iterator, model, return_schedule_plan=False):
+        def forward_step_func(state: "GlobalState", data_iterator, model):
             return None
 
-        result = check_forward_step_func_num_args(forward_step_func_3_args)
-        assert result == 3
+        result = needs_global_state_injection(forward_step_func)
+        assert result is True
 
-    def test_four_args_function(self):
-        """Test function with 4 arguments."""
+    def test_function_with_state_name_needs_injection(self):
+        """Test function with 'state' parameter name needs injection."""
 
-        def forward_step_func_4_args(state, data_iterator, model, return_schedule_plan=False):
+        def forward_step_func(state, data_iterator, model):
             return None
 
-        result = check_forward_step_func_num_args(forward_step_func_4_args)
-        assert result == 4
+        result = needs_global_state_injection(forward_step_func)
+        assert result is True
 
-    def test_one_arg_function_raises_assertion_error(self):
-        """Test function with 1 argument raises AssertionError."""
+    def test_function_with_global_state_name_needs_injection(self):
+        """Test function with 'global_state' parameter name needs injection."""
 
-        def forward_step_func_1_arg(data_iterator):
+        def forward_step_func(global_state, data_iterator, model):
             return None
 
-        with pytest.raises(AssertionError) as exc_info:
-            check_forward_step_func_num_args(forward_step_func_1_arg)
+        result = needs_global_state_injection(forward_step_func)
+        assert result is True
 
-        error_message = str(exc_info.value)
-        assert "forward_step_func has 1 arguments" in error_message
-        assert "Only the following signatures are supported" in error_message
-        assert "2 args:" in error_message
-        assert "3 args:" in error_message
-        assert "4 args:" in error_message
+    def test_function_without_state_no_injection(self):
+        """Test function without state parameter doesn't need injection."""
 
-    def test_five_args_function_raises_assertion_error(self):
-        """Test function with 5 arguments raises AssertionError."""
-
-        def forward_step_func_5_args(state, data_iterator, model, return_schedule_plan, extra_arg):
+        def forward_step_func(data_iterator, model, return_schedule_plan=False):
             return None
 
-        with pytest.raises(AssertionError) as exc_info:
-            check_forward_step_func_num_args(forward_step_func_5_args)
+        result = needs_global_state_injection(forward_step_func)
+        assert result is False
 
-        error_message = str(exc_info.value)
-        assert "forward_step_func has 5 arguments" in error_message
-        assert "Only the following signatures are supported" in error_message
+    def test_lambda_function_with_state_name(self):
+        """Test lambda function with state parameter name."""
+        forward_step_func = lambda state, data_iterator, model: None
 
-    def test_zero_args_function_raises_assertion_error(self):
-        """Test function with 0 arguments raises AssertionError."""
+        result = needs_global_state_injection(forward_step_func)
+        assert result is True
 
-        def forward_step_func_0_args():
-            return None
+    def test_lambda_function_without_state(self):
+        """Test lambda function without state parameter."""
+        forward_step_func = lambda data_iterator, model: None
 
-        with pytest.raises(AssertionError) as exc_info:
-            check_forward_step_func_num_args(forward_step_func_0_args)
+        result = needs_global_state_injection(forward_step_func)
+        assert result is False
 
-        error_message = str(exc_info.value)
-        assert "forward_step_func has 0 arguments" in error_message
+    def test_callable_class_with_globalstate_type_hint(self):
+        """Test callable class with GlobalState type hint."""
+        from megatron.bridge.training.state import GlobalState
 
-    def test_lambda_function_two_args(self):
-        """Test lambda function with 2 arguments."""
-        forward_step_func = lambda data_iterator, model: None
+        class ForwardFunctor:
+            def __call__(self, state: GlobalState, data_iterator, model):
+                return None
 
-        result = check_forward_step_func_num_args(forward_step_func)
-        assert result == 2
+        result = needs_global_state_injection(ForwardFunctor())
+        assert result is True
 
-    def test_lambda_function_four_args(self):
-        """Test lambda function with 4 arguments."""
-        forward_step_func = lambda state, data_iterator, model, return_schedule_plan=False: None
+    def test_callable_class_with_state_name(self):
+        """Test callable class with state parameter name."""
 
-        result = check_forward_step_func_num_args(forward_step_func)
-        assert result == 4
+        class ForwardFunctor:
+            def __call__(self, state, data_iterator, model, return_schedule_plan=False):
+                return None
 
-    def test_partial_function(self):
-        """Test partial function (should count remaining parameters)."""
+        result = needs_global_state_injection(ForwardFunctor())
+        assert result is True
 
-        def original_func(state, data_iterator, model, return_schedule_plan=False):
-            return None
+    def test_callable_class_without_state(self):
+        """Test callable class without state parameter."""
 
-        # Create partial function with state bound
-        partial_func = partial(original_func, mock.MagicMock())
+        class ForwardFunctor:
+            def __call__(self, data_iterator, model, return_schedule_plan=False):
+                return None
 
-        result = check_forward_step_func_num_args(partial_func)
-        assert result == 3  # 4 original args - 1 bound arg = 3 remaining
+        result = needs_global_state_injection(ForwardFunctor())
+        assert result is False
 
 
 class TestMaybeInjectState:
@@ -981,7 +980,7 @@ def forward_step_func_4_args(state, data_iterator, model, return_schedule_plan=F
         mock_state = mock.MagicMock()
         mock_state.name = "test_state"
 
-        result_func = maybe_inject_state(forward_step_func_4_args, mock_state, num_fw_args=4)
+        result_func = maybe_inject_state(forward_step_func_4_args, mock_state, needs_injection=True)
 
         # Result should be a partial function
         assert isinstance(result_func, partial)
@@ -1022,7 +1021,7 @@ def forward_step_func_3_args(data_iterator, model, return_schedule_plan=False):
 
         mock_state = mock.MagicMock()
 
-        result_func = maybe_inject_state(forward_step_func_3_args, mock_state, num_fw_args=3)
+        result_func = maybe_inject_state(forward_step_func_3_args, mock_state, needs_injection=False)
 
         # Result should be the original function
         assert result_func is forward_step_func_3_args
@@ -1035,7 +1034,7 @@ def forward_step_func_2_args(data_iterator, model):
 
         mock_state = mock.MagicMock()
 
-        result_func = maybe_inject_state(forward_step_func_2_args, mock_state, num_fw_args=2)
+        result_func = maybe_inject_state(forward_step_func_2_args, mock_state, needs_injection=False)
 
         # Result should be the original function
         assert result_func is forward_step_func_2_args
@@ -1055,3 +1054,153 @@ def original_func(arg1, arg2, data_iterator, model):
 
         # Should return original partial since it has 2 remaining args
         assert result_func is partial_func
+
+    def test_callable_class_four_args_injects_state(self):
+        """Test state injection for callable class with 4 arguments."""
+
+        class ForwardFunctor:
+            def __init__(self):
+                self.seen_state = None
+
+            def __call__(self, state, data_iterator, model, return_schedule_plan=False):
+                self.seen_state = state
+                return "called"
+
+        functor = ForwardFunctor()
+        mock_state = mock.MagicMock()
+
+        result_func = maybe_inject_state(functor, mock_state)
+
+        assert isinstance(result_func, partial)
+
+        mock_data_iterator = mock.MagicMock()
+        mock_model = mock.MagicMock()
+        result = result_func(mock_data_iterator, mock_model, return_schedule_plan=True)
+
+        assert result == "called"
+        assert functor.seen_state is mock_state
+
+    def test_callable_class_three_args_no_injection(self):
+        """Test callable class with 3 arguments does not inject state."""
+
+        class ForwardFunctor:
+            def __call__(self, data_iterator, model, return_schedule_plan=False):
+                return "no state"
+
+        functor = ForwardFunctor()
+        mock_state = mock.MagicMock()
+
+        result_func = maybe_inject_state(functor, mock_state)
+
+        assert result_func is functor
+        assert not isinstance(result_func, partial)
+
+
+class TestPrepareForwardStepFunc:
+    """Tests for prepare_forward_step_func convenience function."""
+
+    def test_prepare_with_state_parameter_injects(self):
+        """Test prepare_forward_step_func with function that needs state injection."""
+
+        def forward_with_state(state: GlobalState, data_iterator, model):
+            return state.train_state.step
+
+        mock_state = mock.MagicMock()
+        mock_state.train_state.step = 42
+
+        result = prepare_forward_step_func(forward_with_state, mock_state)
+
+        # Should be wrapped
+        assert isinstance(result, partial)
+        # Should work correctly
+        assert result(None, None) == 42
+
+    def test_prepare_without_state_parameter_returns_original(self):
+        """Test prepare_forward_step_func with function that doesn't need state injection."""
+
+        def forward_no_state(data_iterator, model):
+            return "no state needed"
+
+        mock_state = mock.MagicMock()
+
+        result = prepare_forward_step_func(forward_no_state, mock_state)
+
+        # Should return original function
+        assert result is forward_no_state
+        assert not isinstance(result, partial)
+
+    def test_prepare_with_functor_needing_state(self):
+        """Test prepare_forward_step_func with functor that needs state injection."""
+
+        class ForwardFunctor:
+            def __init__(self):
+                self.call_count = 0
+
+            def __call__(self, state: GlobalState, data_iterator, model):
+                self.call_count += 1
+                return state.train_state.step + self.call_count
+
+        functor = ForwardFunctor()
+        mock_state = mock.MagicMock()
+        mock_state.train_state.step = 10
+
+        result = prepare_forward_step_func(functor, mock_state)
+
+        # Should be wrapped
+        assert isinstance(result, partial)
+
+        # Call multiple times - verify functor's internal state still works
+        assert result(None, None) == 11  # step=10 + call_count=1
+        assert result(None, None) == 12  # step=10 + call_count=2
+        assert functor.call_count == 2
+
+    def test_prepare_with_functor_not_needing_state(self):
+        """Test prepare_forward_step_func with functor that doesn't need state."""
+
+        class ForwardFunctor:
+            def __init__(self):
+                self.call_count = 0
+
+            def __call__(self, data_iterator, model):
+                self.call_count += 1
+                return self.call_count
+
+        functor = ForwardFunctor()
+        mock_state = mock.MagicMock()
+
+        result = prepare_forward_step_func(functor, mock_state)
+
+        # Should return original functor
+        assert result is functor
+        assert not isinstance(result, partial)
+
+        # Functor should still work
+        assert result(None, None) == 1
+        assert result(None, None) == 2
+
+    def test_prepare_sees_state_mutations(self):
+        """Test that prepared function sees mutations to GlobalState."""
+
+        def forward_with_state(state: GlobalState, data_iterator, model):
+            return state.train_state.step
+
+        mock_state = mock.MagicMock()
+        mock_state.train_state.step = 10
+
+        # Prepare once
+        wrapped = prepare_forward_step_func(forward_with_state, mock_state)
+
+        # Call with initial state
+        assert wrapped(None, None) == 10
+
+        # Mutate state (simulates training loop incrementing step)
+        mock_state.train_state.step = 20
+
+        # Call again - should see mutated value
+        assert wrapped(None, None) == 20
+
+        # Further mutation
+        mock_state.train_state.step = 100
+
+        # Still sees current value
+        assert wrapped(None, None) == 100
diff --git a/vace.sh b/vace.sh
new file mode 100644
index 0000000000..a9647374f2
--- /dev/null
+++ b/vace.sh
@@ -0,0 +1,51 @@
+export CUDA_VISIBLE_DEVICES=0,1
+export MBRIDGE_PATH=/workspace/vace/Megatron-Bridge
+export PYTHONPATH="${MBRIDGE_PATH}/.:${MBRIDGE_PATH}/src/.:/opt/NeMo-Framework-Launcher/launcher_scripts"
+
+### Inferencing
+# Download T5 weights and VAE weights from "https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B/tree/main"
+#   T5: models_t5_umt5-xxl-enc-bf16.pth, google
+#   VAE: Wan2.1_VAE.pth
+
+# CHECKPOINT_DIR=/opt/megatron_checkpoint_VACE
+# CHECKPOINT_STEP=0000
+CHECKPOINT_DIR="/workspace/checkpoints_vace_ft_I2V"
+CHECKPOINT_STEP=1000
+T5_DIR="/workspace/checkpoints/T5"
+VAE_DIR="/workspace/checkpoints/"
+
+NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=2 --rdzv-backend=c10d --rdzv-endpoint=localhost:0 examples/recipes/wan/inference_vace.py \
+  --model_name vace-1.3B \
+  --sizes 832*480 \
+  --save_file "test" \
+  --src_video "src_video-frameref.mp4" \
+  --checkpoint_dir ${CHECKPOINT_DIR} \
+  --checkpoint_step 1000  \
+  --t5_checkpoint_dir ${T5_DIR} \
+  --vae_checkpoint_dir ${VAE_DIR} \
+  --prompts "Cat jumps from the cabinet." \
+  --frame_nums 81 \
+  --tensor_parallel_size 1 \
+  --context_parallel_size 2 \
+  --pipeline_parallel_size 1 \
+  --sequence_parallel False \
+  --base_seed 42 \
+  --sample_steps 50
+
+# NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=2 --rdzv-backend=c10d --rdzv-endpoint=localhost:0 examples/recipes/wan/inference_vace.py \
+#   --model_name vace-1.3B \
+#   --sizes 832*480 832*480 832*480 \
+#   --save_file "test" \
+#   --src_video "src_video_depth.mp4" "src_video_flow.mp4" "src_video_pose.mp4" \
+#   --checkpoint_dir ${CHECKPOINT_DIR} \
+#   --checkpoint_step 0000 \
+#   --t5_checkpoint_dir ${T5_DIR} \
+#   --vae_checkpoint_dir ${VAE_DIR} \
+#   --prompts "Two dogs hit each other during boxing." "Two dogs hit each other during boxing." "Two dogs hit each other during boxing." \
+#   --frame_nums 81 81 81 \
+#   --tensor_parallel_size 1 \
+#   --context_parallel_size 2 \
+#   --pipeline_parallel_size 1 \
+#   --sequence_parallel False \
+#   --base_seed 42 \
+#   --sample_steps 50
\ No newline at end of file