From 0f854431795a3cb18c7cc17dce8bbfefba1857e1 Mon Sep 17 00:00:00 2001
From: Dennis Liu <denliu@nvidia.com>
Date: Mon, 2 Mar 2026 20:02:42 -0800
Subject: [PATCH] fix: skip FSDP DTensor boundary validation under fake process
 group
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `validate_uneven_dtensor` function uses `all_reduce(MAX)` across
all ranks to verify that local shards collectively cover the full
global tensor. Under fake process group (backend='fake'), all
collective operations are no-ops, so only rank 0's boundaries are
visible — the end-boundary check always fails.

Skip the boundary validation when the distributed backend is 'fake',
since fake process group is only used for memory profiling where
numerical correctness is irrelevant.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py     | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
index 5df9c2e95c0..f18a21df6c1 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
@@ -175,6 +175,11 @@ def validate_uneven_dtensor(dtensor: DTensor) -> None:
     )
 
     # Check that all boundaries (start and end) are touched.
+    # Skip under fake process group — all_reduce is a no-op so only rank 0's
+    # boundaries are visible, which makes the end-boundary check always fail.
+    if torch.distributed.is_initialized() and torch.distributed.get_backend() == 'fake':
+        return
+
     boundary_checks = torch.tensor(
         [
             [offset == 0, offset + size == dtensor.shape[dim]]