From 0f854431795a3cb18c7cc17dce8bbfefba1857e1 Mon Sep 17 00:00:00 2001 From: Dennis Liu Date: Mon, 2 Mar 2026 20:02:42 -0800 Subject: [PATCH] fix: skip FSDP DTensor boundary validation under fake process group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `validate_uneven_dtensor` function uses `all_reduce(MAX)` across all ranks to verify that local shards collectively cover the full global tensor. Under fake process group (backend='fake'), all collective operations are no-ops, so only rank 0's boundaries are visible — the end-boundary check always fails. Skip the boundary validation when the distributed backend is 'fake', since fake process group is only used for memory profiling where numerical correctness is irrelevant. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py index 5df9c2e95c0..f18a21df6c1 100644 --- a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py @@ -175,6 +175,11 @@ def validate_uneven_dtensor(dtensor: DTensor) -> None: ) # Check that all boundaries (start and end) are touched. + # Skip under fake process group — all_reduce is a no-op so only rank 0's + # boundaries are visible, which makes the end-boundary check always fail. + if torch.distributed.is_initialized() and torch.distributed.get_backend() == 'fake': + return + boundary_checks = torch.tensor( [ [offset == 0, offset + size == dtensor.shape[dim]]