From 099370630ce82ec9350c187dee8df4ba84026e62 Mon Sep 17 00:00:00 2001 From: Aaron Gokaslan Date: Mon, 25 Aug 2025 18:19:08 +0000 Subject: [PATCH] Fixes torch_dist checkpointing ETP replica_id --- megatron/core/transformer/moe/experts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index 98bc8912292..b4146d99576 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -327,7 +327,7 @@ def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): local_expert_indices_offset = ep_rank * self.num_local_experts prepend_axis_num = len(sharded_offsets) - replica_id = (0, 0, dp_rank) + replica_id = (0, tp_rank, dp_rank) local_ffn_dim_size = ( self.weight2.numel() // self.num_local_experts // self.config.hidden_size