From b814559241e55fe43c68b400141bc826d6a64d2c Mon Sep 17 00:00:00 2001
From: Jinliang Li <jinliangl@nvidia.com>
Date: Wed, 4 Mar 2026 01:31:41 -0800
Subject: [PATCH] fix ddp bug when --overlap-grad-reduce and
 --num-distributed-optimizer-instances > 1

---
 megatron/core/distributed/param_and_grad_buffer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
index 088374fbf13..85b9d98a3be 100644
--- a/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -419,10 +419,10 @@ def start_grad_sync(self, force_all_reduce: Optional[bool] = False):
             # need to overlap communication.
             stream_context = torch.cuda.stream(self.communication_stream)
 
-            # The RS/AR communication stream needs to wait for the default stream
+            # The RS/AR communication stream needs to wait for the current stream
             # to complete its gradient computation before launching the next
             # gradient reduction collective.
-            self.communication_stream.wait_stream(torch.cuda.default_stream())
+            self.communication_stream.wait_stream(torch.cuda.current_stream())
         else:
             stream_context = nullcontext()
 
@@ -529,7 +529,7 @@ def finish_grad_sync(self, force_all_reduce: Optional[bool] = False):
         # When using multiple DistOpt instances, we don't need to sync here as we launch
         # communications on a separate communication stream.
         if self.ddp_config.num_distributed_optimizer_instances > 1:
-            torch.cuda.default_stream().wait_stream(self.communication_stream)
+            torch.cuda.current_stream().wait_stream(self.communication_stream)
             return
         assert self.grad_reduce_handle is not None, (
             f"Communication call has not been issued for this bucket "