From fb6f4e49ab697e0e8a1702891456154d18ffecbf Mon Sep 17 00:00:00 2001
From: Hexin Wang <hexinw@nvidia.com>
Date: Tue, 3 Mar 2026 18:28:21 -0800
Subject: [PATCH] Fix: Defensively close GPU device FDs in dataloader worker
 processes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This ensures workers do not keep references into NVIDIA memory space after fork.
This helps ensure GPU memory can be reclaimed even if a dataloader worker is
delayed or fails to exit.

How to Reproduce / Validate:

1. Force a long-running dataloader worker
   Modify GPTDataset.__getitem__ to insert:

     time.sleep(3600)

   This simulates a stuck dataloader worker (e.g., blocked in I/O).

2. Start training
   Launch a 1-node Megatron-LM job with:

     --num-workers > 0

3. Verify dataloader workers are alive
   On the GPU node:

     sudo fuser -v /dev/nvidia*

  You should see the dataloader worker processes listed.
  With this patch, they should not retain active /dev/nvidia* file
  descriptors even though they are running.

4. Trigger a rank failure
   Send SIGTERM to one of the training ranks:

     kill -15 <rank_pid>

5. Observe GPU memory reclaim
   Run:

     nvidia-smi

  The corresponding rank’s GPU memory usage should return to 0
  immediately (assuming no other GPU-holding child processes such as async
  checkpoint workers are present in this test).

6. Baseline (without this patch)
   Repeat the same steps without this change.
   After killing the rank in step 4, you will observe that GPU memory
   remains non-zero in nvidia-smi, because the dataloader worker still
   holds /dev/nvidia* references.
---
 megatron/training/datasets/data_samplers.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/megatron/training/datasets/data_samplers.py b/megatron/training/datasets/data_samplers.py
index ca4cc1b36a3..2ab25094735 100644
--- a/megatron/training/datasets/data_samplers.py
+++ b/megatron/training/datasets/data_samplers.py
@@ -73,10 +73,26 @@ def build_pretraining_data_loader(dataset, consumed_samples):
         raise Exception('{} dataloader type is not supported.'.format(args.dataloader_type))
 
     def worker_init_fn(_):
-        DistributedSignalHandler(args.exit_signal).__enter__()
+        import os
+
+        # Defensively close GPU device FDs in worker processes so workers do not
+        # keep references into NVIDIA memory space. This helps ensure GPU memory
+        # can be reclaimed even if a dataloader worker is delayed or fails to exit.
+        def close_nvidia_fds():
+            for fd in os.listdir("/proc/self/fd"):
+                try:
+                    path = os.readlink(f"/proc/self/fd/{fd}")
+                    if path.startswith("/dev/nvidia"):
+                        os.close(int(fd))
+                except Exception:
+                    pass
+
+        close_nvidia_fds()
+        if args.exit_signal_handler:
+            DistributedSignalHandler(args.exit_signal).__enter__()
 
     maybe_worker_init_fn = (
-        worker_init_fn if args.exit_signal_handler and args.num_workers > 0 else None
+        worker_init_fn if args.num_workers > 0 else None
     )
     # Torch dataloader.
     if args.hybrid_context_parallel: