From fb6f4e49ab697e0e8a1702891456154d18ffecbf Mon Sep 17 00:00:00 2001 From: Hexin Wang Date: Tue, 3 Mar 2026 18:28:21 -0800 Subject: [PATCH] Fix: Defensively close GPU device FDs in dataloader worker processes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This ensures workers do not keep references into NVIDIA memory space after fork. This helps ensure GPU memory can be reclaimed even if a dataloader worker is delayed or fails to exit. How to Reproduce / Validate: 1. Force a long-running dataloader worker Modify GPTDataset.__getitem__ to insert: time.sleep(3600) This simulates a stuck dataloader worker (e.g., blocked in I/O). 2. Start training Launch a 1-node Megatron-LM job with: --num-workers > 0 3. Verify dataloader workers are alive On the GPU node: sudo fuser -v /dev/nvidia* You should see the dataloader worker processes listed. With this patch, they should not retain active /dev/nvidia* file descriptors even though they are running. 4. Trigger a rank failure Send SIGTERM to one of the training ranks: kill -15 5. Observe GPU memory reclaim Run: nvidia-smi The corresponding rank’s GPU memory usage should return to 0 immediately (assuming no other GPU-holding child processes such as async checkpoint workers are present in this test). 6. Baseline (without this patch) Repeat the same steps without this change. After killing the rank in step 4, you will observe that GPU memory remains non-zero in nvidia-smi, because the dataloader worker still holds /dev/nvidia* references. --- megatron/training/datasets/data_samplers.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/megatron/training/datasets/data_samplers.py b/megatron/training/datasets/data_samplers.py index ca4cc1b36a3..2ab25094735 100644 --- a/megatron/training/datasets/data_samplers.py +++ b/megatron/training/datasets/data_samplers.py @@ -73,10 +73,26 @@ def build_pretraining_data_loader(dataset, consumed_samples): raise Exception('{} dataloader type is not supported.'.format(args.dataloader_type)) def worker_init_fn(_): - DistributedSignalHandler(args.exit_signal).__enter__() + import os + + # Defensively close GPU device FDs in worker processes so workers do not + # keep references into NVIDIA memory space. This helps ensure GPU memory + # can be reclaimed even if a dataloader worker is delayed or fails to exit. + def close_nvidia_fds(): + for fd in os.listdir("/proc/self/fd"): + try: + path = os.readlink(f"/proc/self/fd/{fd}") + if path.startswith("/dev/nvidia"): + os.close(int(fd)) + except Exception: + pass + + close_nvidia_fds() + if args.exit_signal_handler: + DistributedSignalHandler(args.exit_signal).__enter__() maybe_worker_init_fn = ( - worker_init_fn if args.exit_signal_handler and args.num_workers > 0 else None + worker_init_fn if args.num_workers > 0 else None ) # Torch dataloader. if args.hybrid_context_parallel: