diff --git a/megatron/training/datasets/data_samplers.py b/megatron/training/datasets/data_samplers.py index ca4cc1b36a3..2ab25094735 100644 --- a/megatron/training/datasets/data_samplers.py +++ b/megatron/training/datasets/data_samplers.py @@ -73,10 +73,26 @@ def build_pretraining_data_loader(dataset, consumed_samples): raise Exception('{} dataloader type is not supported.'.format(args.dataloader_type)) def worker_init_fn(_): - DistributedSignalHandler(args.exit_signal).__enter__() + import os + + # Defensively close GPU device FDs in worker processes so workers do not + # keep references into NVIDIA memory space. This helps ensure GPU memory + # can be reclaimed even if a dataloader worker is delayed or fails to exit. + def close_nvidia_fds(): + for fd in os.listdir("/proc/self/fd"): + try: + path = os.readlink(f"/proc/self/fd/{fd}") + if path.startswith("/dev/nvidia"): + os.close(int(fd)) + except Exception: + pass + + close_nvidia_fds() + if args.exit_signal_handler: + DistributedSignalHandler(args.exit_signal).__enter__() maybe_worker_init_fn = ( - worker_init_fn if args.exit_signal_handler and args.num_workers > 0 else None + worker_init_fn if args.num_workers > 0 else None ) # Torch dataloader. if args.hybrid_context_parallel: