From 00a24d59192f23c3d1a859676788ddc3118095e6 Mon Sep 17 00:00:00 2001 From: yezhengmao Date: Fri, 27 Jun 2025 19:28:37 +0800 Subject: [PATCH] bug fixed: wandb artifact requires the tracker file --- megatron/training/checkpointing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index 5de0e85abd7..e8201901730 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -588,9 +588,11 @@ def onelogger_finalize_fn(): else: onelogger_finalize_fn() - # Additional callback for wandb (last rank) + # Additional callback for wandb + # The wandb artifact requires the tracker file to be present, so we need to ensure + # that rank 0 has already saved it before proceeding with wandb operations if not torch.distributed.is_initialized() \ - or is_last_rank(): + or torch.distributed.get_rank() == 0: def wandb_finalize_fn(): wandb_utils.on_save_checkpoint_success(checkpoint_name, get_checkpoint_tracker_filename(save_dir), save_dir, iteration) if args.async_save: