Azure · wkcn · Dec 8, 2023 · Dec 10, 2023 · Dec 12, 2023
diff --git a/gpt3/Megatron-LM.patch b/gpt3/Megatron-LM.patch
@@ -1,5 +1,5 @@
 diff --git a/megatron/arguments.py b/megatron/arguments.py
-index ae42b83..f427bc5 100644
+index ae42b83e..df50f67f 100644
 --- a/megatron/arguments.py
 +++ b/megatron/arguments.py
 @@ -38,6 +38,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
@@ -10,7 +10,7 @@ index ae42b83..f427bc5 100644
 
      # Custom arguments.
      if extra_args_provider is not None:
-@@ -1306,3 +1307,10 @@ def _add_vision_args(parser):
+@@ -1306,3 +1307,19 @@ def _add_vision_args(parser):
                         help='warmup teacher temperaure epochs')
 
      return parser
@@ -20,10 +20,18 @@ index ae42b83..f427bc5 100644
 +    group = parser.add_argument_group(title="msamp")
 +    group.add_argument('--msamp', action='store_true', default=False,
 +                       help='whether to enable msamp')
++    # Auto scaling factor tuning for FP8 collective communication
++    group.add_argument('--wgrad-auto-scaling', action='store_true', default=False,
++                       help='whether to enable auto scaling factor tuning on weight gradients reduction')
++    group.add_argument('--wgrad-auto-scaling-freq', type=int, default=10,
++                       help='the frequency of checking whether overflow exists in the result of weight gradients reduction')
++    group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-3,
++                       help='the threshold of overflow ratio for auto scaling factor tuning on weight gradients reduction')
++    group.add_argument('--wgrad-auto-scaling-window', type=int, default=100,
++                       help='the window size for auto scaling factor tuning on weight gradients reduction')
 +    return parser
-\ No newline at end of file
 diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
-index e88b585..320f2b2 100644
+index e88b5851..320f2b24 100644
 --- a/megatron/checkpointing.py
 +++ b/megatron/checkpointing.py
 @@ -106,6 +106,59 @@ def get_checkpoint_name(checkpoints_path, iteration, release=False,
@@ -406,7 +414,7 @@ index e88b585..320f2b2 100644
                                only_context_model=False, custom_load_path=None):
      """
 diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
-index a86444c..600f49d 100644
+index a86444cc..600f49d8 100644
 --- a/megatron/core/tensor_parallel/layers.py
 +++ b/megatron/core/tensor_parallel/layers.py
 @@ -439,7 +439,9 @@ def linear_with_grad_accumulation_and_async_allreduce(
@@ -532,7 +540,7 @@ index a86444c..600f49d 100644
          """Forward of RowParallelLinear
 
 diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
-index 484e9b3..e85984d 100644
+index 484e9b32..e85984d7 100644
 --- a/megatron/optimizer/__init__.py
 +++ b/megatron/optimizer/__init__.py
 @@ -5,10 +5,12 @@ from apex.optimizers import FusedSGD as SGD
@@ -577,7 +585,7 @@ index 484e9b3..e85984d 100644
          optimizer = SGD(param_groups,
                          lr=args.lr,
 diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
-index da9cd70..414fd88 100644
+index da9cd70f..414fd887 100644
 --- a/megatron/optimizer/optimizer.py
 +++ b/megatron/optimizer/optimizer.py
 @@ -13,13 +13,15 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
@@ -599,7 +607,7 @@ index da9cd70..414fd88 100644
  def _zero_grad_group_helper(group, set_to_none):
      """Zero out the gradient for a group of parameters.
 diff --git a/megatron/training.py b/megatron/training.py
-index b821ae7..99a7fad 100644
+index b821ae7b..99a7fadb 100644
 --- a/megatron/training.py
 +++ b/megatron/training.py
 @@ -33,7 +33,7 @@ from megatron.initialize import initialize_megatron