From 97e234adedc2ca94ac026ec9fc0e70f4bf866b16 Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Fri, 8 Dec 2023 10:30:19 +0800
Subject: [PATCH 1/3] auto scaling for megatron-lm

---
 gpt3/Megatron-LM.patch | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/gpt3/Megatron-LM.patch b/gpt3/Megatron-LM.patch
index 45be7231..37e57243 100644
--- a/gpt3/Megatron-LM.patch
+++ b/gpt3/Megatron-LM.patch
@@ -1,5 +1,5 @@
 diff --git a/megatron/arguments.py b/megatron/arguments.py
-index ae42b83..f427bc5 100644
+index ae42b83e..5c4b615f 100644
 --- a/megatron/arguments.py
 +++ b/megatron/arguments.py
 @@ -38,6 +38,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
@@ -10,7 +10,7 @@ index ae42b83..f427bc5 100644
  
      # Custom arguments.
      if extra_args_provider is not None:
-@@ -1306,3 +1307,10 @@ def _add_vision_args(parser):
+@@ -1306,3 +1307,17 @@ def _add_vision_args(parser):
                         help='warmup teacher temperaure epochs')
  
      return parser
@@ -20,10 +20,16 @@ index ae42b83..f427bc5 100644
 +    group = parser.add_argument_group(title="msamp")
 +    group.add_argument('--msamp', action='store_true', default=False,
 +                       help='whether to enable msamp')
++    # Auto scaling factor tuning for FP8 collective communication
++    group.add_argument('--wgrad-auto-scaling', action='store_true', default=False,
++                       help='whether to enable auto scaling factor tuning on weight gradients reduction')
++    group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-5,
++                       help='the threshold of overflow ratio for auto scaling factor tuning on weight gradients reduction')
++    group.add_argument('--wgrad-auto-scaling-window', type=int, default=1000,
++                       help='the window size for auto scaling factor tuning on weight gradients reduction')
 +    return parser
-\ No newline at end of file
 diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
-index e88b585..320f2b2 100644
+index e88b5851..320f2b24 100644
 --- a/megatron/checkpointing.py
 +++ b/megatron/checkpointing.py
 @@ -106,6 +106,59 @@ def get_checkpoint_name(checkpoints_path, iteration, release=False,
@@ -406,7 +412,7 @@ index e88b585..320f2b2 100644
                                only_context_model=False, custom_load_path=None):
      """
 diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
-index a86444c..600f49d 100644
+index a86444cc..600f49d8 100644
 --- a/megatron/core/tensor_parallel/layers.py
 +++ b/megatron/core/tensor_parallel/layers.py
 @@ -439,7 +439,9 @@ def linear_with_grad_accumulation_and_async_allreduce(
@@ -532,7 +538,7 @@ index a86444c..600f49d 100644
          """Forward of RowParallelLinear
  
 diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
-index 484e9b3..e85984d 100644
+index 484e9b32..e85984d7 100644
 --- a/megatron/optimizer/__init__.py
 +++ b/megatron/optimizer/__init__.py
 @@ -5,10 +5,12 @@ from apex.optimizers import FusedSGD as SGD
@@ -577,7 +583,7 @@ index 484e9b3..e85984d 100644
          optimizer = SGD(param_groups,
                          lr=args.lr,
 diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
-index da9cd70..414fd88 100644
+index da9cd70f..414fd887 100644
 --- a/megatron/optimizer/optimizer.py
 +++ b/megatron/optimizer/optimizer.py
 @@ -13,13 +13,15 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
@@ -599,7 +605,7 @@ index da9cd70..414fd88 100644
  def _zero_grad_group_helper(group, set_to_none):
      """Zero out the gradient for a group of parameters.
 diff --git a/megatron/training.py b/megatron/training.py
-index b821ae7..99a7fad 100644
+index b821ae7b..99a7fadb 100644
 --- a/megatron/training.py
 +++ b/megatron/training.py
 @@ -33,7 +33,7 @@ from megatron.initialize import initialize_megatron

From e16023a6c6664ac54515ddc3eaf9a55b23db87f1 Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Sun, 10 Dec 2023 21:29:02 +0800
Subject: [PATCH 2/3] auto scaling freq

---
 gpt3/Megatron-LM.patch | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gpt3/Megatron-LM.patch b/gpt3/Megatron-LM.patch
index 37e57243..46268ed0 100644
--- a/gpt3/Megatron-LM.patch
+++ b/gpt3/Megatron-LM.patch
@@ -1,5 +1,5 @@
 diff --git a/megatron/arguments.py b/megatron/arguments.py
-index ae42b83e..5c4b615f 100644
+index ae42b83e..167edbfc 100644
 --- a/megatron/arguments.py
 +++ b/megatron/arguments.py
 @@ -38,6 +38,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
@@ -10,7 +10,7 @@ index ae42b83e..5c4b615f 100644
  
      # Custom arguments.
      if extra_args_provider is not None:
-@@ -1306,3 +1307,17 @@ def _add_vision_args(parser):
+@@ -1306,3 +1307,19 @@ def _add_vision_args(parser):
                         help='warmup teacher temperaure epochs')
  
      return parser
@@ -23,9 +23,11 @@ index ae42b83e..5c4b615f 100644
 +    # Auto scaling factor tuning for FP8 collective communication
 +    group.add_argument('--wgrad-auto-scaling', action='store_true', default=False,
 +                       help='whether to enable auto scaling factor tuning on weight gradients reduction')
++    group.add_argument('--wgrad-auto-scaling-freq', type=int, default=10,
++                       help='the frequency of checking whether overflow exists in the result of weight gradients reduction')
 +    group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-5,
 +                       help='the threshold of overflow ratio for auto scaling factor tuning on weight gradients reduction')
-+    group.add_argument('--wgrad-auto-scaling-window', type=int, default=1000,
++    group.add_argument('--wgrad-auto-scaling-window', type=int, default=100,
 +                       help='the window size for auto scaling factor tuning on weight gradients reduction')
 +    return parser
 diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py

From e30155c1b7ba562554f67e65f20db8e9c6ee8f4c Mon Sep 17 00:00:00 2001
From: wkcn <wkcn@live.cn>
Date: Tue, 12 Dec 2023 14:41:55 +0800
Subject: [PATCH 3/3] wgrad-auto-scaling-ratio 1e-3

---
 gpt3/Megatron-LM.patch | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpt3/Megatron-LM.patch b/gpt3/Megatron-LM.patch
index 46268ed0..c33be44b 100644
--- a/gpt3/Megatron-LM.patch
+++ b/gpt3/Megatron-LM.patch
@@ -1,5 +1,5 @@
 diff --git a/megatron/arguments.py b/megatron/arguments.py
-index ae42b83e..167edbfc 100644
+index ae42b83e..df50f67f 100644
 --- a/megatron/arguments.py
 +++ b/megatron/arguments.py
 @@ -38,6 +38,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
@@ -25,7 +25,7 @@ index ae42b83e..167edbfc 100644
 +                       help='whether to enable auto scaling factor tuning on weight gradients reduction')
 +    group.add_argument('--wgrad-auto-scaling-freq', type=int, default=10,
 +                       help='the frequency of checking whether overflow exists in the result of weight gradients reduction')
-+    group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-5,
++    group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-3,
 +                       help='the threshold of overflow ratio for auto scaling factor tuning on weight gradients reduction')
 +    group.add_argument('--wgrad-auto-scaling-window', type=int, default=100,
 +                       help='the window size for auto scaling factor tuning on weight gradients reduction')