From 97e234adedc2ca94ac026ec9fc0e70f4bf866b16 Mon Sep 17 00:00:00 2001 From: wkcn Date: Fri, 8 Dec 2023 10:30:19 +0800 Subject: [PATCH 1/3] auto scaling for megatron-lm --- gpt3/Megatron-LM.patch | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/gpt3/Megatron-LM.patch b/gpt3/Megatron-LM.patch index 45be7231..37e57243 100644 --- a/gpt3/Megatron-LM.patch +++ b/gpt3/Megatron-LM.patch @@ -1,5 +1,5 @@ diff --git a/megatron/arguments.py b/megatron/arguments.py -index ae42b83..f427bc5 100644 +index ae42b83e..5c4b615f 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -38,6 +38,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): @@ -10,7 +10,7 @@ index ae42b83..f427bc5 100644 # Custom arguments. if extra_args_provider is not None: -@@ -1306,3 +1307,10 @@ def _add_vision_args(parser): +@@ -1306,3 +1307,17 @@ def _add_vision_args(parser): help='warmup teacher temperaure epochs') return parser @@ -20,10 +20,16 @@ index ae42b83..f427bc5 100644 + group = parser.add_argument_group(title="msamp") + group.add_argument('--msamp', action='store_true', default=False, + help='whether to enable msamp') ++ # Auto scaling factor tuning for FP8 collective communication ++ group.add_argument('--wgrad-auto-scaling', action='store_true', default=False, ++ help='whether to enable auto scaling factor tuning on weight gradients reduction') ++ group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-5, ++ help='the threshold of overflow ratio for auto scaling factor tuning on weight gradients reduction') ++ group.add_argument('--wgrad-auto-scaling-window', type=int, default=1000, ++ help='the window size for auto scaling factor tuning on weight gradients reduction') + return parser -\ No newline at end of file diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py -index e88b585..320f2b2 100644 +index e88b5851..320f2b24 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -106,6 +106,59 @@ def get_checkpoint_name(checkpoints_path, iteration, release=False, @@ -406,7 +412,7 @@ index e88b585..320f2b2 100644 only_context_model=False, custom_load_path=None): """ diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py -index a86444c..600f49d 100644 +index a86444cc..600f49d8 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -439,7 +439,9 @@ def linear_with_grad_accumulation_and_async_allreduce( @@ -532,7 +538,7 @@ index a86444c..600f49d 100644 """Forward of RowParallelLinear diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py -index 484e9b3..e85984d 100644 +index 484e9b32..e85984d7 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -5,10 +5,12 @@ from apex.optimizers import FusedSGD as SGD @@ -577,7 +583,7 @@ index 484e9b3..e85984d 100644 optimizer = SGD(param_groups, lr=args.lr, diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py -index da9cd70..414fd88 100644 +index da9cd70f..414fd887 100644 --- a/megatron/optimizer/optimizer.py +++ b/megatron/optimizer/optimizer.py @@ -13,13 +13,15 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors @@ -599,7 +605,7 @@ index da9cd70..414fd88 100644 def _zero_grad_group_helper(group, set_to_none): """Zero out the gradient for a group of parameters. diff --git a/megatron/training.py b/megatron/training.py -index b821ae7..99a7fad 100644 +index b821ae7b..99a7fadb 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -33,7 +33,7 @@ from megatron.initialize import initialize_megatron From e16023a6c6664ac54515ddc3eaf9a55b23db87f1 Mon Sep 17 00:00:00 2001 From: wkcn Date: Sun, 10 Dec 2023 21:29:02 +0800 Subject: [PATCH 2/3] auto scaling freq --- gpt3/Megatron-LM.patch | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gpt3/Megatron-LM.patch b/gpt3/Megatron-LM.patch index 37e57243..46268ed0 100644 --- a/gpt3/Megatron-LM.patch +++ b/gpt3/Megatron-LM.patch @@ -1,5 +1,5 @@ diff --git a/megatron/arguments.py b/megatron/arguments.py -index ae42b83e..5c4b615f 100644 +index ae42b83e..167edbfc 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -38,6 +38,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): @@ -10,7 +10,7 @@ index ae42b83e..5c4b615f 100644 # Custom arguments. if extra_args_provider is not None: -@@ -1306,3 +1307,17 @@ def _add_vision_args(parser): +@@ -1306,3 +1307,19 @@ def _add_vision_args(parser): help='warmup teacher temperaure epochs') return parser @@ -23,9 +23,11 @@ index ae42b83e..5c4b615f 100644 + # Auto scaling factor tuning for FP8 collective communication + group.add_argument('--wgrad-auto-scaling', action='store_true', default=False, + help='whether to enable auto scaling factor tuning on weight gradients reduction') ++ group.add_argument('--wgrad-auto-scaling-freq', type=int, default=10, ++ help='the frequency of checking whether overflow exists in the result of weight gradients reduction') + group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-5, + help='the threshold of overflow ratio for auto scaling factor tuning on weight gradients reduction') -+ group.add_argument('--wgrad-auto-scaling-window', type=int, default=1000, ++ group.add_argument('--wgrad-auto-scaling-window', type=int, default=100, + help='the window size for auto scaling factor tuning on weight gradients reduction') + return parser diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py From e30155c1b7ba562554f67e65f20db8e9c6ee8f4c Mon Sep 17 00:00:00 2001 From: wkcn Date: Tue, 12 Dec 2023 14:41:55 +0800 Subject: [PATCH 3/3] wgrad-auto-scaling-ratio 1e-3 --- gpt3/Megatron-LM.patch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpt3/Megatron-LM.patch b/gpt3/Megatron-LM.patch index 46268ed0..c33be44b 100644 --- a/gpt3/Megatron-LM.patch +++ b/gpt3/Megatron-LM.patch @@ -1,5 +1,5 @@ diff --git a/megatron/arguments.py b/megatron/arguments.py -index ae42b83e..167edbfc 100644 +index ae42b83e..df50f67f 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -38,6 +38,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): @@ -25,7 +25,7 @@ index ae42b83e..167edbfc 100644 + help='whether to enable auto scaling factor tuning on weight gradients reduction') + group.add_argument('--wgrad-auto-scaling-freq', type=int, default=10, + help='the frequency of checking whether overflow exists in the result of weight gradients reduction') -+ group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-5, ++ group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-3, + help='the threshold of overflow ratio for auto scaling factor tuning on weight gradients reduction') + group.add_argument('--wgrad-auto-scaling-window', type=int, default=100, + help='the window size for auto scaling factor tuning on weight gradients reduction')