Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions gpt3/Megatron-LM.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/megatron/arguments.py b/megatron/arguments.py
index ae42b83..f427bc5 100644
index ae42b83e..df50f67f 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -38,6 +38,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
Expand All @@ -10,7 +10,7 @@ index ae42b83..f427bc5 100644

# Custom arguments.
if extra_args_provider is not None:
@@ -1306,3 +1307,10 @@ def _add_vision_args(parser):
@@ -1306,3 +1307,19 @@ def _add_vision_args(parser):
help='warmup teacher temperaure epochs')

return parser
Expand All @@ -20,10 +20,18 @@ index ae42b83..f427bc5 100644
+ group = parser.add_argument_group(title="msamp")
+ group.add_argument('--msamp', action='store_true', default=False,
+ help='whether to enable msamp')
+ # Auto scaling factor tuning for FP8 collective communication
+ group.add_argument('--wgrad-auto-scaling', action='store_true', default=False,
+ help='whether to enable auto scaling factor tuning on weight gradients reduction')
+ group.add_argument('--wgrad-auto-scaling-freq', type=int, default=10,
+ help='the frequency of checking whether overflow exists in the result of weight gradients reduction')
+ group.add_argument('--wgrad-auto-scaling-ratio', type=float, default=1e-3,
+ help='the threshold of overflow ratio for auto scaling factor tuning on weight gradients reduction')
+ group.add_argument('--wgrad-auto-scaling-window', type=int, default=100,
+ help='the window size for auto scaling factor tuning on weight gradients reduction')
+ return parser
\ No newline at end of file
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index e88b585..320f2b2 100644
index e88b5851..320f2b24 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -106,6 +106,59 @@ def get_checkpoint_name(checkpoints_path, iteration, release=False,
Expand Down Expand Up @@ -406,7 +414,7 @@ index e88b585..320f2b2 100644
only_context_model=False, custom_load_path=None):
"""
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index a86444c..600f49d 100644
index a86444cc..600f49d8 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -439,7 +439,9 @@ def linear_with_grad_accumulation_and_async_allreduce(
Expand Down Expand Up @@ -532,7 +540,7 @@ index a86444c..600f49d 100644
"""Forward of RowParallelLinear

diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
index 484e9b3..e85984d 100644
index 484e9b32..e85984d7 100644
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -5,10 +5,12 @@ from apex.optimizers import FusedSGD as SGD
Expand Down Expand Up @@ -577,7 +585,7 @@ index 484e9b3..e85984d 100644
optimizer = SGD(param_groups,
lr=args.lr,
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
index da9cd70..414fd88 100644
index da9cd70f..414fd887 100644
--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -13,13 +13,15 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
Expand All @@ -599,7 +607,7 @@ index da9cd70..414fd88 100644
def _zero_grad_group_helper(group, set_to_none):
"""Zero out the gradient for a group of parameters.
diff --git a/megatron/training.py b/megatron/training.py
index b821ae7..99a7fad 100644
index b821ae7b..99a7fadb 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -33,7 +33,7 @@ from megatron.initialize import initialize_megatron
Expand Down