From d1d81fa860038aa691c6256cec78cf7bfb30251d Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Fri, 27 Oct 2023 16:11:44 +0000 Subject: [PATCH 01/13] Remove module for trainer.model --- examples/deepspeed/moe_e/user/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/deepspeed/moe_e/user/train.py b/examples/deepspeed/moe_e/user/train.py index fb86941403..6201fd6f08 100644 --- a/examples/deepspeed/moe_e/user/train.py +++ b/examples/deepspeed/moe_e/user/train.py @@ -178,7 +178,7 @@ def main(cfg: FairseqConfig) -> None: # ckpt_path = f"{cfg.checkpoint.save_dir}/deepspeed_moe" trainer.ds_module = load_deepspeed_state_( cfg=cfg, - model=trainer.model.module.module, + model=trainer.model, weights_path=None) if cfg.model.ep_world_size > cfg.model.num_experts: raise ValueError( @@ -438,7 +438,7 @@ def validate_and_save( from user.ds_utils import save_deepspeed_state_ save_deepspeed_state_( cfg, - model=trainer.model.module.module, + model=trainer.model, trainer=trainer, ds_module=trainer.ds_module, ckpt_tag=None, From 4b3414657ab2d715c718f109bb018eee63602bdd Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Mon, 26 Jun 2023 03:39:02 +0000 Subject: [PATCH 02/13] Changes for torchrun --- examples/deepspeed/moe_e/run-distributed.sh | 2 +- fairseq/dataclass/configs.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/deepspeed/moe_e/run-distributed.sh b/examples/deepspeed/moe_e/run-distributed.sh index 57ca45908e..cd40ddd89c 100644 --- a/examples/deepspeed/moe_e/run-distributed.sh +++ b/examples/deepspeed/moe_e/run-distributed.sh @@ -67,7 +67,7 @@ train() { SaveDir="${OUT_DIR?}/checkpoints/${ARCH}-${RUN_NAME}" mkdir -p $SaveDir - python -m torch.distributed.launch \ + torchrun \ --nproc_per_node=${NUM_GPUS} \ --node_rank=${NODE_RANK:-0} \ --nnodes=${NODE_COUNT:-1} \ diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py index 80caa0f2da..f6c52ef676 100644 --- a/fairseq/dataclass/configs.py +++ b/fairseq/dataclass/configs.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import os import sys from dataclasses import _MISSING_TYPE, dataclass, field from typing import Any, List, Optional @@ -271,9 +272,9 @@ class DistributedTrainingConfig(FairseqDataclass): }, ) device_id: int = field( - default=0, + default=os.getenv("LOCAL_RANK", 0), metadata={ - "help": "which GPU to use (usually configured automatically)", + "help": "which GPU to use (by default looks for $LOCAL_RANK, usually configured automatically)", "argparse_alias": "--local_rank", }, ) From ff091616220b5242f7b4432c7ad983ae27b4c964 Mon Sep 17 00:00:00 2001 From: Jagadish Krishnamoorthy Date: Fri, 13 Sep 2024 15:58:46 -0700 Subject: [PATCH 03/13] transformer_moe_layer: Fix Runtime error Fix the issue "[rank0]: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!" On DeepSpeed library 0.15.0, the commit 7260890452eb89185f9ab1e09550938f78ea91db changed the return output tensor exp_counts from 'cpu' to device when calling deepspeed.moe.layer.MoE() This change reduces cpu host overhead when using moe. The device type of self.expert_counts tensor in Fairseq transformer_moe_layer module needs to be changed from cuda from cpu, for ds library >= 0.15.0 Signed-off-by: Jagadish Krishnamoorthy --- .../moe_e/user/modules/transformer_moe_layer.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/deepspeed/moe_e/user/modules/transformer_moe_layer.py b/examples/deepspeed/moe_e/user/modules/transformer_moe_layer.py index 575bf99dfe..6f290106ce 100644 --- a/examples/deepspeed/moe_e/user/modules/transformer_moe_layer.py +++ b/examples/deepspeed/moe_e/user/modules/transformer_moe_layer.py @@ -49,6 +49,14 @@ DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8) +def get_device(): + import deepspeed + + def _get_version(_version): + return tuple(map(int, _version.split('.'))) + if _get_version(deepspeed.__version__) >= _get_version("0.15.0"): + return "cuda" + return "cpu" class TransformerEncoderLayer_MOE(nn.Module): """Encoder layer block. @@ -93,7 +101,8 @@ def __init__(self, args, index=-1): self.experts = None if (index + 1) % 2 == 0: from deepspeed.moe.layer import MoE - self.expert_counts = torch.zeros(1, args.num_experts, dtype=torch.int64).to('cpu') + dev = get_device() + self.expert_counts = torch.zeros(1, args.num_experts, dtype=torch.int64).to(dev) self.fc1 = self.build_fc1( self.embed_dim, args.encoder_ffn_embed_dim, @@ -340,7 +349,8 @@ def __init__( self.experts = None if (index + 1) % 2 == 0: from deepspeed.moe.layer import MoE - self.expert_counts = torch.zeros(1, args.num_experts, dtype=torch.int64).to('cpu') + dev = get_device() + self.expert_counts = torch.zeros(1, args.num_experts, dtype=torch.int64).to(dev) self.fc1 = self.build_fc1( self.embed_dim, args.encoder_ffn_embed_dim, From e8ee21ac64c2ac347b126fb374d1f9bdec3beedc Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Fri, 21 Mar 2025 10:33:44 -0500 Subject: [PATCH 04/13] change version for fambench --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4bf1e60dd1..03d6d283b3 100644 --- a/setup.py +++ b/setup.py @@ -214,8 +214,8 @@ def do_setup(package_data): "cffi", "cython", 'dataclasses; python_version<"3.7"', - "hydra-core>=1.0.7,<1.1", - "omegaconf<2.1", + "hydra-core==1.1.2", + "omegaconf==2.1.2", 'numpy<1.20.0; python_version<"3.7"', 'numpy; python_version>="3.7"', "regex", From cdfd2201c87d793644311b01e923e5b092a7fdd3 Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Mon, 24 Mar 2025 04:10:00 -0500 Subject: [PATCH 05/13] upd data class --- fairseq/dataclass/configs.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py index f6c52ef676..4ea73ba792 100644 --- a/fairseq/dataclass/configs.py +++ b/fairseq/dataclass/configs.py @@ -1024,16 +1024,16 @@ class EMAConfig(FairseqDataclass): @dataclass class FairseqConfig(FairseqDataclass): - common: CommonConfig = CommonConfig() - common_eval: CommonEvalConfig = CommonEvalConfig() - distributed_training: DistributedTrainingConfig = DistributedTrainingConfig() - dataset: DatasetConfig = DatasetConfig() - optimization: OptimizationConfig = OptimizationConfig() - checkpoint: CheckpointConfig = CheckpointConfig() - bmuf: FairseqBMUFConfig = FairseqBMUFConfig() - generation: GenerationConfig = GenerationConfig() - eval_lm: EvalLMConfig = EvalLMConfig() - interactive: InteractiveConfig = InteractiveConfig() + common: CommonConfig = field(default=CommonConfig) + common_eval: CommonEvalConfig = field(default=CommonEvalConfig) + distributed_training: DistributedTrainingConfig = field(default=DistributedTrainingConfig) + dataset: DatasetConfig = field(default=DatasetConfig) + optimization: OptimizationConfig = field(default=OptimizationConfig) + checkpoint: CheckpointConfig = field(default=CheckpointConfig) + bmuf: FairseqBMUFConfig = field(default=FairseqBMUFConfig) + generation: GenerationConfig = field(default=GenerationConfig) + eval_lm: EvalLMConfig = field(default=EvalLMConfig) + interactive: InteractiveConfig = field(default=InteractiveConfig) model: Any = MISSING task: Any = None criterion: Any = None From d47d475e80fc652e7a56bd8fbcdff538a10427f3 Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Mon, 24 Mar 2025 04:15:17 -0500 Subject: [PATCH 06/13] upd data class --- fairseq/dataclass/configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py index 4ea73ba792..72aa52041d 100644 --- a/fairseq/dataclass/configs.py +++ b/fairseq/dataclass/configs.py @@ -1042,4 +1042,4 @@ class FairseqConfig(FairseqDataclass): scoring: Any = None bpe: Any = None tokenizer: Any = None - ema: EMAConfig = EMAConfig() + ema: EMAConfig = EMAConfig(default=EMAConfig) From eb0d5ec870e0738ba43b203bde63ecc4d6a01ed8 Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Mon, 24 Mar 2025 04:16:57 -0500 Subject: [PATCH 07/13] remove np float --- fairseq/data/indexed_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fairseq/data/indexed_dataset.py b/fairseq/data/indexed_dataset.py index 23afb43356..8d5798ba51 100644 --- a/fairseq/data/indexed_dataset.py +++ b/fairseq/data/indexed_dataset.py @@ -118,7 +118,7 @@ def write_longs(f, a): 3: np.int16, 4: np.int32, 5: np.int64, - 6: np.float, + 6: float, 7: np.double, 8: np.uint16, 9: np.uint32, @@ -325,7 +325,7 @@ class IndexedDatasetBuilder: np.int16: 2, np.int32: 4, np.int64: 8, - np.float: 4, + float: 4, np.double: 8, } From 1c6774953704c7c3125151f1242b3858ab17947d Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Mon, 24 Mar 2025 04:21:23 -0500 Subject: [PATCH 08/13] upd data class --- fairseq/dataclass/configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py index 72aa52041d..bdc573a065 100644 --- a/fairseq/dataclass/configs.py +++ b/fairseq/dataclass/configs.py @@ -1042,4 +1042,4 @@ class FairseqConfig(FairseqDataclass): scoring: Any = None bpe: Any = None tokenizer: Any = None - ema: EMAConfig = EMAConfig(default=EMAConfig) + ema: EMAConfig = field(default=EMAConfig) From e5de02ad7697782ab38a8610592b23267740d399 Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Mon, 24 Mar 2025 04:48:20 -0500 Subject: [PATCH 09/13] update requirement and config --- fairseq/models/transformer/transformer_config.py | 6 +++--- setup.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fairseq/models/transformer/transformer_config.py b/fairseq/models/transformer/transformer_config.py index 4ebd292b0d..e46ea78de8 100644 --- a/fairseq/models/transformer/transformer_config.py +++ b/fairseq/models/transformer/transformer_config.py @@ -104,13 +104,13 @@ class TransformerConfig(FairseqDataclass): }, ) adaptive_input: bool = False - encoder: EncDecBaseConfig = EncDecBaseConfig() + encoder: EncDecBaseConfig = field(default=EncDecBaseConfig) # TODO should really be in the encoder config max_source_positions: int = field( default=DEFAULT_MAX_SOURCE_POSITIONS, metadata={"help": "Maximum input length supported by the encoder"}, ) - decoder: DecoderConfig = DecoderConfig() + decoder: DecoderConfig = field(default=DecoderConfig) # TODO should really be in the decoder config max_target_positions: int = field( default=DEFAULT_MAX_TARGET_POSITIONS, @@ -182,7 +182,7 @@ class TransformerConfig(FairseqDataclass): default=False, metadata={"help": "perform cross+self-attention"} ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) - quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig()) + quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig) min_params_to_wrap: int = field( default=DEFAULT_MIN_PARAMS_TO_WRAP, metadata={ diff --git a/setup.py b/setup.py index 03d6d283b3..9d40807294 100644 --- a/setup.py +++ b/setup.py @@ -214,8 +214,8 @@ def do_setup(package_data): "cffi", "cython", 'dataclasses; python_version<"3.7"', - "hydra-core==1.1.2", - "omegaconf==2.1.2", + "hydra-core>=1.3.2", + "omegaconf", 'numpy<1.20.0; python_version<"3.7"', 'numpy; python_version>="3.7"', "regex", From 033b00f5efbd07dd89c59f9e40d24c4b3beb55ac Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Mon, 24 Mar 2025 07:07:09 -0500 Subject: [PATCH 10/13] update requirement --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9d40807294..081c7babe2 100644 --- a/setup.py +++ b/setup.py @@ -214,7 +214,7 @@ def do_setup(package_data): "cffi", "cython", 'dataclasses; python_version<"3.7"', - "hydra-core>=1.3.2", + "hydra-core==1.1.2", "omegaconf", 'numpy<1.20.0; python_version<"3.7"', 'numpy; python_version>="3.7"', From a0262f8ddd6ff9b79326852196181fa3602c8aee Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Mon, 24 Mar 2025 07:14:55 -0500 Subject: [PATCH 11/13] update requirement --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 081c7babe2..5d935a95ab 100644 --- a/setup.py +++ b/setup.py @@ -214,7 +214,7 @@ def do_setup(package_data): "cffi", "cython", 'dataclasses; python_version<"3.7"', - "hydra-core==1.1.2", + "hydra-core>=1.3.2", "omegaconf", 'numpy<1.20.0; python_version<"3.7"', 'numpy; python_version>="3.7"', @@ -223,7 +223,6 @@ def do_setup(package_data): "torch", "tqdm", "bitarray", - "torchaudio>=0.8.0", ], dependency_links=dependency_links, packages=find_packages( From 80268357883e3b07794368a4cde50343be40eda1 Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Tue, 25 Mar 2025 06:37:07 -0500 Subject: [PATCH 12/13] omegaconf utils update --- fairseq/dataclass/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fairseq/dataclass/utils.py b/fairseq/dataclass/utils.py index 1320ec4737..6d53077c36 100644 --- a/fairseq/dataclass/utils.py +++ b/fairseq/dataclass/utils.py @@ -364,13 +364,13 @@ def override_module_args(args: Namespace) -> Tuple[List[str], List[str]]: class omegaconf_no_object_check: def __init__(self): - self.old_is_primitive = _utils.is_primitive_type + self.old_is_primitive = _utils.is_primitive_type_annotation def __enter__(self): - _utils.is_primitive_type = lambda _: True + _utils.is_primitive_type_annotation = lambda _: True def __exit__(self, type, value, traceback): - _utils.is_primitive_type = self.old_is_primitive + _utils.is_primitive_type_annotation = self.old_is_primitive def convert_namespace_to_omegaconf(args: Namespace) -> DictConfig: From 5f7a7d3fe137d53a8a932df2223349771819340a Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Tue, 1 Apr 2025 13:59:30 -0500 Subject: [PATCH 13/13] add torchaudio --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 5d935a95ab..9d40807294 100644 --- a/setup.py +++ b/setup.py @@ -223,6 +223,7 @@ def do_setup(package_data): "torch", "tqdm", "bitarray", + "torchaudio>=0.8.0", ], dependency_links=dependency_links, packages=find_packages(