diff --git a/examples/deepspeed/moe_e/run-distributed.sh b/examples/deepspeed/moe_e/run-distributed.sh index 57ca45908e..cd40ddd89c 100644 --- a/examples/deepspeed/moe_e/run-distributed.sh +++ b/examples/deepspeed/moe_e/run-distributed.sh @@ -67,7 +67,7 @@ train() { SaveDir="${OUT_DIR?}/checkpoints/${ARCH}-${RUN_NAME}" mkdir -p $SaveDir - python -m torch.distributed.launch \ + torchrun \ --nproc_per_node=${NUM_GPUS} \ --node_rank=${NODE_RANK:-0} \ --nnodes=${NODE_COUNT:-1} \ diff --git a/examples/deepspeed/moe_e/user/modules/transformer_moe_layer.py b/examples/deepspeed/moe_e/user/modules/transformer_moe_layer.py index 575bf99dfe..6f290106ce 100644 --- a/examples/deepspeed/moe_e/user/modules/transformer_moe_layer.py +++ b/examples/deepspeed/moe_e/user/modules/transformer_moe_layer.py @@ -49,6 +49,14 @@ DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8) +def get_device(): + import deepspeed + + def _get_version(_version): + return tuple(map(int, _version.split('.'))) + if _get_version(deepspeed.__version__) >= _get_version("0.15.0"): + return "cuda" + return "cpu" class TransformerEncoderLayer_MOE(nn.Module): """Encoder layer block. @@ -93,7 +101,8 @@ def __init__(self, args, index=-1): self.experts = None if (index + 1) % 2 == 0: from deepspeed.moe.layer import MoE - self.expert_counts = torch.zeros(1, args.num_experts, dtype=torch.int64).to('cpu') + dev = get_device() + self.expert_counts = torch.zeros(1, args.num_experts, dtype=torch.int64).to(dev) self.fc1 = self.build_fc1( self.embed_dim, args.encoder_ffn_embed_dim, @@ -340,7 +349,8 @@ def __init__( self.experts = None if (index + 1) % 2 == 0: from deepspeed.moe.layer import MoE - self.expert_counts = torch.zeros(1, args.num_experts, dtype=torch.int64).to('cpu') + dev = get_device() + self.expert_counts = torch.zeros(1, args.num_experts, dtype=torch.int64).to(dev) self.fc1 = self.build_fc1( self.embed_dim, args.encoder_ffn_embed_dim, diff --git a/examples/deepspeed/moe_e/user/train.py b/examples/deepspeed/moe_e/user/train.py index fb86941403..6201fd6f08 100644 --- a/examples/deepspeed/moe_e/user/train.py +++ b/examples/deepspeed/moe_e/user/train.py @@ -178,7 +178,7 @@ def main(cfg: FairseqConfig) -> None: # ckpt_path = f"{cfg.checkpoint.save_dir}/deepspeed_moe" trainer.ds_module = load_deepspeed_state_( cfg=cfg, - model=trainer.model.module.module, + model=trainer.model, weights_path=None) if cfg.model.ep_world_size > cfg.model.num_experts: raise ValueError( @@ -438,7 +438,7 @@ def validate_and_save( from user.ds_utils import save_deepspeed_state_ save_deepspeed_state_( cfg, - model=trainer.model.module.module, + model=trainer.model, trainer=trainer, ds_module=trainer.ds_module, ckpt_tag=None, diff --git a/fairseq/data/indexed_dataset.py b/fairseq/data/indexed_dataset.py index 23afb43356..8d5798ba51 100644 --- a/fairseq/data/indexed_dataset.py +++ b/fairseq/data/indexed_dataset.py @@ -118,7 +118,7 @@ def write_longs(f, a): 3: np.int16, 4: np.int32, 5: np.int64, - 6: np.float, + 6: float, 7: np.double, 8: np.uint16, 9: np.uint32, @@ -325,7 +325,7 @@ class IndexedDatasetBuilder: np.int16: 2, np.int32: 4, np.int64: 8, - np.float: 4, + float: 4, np.double: 8, } diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py index 80caa0f2da..bdc573a065 100644 --- a/fairseq/dataclass/configs.py +++ b/fairseq/dataclass/configs.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import os import sys from dataclasses import _MISSING_TYPE, dataclass, field from typing import Any, List, Optional @@ -271,9 +272,9 @@ class DistributedTrainingConfig(FairseqDataclass): }, ) device_id: int = field( - default=0, + default=os.getenv("LOCAL_RANK", 0), metadata={ - "help": "which GPU to use (usually configured automatically)", + "help": "which GPU to use (by default looks for $LOCAL_RANK, usually configured automatically)", "argparse_alias": "--local_rank", }, ) @@ -1023,16 +1024,16 @@ class EMAConfig(FairseqDataclass): @dataclass class FairseqConfig(FairseqDataclass): - common: CommonConfig = CommonConfig() - common_eval: CommonEvalConfig = CommonEvalConfig() - distributed_training: DistributedTrainingConfig = DistributedTrainingConfig() - dataset: DatasetConfig = DatasetConfig() - optimization: OptimizationConfig = OptimizationConfig() - checkpoint: CheckpointConfig = CheckpointConfig() - bmuf: FairseqBMUFConfig = FairseqBMUFConfig() - generation: GenerationConfig = GenerationConfig() - eval_lm: EvalLMConfig = EvalLMConfig() - interactive: InteractiveConfig = InteractiveConfig() + common: CommonConfig = field(default=CommonConfig) + common_eval: CommonEvalConfig = field(default=CommonEvalConfig) + distributed_training: DistributedTrainingConfig = field(default=DistributedTrainingConfig) + dataset: DatasetConfig = field(default=DatasetConfig) + optimization: OptimizationConfig = field(default=OptimizationConfig) + checkpoint: CheckpointConfig = field(default=CheckpointConfig) + bmuf: FairseqBMUFConfig = field(default=FairseqBMUFConfig) + generation: GenerationConfig = field(default=GenerationConfig) + eval_lm: EvalLMConfig = field(default=EvalLMConfig) + interactive: InteractiveConfig = field(default=InteractiveConfig) model: Any = MISSING task: Any = None criterion: Any = None @@ -1041,4 +1042,4 @@ class FairseqConfig(FairseqDataclass): scoring: Any = None bpe: Any = None tokenizer: Any = None - ema: EMAConfig = EMAConfig() + ema: EMAConfig = field(default=EMAConfig) diff --git a/fairseq/dataclass/utils.py b/fairseq/dataclass/utils.py index 1320ec4737..6d53077c36 100644 --- a/fairseq/dataclass/utils.py +++ b/fairseq/dataclass/utils.py @@ -364,13 +364,13 @@ def override_module_args(args: Namespace) -> Tuple[List[str], List[str]]: class omegaconf_no_object_check: def __init__(self): - self.old_is_primitive = _utils.is_primitive_type + self.old_is_primitive = _utils.is_primitive_type_annotation def __enter__(self): - _utils.is_primitive_type = lambda _: True + _utils.is_primitive_type_annotation = lambda _: True def __exit__(self, type, value, traceback): - _utils.is_primitive_type = self.old_is_primitive + _utils.is_primitive_type_annotation = self.old_is_primitive def convert_namespace_to_omegaconf(args: Namespace) -> DictConfig: diff --git a/fairseq/models/transformer/transformer_config.py b/fairseq/models/transformer/transformer_config.py index 4ebd292b0d..e46ea78de8 100644 --- a/fairseq/models/transformer/transformer_config.py +++ b/fairseq/models/transformer/transformer_config.py @@ -104,13 +104,13 @@ class TransformerConfig(FairseqDataclass): }, ) adaptive_input: bool = False - encoder: EncDecBaseConfig = EncDecBaseConfig() + encoder: EncDecBaseConfig = field(default=EncDecBaseConfig) # TODO should really be in the encoder config max_source_positions: int = field( default=DEFAULT_MAX_SOURCE_POSITIONS, metadata={"help": "Maximum input length supported by the encoder"}, ) - decoder: DecoderConfig = DecoderConfig() + decoder: DecoderConfig = field(default=DecoderConfig) # TODO should really be in the decoder config max_target_positions: int = field( default=DEFAULT_MAX_TARGET_POSITIONS, @@ -182,7 +182,7 @@ class TransformerConfig(FairseqDataclass): default=False, metadata={"help": "perform cross+self-attention"} ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) - quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig()) + quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig) min_params_to_wrap: int = field( default=DEFAULT_MIN_PARAMS_TO_WRAP, metadata={ diff --git a/setup.py b/setup.py index 4bf1e60dd1..9d40807294 100644 --- a/setup.py +++ b/setup.py @@ -214,8 +214,8 @@ def do_setup(package_data): "cffi", "cython", 'dataclasses; python_version<"3.7"', - "hydra-core>=1.0.7,<1.1", - "omegaconf<2.1", + "hydra-core>=1.3.2", + "omegaconf", 'numpy<1.20.0; python_version<"3.7"', 'numpy; python_version>="3.7"', "regex",