From 6155a65fa275539ac1ff8299e493aa03254fba8b Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Sun, 13 Apr 2025 04:26:03 -0700 Subject: [PATCH 01/19] use mcore config_converer and model_initializer for more types of models --- verl/models/mcore/__init__.py | 5 +- verl/models/mcore/config_converter.py | 84 ++++++++++++++++++++++++ verl/models/mcore/model_initializer.py | 88 ++++++++++++++++++++++++++ verl/models/mcore/registry.py | 65 +++++++++++++++++++ verl/workers/megatron_workers.py | 28 ++++---- 5 files changed, 256 insertions(+), 14 deletions(-) create mode 100644 verl/models/mcore/config_converter.py create mode 100644 verl/models/mcore/model_initializer.py create mode 100644 verl/models/mcore/registry.py diff --git a/verl/models/mcore/__init__.py b/verl/models/mcore/__init__.py index e4e2a3861a8..fbc26864c92 100644 --- a/verl/models/mcore/__init__.py +++ b/verl/models/mcore/__init__.py @@ -13,4 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .gpt_model import gptmodel_forward \ No newline at end of file +from .gpt_model import gptmodel_forward +from .registry import init_mcore_model, hf_to_mcore_config + +__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'gptmodel_forward'] \ No newline at end of file diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py new file mode 100644 index 00000000000..d8359ed2284 --- /dev/null +++ b/verl/models/mcore/config_converter.py @@ -0,0 +1,84 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# convert huggingface config to mcore transformer config + +from transformers import PretrainedConfig +from megatron.core.transformer import TransformerConfig +import torch +import torch.nn.functional as F +from megatron.core.enums import AttnBackend + + +def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: + # for LlamaForCausalLM or Qwen2ForCausalLM + from megatron.core import parallel_state as mpu + if "Qwen2ForCausalLM" in hf_config.architectures: + qkv_bias = True + else: + qkv_bias = getattr(hf_config, 'attention_bias', False) + overlap_p2p_comm = mpu.get_virtual_pipeline_model_parallel_world_size( + ) is not None and mpu.get_virtual_pipeline_model_parallel_world_size() > 1 + batch_p2p_comm = False + transformer_config = TransformerConfig( + num_layers=hf_config.num_hidden_layers, + hidden_size=hf_config.hidden_size, + num_attention_heads=hf_config.num_attention_heads, + num_query_groups=hf_config.num_key_value_heads, + ffn_hidden_size=hf_config.intermediate_size, + activation_func=F.silu, + normalization='RMSNorm', + gated_linear_unit=True, + use_cpu_initialization=True, + add_bias_linear=False, + tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(), + pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(), + virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(), + context_parallel_size=mpu.get_context_parallel_world_size(), + overlap_p2p_comm=overlap_p2p_comm, + batch_p2p_comm=batch_p2p_comm, + pipeline_dtype=dtype, + params_dtype=dtype, + sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1, + variable_seq_lengths=True, + masked_softmax_fusion=True, + moe_token_dispatcher_type="alltoall", + attention_dropout=hf_config.attention_dropout, + hidden_dropout=getattr(hf_config, 'hidden_dropout', 0.0), + add_qkv_bias=qkv_bias, + attention_backend=AttnBackend.flash, + bf16=dtype is torch.bfloat16) + + return transformer_config + + +def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: + # Qwen2MoeForCausalLM + raise NotImplementedError("Qwen2MoeForCausalLM is not supported yet") + + +def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: + # DeepseekV3ForCausalLM + raise NotImplementedError("DeepseekV3ForCausalLM is not supported yet") + + +def hf_to_mcore_config_qwen2_5_vl(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: + # Qwen2_5_VLForConditionalGeneration + raise NotImplementedError("Qwen2_5_VLForConditionalGeneration is not supported yet") + + +def hf_to_mcore_config_llama4(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: + # Llama4ForConditionalGeneration + raise NotImplementedError("Llama4ForConditionalGeneration is not supported yet") diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py new file mode 100644 index 00000000000..4ae2fe4e5d4 --- /dev/null +++ b/verl/models/mcore/model_initializer.py @@ -0,0 +1,88 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# use mcore transformer config to initialize the model + + +def init_mcore_model_dense(tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False): + # for LlamaForCausalLM, Qwen2ForCausalLM + from megatron.core.models.gpt.gpt_model import GPTModel + from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec + use_te = True + assert tfconfig.normalization == "RMSNorm", 'only RMSNorm is supported for now' + transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te) + rope_scaling_args = {} + if hf_config.rope_scaling is not None: + assert hf_config.rope_scaling['type'] == 'linear', "only linear scaling is supported for now" + rope_scaling_args['seq_len_interpolation_factor'] = hf_config.rope_scaling['factor'] + model = GPTModel(config=tfconfig, + transformer_layer_spec=transformer_layer_spec, + vocab_size=hf_config.vocab_size, + max_sequence_length=hf_config.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + share_embeddings_and_output_weights=share_embeddings_and_output_weights, + position_embedding_type='rope', + rotary_base=hf_config.rope_theta, + **rope_scaling_args) + if post_process and value: + from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer + model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig) + return model + + +def init_mcore_model_qwen2_moe(tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False): + return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, + value) + + +def init_mcore_model_llama4(tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False): + return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, + value) + + +def init_mcore_model_dpskv3(tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False): + return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, + value) + + +def init_mcore_model_qwen2_5_vl(tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False): + # Qwen2_5_VLForConditionalGeneration + raise NotImplementedError("VLM is not supported yet") diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py new file mode 100644 index 00000000000..a2a92924b00 --- /dev/null +++ b/verl/models/mcore/registry.py @@ -0,0 +1,65 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config_converter import hf_to_mcore_config_dense, hf_to_mcore_config_qwen2moe, hf_to_mcore_config_dpskv3, hf_to_mcore_config_qwen2_5_vl, hf_to_mcore_config_llama4 +from .config_converter import PretrainedConfig, TransformerConfig +import torch +import torch.nn as nn + + +def hf_to_mcore_config(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: + MODEL_CONFIG_CONVERTER_REGISTRY = { + "LlamaForCausalLM": hf_to_mcore_config_dense, + "Qwen2ForCausalLM": hf_to_mcore_config_dense, + "Qwen2MoeForCausalLM": hf_to_mcore_config_qwen2moe, + "DeepseekV3ForCausalLM": hf_to_mcore_config_dpskv3, + "Qwen2_5_VLForConditionalGeneration": hf_to_mcore_config_qwen2_5_vl, + "Llama4ForConditionalGeneration": hf_to_mcore_config_llama4, + } + assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" + arch = hf_config.architectures[0] + if arch not in MODEL_CONFIG_CONVERTER_REGISTRY: + raise ValueError(f"Model architectures {arch} converter are not supported for now. " + f"Supported architectures: {MODEL_CONFIG_CONVERTER_REGISTRY.keys()}") + return MODEL_CONFIG_CONVERTER_REGISTRY[arch](hf_config, dtype) + + +from .model_initializer import init_mcore_model_dense, init_mcore_model_qwen2_moe, init_mcore_model_dpskv3, init_mcore_model_qwen2_5_vl, init_mcore_model_llama4 + + +def init_mcore_model( + tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False, + **extra_kwargs # may be used for vlm +) -> nn.Module: + MODEL_INITIALIZER_REGISTRY = { + "LlamaForCausalLM": init_mcore_model_dense, + "Qwen2ForCausalLM": init_mcore_model_dense, + "Qwen2MoeForCausalLM": init_mcore_model_qwen2_moe, + "DeepseekV3ForCausalLM": init_mcore_model_dpskv3, + "Qwen2_5_VLForConditionalGeneration": init_mcore_model_qwen2_5_vl, + "Llama4ForConditionalGeneration": init_mcore_model_llama4, + } + assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" + arch = hf_config.architectures[0] + if arch not in MODEL_INITIALIZER_REGISTRY: + raise ValueError(f"Model architectures {arch} initializer are not supported for now. " + f"Supported architectures: {MODEL_INITIALIZER_REGISTRY.keys()}") + return MODEL_INITIALIZER_REGISTRY[arch](tfconfig, hf_config, pre_process, post_process, + share_embeddings_and_output_weights, value, **extra_kwargs) diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index ffd7404fa0d..0b441372235 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -143,7 +143,8 @@ def _build_model_optimizer(self, from verl.utils.megatron.optimizer import get_megatron_optimizer from megatron.core.models.gpt.gpt_model import ModelType from verl.utils.model import print_model_size, update_model_config, get_generation_config - from verl.utils.megatron_utils import get_model, init_megatron_optim_config, convert_config + from verl.utils.megatron_utils import get_model, init_megatron_optim_config + from verl.models.mcore import hf_to_mcore_config from transformers import AutoConfig # Step 1: initialize the tokenizer @@ -169,7 +170,7 @@ def _build_model_optimizer(self, self.share_embeddings_and_output_weights = getattr(actor_model_config, "tie_word_embeddings", False) self.architectures = getattr(actor_model_config, "architectures", None) - tfconfig = convert_config(actor_model_config, megatron_config) + tfconfig = hf_to_mcore_config(actor_model_config, megatron_config.dtype) if enable_gradient_checkpointing: gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict())) tfconfig.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method'] @@ -179,8 +180,8 @@ def _build_model_optimizer(self, self.hf_config = actor_model_config def megatron_actor_model_provider(pre_process, post_process): - from verl.utils.model import get_parallel_gptmodel_from_config - parallel_model = get_parallel_gptmodel_from_config( + from verl.models.mcore import init_mcore_model + parallel_model = init_mcore_model( tfconfig, actor_model_config, pre_process, @@ -531,8 +532,9 @@ def _build_critic_model_optimizer(self, from megatron.core.models.gpt.gpt_model import ModelType from verl.utils.model import print_model_size, update_model_config from verl.utils.megatron.optimizer import get_megatron_optimizer - from verl.utils.megatron_utils import get_model, init_megatron_optim_config, convert_config + from verl.utils.megatron_utils import get_model, init_megatron_optim_config from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig + from verl.models.mcore import hf_to_mcore_config # Step 1: initialize the tokenizer local_path = copy_to_local(model_path) @@ -552,7 +554,7 @@ def _build_critic_model_optimizer(self, self.architectures = getattr(critic_model_config, "architectures", None) if self.rank == 0: print(f'Model config after override: {critic_model_config}') - tfconfig = convert_config(critic_model_config, megatron_config) + tfconfig = hf_to_mcore_config(critic_model_config, megatron_config.dtype) if enable_gradient_checkpointing: gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict())) tfconfig.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method'] @@ -562,13 +564,13 @@ def _build_critic_model_optimizer(self, self.hf_config = critic_model_config def megatron_critic_model_provider(pre_process, post_process): - from verl.utils.model import get_parallel_gptmodel_from_config - parallel_model = get_parallel_gptmodel_from_config(tfconfig, - critic_model_config, - pre_process, - post_process, - share_embeddings_and_output_weights=False, - value=True) + from verl.models.mcore import init_mcore_model + parallel_model = init_mcore_model(tfconfig, + critic_model_config, + pre_process, + post_process, + share_embeddings_and_output_weights=False, + value=True) parallel_model.cuda() return parallel_model From 8869168babef06257f29a7bfd3a919df99356672 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Sun, 13 Apr 2025 04:40:45 -0700 Subject: [PATCH 02/19] remove megatron_config from actor/critic --- verl/utils/megatron_utils.py | 5 +++ verl/workers/actor/megatron_actor.py | 30 +++++-------- verl/workers/critic/megatron_critic.py | 11 ++--- verl/workers/megatron_workers.py | 60 +++++++++++++------------- 4 files changed, 50 insertions(+), 56 deletions(-) diff --git a/verl/utils/megatron_utils.py b/verl/utils/megatron_utils.py index e84f154622a..ffeb114f6d2 100644 --- a/verl/utils/megatron_utils.py +++ b/verl/utils/megatron_utils.py @@ -217,6 +217,11 @@ def mcore_model_parallel_config( sequence_parallel: bool, params_dtype: torch.dtype, ) -> ModelParallelConfig: + # WARNING: Code should not reach this point. This function is deprecated and will be removed. + # Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead. + warnings.warn("Code should not reach this point. This function is deprecated and will be removed. " + "Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead.", + DeprecationWarning, stacklevel=2) return ModelParallelConfig( tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(), pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(), diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py index 2d0adc807f7..9fd73fff137 100644 --- a/verl/workers/actor/megatron_actor.py +++ b/verl/workers/actor/megatron_actor.py @@ -52,7 +52,7 @@ class MegatronPPOActor(BasePPOActor): - def __init__(self, config, model_config, megatron_config: ModelParallelConfig, actor_module: nn.ModuleList, + def __init__(self, config, model_config, hf_config, tf_config, actor_module: nn.ModuleList, actor_optimizer: DistributedOptimizer, actor_optimizer_config: OptimizerConfig): """MeagtronPPOActor class. This class implements the simple PPO logics when the model is built with Megatron. @@ -72,13 +72,8 @@ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, a ``entropy_coeff``: entropy coefficient of the PPO loss. See https://arxiv.org/abs/1707.06347. model_config (OmegaConf): model configuration. It must contains ``model_config.vocab_size`` and ``model_config.hidden_size`` - megatron_config (OmegaConf): megatron configuration. It must contains - - ``sequence_parallel_enabled``: whether the sequence parallel is enabled. - - ``param_dtype``: the dtype of the parameters. - - ``virtual_pipeline_model_parallel_size``: virtual pipeline model parallel size. a.k.a number of chunks in each pp stage. + hf_config (PretrainedConfig): huggingface config + tf_config (TransformerConfig): mcore transformer config actor_module (nn.ModuleList): actor module is a ModuleList that contains a list of nn.Module in this pp stage. each nn.Module in this rank holds a vpp module chunk. See https://arxiv.org/pdf/2104.04473.pdf for more details. The actor module has some constraints to follow in order to use the updating logics implemented here @@ -93,13 +88,6 @@ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, a actor_optimizer (DistributedOptimizer): currently, we only support DistributedOptimizer in Megatron. It implements zero1 optimizer that shards the optimizer state across dp ranks. - >>> def megatron_actor_model_provider(pre_process, post_process): - >>> vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() - >>> parallel_model = ParallelMistralForCausalLMRmPadPP(config=actor_model_config, - >>> megatron_config=megatron_config, - >>> pre_process=pre_process, - >>> post_process=post_process).cuda() - >>> return parallel_model >>> from megatron.training import get_model >>> from megatron.optimizer import get_megatron_optimizer >>> actor_module = get_model(megatron_actor_model_provider, wrap_with_ddp=True) @@ -107,14 +95,16 @@ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, a >>> actor_optimizer = get_megatron_optimizer(actor_module) >>> actor = MegatronPPOActor(config=config, >>> model_config=actor_model_config, - >>> megatron_config=megatron_config, + >>> hf_config=hf_config, + >>> tf_config=tf_config, >>> actor_module=actor_module, >>> actor_optimizer=actor_optimizer) """ super().__init__(config) self._validate_config(config) self.model_config = model_config - self.megatron_config = megatron_config + self.hf_config = hf_config + self.tf_config = tf_config self.actor_module = actor_module self.actor_optimizer: DistributedOptimizer = actor_optimizer self.actor_optimizer_config = actor_optimizer_config @@ -124,7 +114,7 @@ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, a 'overlap_dp_param_comm': False, 'overlap_dp_grad_comm': False, 'gradient_accumulation_steps': 1, - 'sequence_parallel': self.megatron_config.sequence_parallel, + 'sequence_parallel': self.tf_config.sequence_parallel, 'DDP_impl': 'local', 'layernorm_allreduce_bucket_threshold': 0, 'pipeline_model_parallel_split_rank': None, @@ -253,7 +243,7 @@ def forward_backward_batch(self, data: DataProto, forward_only=False, post_proce input_shapes = compute_transformers_input_shapes( batches, meta_info={ - 'sequence_parallel': self.megatron_config.sequence_parallel, + 'sequence_parallel': self.tf_config.sequence_parallel, 'hidden_size': self.model_config.hidden_size }) n_micro_batch = len(batches) @@ -334,7 +324,7 @@ def forward_step(batch_iter, model): input_ids, attention_mask, position_ids, - sequence_parallel=self.megatron_config.sequence_parallel) + sequence_parallel=self.tf_config.sequence_parallel) if forward_only: meta_info = None else: diff --git a/verl/workers/critic/megatron_critic.py b/verl/workers/critic/megatron_critic.py index 55076075c5a..af1cbcc4db7 100644 --- a/verl/workers/critic/megatron_critic.py +++ b/verl/workers/critic/megatron_critic.py @@ -42,12 +42,13 @@ class MegatronPPOCritic(BasePPOCritic): - def __init__(self, config, model_config, megatron_config, critic_module: nn.ModuleList, + def __init__(self, config, model_config, hf_config, tf_config, critic_module: nn.ModuleList, critic_optimizer: DistributedOptimizer, critic_optimizer_config: OptimizerConfig): super().__init__(config=config) self._validate_config(config) self.model_config = model_config - self.megatron_config = megatron_config + self.hf_config = hf_config # huggingface config + self.tf_config = tf_config # mcore transformer config self.critic_module = critic_module self.critic_optimizer = critic_optimizer @@ -59,7 +60,7 @@ def __init__(self, config, model_config, megatron_config, critic_module: nn.Modu 'overlap_dp_param_comm': False, 'overlap_dp_grad_comm': False, 'gradient_accumulation_steps': 1, - 'sequence_parallel': self.megatron_config.sequence_parallel, + 'sequence_parallel': self.tf_config.sequence_parallel, 'DDP_impl': 'local', 'layernorm_allreduce_bucket_threshold': 0, 'pipeline_model_parallel_split_rank': None, @@ -122,7 +123,7 @@ def forward_backward_batch(self, data: DataProto, forward_only=False): input_shapes = compute_transformers_input_shapes( batches, meta_info={ - 'sequence_parallel': self.megatron_config.sequence_parallel, + 'sequence_parallel': self.tf_config.sequence_parallel, 'hidden_size': self.model_config.hidden_size }) @@ -169,7 +170,7 @@ def forward_step(batch_iter, model): input_ids, attention_mask, position_ids, - sequence_parallel=self.megatron_config.sequence_parallel, + sequence_parallel=self.tf_config.sequence_parallel, value_model=True) return output, partial(loss_func, data=batch, meta_info={}) diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index 0b441372235..178327c375f 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -136,7 +136,7 @@ def __init__(self, config: DictConfig, role: str): def _build_model_optimizer(self, model_path, - megatron_config: ModelParallelConfig, + dtype, optim_config, override_model_config, enable_gradient_checkpointing=False): @@ -170,19 +170,19 @@ def _build_model_optimizer(self, self.share_embeddings_and_output_weights = getattr(actor_model_config, "tie_word_embeddings", False) self.architectures = getattr(actor_model_config, "architectures", None) - tfconfig = hf_to_mcore_config(actor_model_config, megatron_config.dtype) + tf_config = hf_to_mcore_config(actor_model_config, dtype) if enable_gradient_checkpointing: gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict())) - tfconfig.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method'] - tfconfig.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity'] - tfconfig.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers'] - print(f'TF config: {tfconfig}') + tf_config.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method'] + tf_config.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity'] + tf_config.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers'] + print(f'TF config: {tf_config}') self.hf_config = actor_model_config def megatron_actor_model_provider(pre_process, post_process): from verl.models.mcore import init_mcore_model parallel_model = init_mcore_model( - tfconfig, + tf_config, actor_model_config, pre_process, post_process, @@ -214,7 +214,7 @@ def megatron_actor_model_provider(pre_process, post_process): load_megatron_gptmodel_weights(self.config, actor_model_config, actor_module, - params_dtype=megatron_config.params_dtype, + params_dtype=dtype, is_value_model=False) if self.rank == 0: @@ -239,7 +239,7 @@ def megatron_actor_model_provider(pre_process, post_process): load_megatron_gptmodel_weights(self.config, actor_model_config, ref_module, - params_dtype=megatron_config.params_dtype, + params_dtype=dtype, is_value_model=False) log_gpu_memory_usage('After ref module init', logger=logger) return ref_module, actor_model_config @@ -311,10 +311,7 @@ def init_model(self): override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create())) self.param_dtype = torch.bfloat16 - megatron_config = mcore_model_parallel_config(sequence_parallel=self.config.actor.megatron.get( - 'sequence_parallel', True), - params_dtype=PrecisionType.to_dtype(self.param_dtype)) - + self.dtype = PrecisionType.to_dtype(self.param_dtype) if self._is_actor or self._is_rollout: # we need the model for actor and rollout if self._is_actor: @@ -324,7 +321,7 @@ def init_model(self): self.actor_module, self.hybrid_engine, self.actor_optimizer, \ self.actor_model_config, self.actor_optim_config = self._build_model_optimizer( model_path=self.config.model.path, - megatron_config=megatron_config, + dtype=self.dtype, optim_config=optim_config, override_model_config=override_model_config, enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False) @@ -333,7 +330,8 @@ def init_model(self): if self._is_actor: self.actor = MegatronPPOActor(config=self.config.actor, model_config=self.actor_model_config, - megatron_config=megatron_config, + hf_config=self.hf_config, + tf_config=self.tf_config, actor_module=self.actor_module, actor_optimizer=self.actor_optimizer, actor_optimizer_config=self.actor_optim_config) @@ -344,13 +342,14 @@ def init_model(self): if self._is_ref: self.ref_module, self.ref_model_config = self._build_model_optimizer( model_path=self.config.model.path, - megatron_config=megatron_config, + dtype=self.dtype, optim_config=None, override_model_config=override_model_config, enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False)) self.ref_policy = MegatronPPOActor(config=self.config.ref, model_config=self.ref_model_config, - megatron_config=megatron_config, + hf_config=self.hf_config, + tf_config=self.tf_config, actor_module=self.ref_module, actor_optimizer=None, actor_optimizer_config=None) @@ -525,7 +524,7 @@ def __init__(self, config): def _build_critic_model_optimizer(self, model_path, - megatron_config: ModelParallelConfig, + dtype, optim_config, override_model_config, enable_gradient_checkpointing=False): @@ -554,18 +553,19 @@ def _build_critic_model_optimizer(self, self.architectures = getattr(critic_model_config, "architectures", None) if self.rank == 0: print(f'Model config after override: {critic_model_config}') - tfconfig = hf_to_mcore_config(critic_model_config, megatron_config.dtype) + tf_config = hf_to_mcore_config(critic_model_config, dtype) if enable_gradient_checkpointing: gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict())) - tfconfig.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method'] - tfconfig.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity'] - tfconfig.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers'] - print(f'Critic TF config: {tfconfig}') + tf_config.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method'] + tf_config.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity'] + tf_config.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers'] + print(f'Critic TF config: {tf_config}') self.hf_config = critic_model_config + self.tf_config = tf_config def megatron_critic_model_provider(pre_process, post_process): from verl.models.mcore import init_mcore_model - parallel_model = init_mcore_model(tfconfig, + parallel_model = init_mcore_model(tf_config, critic_model_config, pre_process, post_process, @@ -593,7 +593,7 @@ def megatron_critic_model_provider(pre_process, post_process): load_megatron_gptmodel_weights(self.config, critic_model_config, critic_module, - params_dtype=megatron_config.params_dtype, + params_dtype=dtype, is_value_model=True) t1 = time.time() if torch.distributed.get_rank() == 0: @@ -619,19 +619,17 @@ def init_model(self): importlib.import_module(self.config.model.external_lib) override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create())) self.param_dtype = torch.bfloat16 - - megatron_config = mcore_model_parallel_config(sequence_parallel=self.config.megatron.get( - 'sequence_parallel', True), - params_dtype=PrecisionType.to_dtype(self.param_dtype)) + self.dtype = PrecisionType.to_dtype(self.param_dtype) self.critic_module, self.critic_optimizer, self.critic_model_config, critic_optimizer_config = self._build_critic_model_optimizer( model_path=self.config.model.path, - megatron_config=megatron_config, + dtype=self.dtype, optim_config=self.config.optim, override_model_config=override_model_config, enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False)) self.critic = MegatronPPOCritic(config=self.config, model_config=self.critic_model_config, - megatron_config=megatron_config, + hf_config=self.hf_config, + tf_config=self.tf_config, critic_module=self.critic_module, critic_optimizer=self.critic_optimizer, critic_optimizer_config=critic_optimizer_config) From 9216811f2a1f24bdff3e1eeb9f3bbfeba322201e Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Sun, 13 Apr 2025 09:00:51 -0700 Subject: [PATCH 03/19] reward model use gptmodel api, clean megatron_worker --- .../single_controller/base/megatron/worker.py | 47 ++++- verl/workers/actor/megatron_actor.py | 3 +- verl/workers/megatron_workers.py | 193 +++++------------- .../reward_model/megatron/reward_model.py | 20 +- 4 files changed, 113 insertions(+), 150 deletions(-) diff --git a/verl/single_controller/base/megatron/worker.py b/verl/single_controller/base/megatron/worker.py index 3adce5cce4a..c6594ee4618 100644 --- a/verl/single_controller/base/megatron/worker.py +++ b/verl/single_controller/base/megatron/worker.py @@ -36,4 +36,49 @@ def get_megatron_rank_info(self): pp_rank = mpu.get_pipeline_model_parallel_rank() cp_rank = mpu.get_context_parallel_rank() info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank, cp_rank=cp_rank) - return info \ No newline at end of file + return info + + def _init_hf_config_and_tf_config(self, + model_path, + dtype, + override_model_config): + from verl.utils.model import print_model_size, update_model_config + from verl.utils.fs import copy_to_local + from verl.utils import hf_tokenizer + from transformers import AutoConfig + from verl.models.mcore import hf_to_mcore_config + + # Step 1: initialize the tokenizer + self.local_path = copy_to_local(model_path) + self.tokenizer = hf_tokenizer(self.local_path) + + # Step 2: get the hf + hf_config = AutoConfig.from_pretrained(self.local_path) + + # Step 3: override the hf config + override_config_kwargs = { + 'bos_token_id': self.tokenizer.bos_token_id, + 'eos_token_id': self.tokenizer.eos_token_id, + 'pad_token_id': self.tokenizer.pad_token_id, + } + override_config_kwargs.update(override_model_config) + self.share_embeddings_and_output_weights = getattr(hf_config, "tie_word_embeddings", False) + update_model_config(hf_config, override_config_kwargs=override_config_kwargs) + self.architectures = getattr(hf_config, "architectures", None) + if self.rank == 0: + print(f'Model config after override: {hf_config}') + tf_config = hf_to_mcore_config(hf_config, dtype) + + def add_optimization_config_to_tf_config(tf_config, verl_model_config): + # add optimization config to tf_config, e.g. checkpointing + if verl_model_config.get('enable_gradient_checkpointing', False): + gradient_checkpointing_cfg = dict(verl_model_config.get('gradient_checkpointing_kwargs', dict())) + tf_config.recompute_method = gradient_checkpointing_cfg.get('activations_checkpoint_method', 'full') + tf_config.recompute_granularity = gradient_checkpointing_cfg.get('activations_checkpoint_granularity', 'full') + tf_config.recompute_num_layers = gradient_checkpointing_cfg.get('activations_checkpoint_num_layers', -1) + + add_optimization_config_to_tf_config(tf_config, self.config.model) + + print(f'TF config: {tf_config}') + self.hf_config = hf_config + self.tf_config = tf_config \ No newline at end of file diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py index 9fd73fff137..39fe6aa2d51 100644 --- a/verl/workers/actor/megatron_actor.py +++ b/verl/workers/actor/megatron_actor.py @@ -53,7 +53,7 @@ class MegatronPPOActor(BasePPOActor): def __init__(self, config, model_config, hf_config, tf_config, actor_module: nn.ModuleList, - actor_optimizer: DistributedOptimizer, actor_optimizer_config: OptimizerConfig): + actor_optimizer: DistributedOptimizer): """MeagtronPPOActor class. This class implements the simple PPO logics when the model is built with Megatron. Args: @@ -107,7 +107,6 @@ def __init__(self, config, model_config, hf_config, tf_config, actor_module: nn. self.tf_config = tf_config self.actor_module = actor_module self.actor_optimizer: DistributedOptimizer = actor_optimizer - self.actor_optimizer_config = actor_optimizer_config self.optimizer_step_args = OmegaConf.create({ 'skip_grad': None, diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index 178327c375f..76572de2026 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -37,7 +37,6 @@ from verl.utils.model import load_megatron_model_weights, load_megatron_gptmodel_weights, load_mcore_dist_weights from verl.utils.flops_counter import FlopsCounter from verl.utils.checkpoint.megatron_checkpoint_manager import MegatronCheckpointManager -from verl.utils.megatron_utils import mcore_model_parallel_config from verl.utils.megatron_utils import offload_megatron_param_and_grad, load_megatron_param_and_grad from verl.utils import hf_tokenizer @@ -136,54 +135,21 @@ def __init__(self, config: DictConfig, role: str): def _build_model_optimizer(self, model_path, - dtype, optim_config, - override_model_config, - enable_gradient_checkpointing=False): + override_model_config): from verl.utils.megatron.optimizer import get_megatron_optimizer from megatron.core.models.gpt.gpt_model import ModelType - from verl.utils.model import print_model_size, update_model_config, get_generation_config + from verl.utils.model import print_model_size, get_generation_config from verl.utils.megatron_utils import get_model, init_megatron_optim_config - from verl.models.mcore import hf_to_mcore_config - from transformers import AutoConfig - # Step 1: initialize the tokenizer - local_path = copy_to_local(model_path) - self.tokenizer = hf_tokenizer(local_path) - - # Step 2: get the actor_model_config - actor_model_config = AutoConfig.from_pretrained(local_path) - - self.generation_config = get_generation_config(local_path) - - override_config_kwargs = { - 'bos_token_id': self.tokenizer.bos_token_id, - 'eos_token_id': self.tokenizer.eos_token_id, - 'pad_token_id': self.tokenizer.pad_token_id, - } - override_config_kwargs.update(override_model_config) - update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs) - - if self.rank == 0: - print(f'Model config after override: {actor_model_config}') - - self.share_embeddings_and_output_weights = getattr(actor_model_config, "tie_word_embeddings", False) - self.architectures = getattr(actor_model_config, "architectures", None) - - tf_config = hf_to_mcore_config(actor_model_config, dtype) - if enable_gradient_checkpointing: - gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict())) - tf_config.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method'] - tf_config.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity'] - tf_config.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers'] - print(f'TF config: {tf_config}') - self.hf_config = actor_model_config + self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config) + self.generation_config = get_generation_config(self.local_path) def megatron_actor_model_provider(pre_process, post_process): from verl.models.mcore import init_mcore_model parallel_model = init_mcore_model( - tf_config, - actor_model_config, + self.tf_config, + self.hf_config, pre_process, post_process, share_embeddings_and_output_weights=self.share_embeddings_and_output_weights, @@ -212,9 +178,9 @@ def megatron_actor_model_provider(pre_process, post_process): is_value_model=False) else: load_megatron_gptmodel_weights(self.config, - actor_model_config, + self.hf_config, actor_module, - params_dtype=dtype, + params_dtype=self.dtype, is_value_model=False) if self.rank == 0: @@ -237,12 +203,12 @@ def megatron_actor_model_provider(pre_process, post_process): is_value_model=False) else: load_megatron_gptmodel_weights(self.config, - actor_model_config, + self.hf_config, ref_module, - params_dtype=dtype, + params_dtype=self.dtype, is_value_model=False) log_gpu_memory_usage('After ref module init', logger=logger) - return ref_module, actor_model_config + return ref_module, self.hf_config # TODO: add more optimizer args into config if self._is_actor: @@ -254,7 +220,7 @@ def megatron_actor_model_provider(pre_process, post_process): log_gpu_memory_usage('After actor optimizer init', logger=logger) - return actor_module, hybrid_engine, actor_optimizer, actor_model_config, optim_config + return actor_module, hybrid_engine, actor_optimizer, self.hf_config, optim_config def _build_rollout(self): if self.config.rollout.name == 'vllm': @@ -319,12 +285,10 @@ def init_model(self): else: optim_config = None self.actor_module, self.hybrid_engine, self.actor_optimizer, \ - self.actor_model_config, self.actor_optim_config = self._build_model_optimizer( + self.actor_model_config, self.actor_hf_config = self._build_model_optimizer( model_path=self.config.model.path, - dtype=self.dtype, optim_config=optim_config, - override_model_config=override_model_config, - enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False) + override_model_config=override_model_config ) if self._is_actor: @@ -334,7 +298,7 @@ def init_model(self): tf_config=self.tf_config, actor_module=self.actor_module, actor_optimizer=self.actor_optimizer, - actor_optimizer_config=self.actor_optim_config) + ) if self._is_rollout: self.rollout, self.sharding_manager = self._build_rollout() @@ -342,17 +306,14 @@ def init_model(self): if self._is_ref: self.ref_module, self.ref_model_config = self._build_model_optimizer( model_path=self.config.model.path, - dtype=self.dtype, optim_config=None, - override_model_config=override_model_config, - enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False)) + override_model_config=override_model_config,) self.ref_policy = MegatronPPOActor(config=self.config.ref, model_config=self.ref_model_config, hf_config=self.hf_config, tf_config=self.tf_config, actor_module=self.ref_module, - actor_optimizer=None, - actor_optimizer_config=None) + actor_optimizer=None) if self._is_actor: self.flops_counter = FlopsCounter(self.actor_model_config) @@ -524,49 +485,19 @@ def __init__(self, config): def _build_critic_model_optimizer(self, model_path, - dtype, optim_config, - override_model_config, - enable_gradient_checkpointing=False): + override_model_config): from megatron.core.models.gpt.gpt_model import ModelType - from verl.utils.model import print_model_size, update_model_config + from verl.utils.model import print_model_size from verl.utils.megatron.optimizer import get_megatron_optimizer from verl.utils.megatron_utils import get_model, init_megatron_optim_config - from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig - from verl.models.mcore import hf_to_mcore_config - - # Step 1: initialize the tokenizer - local_path = copy_to_local(model_path) - self.tokenizer = hf_tokenizer(local_path) - # Step 2: get the critic_model_config - critic_model_config = AutoConfig.from_pretrained(local_path) - - override_config_kwargs = { - 'bos_token_id': self.tokenizer.bos_token_id, - 'eos_token_id': self.tokenizer.eos_token_id, - 'pad_token_id': self.tokenizer.pad_token_id, - } - override_config_kwargs.update(override_model_config) - self.share_embeddings_and_output_weights = getattr(critic_model_config, "tie_word_embeddings", False) - update_model_config(critic_model_config, override_config_kwargs=override_config_kwargs) - self.architectures = getattr(critic_model_config, "architectures", None) - if self.rank == 0: - print(f'Model config after override: {critic_model_config}') - tf_config = hf_to_mcore_config(critic_model_config, dtype) - if enable_gradient_checkpointing: - gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict())) - tf_config.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method'] - tf_config.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity'] - tf_config.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers'] - print(f'Critic TF config: {tf_config}') - self.hf_config = critic_model_config - self.tf_config = tf_config + self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config) def megatron_critic_model_provider(pre_process, post_process): from verl.models.mcore import init_mcore_model - parallel_model = init_mcore_model(tf_config, - critic_model_config, + parallel_model = init_mcore_model(self.tf_config, + self.hf_config, pre_process, post_process, share_embeddings_and_output_weights=False, @@ -591,9 +522,9 @@ def megatron_critic_model_provider(pre_process, post_process): is_value_model=True) else: load_megatron_gptmodel_weights(self.config, - critic_model_config, + self.hf_config, critic_module, - params_dtype=dtype, + params_dtype=self.dtype, is_value_model=True) t1 = time.time() if torch.distributed.get_rank() == 0: @@ -605,7 +536,7 @@ def megatron_critic_model_provider(pre_process, post_process): optim_config = init_megatron_optim_config(optim_config) critic_optimizer = get_megatron_optimizer(model=critic_module, config=optim_config) torch.cuda.empty_cache() - return critic_module, critic_optimizer, critic_model_config, optim_config + return critic_module, critic_optimizer, self.hf_config, optim_config @register(dispatch_mode=Dispatch.ONE_TO_ALL) def init_model(self): @@ -622,10 +553,8 @@ def init_model(self): self.dtype = PrecisionType.to_dtype(self.param_dtype) self.critic_module, self.critic_optimizer, self.critic_model_config, critic_optimizer_config = self._build_critic_model_optimizer( model_path=self.config.model.path, - dtype=self.dtype, optim_config=self.config.optim, - override_model_config=override_model_config, - enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False)) + override_model_config=override_model_config) self.critic = MegatronPPOCritic(config=self.config, model_config=self.critic_model_config, hf_config=self.hf_config, @@ -724,45 +653,25 @@ def __init__(self, config): self.config.micro_batch_size //= mpu.get_data_parallel_world_size() self.config.micro_batch_size_per_gpu = self.config.micro_batch_size - def _build_rm_model(self, model_path, megatron_config: ModelParallelConfig, override_model_config): + def _build_rm_model(self, model_path, override_model_config): from megatron.core.models.gpt.gpt_model import ModelType - from verl.utils.model import update_model_config - from verl.utils.megatron_utils import get_model - from transformers import AutoConfig - - # Step 1: initialize the tokenizer - local_path = copy_to_local(model_path) - self.tokenizer = hf_tokenizer(local_path) - - # Step 2: get the actor_model_config - rm_model_config = AutoConfig.from_pretrained(local_path) - - override_config_kwargs = { - 'bos_token_id': self.tokenizer.bos_token_id, - 'eos_token_id': self.tokenizer.eos_token_id, - 'pad_token_id': self.tokenizer.pad_token_id, - } - override_config_kwargs.update(override_model_config) - update_model_config(rm_model_config, override_config_kwargs=override_config_kwargs) + from verl.utils.model import print_model_size + from verl.utils.megatron.optimizer import get_megatron_optimizer + from verl.utils.megatron_utils import get_model, init_megatron_optim_config - if self.rank == 0: - print(f'Model config after override: rm_model_config {rm_model_config}') + self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config) def megatron_rm_model_provider(pre_process, post_process): - from verl.utils.model import get_parallel_model_from_config - # vpp is not supported yet because it will hang for some reason. Need debugging - vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model - # this_megatron_config = copy.deepcopy(megatron_config) - # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank - parallel_model = get_parallel_model_from_config(config=rm_model_config, - megatron_config=megatron_config, - pre_process=pre_process, - post_process=post_process, - share_embeddings_and_output_weights=False, - value=True) + from verl.models.mcore import init_mcore_model + parallel_model = init_mcore_model(self.tf_config, + self.hf_config, + pre_process, + post_process, + share_embeddings_and_output_weights=False, + value=True) parallel_model.cuda() return parallel_model - + # Step 3: initialize the megatron model reward_model = get_model(model_provider_func=megatron_rm_model_provider, model_type=ModelType.encoder_or_decoder, @@ -773,15 +682,20 @@ def megatron_rm_model_provider(pre_process, post_process): # reward_model = nn.ModuleList(reward_model) if self.config.load_weight: - load_megatron_model_weights(self.config, - rm_model_config, - reward_model, - params_dtype=megatron_config.params_dtype, + if self.config.megatron.use_dist_checkpointing: + load_mcore_dist_weights(reward_model, + self.config.megatron.dist_checkpointing_path, is_value_model=True) + else: + load_megatron_gptmodel_weights(self.config, + self.hf_config, + reward_model, + params_dtype=self.dtype, + is_value_model=True) # TODO: add more optimizer args into config torch.cuda.empty_cache() - return reward_model, rm_model_config + return reward_model, self.hf_config @register(dispatch_mode=Dispatch.ONE_TO_ALL) def init_model(self): @@ -804,14 +718,10 @@ def init_model(self): rm_tokenizer = hf_tokenizer(rm_tokenizer_local_path) self.param_dtype = torch.bfloat16 - - megatron_config = mcore_model_parallel_config(sequence_parallel=self.config.megatron.get( - 'sequence_parallel', True), - params_dtype=PrecisionType.to_dtype(self.param_dtype)) + self.dtype = PrecisionType.to_dtype(self.param_dtype) reward_model_module, reward_model_config = self._build_rm_model( model_path=self.config.model.path, - megatron_config=megatron_config, override_model_config=override_model_config, ) # FIXME(sgm): reward model param offload is implemented in MegatronRewardModel @@ -819,7 +729,8 @@ def init_model(self): self.rm = MegatronRewardModel(config=self.config, reward_model_module=reward_model_module, model_config=reward_model_config, - megatron_config=megatron_config, + hf_config=self.hf_config, + tf_config=self.tf_config, sft_tokenizer=sft_tokenizer, rm_tokenizer=rm_tokenizer) diff --git a/verl/workers/reward_model/megatron/reward_model.py b/verl/workers/reward_model/megatron/reward_model.py index a890f288435..1b2cbee8ffa 100644 --- a/verl/workers/reward_model/megatron/reward_model.py +++ b/verl/workers/reward_model/megatron/reward_model.py @@ -35,12 +35,14 @@ def __init__(self, config, model_config, reward_model_module: torch.nn.ModuleList, - megatron_config, + hf_config, + tf_config, sft_tokenizer=None, rm_tokenizer=None): self.config = config self.reward_model_module = reward_model_module - self.megatron_config = megatron_config + self.hf_config = hf_config + self.tf_config = tf_config self.model_config = model_config self.device = 'cuda' self.sft_tokenizer = sft_tokenizer @@ -133,7 +135,7 @@ def compute_reward(self, data: DataProto) -> DataProto: with torch.no_grad(): output = self.forward_batch(data) if mpu.is_pipeline_last_stage(ignore_virtual=True): - logits = torch.cat([o['logits'] for o in output], dim=0) + logits = torch.cat([output], dim=0) else: logits = torch.empty( (input_ids.shape[0], input_ids.shape[1]), @@ -205,21 +207,27 @@ def forward_batch(self, data: DataProto): input_shapes = compute_transformers_input_shapes( batches, meta_info={ - 'sequence_parallel': self.megatron_config.sequence_parallel, + 'sequence_parallel': self.tf_config.sequence_parallel, 'hidden_size': self.model_config.hidden_size }) # compute input shapes for pp stages forward_backward_func = get_forward_backward_func() def loss_func(output): - return 1., {'logits': output.logits} + return 1., {'logits': output} def forward_step(batch_iter, model): batch = next(batch_iter) input_ids = batch['input_ids'] attention_mask = batch['attention_mask'] position_ids = batch['position_ids'] - output = model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids) + from verl.models.mcore import gptmodel_forward + + output = gptmodel_forward(model, + input_ids, + attention_mask, + position_ids, + sequence_parallel=self.tf_config.sequence_parallel) return output, loss_func # batch should be a list of batches inside micro-batches From 6c46c2a7e4b3d801bae918b6f3c78c5e51bc1825 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Sun, 13 Apr 2025 09:43:14 -0700 Subject: [PATCH 04/19] mcore model_forward for registry --- verl/models/mcore/__init__.py | 5 +- verl/models/mcore/config_converter.py | 2 +- verl/models/mcore/model_forward.py | 107 ++++++++++++++++++ verl/models/mcore/registry.py | 20 ++++ verl/models/mcore/{gpt_model.py => util.py} | 42 ------- .../single_controller/base/megatron/worker.py | 10 +- verl/utils/megatron_utils.py | 10 +- verl/workers/actor/megatron_actor.py | 26 ++--- verl/workers/critic/megatron_critic.py | 32 +++--- verl/workers/megatron_workers.py | 34 +++--- .../reward_model/megatron/reward_model.py | 26 +++-- 11 files changed, 197 insertions(+), 117 deletions(-) create mode 100644 verl/models/mcore/model_forward.py rename verl/models/mcore/{gpt_model.py => util.py} (80%) diff --git a/verl/models/mcore/__init__.py b/verl/models/mcore/__init__.py index fbc26864c92..ccb2309f9e5 100644 --- a/verl/models/mcore/__init__.py +++ b/verl/models/mcore/__init__.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .gpt_model import gptmodel_forward -from .registry import init_mcore_model, hf_to_mcore_config +from .registry import init_mcore_model, hf_to_mcore_config, get_mcore_forward_fn -__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'gptmodel_forward'] \ No newline at end of file +__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'get_mcore_forward_fn'] \ No newline at end of file diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index d8359ed2284..f25e6211fcc 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -19,7 +19,7 @@ from megatron.core.transformer import TransformerConfig import torch import torch.nn.functional as F -from megatron.core.enums import AttnBackend +from megatron.core.transformer.enums import AttnBackend def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: diff --git a/verl/models/mcore/model_forward.py b/verl/models/mcore/model_forward.py new file mode 100644 index 00000000000..42fd702efee --- /dev/null +++ b/verl/models/mcore/model_forward.py @@ -0,0 +1,107 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from verl.utils.megatron import sequence_parallel as sp_utils +from verl.utils.megatron import tensor_parallel as tp_utils +import torch +from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core import parallel_state as mpu +from verl.utils.megatron_utils import unwrap_model +from .util import preprocess_packed_seqs, postprocess_packed_seqs, remove_left_padding, recover_left_padding + + +def gptmodel_forward_dense(model, + input_ids, + attention_mask, + position_ids, + sequence_parallel, + value_model=False, + pack_seqs=True): + pre_process = unwrap_model(model).pre_process + post_process = unwrap_model(model).post_process + if pack_seqs: + batch_size, seq_len = attention_mask.shape[:2] + input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=pre_process) + input_ids_rmpad = input_ids_rmpad.contiguous() + output_orig = model(input_ids=input_ids_rmpad, + attention_mask=None, + position_ids=position_ids, + packed_seq_params=packed_seq_params) + + output = postprocess_packed_seqs(output_orig, + packed_seq_params, + attention_mask, + batch_size, + seq_len, + post_process=post_process) + else: + batch_size, sequence_length = attention_mask.shape + new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(input_ids, + attention_mask, + position_ids, + sequence_parallel, + pre_process=pre_process) + output = model(input_ids=new_input_ids, attention_mask=new_attention_mask, position_ids=new_position_ids) + output = recover_left_padding(output, + new_attention_mask, + attention_mask, + sequence_length, + post_process=post_process) + if value_model and post_process: + output = output[..., 0] + return output + + +def gptmodel_forward_qwen2_moe(model, + input_ids, + attention_mask, + position_ids, + sequence_parallel, + value_model=False, + pack_seqs=True): + return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, + pack_seqs) + + +def gptmodel_forward_llama4(model, + input_ids, + attention_mask, + position_ids, + sequence_parallel, + value_model=False, + pack_seqs=True): + return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, + pack_seqs) + + +def gptmodel_forward_dpskv3(model, + input_ids, + attention_mask, + position_ids, + sequence_parallel, + value_model=False, + pack_seqs=True): + return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, + pack_seqs) + + +def gptmodel_forward_qwen2_5_vl(model, + input_ids, + attention_mask, + position_ids, + sequence_parallel, + value_model=False, + pack_seqs=True): + raise NotImplementedError("VLM is not supported yet") diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py index a2a92924b00..a54363026fe 100644 --- a/verl/models/mcore/registry.py +++ b/verl/models/mcore/registry.py @@ -63,3 +63,23 @@ def init_mcore_model( f"Supported architectures: {MODEL_INITIALIZER_REGISTRY.keys()}") return MODEL_INITIALIZER_REGISTRY[arch](tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, value, **extra_kwargs) + + +from .model_forward import gptmodel_forward_dense, gptmodel_forward_qwen2_moe, gptmodel_forward_llama4, gptmodel_forward_dpskv3, gptmodel_forward_qwen2_5_vl + + +def get_mcore_forward_fn(hf_config: PretrainedConfig): + MODEL_FORWARD_REGISTRY = { + "LlamaForCausalLM": gptmodel_forward_dense, + "Qwen2ForCausalLM": gptmodel_forward_dense, + "Qwen2MoeForCausalLM": gptmodel_forward_qwen2_moe, + "DeepseekV3ForCausalLM": gptmodel_forward_dpskv3, + "Qwen2_5_VLForConditionalGeneration": gptmodel_forward_qwen2_5_vl, + "Llama4ForConditionalGeneration": gptmodel_forward_llama4, + } + assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" + arch = hf_config.architectures[0] + if arch not in MODEL_FORWARD_REGISTRY: + raise ValueError(f"Model architectures {arch} forward function are not supported for now. " + f"Supported architectures: {MODEL_FORWARD_REGISTRY.keys()}") + return MODEL_FORWARD_REGISTRY[arch] diff --git a/verl/models/mcore/gpt_model.py b/verl/models/mcore/util.py similarity index 80% rename from verl/models/mcore/gpt_model.py rename to verl/models/mcore/util.py index 814dd3e8ac0..fcf406d4475 100644 --- a/verl/models/mcore/gpt_model.py +++ b/verl/models/mcore/util.py @@ -21,48 +21,6 @@ from verl.utils.megatron_utils import unwrap_model -def gptmodel_forward(model, - input_ids, - attention_mask, - position_ids, - sequence_parallel, - value_model=False, - pack_seqs=True): - pre_process = unwrap_model(model).pre_process - post_process = unwrap_model(model).post_process - if pack_seqs: - batch_size, seq_len = attention_mask.shape[:2] - input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=pre_process) - input_ids_rmpad = input_ids_rmpad.contiguous() - output_orig = model(input_ids=input_ids_rmpad, - attention_mask=None, - position_ids=position_ids, - packed_seq_params=packed_seq_params) - - output = postprocess_packed_seqs(output_orig, - packed_seq_params, - attention_mask, - batch_size, - seq_len, - post_process=post_process) - else: - batch_size, sequence_length = attention_mask.shape - new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(input_ids, - attention_mask, - position_ids, - sequence_parallel, - pre_process=pre_process) - output = model(input_ids=new_input_ids, attention_mask=new_attention_mask, position_ids=new_position_ids) - output = recover_left_padding(output, - new_attention_mask, - attention_mask, - sequence_length, - post_process=post_process) - if value_model and post_process: - output = output[..., 0] - return output - - def preprocess_packed_seqs(input_ids: torch.Tensor, attention_mask: torch.Tensor, pre_process: bool = True) -> tuple[torch.Tensor, PackedSeqParams]: diff --git a/verl/single_controller/base/megatron/worker.py b/verl/single_controller/base/megatron/worker.py index c6594ee4618..af9f612ead0 100644 --- a/verl/single_controller/base/megatron/worker.py +++ b/verl/single_controller/base/megatron/worker.py @@ -38,10 +38,7 @@ def get_megatron_rank_info(self): info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank, cp_rank=cp_rank) return info - def _init_hf_config_and_tf_config(self, - model_path, - dtype, - override_model_config): + def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config): from verl.utils.model import print_model_size, update_model_config from verl.utils.fs import copy_to_local from verl.utils import hf_tokenizer @@ -74,11 +71,12 @@ def add_optimization_config_to_tf_config(tf_config, verl_model_config): if verl_model_config.get('enable_gradient_checkpointing', False): gradient_checkpointing_cfg = dict(verl_model_config.get('gradient_checkpointing_kwargs', dict())) tf_config.recompute_method = gradient_checkpointing_cfg.get('activations_checkpoint_method', 'full') - tf_config.recompute_granularity = gradient_checkpointing_cfg.get('activations_checkpoint_granularity', 'full') + tf_config.recompute_granularity = gradient_checkpointing_cfg.get('activations_checkpoint_granularity', + 'full') tf_config.recompute_num_layers = gradient_checkpointing_cfg.get('activations_checkpoint_num_layers', -1) add_optimization_config_to_tf_config(tf_config, self.config.model) print(f'TF config: {tf_config}') self.hf_config = hf_config - self.tf_config = tf_config \ No newline at end of file + self.tf_config = tf_config diff --git a/verl/utils/megatron_utils.py b/verl/utils/megatron_utils.py index ffeb114f6d2..0dfb5ec1117 100644 --- a/verl/utils/megatron_utils.py +++ b/verl/utils/megatron_utils.py @@ -219,9 +219,11 @@ def mcore_model_parallel_config( ) -> ModelParallelConfig: # WARNING: Code should not reach this point. This function is deprecated and will be removed. # Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead. - warnings.warn("Code should not reach this point. This function is deprecated and will be removed. " - "Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead.", - DeprecationWarning, stacklevel=2) + warnings.warn( + "Code should not reach this point. This function is deprecated and will be removed. " + "Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead.", + DeprecationWarning, + stacklevel=2) return ModelParallelConfig( tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(), pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(), @@ -306,4 +308,4 @@ def get_rng_states_checkpoint_path(checkpoint_path, only_rank0_save=True): tp_rank = mpu.get_tensor_model_parallel_rank() cp_rank = mpu.get_context_parallel_rank() return os.path.join(checkpoint_path, f'rng_states', - f"rng_states_pp{pp_rank}_tp{tp_rank}_cp{cp_rank}_dp{dp_rank}.pt") \ No newline at end of file + f"rng_states_pp{pp_rank}_tp{tp_rank}_cp{cp_rank}_dp{dp_rank}.pt") diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py index 39fe6aa2d51..b8c328cfaca 100644 --- a/verl/workers/actor/megatron_actor.py +++ b/verl/workers/actor/megatron_actor.py @@ -239,12 +239,11 @@ def forward_backward_batch(self, data: DataProto, forward_only=False, post_proce batch_size = self.config.ppo_micro_batch_size_per_gpu batches = split_dict_tensor_into_batches(data.batch, batch_size=batch_size) # compute input shapes for pp stages - input_shapes = compute_transformers_input_shapes( - batches, - meta_info={ - 'sequence_parallel': self.tf_config.sequence_parallel, - 'hidden_size': self.model_config.hidden_size - }) + input_shapes = compute_transformers_input_shapes(batches, + meta_info={ + 'sequence_parallel': self.tf_config.sequence_parallel, + 'hidden_size': self.model_config.hidden_size + }) n_micro_batch = len(batches) seq_len = batches[0]['input_ids'].shape[1] @@ -317,13 +316,14 @@ def forward_step(batch_iter, model): input_ids = batch['input_ids'] attention_mask = batch['attention_mask'] position_ids = batch['position_ids'] - from verl.models.mcore import gptmodel_forward - - output = gptmodel_forward(model, - input_ids, - attention_mask, - position_ids, - sequence_parallel=self.tf_config.sequence_parallel) + from verl.models.mcore import get_mcore_forward_fn + forward_fn = get_mcore_forward_fn(self.hf_config) + + output = forward_fn(model, + input_ids, + attention_mask, + position_ids, + sequence_parallel=self.tf_config.sequence_parallel) if forward_only: meta_info = None else: diff --git a/verl/workers/critic/megatron_critic.py b/verl/workers/critic/megatron_critic.py index af1cbcc4db7..be69a938c55 100644 --- a/verl/workers/critic/megatron_critic.py +++ b/verl/workers/critic/megatron_critic.py @@ -47,8 +47,8 @@ def __init__(self, config, model_config, hf_config, tf_config, critic_module: nn super().__init__(config=config) self._validate_config(config) self.model_config = model_config - self.hf_config = hf_config # huggingface config - self.tf_config = tf_config # mcore transformer config + self.hf_config = hf_config # huggingface config + self.tf_config = tf_config # mcore transformer config self.critic_module = critic_module self.critic_optimizer = critic_optimizer @@ -120,12 +120,11 @@ def forward_backward_batch(self, data: DataProto, forward_only=False): seq_len = batches[0]['input_ids'].shape[1] # compute input shapes for pp stages - input_shapes = compute_transformers_input_shapes( - batches, - meta_info={ - 'sequence_parallel': self.tf_config.sequence_parallel, - 'hidden_size': self.model_config.hidden_size - }) + input_shapes = compute_transformers_input_shapes(batches, + meta_info={ + 'sequence_parallel': self.tf_config.sequence_parallel, + 'hidden_size': self.model_config.hidden_size + }) forward_backward_func = get_forward_backward_func() @@ -164,14 +163,15 @@ def forward_step(batch_iter, model): input_ids = batch['input_ids'] attention_mask = batch['attention_mask'] position_ids = batch['position_ids'] - from verl.models.mcore import gptmodel_forward - - output = gptmodel_forward(model, - input_ids, - attention_mask, - position_ids, - sequence_parallel=self.tf_config.sequence_parallel, - value_model=True) + from verl.models.mcore import get_mcore_forward_fn + forward_fn = get_mcore_forward_fn(self.hf_config) + + output = forward_fn(model, + input_ids, + attention_mask, + position_ids, + sequence_parallel=self.tf_config.sequence_parallel, + value_model=True) return output, partial(loss_func, data=batch, meta_info={}) diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index 76572de2026..46df4960d0f 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -133,10 +133,7 @@ def __init__(self, config: DictConfig, role: str): self.config.ref.ppo_micro_batch_size_per_gpu = self.config.ref.ppo_micro_batch_size self._is_offload_param = self.config.ref.get('param_offload', False) - def _build_model_optimizer(self, - model_path, - optim_config, - override_model_config): + def _build_model_optimizer(self, model_path, optim_config, override_model_config): from verl.utils.megatron.optimizer import get_megatron_optimizer from megatron.core.models.gpt.gpt_model import ModelType from verl.utils.model import print_model_size, get_generation_config @@ -292,13 +289,14 @@ def init_model(self): ) if self._is_actor: - self.actor = MegatronPPOActor(config=self.config.actor, - model_config=self.actor_model_config, - hf_config=self.hf_config, - tf_config=self.tf_config, - actor_module=self.actor_module, - actor_optimizer=self.actor_optimizer, - ) + self.actor = MegatronPPOActor( + config=self.config.actor, + model_config=self.actor_model_config, + hf_config=self.hf_config, + tf_config=self.tf_config, + actor_module=self.actor_module, + actor_optimizer=self.actor_optimizer, + ) if self._is_rollout: self.rollout, self.sharding_manager = self._build_rollout() @@ -307,7 +305,8 @@ def init_model(self): self.ref_module, self.ref_model_config = self._build_model_optimizer( model_path=self.config.model.path, optim_config=None, - override_model_config=override_model_config,) + override_model_config=override_model_config, + ) self.ref_policy = MegatronPPOActor(config=self.config.ref, model_config=self.ref_model_config, hf_config=self.hf_config, @@ -483,10 +482,7 @@ def __init__(self, config): # TODO(sgm): support critic model offload - def _build_critic_model_optimizer(self, - model_path, - optim_config, - override_model_config): + def _build_critic_model_optimizer(self, model_path, optim_config, override_model_config): from megatron.core.models.gpt.gpt_model import ModelType from verl.utils.model import print_model_size from verl.utils.megatron.optimizer import get_megatron_optimizer @@ -671,7 +667,7 @@ def megatron_rm_model_provider(pre_process, post_process): value=True) parallel_model.cuda() return parallel_model - + # Step 3: initialize the megatron model reward_model = get_model(model_provider_func=megatron_rm_model_provider, model_type=ModelType.encoder_or_decoder, @@ -683,9 +679,7 @@ def megatron_rm_model_provider(pre_process, post_process): if self.config.load_weight: if self.config.megatron.use_dist_checkpointing: - load_mcore_dist_weights(reward_model, - self.config.megatron.dist_checkpointing_path, - is_value_model=True) + load_mcore_dist_weights(reward_model, self.config.megatron.dist_checkpointing_path, is_value_model=True) else: load_megatron_gptmodel_weights(self.config, self.hf_config, diff --git a/verl/workers/reward_model/megatron/reward_model.py b/verl/workers/reward_model/megatron/reward_model.py index 1b2cbee8ffa..54027dfcc11 100644 --- a/verl/workers/reward_model/megatron/reward_model.py +++ b/verl/workers/reward_model/megatron/reward_model.py @@ -204,12 +204,11 @@ def forward_batch(self, data: DataProto): seq_len = batches[0]['input_ids'].shape[1] # compute input shapes for pp stages - input_shapes = compute_transformers_input_shapes( - batches, - meta_info={ - 'sequence_parallel': self.tf_config.sequence_parallel, - 'hidden_size': self.model_config.hidden_size - }) + input_shapes = compute_transformers_input_shapes(batches, + meta_info={ + 'sequence_parallel': self.tf_config.sequence_parallel, + 'hidden_size': self.model_config.hidden_size + }) # compute input shapes for pp stages forward_backward_func = get_forward_backward_func() @@ -221,13 +220,16 @@ def forward_step(batch_iter, model): input_ids = batch['input_ids'] attention_mask = batch['attention_mask'] position_ids = batch['position_ids'] - from verl.models.mcore import gptmodel_forward + from verl.models.mcore import get_mcore_forward_fn + forward_fn = get_mcore_forward_fn(self.hf_config) + + output = forward_fn(model, + input_ids, + attention_mask, + position_ids, + sequence_parallel=self.tf_config.sequence_parallel, + value_model=True) - output = gptmodel_forward(model, - input_ids, - attention_mask, - position_ids, - sequence_parallel=self.tf_config.sequence_parallel) return output, loss_func # batch should be a list of batches inside micro-batches From e709dc3676032060f859fdb07a1408437cce60f7 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Wed, 16 Apr 2025 03:35:39 -0700 Subject: [PATCH 05/19] (WIP) support qwen2moe --- verl/models/mcore/__init__.py | 4 +- verl/models/mcore/config_converter.py | 57 ++++++- verl/models/mcore/model_initializer.py | 34 ++++- verl/models/mcore/registry.py | 18 +++ verl/models/mcore/saver.py | 8 + verl/models/mcore/weight_converter.py | 143 ++++++++++++++++++ verl/models/weight_loader_registry.py | 7 +- verl/utils/model.py | 1 + verl/workers/actor/megatron_actor.py | 4 +- verl/workers/critic/megatron_critic.py | 2 +- verl/workers/megatron_workers.py | 5 +- .../workers/sharding_manager/megatron_vllm.py | 56 ++++++- 12 files changed, 316 insertions(+), 23 deletions(-) create mode 100644 verl/models/mcore/weight_converter.py diff --git a/verl/models/mcore/__init__.py b/verl/models/mcore/__init__.py index ccb2309f9e5..fbc2dc566bf 100644 --- a/verl/models/mcore/__init__.py +++ b/verl/models/mcore/__init__.py @@ -13,6 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .registry import init_mcore_model, hf_to_mcore_config, get_mcore_forward_fn +from .registry import init_mcore_model, hf_to_mcore_config, get_mcore_forward_fn, get_mcore_weight_converter -__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'get_mcore_forward_fn'] \ No newline at end of file +__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'get_mcore_forward_fn', 'get_mcore_weight_converter'] \ No newline at end of file diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index f25e6211fcc..e95d4ca73ba 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -65,8 +65,61 @@ def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: - # Qwen2MoeForCausalLM - raise NotImplementedError("Qwen2MoeForCausalLM is not supported yet") + from megatron.core import parallel_state as mpu + overlap_p2p_comm = mpu.get_virtual_pipeline_model_parallel_world_size( + ) is not None and mpu.get_virtual_pipeline_model_parallel_world_size() > 1 + batch_p2p_comm = False + transformer_config = TransformerConfig( + num_layers=hf_config.num_hidden_layers, + hidden_size=hf_config.hidden_size, + num_attention_heads=hf_config.num_attention_heads, + num_query_groups=hf_config.num_key_value_heads, + attention_dropout=hf_config.attention_dropout, + hidden_dropout=getattr(hf_config, 'hidden_dropout', 0.0), + activation_func=F.silu, + normalization='RMSNorm', + gated_linear_unit=True, + use_cpu_initialization=False, + add_bias_linear=False, + pipeline_dtype=dtype, + params_dtype=dtype, + variable_seq_lengths=True, + masked_softmax_fusion=True, + attention_backend=AttnBackend.flash, + bf16=dtype is torch.bfloat16, + layernorm_epsilon=hf_config.rms_norm_eps, + + # parallel config + tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(), + pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(), + virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(), + context_parallel_size=mpu.get_context_parallel_world_size(), + overlap_p2p_comm=overlap_p2p_comm, + batch_p2p_comm=batch_p2p_comm, + sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1, + + # moe specific + ffn_hidden_size=hf_config.moe_intermediate_size, + moe_token_dispatcher_type="alltoall", + moe_router_bias_update_rate=0.001, + moe_router_topk=hf_config.num_experts_per_tok, + num_moe_experts=hf_config.num_experts, + moe_shared_expert_intermediate_size=hf_config.shared_expert_intermediate_size, + # moe_aux_loss_coeff=hf_config.router_aux_loss_coef, + moe_aux_loss_coeff=0.0, + moe_router_load_balancing_type="aux_loss", + moe_router_pre_softmax=False, #? + moe_shared_expert_overlap=True, + # moe_permute_fusion=True, + moe_grouped_gemm=True, + + # mcore 0.12 + moe_router_dtype="fp64", + disable_bf16_reduced_precision_matmul=True, + + # qwen specific + add_qkv_bias=True) + return transformer_config def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py index 4ae2fe4e5d4..48ea980fd6b 100644 --- a/verl/models/mcore/model_initializer.py +++ b/verl/models/mcore/model_initializer.py @@ -54,8 +54,38 @@ def init_mcore_model_qwen2_moe(tfconfig, post_process=None, share_embeddings_and_output_weights=False, value=False): - return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, - value) + + from megatron.core.models.gpt.gpt_model import GPTModel + from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec + use_te = True + + def patch_layer_spec(transformer_layer_spec): + # shared_experts.gate=True + for i in range(len(transformer_layer_spec.layer_specs)): + transformer_layer_spec.layer_specs[i].submodules.mlp.submodules.shared_experts.params['gate'] = True + return transformer_layer_spec + + assert tfconfig.normalization == "RMSNorm", 'only RMSNorm is supported for now' + transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te) + transformer_layer_spec = patch_layer_spec(transformer_layer_spec) + rope_scaling_args = {} + if hf_config.rope_scaling is not None: + assert hf_config.rope_scaling['type'] == 'linear', "only linear scaling is supported for now" + rope_scaling_args['seq_len_interpolation_factor'] = hf_config.rope_scaling['factor'] + model = GPTModel(config=tfconfig, + transformer_layer_spec=transformer_layer_spec, + vocab_size=hf_config.vocab_size, + max_sequence_length=hf_config.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + share_embeddings_and_output_weights=share_embeddings_and_output_weights, + position_embedding_type='rope', + rotary_base=hf_config.rope_theta, + **rope_scaling_args) + if post_process and value: + from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer + model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig) + return model def init_mcore_model_llama4(tfconfig, diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py index a54363026fe..ab02c69e5be 100644 --- a/verl/models/mcore/registry.py +++ b/verl/models/mcore/registry.py @@ -83,3 +83,21 @@ def get_mcore_forward_fn(hf_config: PretrainedConfig): raise ValueError(f"Model architectures {arch} forward function are not supported for now. " f"Supported architectures: {MODEL_FORWARD_REGISTRY.keys()}") return MODEL_FORWARD_REGISTRY[arch] + + +from .weight_converter import McoreToHFWeightConverterDense, McoreToHFWeightConverterQwen2Moe + + +def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype): + MODEL_WEIGHT_CONVERTER_REGISTRY = { + "LlamaForCausalLM": McoreToHFWeightConverterDense, + "Qwen2ForCausalLM": McoreToHFWeightConverterDense, + "Qwen2MoeForCausalLM": McoreToHFWeightConverterQwen2Moe, + } + assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" + arch = hf_config.architectures[0] + if arch not in MODEL_WEIGHT_CONVERTER_REGISTRY: + raise ValueError(f"Model architectures {arch} weight converter are not supported for now. " + f"Supported architectures: {MODEL_WEIGHT_CONVERTER_REGISTRY.keys()}") + tfconfig = hf_to_mcore_config(hf_config, dtype) + return MODEL_WEIGHT_CONVERTER_REGISTRY[arch](hf_config, tfconfig) diff --git a/verl/models/mcore/saver.py b/verl/models/mcore/saver.py index 5598ab2d2cc..d4ab4610cc5 100644 --- a/verl/models/mcore/saver.py +++ b/verl/models/mcore/saver.py @@ -466,3 +466,11 @@ def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank): print_rank_0(f"merge megatron ckpt done, time elapsed {time.time() - start_time}s") return state_dict + + +def merge_megatron_ckpt_gptmodel_qwen_moe(wrapped_models, + config, + dtype, + is_value_model=False, + tie_word_embeddings=False): + raise NotImplementedError("merge_megatron_ckpt_gptmodel_qwen_moe is not implemented") diff --git a/verl/models/mcore/weight_converter.py b/verl/models/mcore/weight_converter.py new file mode 100644 index 00000000000..155361a532a --- /dev/null +++ b/verl/models/mcore/weight_converter.py @@ -0,0 +1,143 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# online convert mcore weight to pure huggingface weight, no any fusion +# including format conversion and name mapping +# not including resharding +import torch +from transformers import PretrainedConfig +from megatron.core.transformer import TransformerConfig + + +class McoreToHFWeightConverterBase: + + def __init__(self, hf_config: PretrainedConfig, mcore_config: TransformerConfig): + self.hf_config = hf_config + self.mcore_config = mcore_config + + def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> torch.Tensor: + raise NotImplementedError + + +class McoreToHFWeightConverterDense(McoreToHFWeightConverterBase): + + def _convert_attention_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]: + # 'decoder.layers.0.self_attention.linear_proj.weight' + # 'decoder.layers.0.self_attention.linear_qkv.layer_norm_weight' + # 'decoder.layers.0.self_attention.linear_qkv.weight' + # 'decoder.layers.0.self_attention.linear_qkv.bias' + layer_number = name.split('.')[2] + convert_names = [] + if "self_attention.linear_qkv.bias" in name or "self_attention.linear_qkv.weight" in name: + param_type = name.split('.')[-1] + assert param_type == 'bias' or param_type == 'weight' + convert_names.append(f'model.layers.{layer_number}.self_attn.q_proj.{param_type}') + convert_names.append(f'model.layers.{layer_number}.self_attn.k_proj.{param_type}') + convert_names.append(f'model.layers.{layer_number}.self_attn.v_proj.{param_type}') + assert len(params) == 3 + elif "self_attention.linear_proj.weight" in name: + convert_names.append(f'model.layers.{layer_number}.self_attn.o_proj.weight') + assert len(params) == 1 + elif "self_attention.linear_qkv.layer_norm_weight" in name: + convert_names.append(f'model.layers.{layer_number}.input_layernorm.weight') + assert len(params) == 1 + else: + raise NotImplementedError(f"Unsupported parameter name: {name}") + return convert_names, params + + def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]: + # 'decoder.layers.0.mlp.linear_fc1.layer_norm_weight' + # 'decoder.layers.0.mlp.linear_fc1.weight' + # 'decoder.layers.0.mlp.linear_fc2.weight' + layer_number = name.split('.')[2] + convert_names = [] + if "mlp.linear_fc1.weight" in name: + # split gate_proj and up_proj + convert_names.append(f'model.layers.{layer_number}.mlp.gate_proj.weight') + convert_names.append(f'model.layers.{layer_number}.mlp.up_proj.weight') + assert len(params) == 2 + elif "mlp.linear_fc1.layer_norm_weight" in name: + convert_names.append(f'model.layers.{layer_number}.post_attention_layernorm.weight') + assert len(params) == 1 + elif "mlp.linear_fc2.weight" in name: + convert_names.append(f'model.layers.{layer_number}.mlp.down_proj.weight') + assert len(params) == 1 + else: + raise NotImplementedError(f"Unsupported parameter name: {name}") + return convert_names, params + + def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]: + direct_name_mapping = { + "embedding.word_embeddings.weight": "model.embed_tokens.weight", + "decoder.final_layernorm.weight": "model.norm.weight", + "output_layer.weight": "lm_head.weight", + } + if name in direct_name_mapping: + return [direct_name_mapping[name]], [params_one_group[0]] + + if "self_attention" in name: + return self._convert_attention_param(name, params_one_group) + elif "mlp" in name: + return self._convert_mlp_param(name, params_one_group) + else: + raise NotImplementedError(f"Unsupported parameter name: {name}") + + +class McoreToHFWeightConverterQwen2Moe(McoreToHFWeightConverterDense): + + def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]: + # 'decoder.layers.0.pre_mlp_layernorm.weight', + # 'decoder.layers.0.mlp.router.weight', + # 'decoder.layers.0.mlp.shared_experts.gate_weight', + # 'decoder.layers.0.mlp.shared_experts.linear_fc1.weight', + # 'decoder.layers.0.mlp.shared_experts.linear_fc2.weight' + # moe1 + # 'decoder.layers.0.mlp.experts.linear_fc1.weight0', + # 'decoder.layers.0.mlp.experts.linear_fc1.weight1', + # 'decoder.layers.0.mlp.experts.linear_fc1.weight2', + # 'decoder.layers.0.mlp.experts.linear_fc1.weight3', + # moe2 + # 'decoder.layers.0.mlp.experts.linear_fc2.weight0', + # 'decoder.layers.0.mlp.experts.linear_fc2.weight1', + layer_number = name.split('.')[2] + convert_names = [] + if "pre_mlp_layernorm" in name: + convert_names.append(f'model.layers.{layer_number}.post_attention_layernorm.weight') + assert len(params) == 1 + elif "mlp.router.weight" in name: + convert_names.append(f'model.layers.{layer_number}.mlp.gate.weight') + assert len(params) == 1 + elif "shared_experts.gate_weight" in name: + convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert_gate.weight') + assert len(params) == 1 + elif "shared_experts.linear_fc1.weight" in name: # split gate_proj and up_proj + convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.gate_proj.weight') + convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.up_proj.weight') + assert len(params) == 2 + elif "shared_experts.linear_fc2.weight" in name: + convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.down_proj.weight') + assert len(params) == 1 + elif "mlp.experts.linear_fc1" in name: # split gate_proj and up_proj + expert_id = name.split('weight')[-1] + convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight') + convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight') + assert len(params) == 2 + elif "mlp.experts.linear_fc2" in name: + expert_id = name.split('weight')[-1] + convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight') + assert len(params) == 1 + else: + raise NotImplementedError(f"Unsupported parameter name: {name}") + return convert_names, params diff --git a/verl/models/weight_loader_registry.py b/verl/models/weight_loader_registry.py index a700761268b..8687b0cac59 100644 --- a/verl/models/weight_loader_registry.py +++ b/verl/models/weight_loader_registry.py @@ -14,8 +14,6 @@ def get_weight_loader(arch: str): - from verl.models.llama.megatron.checkpoint_utils.llama_loader import load_state_dict_to_megatron_llama - from verl.models.qwen2.megatron.checkpoint_utils.qwen2_loader import load_state_dict_to_megatron_qwen2 from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel _MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY = { 'LlamaForCausalLM': load_state_dict_to_megatron_gptmodel, @@ -29,12 +27,11 @@ def get_weight_loader(arch: str): def get_weight_saver(arch: str): - from verl.models.llama.megatron.checkpoint_utils.llama_saver import merge_megatron_ckpt_llama - from verl.models.qwen2.megatron.checkpoint_utils.qwen2_saver import merge_megatron_ckpt_qwen2 - from verl.models.mcore.saver import merge_megatron_ckpt_gptmodel + from verl.models.mcore.saver import merge_megatron_ckpt_gptmodel, merge_megatron_ckpt_gptmodel_qwen_moe _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY = { 'LlamaForCausalLM': merge_megatron_ckpt_gptmodel, 'Qwen2ForCausalLM': merge_megatron_ckpt_gptmodel, + "Qwen2MoeForCausalLM": merge_megatron_ckpt_gptmodel_qwen_moe, } if arch in _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY: return _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY[arch] diff --git a/verl/utils/model.py b/verl/utils/model.py index fbe26691e8e..52cb5a06af7 100644 --- a/verl/utils/model.py +++ b/verl/utils/model.py @@ -210,6 +210,7 @@ def normalize_model_name(name, pp_rank, vpp_rank, pp_size, vpp_size, num_layers) """ Transform the model name in each model_chunk in each pp stage into the name in inference engine """ + # TODO for mcore uneven pp/vpp, things are different if vpp_size > 1: # print(f'try to bind vpp params to inference engine...') layers_per_pp = num_layers // pp_size diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py index 7cdc268e836..a5c34111fb4 100644 --- a/verl/workers/actor/megatron_actor.py +++ b/verl/workers/actor/megatron_actor.py @@ -259,9 +259,9 @@ def forward_backward_batch(self, data: DataProto, forward_only=False, post_proce def loss_func(output, data, meta_info): if forward_only: if post_process_fn is None: - return 1.0, {'logits': output} + return torch.tensor(1.0, device=output.device), {'logits': output} else: - return 1.0, post_process_fn(output, data) + return torch.tensor(1.0, device=output.device), post_process_fn(output, data) responses = data['responses'] response_length = responses.size(1) diff --git a/verl/workers/critic/megatron_critic.py b/verl/workers/critic/megatron_critic.py index 9f5b08b6281..30f8e4c64eb 100644 --- a/verl/workers/critic/megatron_critic.py +++ b/verl/workers/critic/megatron_critic.py @@ -137,7 +137,7 @@ def forward_backward_batch(self, data: DataProto, forward_only=False): def loss_func(output, data, meta_info): if forward_only: - return 1.0, {'vpreds': output} + return torch.tensor(1.0, device=output.device), {'vpreds': output} responses = data['responses'] attention_mask = data['attention_mask'] diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index 55775caf196..200ffd77274 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -255,10 +255,13 @@ def _build_rollout(self, trust_remote_code=False): log_gpu_memory_usage('After building vllm rollout', logger=logger) # perform weight resharding between actor and rollout + from verl.models.mcore import get_mcore_weight_converter + weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype) sharding_manager = MegatronVLLMShardingManager(module=self.hybrid_engine, inference_engine=rollout.inference_engine, model_config=self.actor_model_config, - layer_name_mapping=layer_name_mapping) + layer_name_mapping=layer_name_mapping, + weight_converter=weight_converter) log_gpu_memory_usage('After building sharding manager', logger=logger) else: raise NotImplementedError('Only vllmRollout is supported with Megatron now') diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py index 60d8cbc6700..f456cde7d0a 100644 --- a/verl/workers/sharding_manager/megatron_vllm.py +++ b/verl/workers/sharding_manager/megatron_vllm.py @@ -258,6 +258,7 @@ def pp_models(self): from verl.third_party.vllm import LLM from verl.utils.model import normalize_pp_vpp_params from verl.utils.megatron_utils import convert_megatron_model_to_transformers_model +from verl.models.mcore.weight_converter import McoreToHFWeightConverterBase # Micro Data parallel group. Micro data parallel group is additional dp group that origins from splitting training tp # into infer_tp and micro_tp. By default, we use order micro_dp - tp # NOTICE: in new version of vLLM, We need to all-gather all tp rank's model weights @@ -267,12 +268,14 @@ def pp_models(self): class MegatronVLLMShardingManager(BaseShardingManager): - def __init__(self, module: AllGatherPPModel, inference_engine: LLM, model_config, layer_name_mapping): + def __init__(self, module: AllGatherPPModel, inference_engine: LLM, model_config, layer_name_mapping, + weight_converter: McoreToHFWeightConverterBase): from megatron.core import parallel_state as mpu self.module = module self.inference_engine = inference_engine self.model_config = model_config self.layer_name_mapping = layer_name_mapping + self.weight_converter = weight_converter # initialize micro_dp group for vllm inference global _MICRO_DATA_PARALLEL_GROUP @@ -362,6 +365,9 @@ def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_ else: infer_params = [gate, up] + elif "mlp.experts.linear_fc2.weight" in name: # moe + infer_params = torch.cat(infer_params, dim=1) + else: # concat tensor infer_params = torch.cat(infer_params, dim=tp_utils.get_tensor_parallel_partition_dim(param)) @@ -393,13 +399,18 @@ def _post_process_params(self, params, convert_qkv_gate_up_by_simple_split=False convert_qkv_gate_up_by_simple_split) else: infer_params = param - converted_names, converted_params = convert_megatron_model_to_transformers_model( - name, - infer_params, - self.model_config, - self.train_tp_size, - self.module.pp_models[0][0].config.num_query_groups, - convert_qkv_gate_up_by_trunk_concat=False) + if vllm_version in ('0.4.2', '0.5.4', '0.6.3'): + converted_names, converted_params = convert_megatron_model_to_transformers_model( + name, + infer_params, + self.model_config, + self.train_tp_size, + self.module.pp_models[0][0].config.num_query_groups, + convert_qkv_gate_up_by_trunk_concat=False) + else: + if not isinstance(infer_params, list): + infer_params = [infer_params] + converted_names, converted_params = self.weight_converter.convert_param(name, infer_params) for converted_name, infer_param in zip(converted_names, converted_params): yield converted_name, infer_param @@ -425,6 +436,7 @@ def __enter__(self): per_tensor_param = self._post_process_params(cur_tp_rank_param, convert_qkv_gate_up_by_simple_split=True) self.inference_engine.wake_up() model = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model + _patch_vllm_qwen2_moe_model_weight_loader(model) loaded_params = model.load_weights(per_tensor_param) logger.info(f"vLLM load weights, loaded_params: {len(loaded_params)}") log_gpu_memory_usage('After load_weights sharding manager memory', logger=logger) @@ -484,3 +496,31 @@ def get_micro_data_parallel_world_size(): def get_micro_data_parallel_rank(): return torch.distributed.get_rank(group=get_micro_data_parallel_group()) + + +def _patch_vllm_qwen2_moe_model_weight_loader(model): + # this is a work around to load the weight of vllm qwen2 moe model + # it is from a bug from vllm 0.8.2 + # all the weights are supposed to have a weight_loader, but the moe weights + # do not have a weight_loader, so we need to patch it + # (True, 'model.embed_tokens.weight') + # (True, 'model.layers.0.self_attn.qkv_proj.weight') + # (True, 'model.layers.0.self_attn.qkv_proj.bias') + # (True, 'model.layers.0.self_attn.o_proj.weight') + # (True, 'model.layers.0.mlp.gate.weight') + # (True, 'model.layers.0.mlp.shared_expert.gate_up_proj.weight') + # (True, 'model.layers.0.mlp.shared_expert.down_proj.weight') + # (False, 'model.layers.0.mlp.shared_expert_gate.weight') use default + # (False, 'model.layers.0.input_layernorm.weight') use default + # (False, 'model.layers.0.post_attention_layernorm.weight') use default + # (False, 'model.layers.0.mlp.experts.w13_weight') use mlp.experts.weight_loader + # (False, 'model.layers.0.mlp.experts.w2_weight') use mlp.experts.weight_loader + from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM + if not isinstance(model, Qwen2MoeForCausalLM): + return + for layer in model.model.layers: + mlp = layer.mlp + param_dict = dict(mlp.named_parameters()) + for name, param in param_dict.items(): + if "w13_weight" in name or "w2_weight" in name: + param.weight_loader = mlp.experts.weight_loader From 0775d36852c8d11fa697e71ca35f29c05a737254 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Thu, 17 Apr 2025 04:10:29 -0700 Subject: [PATCH 06/19] qwen2moe config converter and weight converter --- scripts/converter_hf_to_mcore.py | 100 ++++++++++++++++++------- verl/models/mcore/config_converter.py | 26 +++++-- verl/models/mcore/model_initializer.py | 26 +++++-- verl/models/mcore/registry.py | 2 +- 4 files changed, 113 insertions(+), 41 deletions(-) diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py index c7c10b3fc85..cc4af55b512 100644 --- a/scripts/converter_hf_to_mcore.py +++ b/scripts/converter_hf_to_mcore.py @@ -24,11 +24,14 @@ from concurrent.futures import ThreadPoolExecutor from safetensors.torch import load_file from torch.distributed._tensor import Shard, Placement -from verl.utils.megatron_utils import get_model, convert_config +from verl.utils.megatron_utils import get_model from megatron.core.models.gpt.gpt_model import ModelType from megatron.core import parallel_state as mpu from megatron.core import dist_checkpointing from megatron.core.dist_checkpointing.serialization import StrictHandling +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed + +from verl.models.mcore import hf_to_mcore_config def _init_args(): @@ -58,6 +61,48 @@ def __init__(self): self.model = ModelConfig() +def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config): + num_attention_heads = hf_config.num_attention_heads + hidden_dim = hf_config.hidden_size + head_dim = hidden_dim // num_attention_heads + with torch.no_grad(): + model.embedding.word_embeddings.weight.copy_(hf_model.model.embed_tokens.weight) + for layer, hf_layer in zip(model.decoder.layers, hf_model.model.layers): + layer.self_attention.linear_qkv.layer_norm_weight.copy_(hf_layer.input_layernorm.weight) + + q = hf_layer.self_attn.q_proj.weight.view([num_attention_heads, -1, head_dim, hidden_dim]) + k = hf_layer.self_attn.k_proj.weight.view([num_attention_heads, -1, head_dim, hidden_dim]) + v = hf_layer.self_attn.v_proj.weight.view([num_attention_heads, -1, head_dim, hidden_dim]) + qkv = torch.cat([q, k, v], dim=1).view(-1, hidden_dim).contiguous() + + q_bias = hf_layer.self_attn.q_proj.bias.view([num_attention_heads, -1]) + k_bias = hf_layer.self_attn.k_proj.bias.view([num_attention_heads, -1]) + v_bias = hf_layer.self_attn.v_proj.bias.view([num_attention_heads, -1]) + qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=1).view(-1).contiguous() + + layer.self_attention.linear_qkv.weight.copy_(qkv) + layer.self_attention.linear_qkv.bias.copy_(qkv_bias) + + layer.self_attention.linear_proj.weight.copy_(hf_layer.self_attn.o_proj.weight) + layer.pre_mlp_layernorm.weight.copy_(hf_layer.post_attention_layernorm.weight) + + layer.mlp.router.weight.copy_(hf_layer.mlp.gate.weight) + + for idx, hf_expert in enumerate(hf_layer.mlp.experts): + fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight]) + layer.mlp.experts.linear_fc1._parameters[f'weight{idx}'].copy_(fc1_weight) + layer.mlp.experts.linear_fc2._parameters[f'weight{idx}'].copy_(hf_expert.down_proj.weight) + + layer.mlp.shared_experts.gate_weight.copy_(hf_layer.mlp.shared_expert_gate.weight) + shared_fc1_weight = torch.cat( + [hf_layer.mlp.shared_expert.gate_proj.weight, hf_layer.mlp.shared_expert.up_proj.weight]) + layer.mlp.shared_experts.linear_fc1.weight.copy_(shared_fc1_weight) + layer.mlp.shared_experts.linear_fc2.weight.copy_(hf_layer.mlp.shared_expert.down_proj.weight) + + model.decoder.final_layernorm.weight.copy_(hf_model.model.norm.weight) + model.output_layer.weight.copy_(hf_model.lm_head.weight) + + def convert_hf_to_mcore(hf_model_path, output_path, test=False): os.makedirs(output_path, exist_ok=True) if len(os.listdir(output_path)) > 0 and not test: @@ -74,46 +119,51 @@ def convert_hf_to_mcore(hf_model_path, output_path, test=False): virtual_pipeline_model_parallel_size=None, context_parallel_size=1, expert_model_parallel_size=1) + model_parallel_cuda_manual_seed(0) # init hf config hf_config = AutoConfig.from_pretrained(hf_model_path) print(hf_config) - megatron_config = MegatronConfig() + cfg = Config() cfg.model.path = hf_model_path - tfconfig = convert_config(hf_config, megatron_config) + tfconfig = hf_to_mcore_config(hf_config, torch.bfloat16) tie_word_embeddings = getattr(hf_config, "tie_word_embeddings", False) # init megatron model def megatron_model_provider(pre_process, post_process): - from verl.utils.model import get_parallel_gptmodel_from_config - parallel_model = get_parallel_gptmodel_from_config(tfconfig, - hf_config, - pre_process, - post_process, - share_embeddings_and_output_weights=tie_word_embeddings, - value=False) + from verl.models.mcore import init_mcore_model + parallel_model = init_mcore_model(tfconfig, + hf_config, + pre_process, + post_process, + share_embeddings_and_output_weights=tie_word_embeddings, + value=False) return parallel_model model = get_model(model_provider_func=megatron_model_provider, model_type=ModelType.encoder_or_decoder, - wrap_with_ddp=True) + wrap_with_ddp=False) with warnings.catch_warnings(): warnings.simplefilter("ignore") # init hf model - hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path) + hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, torch_dtype=torch.bfloat16) ref_state_dict = hf_model.state_dict() # load hf state dict to megatron model - from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel - load_state_dict_to_megatron_gptmodel(state_dict=ref_state_dict, - wrapped_models=model, - config=hf_config, - params_dtype=torch.bfloat16, - is_value_model=False) - ssd = model[0].module.module.sharded_state_dict() + if "Qwen2MoeForCausalLM" in hf_config.architectures: + convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config) + else: + from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel + load_state_dict_to_megatron_gptmodel(state_dict=ref_state_dict, + wrapped_models=model, + config=hf_config, + params_dtype=torch.bfloat16, + is_value_model=False) + + ssd = model[0].module.sharded_state_dict() del ref_state_dict, hf_model # save megatron model @@ -125,11 +175,11 @@ def megatron_model_provider(pre_process, post_process): model_test = get_model(model_provider_func=megatron_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True) - ssd2 = model_test[0].module.module.sharded_state_dict() + ssd2 = model_test[0].module.sharded_state_dict() dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.ASSUME_OK_UNEXPECTED) - sd = model[0].module.module.state_dict() - sd2 = model_test[0].module.module.state_dict() + sd = model[0].module.state_dict() + sd2 = model_test[0].module.state_dict() for k in sd.keys(): if sd[k] is None: continue @@ -162,11 +212,11 @@ def megatron_value_model_provider(pre_process, post_process): model_value = get_model(model_provider_func=megatron_value_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True) - ssd2 = model_value[0].module.module.sharded_state_dict() + ssd2 = model_value[0].module.sharded_state_dict() dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.IGNORE_ALL) - sd = model[0].module.module.state_dict() - sd2 = model_value[0].module.module.state_dict() + sd = model[0].module.state_dict() + sd2 = model_value[0].module.state_dict() for k in sd.keys(): if sd[k] is None: continue diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index e95d4ca73ba..6e419d6b857 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -86,8 +86,10 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) variable_seq_lengths=True, masked_softmax_fusion=True, attention_backend=AttnBackend.flash, + # attention_backend=AttnBackend.fused, bf16=dtype is torch.bfloat16, layernorm_epsilon=hf_config.rms_norm_eps, + ffn_hidden_size=hf_config.intermediate_size, # parallel config tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(), @@ -99,25 +101,33 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1, # moe specific - ffn_hidden_size=hf_config.moe_intermediate_size, + moe_ffn_hidden_size=hf_config.moe_intermediate_size, moe_token_dispatcher_type="alltoall", moe_router_bias_update_rate=0.001, moe_router_topk=hf_config.num_experts_per_tok, num_moe_experts=hf_config.num_experts, moe_shared_expert_intermediate_size=hf_config.shared_expert_intermediate_size, - # moe_aux_loss_coeff=hf_config.router_aux_loss_coef, - moe_aux_loss_coeff=0.0, + moe_aux_loss_coeff=hf_config.router_aux_loss_coef, + # moe_aux_loss_coeff=0.0, moe_router_load_balancing_type="aux_loss", - moe_router_pre_softmax=False, #? moe_shared_expert_overlap=True, - # moe_permute_fusion=True, + # moe_permute_fusion=True, # need TE 2.1+ moe_grouped_gemm=True, + moe_router_score_function="softmax", + + # # mcore 0.12 moe + # moe_router_dtype="fp64", + # disable_bf16_reduced_precision_matmul=True, - # mcore 0.12 - moe_router_dtype="fp64", - disable_bf16_reduced_precision_matmul=True, + # other + # deallocate_pipeline_outputs=True, + # gradient_accumulation_fusion=True, + persist_layer_norm=True, + bias_activation_fusion=True, + bias_dropout_fusion=True, # qwen specific + moe_router_pre_softmax=True, add_qkv_bias=True) return transformer_config diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py index 48ea980fd6b..823cf3af9d7 100644 --- a/verl/models/mcore/model_initializer.py +++ b/verl/models/mcore/model_initializer.py @@ -21,7 +21,8 @@ def init_mcore_model_dense(tfconfig, pre_process=None, post_process=None, share_embeddings_and_output_weights=False, - value=False): + value=False, + **extra_kwargs): # for LlamaForCausalLM, Qwen2ForCausalLM from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec @@ -53,11 +54,15 @@ def init_mcore_model_qwen2_moe(tfconfig, pre_process=None, post_process=None, share_embeddings_and_output_weights=False, - value=False): + value=False, + freeze_moe_router=True, + **extra_kwargs): from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec use_te = True + if freeze_moe_router: + tfconfig.moe_router_load_balancing_type = "none" def patch_layer_spec(transformer_layer_spec): # shared_experts.gate=True @@ -82,6 +87,10 @@ def patch_layer_spec(transformer_layer_spec): position_embedding_type='rope', rotary_base=hf_config.rope_theta, **rope_scaling_args) + if freeze_moe_router: + for layer in model.decoder.layers: + layer.mlp.router.weight.requires_grad = False + layer.mlp.shared_experts.gate_weight.requires_grad = False if post_process and value: from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig) @@ -93,9 +102,10 @@ def init_mcore_model_llama4(tfconfig, pre_process=None, post_process=None, share_embeddings_and_output_weights=False, - value=False): + value=False, + **extra_kwargs): return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, - value) + value, **extra_kwargs) def init_mcore_model_dpskv3(tfconfig, @@ -103,9 +113,10 @@ def init_mcore_model_dpskv3(tfconfig, pre_process=None, post_process=None, share_embeddings_and_output_weights=False, - value=False): + value=False, + **extra_kwargs): return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, - value) + value, **extra_kwargs) def init_mcore_model_qwen2_5_vl(tfconfig, @@ -113,6 +124,7 @@ def init_mcore_model_qwen2_5_vl(tfconfig, pre_process=None, post_process=None, share_embeddings_and_output_weights=False, - value=False): + value=False, + **extra_kwargs): # Qwen2_5_VLForConditionalGeneration raise NotImplementedError("VLM is not supported yet") diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py index ab02c69e5be..6d9c3bbe37a 100644 --- a/verl/models/mcore/registry.py +++ b/verl/models/mcore/registry.py @@ -46,7 +46,7 @@ def init_mcore_model( post_process=None, share_embeddings_and_output_weights=False, value=False, - **extra_kwargs # may be used for vlm + **extra_kwargs # may be used for vlm and moe ) -> nn.Module: MODEL_INITIALIZER_REGISTRY = { "LlamaForCausalLM": init_mcore_model_dense, From 6113b1095ce28d1526ed78841540a5ba7514de32 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Thu, 17 Apr 2025 05:55:07 -0700 Subject: [PATCH 07/19] add scripts to run qwen1.5moe_a2.7b --- .../run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh diff --git a/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh b/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh new file mode 100644 index 00000000000..dad28cd6dd3 --- /dev/null +++ b/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh @@ -0,0 +1,70 @@ +set -x +# 0. download the model +huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat + +# 1. convert the model to mcore format +# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path +HF_MODEL_PATH=/data/models/Qwen/Qwen1.5-MoE-A2.7B-Chat +DIST_CKPT_PATH=/data/mcore_ckpt/Qwen1.5-MoE-A2.7B-Chat +python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH + +# 2. run the script +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +train_files=$gsm8k_train_path +test_files=$gsm8k_test_path + +NODES=4 +PP=2 +TP=4 +CP=1 +VLLM_TP=4 + +RAY_ADDRESS='auto' ray job submit --working-dir . -- python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=$HF_MODEL_PATH \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ + critic.optim.lr=1e-5 \ + critic.model.path=$HF_MODEL_PATH \ + critic.model.enable_gradient_checkpointing=False \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=['console','wandb'] \ + trainer.project_name='verl_megatron_gsm8k_examples' \ + trainer.experiment_name='qwen1.5_moe_nochat' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=$NODES \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \ + critic.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \ + critic.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.actor.megatron.context_parallel_size=$CP \ + actor_rollout_ref.ref.megatron.context_parallel_size=$CP \ + critic.megatron.context_parallel_size=$CP \ + actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \ + actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \ + critic.megatron.use_dist_checkpointing=True \ + actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + trainer.total_epochs=100 $@ + \ No newline at end of file From 5f8d8a0f5aa18a7cb89b8f9da965ff35c86737c9 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Thu, 17 Apr 2025 23:58:30 -0700 Subject: [PATCH 08/19] format --- verl/workers/sharding_manager/megatron_vllm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py index b7e8ae88c13..b09f4a8efa4 100644 --- a/verl/workers/sharding_manager/megatron_vllm.py +++ b/verl/workers/sharding_manager/megatron_vllm.py @@ -271,6 +271,7 @@ def pp_models(self): class MegatronVLLMShardingManager(BaseShardingManager): + def __init__(self, actor_module: nn.ModuleList, inference_engine: LLM, From d2376eca9eb12cc5aa301bf9304934a236cb4221 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Fri, 18 Apr 2025 07:27:54 -0700 Subject: [PATCH 09/19] update scripts --- examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh b/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh index dad28cd6dd3..0d84d28046e 100644 --- a/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh +++ b/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh @@ -20,7 +20,8 @@ TP=4 CP=1 VLLM_TP=4 -RAY_ADDRESS='auto' ray job submit --working-dir . -- python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ +# RAY_ADDRESS='auto' ray job submit --working-dir . -- +python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ algorithm.adv_estimator=gae \ data.train_files="$train_files" \ data.val_files="$test_files" \ From 57d9671e0589f3f2c73b879b25d7aea4e6aa830c Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Fri, 18 Apr 2025 08:39:02 -0700 Subject: [PATCH 10/19] fix for pre-commit --- scripts/converter_hf_to_mcore.py | 123 ++++++------- verl/models/mcore/__init__.py | 4 +- verl/models/mcore/config_converter.py | 41 +++-- verl/models/mcore/model_forward.py | 108 +++++------ verl/models/mcore/model_initializer.py | 167 ++++++++++-------- verl/models/mcore/registry.py | 79 ++++++--- verl/models/mcore/saver.py | 31 ++-- verl/models/mcore/weight_converter.py | 55 +++--- verl/models/weight_loader_registry.py | 5 +- verl/workers/critic/megatron_critic.py | 13 +- verl/workers/megatron_workers.py | 20 +-- .../workers/sharding_manager/megatron_vllm.py | 83 +++++---- 12 files changed, 362 insertions(+), 367 deletions(-) diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py index 1aa3d0f9c13..aa4256b67a4 100644 --- a/scripts/converter_hf_to_mcore.py +++ b/scripts/converter_hf_to_mcore.py @@ -13,50 +13,42 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple, Dict -import re -import os -import torch import argparse +import os import warnings -import numpy as np -from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForTokenClassification, AutoModelForVision2Seq -from concurrent.futures import ThreadPoolExecutor -from safetensors.torch import load_file -from torch.distributed._tensor import Shard, Placement -from verl.utils.megatron_utils import get_model -from megatron.core.models.gpt.gpt_model import ModelType -from megatron.core import parallel_state as mpu + +import torch from megatron.core import dist_checkpointing +from megatron.core import parallel_state as mpu from megatron.core.dist_checkpointing.serialization import StrictHandling +from megatron.core.models.gpt.gpt_model import ModelType from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from transformers import AutoConfig, AutoModelForCausalLM from verl.models.mcore import hf_to_mcore_config +from verl.utils.megatron_utils import get_model def _init_args(): parser = argparse.ArgumentParser() - parser.add_argument('--hf_model_path', type=str, required=True, help="The path for the huggingface model") - parser.add_argument('--output_path', type=str, required=True, help="The path for the output mcore model") - parser.add_argument('--test', action='store_true', help="Whether to test the conversion") + parser.add_argument("--hf_model_path", type=str, required=True, help="The path for the huggingface model") + parser.add_argument("--output_path", type=str, required=True, help="The path for the output mcore model") + parser.add_argument("--test", action="store_true", help="Whether to test the conversion") args = parser.parse_args() return args class MegatronConfig: - def __init__(self): self.params_dtype = torch.bfloat16 class ModelConfig: - def __init__(self): self.path = None class Config: - def __init__(self): self.model = ModelConfig() @@ -90,12 +82,13 @@ def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config) for idx, hf_expert in enumerate(hf_layer.mlp.experts): fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight]) - layer.mlp.experts.linear_fc1._parameters[f'weight{idx}'].copy_(fc1_weight) - layer.mlp.experts.linear_fc2._parameters[f'weight{idx}'].copy_(hf_expert.down_proj.weight) + layer.mlp.experts.linear_fc1._parameters[f"weight{idx}"].copy_(fc1_weight) + layer.mlp.experts.linear_fc2._parameters[f"weight{idx}"].copy_(hf_expert.down_proj.weight) layer.mlp.shared_experts.gate_weight.copy_(hf_layer.mlp.shared_expert_gate.weight) shared_fc1_weight = torch.cat( - [hf_layer.mlp.shared_expert.gate_proj.weight, hf_layer.mlp.shared_expert.up_proj.weight]) + [hf_layer.mlp.shared_expert.gate_proj.weight, hf_layer.mlp.shared_expert.up_proj.weight] + ) layer.mlp.shared_experts.linear_fc1.weight.copy_(shared_fc1_weight) layer.mlp.shared_experts.linear_fc2.weight.copy_(hf_layer.mlp.shared_expert.down_proj.weight) @@ -110,15 +103,17 @@ def convert_hf_to_mcore(hf_model_path, output_path, test=False): return # init torch distributed and mpu - os.environ['RANK'] = '0' - os.environ['WORLD_SIZE'] = '1' - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '12355' - torch.distributed.init_process_group('nccl') - mpu.initialize_model_parallel(tensor_model_parallel_size=1, - virtual_pipeline_model_parallel_size=None, - context_parallel_size=1, - expert_model_parallel_size=1) + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = "1" + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + torch.distributed.init_process_group("nccl") + mpu.initialize_model_parallel( + tensor_model_parallel_size=1, + virtual_pipeline_model_parallel_size=None, + context_parallel_size=1, + expert_model_parallel_size=1, + ) model_parallel_cuda_manual_seed(0) # init hf config @@ -133,17 +128,20 @@ def convert_hf_to_mcore(hf_model_path, output_path, test=False): # init megatron model def megatron_model_provider(pre_process, post_process): from verl.models.mcore import init_mcore_model - parallel_model = init_mcore_model(tfconfig, - hf_config, - pre_process, - post_process, - share_embeddings_and_output_weights=tie_word_embeddings, - value=False) + + parallel_model = init_mcore_model( + tfconfig, + hf_config, + pre_process, + post_process, + share_embeddings_and_output_weights=tie_word_embeddings, + value=False, + ) return parallel_model - model = get_model(model_provider_func=megatron_model_provider, - model_type=ModelType.encoder_or_decoder, - wrap_with_ddp=False) + model = get_model( + model_provider_func=megatron_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=False + ) with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -157,11 +155,14 @@ def megatron_model_provider(pre_process, post_process): convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config) else: from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel - load_state_dict_to_megatron_gptmodel(state_dict=ref_state_dict, - wrapped_models=model, - config=hf_config, - params_dtype=torch.bfloat16, - is_value_model=False) + + load_state_dict_to_megatron_gptmodel( + state_dict=ref_state_dict, + wrapped_models=model, + config=hf_config, + params_dtype=torch.bfloat16, + is_value_model=False, + ) ssd = model[0].module.sharded_state_dict() del ref_state_dict, hf_model @@ -172,9 +173,9 @@ def megatron_model_provider(pre_process, post_process): if test: ########### test ########### # load model - model_test = get_model(model_provider_func=megatron_model_provider, - model_type=ModelType.encoder_or_decoder, - wrap_with_ddp=True) + model_test = get_model( + model_provider_func=megatron_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True + ) ssd2 = model_test[0].module.sharded_state_dict() dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.ASSUME_OK_UNEXPECTED) @@ -186,7 +187,7 @@ def megatron_model_provider(pre_process, post_process): d1 = sd[k].data if k in sd2: d2 = sd2[k].data - assert d1.shape == d2.shape, f'{k=} {d1.shape=} {d2.shape=}' + assert d1.shape == d2.shape, f"{k=} {d1.shape=} {d2.shape=}" assert (d1 == d2).all(), f"{k} is not equal" for k in sd2.keys(): if sd2[k] is None: @@ -194,24 +195,24 @@ def megatron_model_provider(pre_process, post_process): d1 = sd2[k].data if k in sd: d2 = sd[k].data - assert d1.shape == d2.shape, f'{k=} {d1.shape=} {d2.shape=}' + assert d1.shape == d2.shape, f"{k=} {d1.shape=} {d2.shape=}" assert (d1 == d2).all(), f"{k} is not equal" # load value model def megatron_value_model_provider(pre_process, post_process): from verl.utils.model import get_parallel_gptmodel_from_config - parallel_model = get_parallel_gptmodel_from_config(tfconfig, - hf_config, - pre_process, - post_process, - share_embeddings_and_output_weights=False, - value=True) + + parallel_model = get_parallel_gptmodel_from_config( + tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights=False, value=True + ) parallel_model.cuda() return parallel_model - model_value = get_model(model_provider_func=megatron_value_model_provider, - model_type=ModelType.encoder_or_decoder, - wrap_with_ddp=True) + model_value = get_model( + model_provider_func=megatron_value_model_provider, + model_type=ModelType.encoder_or_decoder, + wrap_with_ddp=True, + ) ssd2 = model_value[0].module.sharded_state_dict() dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.IGNORE_ALL) @@ -223,7 +224,7 @@ def megatron_value_model_provider(pre_process, post_process): d1 = sd[k].data if k in sd2: d2 = sd2[k].data - assert d1.shape == d2.shape, f'{k=} {d1.shape=} {d2.shape=}' + assert d1.shape == d2.shape, f"{k=} {d1.shape=} {d2.shape=}" assert (d1 == d2).all(), f"{k} is not equal" for k in sd2.keys(): if sd2[k] is None: @@ -231,10 +232,10 @@ def megatron_value_model_provider(pre_process, post_process): d1 = sd2[k].data if k in sd: d2 = sd[k].data - assert d1.shape == d2.shape, f'{k=} {d1.shape=} {d2.shape=}' + assert d1.shape == d2.shape, f"{k=} {d1.shape=} {d2.shape=}" assert (d1 == d2).all(), f"{k} is not equal" if __name__ == "__main__": args = _init_args() - convert_hf_to_mcore(args.hf_model_path, args.output_path, args.test) \ No newline at end of file + convert_hf_to_mcore(args.hf_model_path, args.output_path, args.test) diff --git a/verl/models/mcore/__init__.py b/verl/models/mcore/__init__.py index 8782330500d..6ee338cd15c 100644 --- a/verl/models/mcore/__init__.py +++ b/verl/models/mcore/__init__.py @@ -13,6 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .registry import init_mcore_model, hf_to_mcore_config, get_mcore_forward_fn, get_mcore_weight_converter +from .registry import get_mcore_forward_fn, get_mcore_weight_converter, hf_to_mcore_config, init_mcore_model -__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'get_mcore_forward_fn', 'get_mcore_weight_converter'] +__all__ = ["init_mcore_model", "hf_to_mcore_config", "get_mcore_forward_fn", "get_mcore_weight_converter"] diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index 6e419d6b857..c14228d4261 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -15,22 +15,25 @@ # convert huggingface config to mcore transformer config -from transformers import PretrainedConfig -from megatron.core.transformer import TransformerConfig import torch import torch.nn.functional as F +from megatron.core.transformer import TransformerConfig from megatron.core.transformer.enums import AttnBackend +from transformers import PretrainedConfig def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: # for LlamaForCausalLM or Qwen2ForCausalLM from megatron.core import parallel_state as mpu + + qkv_bias = getattr(hf_config, "attention_bias", False) if "Qwen2ForCausalLM" in hf_config.architectures: qkv_bias = True - else: - qkv_bias = getattr(hf_config, 'attention_bias', False) - overlap_p2p_comm = mpu.get_virtual_pipeline_model_parallel_world_size( - ) is not None and mpu.get_virtual_pipeline_model_parallel_world_size() > 1 + + overlap_p2p_comm = ( + mpu.get_virtual_pipeline_model_parallel_world_size() is not None + and mpu.get_virtual_pipeline_model_parallel_world_size() > 1 + ) batch_p2p_comm = False transformer_config = TransformerConfig( num_layers=hf_config.num_hidden_layers, @@ -39,7 +42,7 @@ def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> num_query_groups=hf_config.num_key_value_heads, ffn_hidden_size=hf_config.intermediate_size, activation_func=F.silu, - normalization='RMSNorm', + normalization="RMSNorm", gated_linear_unit=True, use_cpu_initialization=True, add_bias_linear=False, @@ -56,18 +59,22 @@ def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> masked_softmax_fusion=True, moe_token_dispatcher_type="alltoall", attention_dropout=hf_config.attention_dropout, - hidden_dropout=getattr(hf_config, 'hidden_dropout', 0.0), + hidden_dropout=getattr(hf_config, "hidden_dropout", 0.0), add_qkv_bias=qkv_bias, attention_backend=AttnBackend.flash, - bf16=dtype is torch.bfloat16) + bf16=dtype is torch.bfloat16, + ) return transformer_config def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: from megatron.core import parallel_state as mpu - overlap_p2p_comm = mpu.get_virtual_pipeline_model_parallel_world_size( - ) is not None and mpu.get_virtual_pipeline_model_parallel_world_size() > 1 + + overlap_p2p_comm = ( + mpu.get_virtual_pipeline_model_parallel_world_size() is not None + and mpu.get_virtual_pipeline_model_parallel_world_size() > 1 + ) batch_p2p_comm = False transformer_config = TransformerConfig( num_layers=hf_config.num_hidden_layers, @@ -75,9 +82,9 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) num_attention_heads=hf_config.num_attention_heads, num_query_groups=hf_config.num_key_value_heads, attention_dropout=hf_config.attention_dropout, - hidden_dropout=getattr(hf_config, 'hidden_dropout', 0.0), + hidden_dropout=getattr(hf_config, "hidden_dropout", 0.0), activation_func=F.silu, - normalization='RMSNorm', + normalization="RMSNorm", gated_linear_unit=True, use_cpu_initialization=False, add_bias_linear=False, @@ -90,7 +97,6 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) bf16=dtype is torch.bfloat16, layernorm_epsilon=hf_config.rms_norm_eps, ffn_hidden_size=hf_config.intermediate_size, - # parallel config tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(), pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(), @@ -99,7 +105,6 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) overlap_p2p_comm=overlap_p2p_comm, batch_p2p_comm=batch_p2p_comm, sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1, - # moe specific moe_ffn_hidden_size=hf_config.moe_intermediate_size, moe_token_dispatcher_type="alltoall", @@ -114,21 +119,19 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) # moe_permute_fusion=True, # need TE 2.1+ moe_grouped_gemm=True, moe_router_score_function="softmax", - # # mcore 0.12 moe # moe_router_dtype="fp64", # disable_bf16_reduced_precision_matmul=True, - # other # deallocate_pipeline_outputs=True, # gradient_accumulation_fusion=True, persist_layer_norm=True, bias_activation_fusion=True, bias_dropout_fusion=True, - # qwen specific moe_router_pre_softmax=True, - add_qkv_bias=True) + add_qkv_bias=True, + ) return transformer_config diff --git a/verl/models/mcore/model_forward.py b/verl/models/mcore/model_forward.py index 42fd702efee..a615fefbfc1 100644 --- a/verl/models/mcore/model_forward.py +++ b/verl/models/mcore/model_forward.py @@ -13,95 +13,69 @@ # See the License for the specific language governing permissions and # limitations under the License. -from verl.utils.megatron import sequence_parallel as sp_utils -from verl.utils.megatron import tensor_parallel as tp_utils -import torch -from megatron.core.packed_seq_params import PackedSeqParams -from megatron.core import parallel_state as mpu from verl.utils.megatron_utils import unwrap_model -from .util import preprocess_packed_seqs, postprocess_packed_seqs, remove_left_padding, recover_left_padding +from .util import postprocess_packed_seqs, preprocess_packed_seqs, recover_left_padding, remove_left_padding -def gptmodel_forward_dense(model, - input_ids, - attention_mask, - position_ids, - sequence_parallel, - value_model=False, - pack_seqs=True): + +def gptmodel_forward_dense( + model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True +): pre_process = unwrap_model(model).pre_process post_process = unwrap_model(model).post_process if pack_seqs: batch_size, seq_len = attention_mask.shape[:2] input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=pre_process) input_ids_rmpad = input_ids_rmpad.contiguous() - output_orig = model(input_ids=input_ids_rmpad, - attention_mask=None, - position_ids=position_ids, - packed_seq_params=packed_seq_params) + output_orig = model( + input_ids=input_ids_rmpad, + attention_mask=None, + position_ids=position_ids, + packed_seq_params=packed_seq_params, + ) - output = postprocess_packed_seqs(output_orig, - packed_seq_params, - attention_mask, - batch_size, - seq_len, - post_process=post_process) + output = postprocess_packed_seqs( + output_orig, packed_seq_params, attention_mask, batch_size, seq_len, post_process=post_process + ) else: batch_size, sequence_length = attention_mask.shape - new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(input_ids, - attention_mask, - position_ids, - sequence_parallel, - pre_process=pre_process) + new_input_ids, new_attention_mask, new_position_ids = remove_left_padding( + input_ids, attention_mask, position_ids, sequence_parallel, pre_process=pre_process + ) output = model(input_ids=new_input_ids, attention_mask=new_attention_mask, position_ids=new_position_ids) - output = recover_left_padding(output, - new_attention_mask, - attention_mask, - sequence_length, - post_process=post_process) + output = recover_left_padding( + output, new_attention_mask, attention_mask, sequence_length, post_process=post_process + ) if value_model and post_process: output = output[..., 0] return output -def gptmodel_forward_qwen2_moe(model, - input_ids, - attention_mask, - position_ids, - sequence_parallel, - value_model=False, - pack_seqs=True): - return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, - pack_seqs) +def gptmodel_forward_qwen2_moe( + model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True +): + return gptmodel_forward_dense( + model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, pack_seqs + ) -def gptmodel_forward_llama4(model, - input_ids, - attention_mask, - position_ids, - sequence_parallel, - value_model=False, - pack_seqs=True): - return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, - pack_seqs) +def gptmodel_forward_llama4( + model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True +): + return gptmodel_forward_dense( + model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, pack_seqs + ) -def gptmodel_forward_dpskv3(model, - input_ids, - attention_mask, - position_ids, - sequence_parallel, - value_model=False, - pack_seqs=True): - return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, - pack_seqs) +def gptmodel_forward_dpskv3( + model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True +): + return gptmodel_forward_dense( + model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, pack_seqs + ) -def gptmodel_forward_qwen2_5_vl(model, - input_ids, - attention_mask, - position_ids, - sequence_parallel, - value_model=False, - pack_seqs=True): +def gptmodel_forward_qwen2_5_vl( + model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True +): raise NotImplementedError("VLM is not supported yet") diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py index 823cf3af9d7..0be8c9eb7ac 100644 --- a/verl/models/mcore/model_initializer.py +++ b/verl/models/mcore/model_initializer.py @@ -16,50 +16,58 @@ # use mcore transformer config to initialize the model -def init_mcore_model_dense(tfconfig, - hf_config, - pre_process=None, - post_process=None, - share_embeddings_and_output_weights=False, - value=False, - **extra_kwargs): +def init_mcore_model_dense( + tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False, + **extra_kwargs, +): # for LlamaForCausalLM, Qwen2ForCausalLM - from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec + from megatron.core.models.gpt.gpt_model import GPTModel + use_te = True - assert tfconfig.normalization == "RMSNorm", 'only RMSNorm is supported for now' + assert tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now" transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te) rope_scaling_args = {} if hf_config.rope_scaling is not None: - assert hf_config.rope_scaling['type'] == 'linear', "only linear scaling is supported for now" - rope_scaling_args['seq_len_interpolation_factor'] = hf_config.rope_scaling['factor'] - model = GPTModel(config=tfconfig, - transformer_layer_spec=transformer_layer_spec, - vocab_size=hf_config.vocab_size, - max_sequence_length=hf_config.max_position_embeddings, - pre_process=pre_process, - post_process=post_process, - share_embeddings_and_output_weights=share_embeddings_and_output_weights, - position_embedding_type='rope', - rotary_base=hf_config.rope_theta, - **rope_scaling_args) + assert hf_config.rope_scaling["type"] == "linear", "only linear scaling is supported for now" + rope_scaling_args["seq_len_interpolation_factor"] = hf_config.rope_scaling["factor"] + model = GPTModel( + config=tfconfig, + transformer_layer_spec=transformer_layer_spec, + vocab_size=hf_config.vocab_size, + max_sequence_length=hf_config.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + share_embeddings_and_output_weights=share_embeddings_and_output_weights, + position_embedding_type="rope", + rotary_base=hf_config.rope_theta, + **rope_scaling_args, + ) if post_process and value: from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer + model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig) return model -def init_mcore_model_qwen2_moe(tfconfig, - hf_config, - pre_process=None, - post_process=None, - share_embeddings_and_output_weights=False, - value=False, - freeze_moe_router=True, - **extra_kwargs): - - from megatron.core.models.gpt.gpt_model import GPTModel +def init_mcore_model_qwen2_moe( + tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False, + freeze_moe_router=True, + **extra_kwargs, +): from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec + from megatron.core.models.gpt.gpt_model import GPTModel + use_te = True if freeze_moe_router: tfconfig.moe_router_load_balancing_type = "none" @@ -67,64 +75,75 @@ def init_mcore_model_qwen2_moe(tfconfig, def patch_layer_spec(transformer_layer_spec): # shared_experts.gate=True for i in range(len(transformer_layer_spec.layer_specs)): - transformer_layer_spec.layer_specs[i].submodules.mlp.submodules.shared_experts.params['gate'] = True + transformer_layer_spec.layer_specs[i].submodules.mlp.submodules.shared_experts.params["gate"] = True return transformer_layer_spec - assert tfconfig.normalization == "RMSNorm", 'only RMSNorm is supported for now' + assert tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now" transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te) transformer_layer_spec = patch_layer_spec(transformer_layer_spec) rope_scaling_args = {} if hf_config.rope_scaling is not None: - assert hf_config.rope_scaling['type'] == 'linear', "only linear scaling is supported for now" - rope_scaling_args['seq_len_interpolation_factor'] = hf_config.rope_scaling['factor'] - model = GPTModel(config=tfconfig, - transformer_layer_spec=transformer_layer_spec, - vocab_size=hf_config.vocab_size, - max_sequence_length=hf_config.max_position_embeddings, - pre_process=pre_process, - post_process=post_process, - share_embeddings_and_output_weights=share_embeddings_and_output_weights, - position_embedding_type='rope', - rotary_base=hf_config.rope_theta, - **rope_scaling_args) + assert hf_config.rope_scaling["type"] == "linear", "only linear scaling is supported for now" + rope_scaling_args["seq_len_interpolation_factor"] = hf_config.rope_scaling["factor"] + model = GPTModel( + config=tfconfig, + transformer_layer_spec=transformer_layer_spec, + vocab_size=hf_config.vocab_size, + max_sequence_length=hf_config.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + share_embeddings_and_output_weights=share_embeddings_and_output_weights, + position_embedding_type="rope", + rotary_base=hf_config.rope_theta, + **rope_scaling_args, + ) if freeze_moe_router: for layer in model.decoder.layers: layer.mlp.router.weight.requires_grad = False layer.mlp.shared_experts.gate_weight.requires_grad = False if post_process and value: from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer + model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig) return model -def init_mcore_model_llama4(tfconfig, - hf_config, - pre_process=None, - post_process=None, - share_embeddings_and_output_weights=False, - value=False, - **extra_kwargs): - return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, - value, **extra_kwargs) - - -def init_mcore_model_dpskv3(tfconfig, - hf_config, - pre_process=None, - post_process=None, - share_embeddings_and_output_weights=False, - value=False, - **extra_kwargs): - return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, - value, **extra_kwargs) - - -def init_mcore_model_qwen2_5_vl(tfconfig, - hf_config, - pre_process=None, - post_process=None, - share_embeddings_and_output_weights=False, - value=False, - **extra_kwargs): +def init_mcore_model_llama4( + tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False, + **extra_kwargs, +): + return init_mcore_model_dense( + tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, value, **extra_kwargs + ) + + +def init_mcore_model_dpskv3( + tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False, + **extra_kwargs, +): + return init_mcore_model_dense( + tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, value, **extra_kwargs + ) + + +def init_mcore_model_qwen2_5_vl( + tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False, + **extra_kwargs, +): # Qwen2_5_VLForConditionalGeneration raise NotImplementedError("VLM is not supported yet") diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py index 6d9c3bbe37a..19d8433db08 100644 --- a/verl/models/mcore/registry.py +++ b/verl/models/mcore/registry.py @@ -13,11 +13,34 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .config_converter import hf_to_mcore_config_dense, hf_to_mcore_config_qwen2moe, hf_to_mcore_config_dpskv3, hf_to_mcore_config_qwen2_5_vl, hf_to_mcore_config_llama4 -from .config_converter import PretrainedConfig, TransformerConfig import torch import torch.nn as nn +from .config_converter import ( + PretrainedConfig, + TransformerConfig, + hf_to_mcore_config_dense, + hf_to_mcore_config_dpskv3, + hf_to_mcore_config_llama4, + hf_to_mcore_config_qwen2_5_vl, + hf_to_mcore_config_qwen2moe, +) +from .model_forward import ( + gptmodel_forward_dense, + gptmodel_forward_dpskv3, + gptmodel_forward_llama4, + gptmodel_forward_qwen2_5_vl, + gptmodel_forward_qwen2_moe, +) +from .model_initializer import ( + init_mcore_model_dense, + init_mcore_model_dpskv3, + init_mcore_model_llama4, + init_mcore_model_qwen2_5_vl, + init_mcore_model_qwen2_moe, +) +from .weight_converter import McoreToHFWeightConverterDense, McoreToHFWeightConverterQwen2Moe + def hf_to_mcore_config(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: MODEL_CONFIG_CONVERTER_REGISTRY = { @@ -31,22 +54,21 @@ def hf_to_mcore_config(hf_config: PretrainedConfig, dtype: torch.dtype) -> Trans assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" arch = hf_config.architectures[0] if arch not in MODEL_CONFIG_CONVERTER_REGISTRY: - raise ValueError(f"Model architectures {arch} converter are not supported for now. " - f"Supported architectures: {MODEL_CONFIG_CONVERTER_REGISTRY.keys()}") + raise ValueError( + f"Model architectures {arch} converter are not supported for now. " + f"Supported architectures: {MODEL_CONFIG_CONVERTER_REGISTRY.keys()}" + ) return MODEL_CONFIG_CONVERTER_REGISTRY[arch](hf_config, dtype) -from .model_initializer import init_mcore_model_dense, init_mcore_model_qwen2_moe, init_mcore_model_dpskv3, init_mcore_model_qwen2_5_vl, init_mcore_model_llama4 - - def init_mcore_model( - tfconfig, - hf_config, - pre_process=None, - post_process=None, - share_embeddings_and_output_weights=False, - value=False, - **extra_kwargs # may be used for vlm and moe + tfconfig, + hf_config, + pre_process=None, + post_process=None, + share_embeddings_and_output_weights=False, + value=False, + **extra_kwargs, # may be used for vlm and moe ) -> nn.Module: MODEL_INITIALIZER_REGISTRY = { "LlamaForCausalLM": init_mcore_model_dense, @@ -59,13 +81,13 @@ def init_mcore_model( assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" arch = hf_config.architectures[0] if arch not in MODEL_INITIALIZER_REGISTRY: - raise ValueError(f"Model architectures {arch} initializer are not supported for now. " - f"Supported architectures: {MODEL_INITIALIZER_REGISTRY.keys()}") - return MODEL_INITIALIZER_REGISTRY[arch](tfconfig, hf_config, pre_process, post_process, - share_embeddings_and_output_weights, value, **extra_kwargs) - - -from .model_forward import gptmodel_forward_dense, gptmodel_forward_qwen2_moe, gptmodel_forward_llama4, gptmodel_forward_dpskv3, gptmodel_forward_qwen2_5_vl + raise ValueError( + f"Model architectures {arch} initializer are not supported for now. " + f"Supported architectures: {MODEL_INITIALIZER_REGISTRY.keys()}" + ) + return MODEL_INITIALIZER_REGISTRY[arch]( + tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, value, **extra_kwargs + ) def get_mcore_forward_fn(hf_config: PretrainedConfig): @@ -80,14 +102,13 @@ def get_mcore_forward_fn(hf_config: PretrainedConfig): assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" arch = hf_config.architectures[0] if arch not in MODEL_FORWARD_REGISTRY: - raise ValueError(f"Model architectures {arch} forward function are not supported for now. " - f"Supported architectures: {MODEL_FORWARD_REGISTRY.keys()}") + raise ValueError( + f"Model architectures {arch} forward function are not supported for now. " + f"Supported architectures: {MODEL_FORWARD_REGISTRY.keys()}" + ) return MODEL_FORWARD_REGISTRY[arch] -from .weight_converter import McoreToHFWeightConverterDense, McoreToHFWeightConverterQwen2Moe - - def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype): MODEL_WEIGHT_CONVERTER_REGISTRY = { "LlamaForCausalLM": McoreToHFWeightConverterDense, @@ -97,7 +118,9 @@ def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype): assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" arch = hf_config.architectures[0] if arch not in MODEL_WEIGHT_CONVERTER_REGISTRY: - raise ValueError(f"Model architectures {arch} weight converter are not supported for now. " - f"Supported architectures: {MODEL_WEIGHT_CONVERTER_REGISTRY.keys()}") + raise ValueError( + f"Model architectures {arch} weight converter are not supported for now. " + f"Supported architectures: {MODEL_WEIGHT_CONVERTER_REGISTRY.keys()}" + ) tfconfig = hf_to_mcore_config(hf_config, dtype) return MODEL_WEIGHT_CONVERTER_REGISTRY[arch](hf_config, tfconfig) diff --git a/verl/models/mcore/saver.py b/verl/models/mcore/saver.py index e8dc6bdf679..df8721aa56a 100644 --- a/verl/models/mcore/saver.py +++ b/verl/models/mcore/saver.py @@ -35,7 +35,7 @@ def _megatron_calc_global_rank( dp_size = mpu.get_data_parallel_world_size() pp_size = mpu.get_pipeline_model_parallel_world_size() cp_size = mpu.get_context_parallel_world_size() - ep_size = mpu.get_expert_model_parallel_world_size() + # ep_size = mpu.get_expert_model_parallel_world_size() # Verify total GPU count matches (must be consistent with parallel_state.py) total_size = tp_size * dp_size * pp_size * cp_size @@ -179,14 +179,11 @@ def _broadcast_tp_shard_tensor(tensor, name, src_pp_rank, concat_dim=0, mutate_f """broadcast tensor in tp shards across mp_group""" nonlocal state_dict nonlocal mp_group - tp_rank = mpu.get_tensor_model_parallel_rank() + # tp_rank = mpu.get_tensor_model_parallel_rank() tp_size = mpu.get_tensor_model_parallel_world_size() src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank) - if torch.distributed.get_rank() == src_rank: - chunk_shape = tensor.shape - else: - chunk_shape = None + chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None obj_list = [chunk_shape] dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group) @@ -223,14 +220,11 @@ def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name, src_pp_rank) """broadcast tensor in tp shards across mp_group""" nonlocal state_dict nonlocal mp_group - tp_rank = mpu.get_tensor_model_parallel_rank() + # tp_rank = mpu.get_tensor_model_parallel_rank() tp_size = mpu.get_tensor_model_parallel_world_size() src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank) - if torch.distributed.get_rank() == src_rank: - chunk_shape = tensor.shape - else: - chunk_shape = None + chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None obj_list = [chunk_shape] dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group) @@ -276,14 +270,11 @@ def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank): """broadcast tensor in tp shards across mp_group""" nonlocal state_dict nonlocal mp_group - tp_rank = mpu.get_tensor_model_parallel_rank() + # tp_rank = mpu.get_tensor_model_parallel_rank() tp_size = mpu.get_tensor_model_parallel_world_size() src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank) - if torch.distributed.get_rank() == src_rank: - chunk_shape = tensor.shape - else: - chunk_shape = None + chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None obj_list = [chunk_shape] dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group) @@ -473,9 +464,7 @@ def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank): return state_dict -def merge_megatron_ckpt_gptmodel_qwen_moe(wrapped_models, - config, - dtype, - is_value_model=False, - tie_word_embeddings=False): +def merge_megatron_ckpt_gptmodel_qwen_moe( + wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False +): raise NotImplementedError("merge_megatron_ckpt_gptmodel_qwen_moe is not implemented") diff --git a/verl/models/mcore/weight_converter.py b/verl/models/mcore/weight_converter.py index 155361a532a..6bdee51b3e9 100644 --- a/verl/models/mcore/weight_converter.py +++ b/verl/models/mcore/weight_converter.py @@ -17,12 +17,11 @@ # including format conversion and name mapping # not including resharding import torch -from transformers import PretrainedConfig from megatron.core.transformer import TransformerConfig +from transformers import PretrainedConfig class McoreToHFWeightConverterBase: - def __init__(self, hf_config: PretrainedConfig, mcore_config: TransformerConfig): self.hf_config = hf_config self.mcore_config = mcore_config @@ -32,26 +31,25 @@ def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> torc class McoreToHFWeightConverterDense(McoreToHFWeightConverterBase): - def _convert_attention_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]: # 'decoder.layers.0.self_attention.linear_proj.weight' # 'decoder.layers.0.self_attention.linear_qkv.layer_norm_weight' # 'decoder.layers.0.self_attention.linear_qkv.weight' # 'decoder.layers.0.self_attention.linear_qkv.bias' - layer_number = name.split('.')[2] + layer_number = name.split(".")[2] convert_names = [] if "self_attention.linear_qkv.bias" in name or "self_attention.linear_qkv.weight" in name: - param_type = name.split('.')[-1] - assert param_type == 'bias' or param_type == 'weight' - convert_names.append(f'model.layers.{layer_number}.self_attn.q_proj.{param_type}') - convert_names.append(f'model.layers.{layer_number}.self_attn.k_proj.{param_type}') - convert_names.append(f'model.layers.{layer_number}.self_attn.v_proj.{param_type}') + param_type = name.split(".")[-1] + assert param_type == "bias" or param_type == "weight" + convert_names.append(f"model.layers.{layer_number}.self_attn.q_proj.{param_type}") + convert_names.append(f"model.layers.{layer_number}.self_attn.k_proj.{param_type}") + convert_names.append(f"model.layers.{layer_number}.self_attn.v_proj.{param_type}") assert len(params) == 3 elif "self_attention.linear_proj.weight" in name: - convert_names.append(f'model.layers.{layer_number}.self_attn.o_proj.weight') + convert_names.append(f"model.layers.{layer_number}.self_attn.o_proj.weight") assert len(params) == 1 elif "self_attention.linear_qkv.layer_norm_weight" in name: - convert_names.append(f'model.layers.{layer_number}.input_layernorm.weight') + convert_names.append(f"model.layers.{layer_number}.input_layernorm.weight") assert len(params) == 1 else: raise NotImplementedError(f"Unsupported parameter name: {name}") @@ -61,18 +59,18 @@ def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[lis # 'decoder.layers.0.mlp.linear_fc1.layer_norm_weight' # 'decoder.layers.0.mlp.linear_fc1.weight' # 'decoder.layers.0.mlp.linear_fc2.weight' - layer_number = name.split('.')[2] + layer_number = name.split(".")[2] convert_names = [] if "mlp.linear_fc1.weight" in name: # split gate_proj and up_proj - convert_names.append(f'model.layers.{layer_number}.mlp.gate_proj.weight') - convert_names.append(f'model.layers.{layer_number}.mlp.up_proj.weight') + convert_names.append(f"model.layers.{layer_number}.mlp.gate_proj.weight") + convert_names.append(f"model.layers.{layer_number}.mlp.up_proj.weight") assert len(params) == 2 elif "mlp.linear_fc1.layer_norm_weight" in name: - convert_names.append(f'model.layers.{layer_number}.post_attention_layernorm.weight') + convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight") assert len(params) == 1 elif "mlp.linear_fc2.weight" in name: - convert_names.append(f'model.layers.{layer_number}.mlp.down_proj.weight') + convert_names.append(f"model.layers.{layer_number}.mlp.down_proj.weight") assert len(params) == 1 else: raise NotImplementedError(f"Unsupported parameter name: {name}") @@ -96,7 +94,6 @@ def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> tupl class McoreToHFWeightConverterQwen2Moe(McoreToHFWeightConverterDense): - def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]: # 'decoder.layers.0.pre_mlp_layernorm.weight', # 'decoder.layers.0.mlp.router.weight', @@ -111,32 +108,32 @@ def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[lis # moe2 # 'decoder.layers.0.mlp.experts.linear_fc2.weight0', # 'decoder.layers.0.mlp.experts.linear_fc2.weight1', - layer_number = name.split('.')[2] + layer_number = name.split(".")[2] convert_names = [] if "pre_mlp_layernorm" in name: - convert_names.append(f'model.layers.{layer_number}.post_attention_layernorm.weight') + convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight") assert len(params) == 1 elif "mlp.router.weight" in name: - convert_names.append(f'model.layers.{layer_number}.mlp.gate.weight') + convert_names.append(f"model.layers.{layer_number}.mlp.gate.weight") assert len(params) == 1 elif "shared_experts.gate_weight" in name: - convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert_gate.weight') + convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert_gate.weight") assert len(params) == 1 elif "shared_experts.linear_fc1.weight" in name: # split gate_proj and up_proj - convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.gate_proj.weight') - convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.up_proj.weight') + convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.gate_proj.weight") + convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.up_proj.weight") assert len(params) == 2 elif "shared_experts.linear_fc2.weight" in name: - convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.down_proj.weight') + convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.down_proj.weight") assert len(params) == 1 elif "mlp.experts.linear_fc1" in name: # split gate_proj and up_proj - expert_id = name.split('weight')[-1] - convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight') - convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight') + expert_id = name.split("weight")[-1] + convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight") + convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight") assert len(params) == 2 elif "mlp.experts.linear_fc2" in name: - expert_id = name.split('weight')[-1] - convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight') + expert_id = name.split("weight")[-1] + convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight") assert len(params) == 1 else: raise NotImplementedError(f"Unsupported parameter name: {name}") diff --git a/verl/models/weight_loader_registry.py b/verl/models/weight_loader_registry.py index b274f087718..31942b7cfe3 100644 --- a/verl/models/weight_loader_registry.py +++ b/verl/models/weight_loader_registry.py @@ -31,9 +31,10 @@ def get_weight_loader(arch: str): def get_weight_saver(arch: str): from verl.models.mcore.saver import merge_megatron_ckpt_gptmodel, merge_megatron_ckpt_gptmodel_qwen_moe + _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY = { - 'LlamaForCausalLM': merge_megatron_ckpt_gptmodel, - 'Qwen2ForCausalLM': merge_megatron_ckpt_gptmodel, + "LlamaForCausalLM": merge_megatron_ckpt_gptmodel, + "Qwen2ForCausalLM": merge_megatron_ckpt_gptmodel, "Qwen2MoeForCausalLM": merge_megatron_ckpt_gptmodel_qwen_moe, } if arch in _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY: diff --git a/verl/workers/critic/megatron_critic.py b/verl/workers/critic/megatron_critic.py index 47a535ee0f4..db014d96b1f 100644 --- a/verl/workers/critic/megatron_critic.py +++ b/verl/workers/critic/megatron_critic.py @@ -28,7 +28,7 @@ from verl import DataProto from verl.trainer.ppo import core_algos -from verl.utils.megatron.pipeline_parallel import compute_transformers_input_shapes, make_batch_generator +from verl.utils.megatron.pipeline_parallel import make_batch_generator from verl.utils.py_functional import append_to_dict from verl.utils.torch_functional import broadcast_dict_tensor, masked_mean, split_dict_tensor_into_batches from verl.workers.critic import BasePPOCritic @@ -133,20 +133,11 @@ def forward_backward_batch(self, data: DataProto, forward_only=False): n_micro_batch = len(batches) seq_len = batches[0]["input_ids"].shape[1] - # compute input shapes for pp stages - input_shapes = compute_transformers_input_shapes( - batches, - meta_info={ - "sequence_parallel": self.tf_config.sequence_parallel, - "hidden_size": self.model_config.hidden_size, - }, - ) - forward_backward_func = get_forward_backward_func() def loss_func(output, data, meta_info): if forward_only: - return torch.tensor(1.0, device=output.device), {'vpreds': output} + return torch.tensor(1.0, device=output.device), {"vpreds": output} responses = data["responses"] attention_mask = data["attention_mask"] diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index 9e9e613b105..c9ae207a8e7 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -257,13 +257,16 @@ def _build_rollout(self, trust_remote_code=False): # perform weight resharding between actor and rollout from verl.models.mcore import get_mcore_weight_converter + weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype) - sharding_manager = MegatronVLLMShardingManager(inference_engine=rollout.inference_engine, - model_config=self.actor_model_config, - layer_name_mapping=layer_name_mapping, - actor_module=self.actor.actor_module, - weight_converter=weight_converter) - log_gpu_memory_usage('After building sharding manager', logger=logger) + sharding_manager = MegatronVLLMShardingManager( + inference_engine=rollout.inference_engine, + model_config=self.actor_model_config, + layer_name_mapping=layer_name_mapping, + actor_module=self.actor.actor_module, + weight_converter=weight_converter, + ) + log_gpu_memory_usage("After building sharding manager", logger=logger) else: raise NotImplementedError("Only vllmRollout is supported with Megatron now") @@ -287,10 +290,7 @@ def init_model(self): self.dtype = PrecisionType.to_dtype(self.param_dtype) if self._is_actor or self._is_rollout: # we need the model for actor and rollout - if self._is_actor: - optim_config = self.config.actor.optim - else: - optim_config = None + optim_config = self.config.actor.optim if self._is_actor else None self.actor_module, self.actor_optimizer, self.actor_model_config, self.actor_optim_config = ( self._build_model_optimizer( model_path=self.config.model.path, diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py index 33d9a51aa24..b60ce9f73e3 100644 --- a/verl/workers/sharding_manager/megatron_vllm.py +++ b/verl/workers/sharding_manager/megatron_vllm.py @@ -15,21 +15,30 @@ This file contains a Megatron style Hybrid Engine that shares the weights of the actor with the inference engine. """ +import inspect import logging import os import torch +import torch.distributed import torch.distributed as dist from megatron.core import DistributedDataParallel as LocalDDP from megatron.core import parallel_state as mpu from megatron.core.transformer.module import Float16Module from torch import nn +from torch.distributed import new_group from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP +import verl.utils.megatron.tensor_parallel as tp_utils +from verl import DataProto +from verl.models.mcore.weight_converter import McoreToHFWeightConverterBase +from verl.third_party.vllm import LLM, vllm_version +from verl.third_party.vllm import parallel_state as vllm_ps from verl.utils.debug import log_gpu_memory_usage from verl.utils.megatron_utils import ( broadcast_from_megatron_pp, broadcast_str_from_megatron_pp, + convert_megatron_model_to_transformers_model, get_model, unwrap_model, ) @@ -39,6 +48,9 @@ get_weight_buffer_meta_from_module, ) from verl.utils.model import normalize_model_name +from verl.utils.torch_functional import allgather_dict_tensors + +from .base import BaseShardingManager logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_PPO_LOGGING_LEVEL", "WARN")) @@ -47,7 +59,8 @@ class AllGatherPPModel: def __init__(self, model_provider, use_distributed_optimizer=True) -> None: print( - "[WARNING] This class is deprecated and will no longer be supported. Consider using the `MegatronPPOActor` class directly as a replacement." + "[WARNING] This class is deprecated and will no longer be supported. \ +Consider using the `MegatronPPOActor` class directly as a replacement." ) self._pp_group = mpu.get_pipeline_model_parallel_group() self._pp_rank = mpu.get_pipeline_model_parallel_rank() @@ -243,26 +256,13 @@ def pp_models(self): """ Megatron Hybrid Engine: - During training, only the current pp stage holds the parameters -- Before inference, broadcast the parameters of the current pp rank to all other pp ranks (all pp ranks holds all the parameters) +- Before inference, broadcast the parameters of the current pp rank + to all other pp ranks (all pp ranks holds all the parameters) - Bind the parameters to the inference engine - Do inference in tp. pp is treated as additional dp - After inference, all the parameters that doesn't belong to this pp rank is freed. """ -import inspect - -import torch.distributed -from torch.distributed import new_group - -import verl.utils.megatron.tensor_parallel as tp_utils -from verl import DataProto -from verl.third_party.vllm import LLM, vllm_version -from verl.third_party.vllm import parallel_state as vllm_ps -from verl.utils.megatron_utils import convert_megatron_model_to_transformers_model -from verl.models.mcore.weight_converter import McoreToHFWeightConverterBase -from verl.utils.torch_functional import allgather_dict_tensors - -from .base import BaseShardingManager # Micro Data parallel group. Micro data parallel group is additional dp group that origins from splitting training tp # into infer_tp and micro_tp. By default, we use order micro_dp - tp @@ -272,14 +272,15 @@ def pp_models(self): class MegatronVLLMShardingManager(BaseShardingManager): - - def __init__(self, - actor_module: nn.ModuleList, - inference_engine: LLM, - model_config, - layer_name_mapping, - weight_converter: McoreToHFWeightConverterBase, - module: AllGatherPPModel = None): + def __init__( + self, + actor_module: nn.ModuleList, + inference_engine: LLM, + model_config, + layer_name_mapping, + weight_converter: McoreToHFWeightConverterBase, + module: AllGatherPPModel = None, + ): from megatron.core import parallel_state as mpu self.actor_module = actor_module @@ -331,8 +332,7 @@ def per_tensor_generator(self, convert_qkv_gate_up_by_simple_split=True): def tensor_generator(): for scan_vpp_idx in range(vpp_size): - for name, param in self.actor_module[scan_vpp_idx].named_parameters(): - yield name, param + yield from self.actor_module[scan_vpp_idx].named_parameters() # we need first make all rank get full model information meta_info = [] @@ -395,14 +395,14 @@ def tensor_generator(): convert_qkv_gate_up_by_trunk_concat=False, ) # defualt false - for converted_name, infer_param in zip(converted_names, converted_params): - yield converted_name, infer_param + yield from zip(converted_names, converted_params) def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_qkv_gate_up_by_simple_split=False): """ name: name of the parameter param: training parameters - infer_params (Iterable[torch.Tensor]): a iterator towards list of parameters all-gathered from train tp group (vllm 0.8.2) or micro-dp group (vllm <= 0.6.3) + infer_params (Iterable[torch.Tensor]): a iterator towards list of parameters all-gathered + from train tp group (vllm 0.8.2) or micro-dp group (vllm <= 0.6.3) model_config: huggingface model_config TODO(zhangchi.usc1992): currently, the implementation is adhoc. We can move this function to the model definition so that it is model-agnostic. If the model doesn't implement this function, @@ -436,10 +436,7 @@ def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_ q = torch.cat(q_lst, dim=0) k = torch.cat(k_lst, dim=0) v = torch.cat(v_lst, dim=0) - if not convert_qkv_gate_up_by_simple_split: - infer_params = torch.cat((q, k, v), dim=0) - else: - infer_params = [q, k, v] + infer_params = torch.cat((q, k, v), dim=0) if not convert_qkv_gate_up_by_simple_split else [q, k, v] elif self.layer_name_mapping.get("gate_proj_layer_name") in name: # if the tensor is gate and proj @@ -451,10 +448,7 @@ def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_ up_lst.append(up) gate = torch.cat(gate_lst, dim=0) up = torch.cat(up_lst, dim=0) - if not convert_qkv_gate_up_by_simple_split: - infer_params = torch.cat((gate, up), dim=0) - else: - infer_params = [gate, up] + infer_params = torch.cat((gate, up), dim=0) if not convert_qkv_gate_up_by_simple_split else [gate, up] elif "mlp.experts.linear_fc2.weight" in name: # moe infer_params = torch.cat(infer_params, dim=1) @@ -467,7 +461,8 @@ def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_ def _post_process_params(self, params, convert_qkv_gate_up_by_simple_split=False): """ - For each param, if it is a tp-splited param, we all-gather from train tp group (vllm 0.8.2) or micro-dp group (vllm <= 0.6.3) + For each param, if it is a tp-splited param, we all-gather from train + tp group (vllm 0.8.2) or micro-dp group (vllm <= 0.6.3) """ # here the params are in train tp format. we iterate params and all-gather # TODO(zhangchi.usc1992) We can consider copy non-tp weight to another infer buffer. @@ -491,20 +486,20 @@ def _post_process_params(self, params, convert_qkv_gate_up_by_simple_split=False ) else: infer_params = param - if vllm_version in ('0.4.2', '0.5.4', '0.6.3'): + if vllm_version in ("0.4.2", "0.5.4", "0.6.3"): converted_names, converted_params = convert_megatron_model_to_transformers_model( name, infer_params, self.model_config, self.train_tp_size, self.module.pp_models[0][0].config.num_query_groups, - convert_qkv_gate_up_by_trunk_concat=False) + convert_qkv_gate_up_by_trunk_concat=False, + ) else: if not isinstance(infer_params, list): infer_params = [infer_params] converted_names, converted_params = self.weight_converter.convert_param(name, infer_params) - for converted_name, infer_param in zip(converted_names, converted_params): - yield converted_name, infer_param + yield from zip(converted_names, converted_params) def __enter__(self): if vllm_version in ("0.4.2", "0.5.4", "0.6.3"): @@ -520,7 +515,8 @@ def __enter__(self): model = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model _patch_vllm_qwen2_moe_model_weight_loader(model) loaded_params = model.load_weights(per_tensor_param) - logger.info(f"vLLM load weights, loaded_params: {len(loaded_params)}") + info = f"vLLM load weights, loaded_params: {len(loaded_params)}" + logger.info(info) log_gpu_memory_usage("After load_weights sharding manager memory", logger=logger) if "tags" in inspect.signature(self.inference_engine.wake_up).parameters: @@ -597,6 +593,7 @@ def _patch_vllm_qwen2_moe_model_weight_loader(model): # (False, 'model.layers.0.mlp.experts.w13_weight') use mlp.experts.weight_loader # (False, 'model.layers.0.mlp.experts.w2_weight') use mlp.experts.weight_loader from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM + if not isinstance(model, Qwen2MoeForCausalLM): return for layer in model.model.layers: From 7b66d82b4619820ddc324786b16318d2f874eab7 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Fri, 18 Apr 2025 23:58:10 -0700 Subject: [PATCH 11/19] fix bug of merge --- .../workers/sharding_manager/megatron_vllm.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py index a2ea7b288b4..12bc569d78b 100644 --- a/verl/workers/sharding_manager/megatron_vllm.py +++ b/verl/workers/sharding_manager/megatron_vllm.py @@ -390,15 +390,19 @@ def tensor_generator(): else: infer_params = broad_pp_tensor - # change megatron tensor name to hf model name - converted_names, converted_params = convert_megatron_model_to_transformers_model( - cur_name, - infer_params, - self.model_config, - self.train_tp_size, - 0, # no impact - convert_qkv_gate_up_by_trunk_concat=False, - ) # defualt false + if vllm_version in ("0.4.2", "0.5.4", "0.6.3"): + converted_names, converted_params = convert_megatron_model_to_transformers_model( + cur_name, + infer_params, + self.model_config, + self.train_tp_size, + 0, # no impact + convert_qkv_gate_up_by_trunk_concat=False, + ) # defualt false + else: + if not isinstance(infer_params, list): + infer_params = [infer_params] + converted_names, converted_params = self.weight_converter.convert_param(cur_name, infer_params) yield from zip(converted_names, converted_params) From 941ab9589e471c4648b7bb7efc16c82cebd423fa Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Sat, 19 Apr 2025 01:00:01 -0700 Subject: [PATCH 12/19] compatible to mcore 0.12 --- verl/workers/actor/megatron_actor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py index c7ce1161e97..49a76687402 100644 --- a/verl/workers/actor/megatron_actor.py +++ b/verl/workers/actor/megatron_actor.py @@ -344,7 +344,7 @@ def loss_func(output, data, meta_info): stats = {} if forward_only: - policy_loss = 1.0 + policy_loss = torch.tensor(1.0, device=output.device) else: if self.config.use_kl_loss: ref_log_prob = data["ref_log_prob"] From 267a119410eec1ab9778c407a0de0ab8fca86d19 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Mon, 21 Apr 2025 08:29:28 -0700 Subject: [PATCH 13/19] WIP support moonlight --- scripts/converter_hf_to_mcore.py | 76 ++++++++++- verl/models/mcore/config_converter.py | 107 ++++++++++++++- verl/models/mcore/registry.py | 7 +- verl/models/mcore/saver.py | 5 + verl/models/mcore/weight_converter.py | 128 ++++++++++++++++++ verl/models/weight_loader_registry.py | 7 +- .../single_controller/base/megatron/worker.py | 18 ++- verl/workers/megatron_workers.py | 9 +- .../workers/sharding_manager/megatron_vllm.py | 21 ++- 9 files changed, 347 insertions(+), 31 deletions(-) diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py index aa4256b67a4..f917a9c1d58 100644 --- a/scripts/converter_hf_to_mcore.py +++ b/scripts/converter_hf_to_mcore.py @@ -53,7 +53,7 @@ def __init__(self): self.model = ModelConfig() -def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config): +def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config,tfconfig): num_attention_heads = hf_config.num_attention_heads hidden_dim = hf_config.hidden_size head_dim = hidden_dim // num_attention_heads @@ -95,6 +95,70 @@ def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config) model.decoder.final_layernorm.weight.copy_(hf_model.model.norm.weight) model.output_layer.weight.copy_(hf_model.lm_head.weight) +@torch.no_grad() +def convert_checkpoint_from_transformers_to_megatron_dpskv3(hf_model, model, hf_config, tfconfig): + warnings.warn("MPT model is not supported yet") + def safe_copy( + src_tensor: torch.Tensor, + dst_tensor: torch.Tensor, + skip_dtype_assert: bool = False, + ): + if not skip_dtype_assert: + if src_tensor.dtype != dst_tensor.dtype: + raise ValueError(f"Get source dtype {src_tensor.dtype}, but target dtype {dst_tensor.dtype}") + assert src_tensor.shape == dst_tensor.shape + dst_tensor.data.copy_(src_tensor.data) + return src_tensor.numel() + model.embedding.word_embeddings.weight.copy_(hf_model.model.embed_tokens.weight) + for layer_idx, (layer, hf_layer) in enumerate(zip(model.decoder.layers, hf_model.model.layers)): + print(layer_idx) + layer.input_layernorm.weight.copy_(hf_layer.input_layernorm.weight) + + if hf_config.q_lora_rank is None: + layer.self_attention.linear_q_proj.weight.copy_(hf_layer.self_attn.q_proj.weight) + else: + layer.self_attention.linear_q_down_proj.weight.copy_(hf_layer.self_attn.q_a_proj.weight) + layer.self_attention.linear_q_up_proj.weight.copy_(hf_layer.self_attn.q_b_proj.weight) + layer.self_attention.linear_q_up_proj.layer_norm_weight.copy_(hf_layer.self_attn.q_a_layernorm.weight) + + layer.self_attention.linear_kv_down_proj.weight.copy_(hf_layer.self_attn.kv_a_proj_with_mqa.weight) + layer.self_attention.linear_kv_up_proj.weight.copy_(hf_layer.self_attn.kv_b_proj.weight) + layer.self_attention.linear_kv_up_proj.layer_norm_weight.copy_(hf_layer.self_attn.kv_a_layernorm.weight) + layer.self_attention.linear_proj.weight.copy_(hf_layer.self_attn.o_proj.weight) + + if not hasattr(layer.mlp, 'router'): + layer.mlp.linear_fc1.layer_norm_weight.copy_(hf_layer.post_attention_layernorm.weight) + layer.mlp.linear_fc1.weight.copy_( + torch.cat([hf_layer.mlp.gate_proj.weight, hf_layer.mlp.up_proj.weight])) + layer.mlp.linear_fc2.weight.copy_(hf_layer.mlp.down_proj.weight) + else: + layer.mlp.router.weight.copy_(hf_layer.mlp.gate.weight) + # NOTE: the e_score_correction_bias in mcore model will be initialized with bfloat16 and \ + # recover to fp32 in the first forward. There is always a diff in the bias between two models (~0.3%) + safe_copy(hf_layer.mlp.gate.e_score_correction_bias, layer.mlp.router.expert_bias, skip_dtype_assert=True) + if tfconfig.moe_grouped_gemm == True: + for i, hf_expert in enumerate(hf_layer.mlp.experts): + fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight]) + linear_fc1_weighti = getattr(layer.mlp.experts.linear_fc1, 'weight' + str(i)) + linear_fc1_weighti.copy_(fc1_weight) + linear_fc2_weighti = getattr(layer.mlp.experts.linear_fc2, 'weight' + str(i)) + linear_fc2_weighti.copy_(hf_expert.down_proj.weight) + else: + for i, hf_expert in enumerate(hf_layer.mlp.experts): + expert = layer.mlp.experts.local_experts[i] + fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight]) + expert.linear_fc1.weight.copy_(fc1_weight) + expert.linear_fc2.weight.copy_(hf_expert.down_proj.weight) + layer.pre_mlp_layernorm.weight.copy_(hf_layer.post_attention_layernorm.weight) + shared_fc1_weight = torch.cat( + [hf_layer.mlp.shared_experts.gate_proj.weight, hf_layer.mlp.shared_experts.up_proj.weight]) + layer.mlp.shared_experts.linear_fc1.weight.copy_(shared_fc1_weight) + layer.mlp.shared_experts.linear_fc2.weight.copy_(hf_layer.mlp.shared_experts.down_proj.weight) + + model.decoder.final_layernorm.weight.copy_(hf_model.model.norm.weight) + if not hf_config.tie_word_embeddings: + model.output_layer.weight.copy_(hf_model.lm_head.weight) + def convert_hf_to_mcore(hf_model_path, output_path, test=False): os.makedirs(output_path, exist_ok=True) @@ -117,7 +181,7 @@ def convert_hf_to_mcore(hf_model_path, output_path, test=False): model_parallel_cuda_manual_seed(0) # init hf config - hf_config = AutoConfig.from_pretrained(hf_model_path) + hf_config = AutoConfig.from_pretrained(hf_model_path,trust_remote_code=True) print(hf_config) cfg = Config() @@ -147,12 +211,14 @@ def megatron_model_provider(pre_process, post_process): warnings.simplefilter("ignore") # init hf model - hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, torch_dtype=torch.bfloat16) + hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, torch_dtype=torch.bfloat16,trust_remote_code=True) ref_state_dict = hf_model.state_dict() # load hf state dict to megatron model - if "Qwen2MoeForCausalLM" in hf_config.architectures: - convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config) + if "DeepseekV3ForCausalLM" in hf_config.architectures: + convert_checkpoint_from_transformers_to_megatron_dpskv3(hf_model, model[0].module, hf_config, tfconfig=tfconfig) + elif "Qwen2MoeForCausalLM" in hf_config.architectures: + convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config,tfconfig) else: from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index c43f7f75526..9602df1c0fb 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -17,7 +17,7 @@ import torch import torch.nn.functional as F -from megatron.core.transformer import TransformerConfig +from megatron.core.transformer import TransformerConfig, MLATransformerConfig from megatron.core.transformer.enums import AttnBackend from transformers import PretrainedConfig @@ -94,6 +94,7 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) bf16=dtype is torch.bfloat16, layernorm_epsilon=hf_config.rms_norm_eps, ffn_hidden_size=hf_config.intermediate_size, + # parallel config tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(), pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(), @@ -102,6 +103,7 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) overlap_p2p_comm=overlap_p2p_comm, batch_p2p_comm=batch_p2p_comm, sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1, + # moe specific moe_ffn_hidden_size=hf_config.moe_intermediate_size, moe_token_dispatcher_type="alltoall", @@ -116,15 +118,18 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) # moe_permute_fusion=True, # need TE 2.1+ moe_grouped_gemm=True, moe_router_score_function="softmax", + # # mcore 0.12 moe # moe_router_dtype="fp64", # disable_bf16_reduced_precision_matmul=True, + # other # deallocate_pipeline_outputs=True, # gradient_accumulation_fusion=True, persist_layer_norm=True, bias_activation_fusion=True, bias_dropout_fusion=True, + # qwen specific moe_router_pre_softmax=True, add_qkv_bias=True, @@ -132,9 +137,105 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) return transformer_config -def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: +def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> MLATransformerConfig: # DeepseekV3ForCausalLM - raise NotImplementedError("DeepseekV3ForCausalLM is not supported yet") + from megatron.core import parallel_state as mpu + + overlap_p2p_comm = ( + mpu.get_virtual_pipeline_model_parallel_world_size() is not None + and mpu.get_virtual_pipeline_model_parallel_world_size() > 1 + ) + batch_p2p_comm = False + + mla_rope_config = { + "beta_fast": 32, + "beta_slow": 1, + "factor": 40, + "mscale": 1.0, + "mscale_all_dim": 1.0, + "original_max_position_embeddings": 4096, + } + if "rope_scaling" in hf_config and hf_config.rope_scaling is not None: + mla_rope_config.update(hf_config.rope_scaling) + moe_layer_freq = [1] * hf_config.num_hidden_layers + for i in range(hf_config.first_k_dense_replace): + moe_layer_freq[i] = 0 + transformer_config = MLATransformerConfig( + num_layers=hf_config.num_hidden_layers, + hidden_size=hf_config.hidden_size, + num_attention_heads=hf_config.num_attention_heads, + num_query_groups=hf_config.num_key_value_heads, + attention_dropout=hf_config.attention_dropout, + hidden_dropout=getattr(hf_config, "hidden_dropout", 0.0), + activation_func=F.silu, + normalization="RMSNorm", + gated_linear_unit=True, + use_cpu_initialization=False, + add_bias_linear=False, + pipeline_dtype=dtype, + params_dtype=dtype, + variable_seq_lengths=True, + masked_softmax_fusion=True, + # attention_backend=AttnBackend.flash, + attention_backend=AttnBackend.unfused, + bf16=dtype is torch.bfloat16, + layernorm_epsilon=hf_config.rms_norm_eps, + ffn_hidden_size=hf_config.intermediate_size, + qk_layernorm=True, + + # parallel config + tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(), + pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(), + virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(), + context_parallel_size=mpu.get_context_parallel_world_size(), + overlap_p2p_comm=overlap_p2p_comm, + batch_p2p_comm=batch_p2p_comm, + sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1, + + # moe specific + moe_ffn_hidden_size=hf_config.moe_intermediate_size, + moe_token_dispatcher_type="alltoall", + moe_router_bias_update_rate=0.001, + moe_router_enable_expert_bias=True, + moe_router_topk=hf_config.num_experts_per_tok, + num_moe_experts=hf_config.n_routed_experts, + moe_shared_expert_intermediate_size=hf_config.moe_intermediate_size * hf_config.n_shared_experts, + moe_aux_loss_coeff=getattr(hf_config, "aux_loss_alpha", 0.001), + moe_router_load_balancing_type="seq_aux_loss", + moe_shared_expert_overlap=True, + # moe_permute_fusion=True, # need TE 2.1+ + moe_grouped_gemm=True, + moe_router_score_function="sigmoid", + moe_router_pre_softmax=True, + moe_router_topk_scaling_factor=hf_config.routed_scaling_factor, + moe_layer_freq=moe_layer_freq, + + # MLA + q_lora_rank=hf_config.q_lora_rank, + kv_lora_rank=hf_config.kv_lora_rank, + qk_head_dim=hf_config.qk_nope_head_dim, + qk_pos_emb_head_dim=hf_config.qk_rope_head_dim, + v_head_dim=hf_config.v_head_dim, + rotary_base=hf_config.rope_theta, + rotary_scaling_factor=mla_rope_config["factor"], + mscale=mla_rope_config["mscale"], + mscale_all_dim=mla_rope_config["mscale_all_dim"], + max_position_embeddings=mla_rope_config["original_max_position_embeddings"], + beta_fast=mla_rope_config["beta_fast"], + beta_slow=mla_rope_config["beta_slow"], + + # mcore 0.12 moe + # moe_router_dtype="fp64", + # disable_bf16_reduced_precision_matmul=True, + + # other + # deallocate_pipeline_outputs=True, + # gradient_accumulation_fusion=True, + persist_layer_norm=True, + bias_activation_fusion=True, + bias_dropout_fusion=True, + ) + return transformer_config def hf_to_mcore_config_qwen2_5_vl(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py index 19d8433db08..2fdab2160a4 100644 --- a/verl/models/mcore/registry.py +++ b/verl/models/mcore/registry.py @@ -39,7 +39,11 @@ init_mcore_model_qwen2_5_vl, init_mcore_model_qwen2_moe, ) -from .weight_converter import McoreToHFWeightConverterDense, McoreToHFWeightConverterQwen2Moe +from .weight_converter import ( + McoreToHFWeightConverterDense, + McoreToHFWeightConverterQwen2Moe, + McoreToHFWeightConverterDpskv3, +) def hf_to_mcore_config(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: @@ -114,6 +118,7 @@ def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype): "LlamaForCausalLM": McoreToHFWeightConverterDense, "Qwen2ForCausalLM": McoreToHFWeightConverterDense, "Qwen2MoeForCausalLM": McoreToHFWeightConverterQwen2Moe, + "DeepseekV3ForCausalLM": McoreToHFWeightConverterDpskv3, } assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" arch = hf_config.architectures[0] diff --git a/verl/models/mcore/saver.py b/verl/models/mcore/saver.py index df8721aa56a..14c7e29278e 100644 --- a/verl/models/mcore/saver.py +++ b/verl/models/mcore/saver.py @@ -468,3 +468,8 @@ def merge_megatron_ckpt_gptmodel_qwen_moe( wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False ): raise NotImplementedError("merge_megatron_ckpt_gptmodel_qwen_moe is not implemented") + +def merge_megatron_ckpt_gptmodel_dpskv3( + wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False +): + raise NotImplementedError("merge_megatron_ckpt_gptmodel_dpskv3 is not implemented") diff --git a/verl/models/mcore/weight_converter.py b/verl/models/mcore/weight_converter.py index 6bdee51b3e9..ce8c9fb7eab 100644 --- a/verl/models/mcore/weight_converter.py +++ b/verl/models/mcore/weight_converter.py @@ -138,3 +138,131 @@ def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[lis else: raise NotImplementedError(f"Unsupported parameter name: {name}") return convert_names, params + +class McoreToHFWeightConverterDpskv3(McoreToHFWeightConverterBase): + + def _convert_attention_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]: + # mcore + # 'decoder.layers.0.input_layernorm.weight' + # 'decoder.layers.0.self_attention.linear_proj.weight' + # 'decoder.layers.0.self_attention.linear_q_proj.weight' + # 'decoder.layers.0.self_attention.linear_kv_down_proj.weight' + # 'decoder.layers.0.self_attention.linear_kv_up_proj.layer_norm_weight' + # 'decoder.layers.0.self_attention.linear_kv_up_proj.weight' + # 'decoder.layers.0.self_attention.linear_q_down_proj.weight' + # 'decoder.layers.0.self_attention.linear_q_up_proj.weight' + # 'decoder.layers.0.self_attention.linear_q_up_proj.layer_norm_weight' + # hf + # 'model.layers.0.input_layernorm.weight' + # 'model.layers.0.self_attn.o_proj.weight' + # 'model.layers.0.self_attn.q_proj.weight' + # 'model.layers.0.self_attn.kv_a_proj_with_mqa.weight' + # 'model.layers.0.self_attn.kv_a_layernorm.weight' + # 'model.layers.0.self_attn.kv_b_proj.weight' + # 'model.layers.0.self_attn.q_a_proj.weight' + # 'model.layers.0.self_attn.q_b_proj.weight' + # 'model.layers.0.self_attn.q_a_layernorm.weight' + name_map_after_layer = { + "input_layernorm.weight": "input_layernorm.weight", + "self_attention.linear_proj.weight": "self_attn.o_proj.weight", + "self_attention.linear_q_proj.weight": "self_attn.q_proj.weight", + "self_attention.linear_kv_down_proj.weight": "self_attn.kv_a_proj_with_mqa.weight", + "self_attention.linear_kv_up_proj.layer_norm_weight": "self_attn.kv_a_layernorm.weight", + "self_attention.linear_kv_up_proj.weight": "self_attn.kv_b_proj.weight", + "self_attention.linear_q_down_proj.weight": "self_attn.q_a_proj.weight", + "self_attention.linear_q_up_proj.weight": "self_attn.q_b_proj.weight", + "self_attention.linear_q_up_proj.layer_norm_weight": "self_attn.q_a_layernorm.weight", + } + assert len(params) == 1 + convert_names = [] + layer_number = name.split(".")[2] + name_after_layer = name.split(f'.{layer_number}.')[1] + convert_names.append(f"model.layers.{layer_number}.{name_map_after_layer[name_after_layer]}") + return convert_names, params + + def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]: + # mcore dense + # 'decoder.layers.0.mlp.linear_fc1.layer_norm_weight' + # 'decoder.layers.0.mlp.linear_fc2.weight' + # 'decoder.layers.0.mlp.linear_fc1.weight' + # --- + # 'decoder.layers.1.mlp.shared_experts.linear_fc1.weight' + # --- + # 'decoder.layers.1.mlp.shared_experts.linear_fc2.weight' + # hf dense + # 'model.layers.0.post_attention_layernorm.weight' + # 'model.layers.0.mlp.down_proj.weight' + # 'model.layers.0.mlp.gate_proj.weight' + # 'model.layers.0.mlp.up_proj.weight' + # 'model.layers.1.mlp.shared_experts.gate_proj.weight' + # 'model.layers.1.mlp.shared_experts.up_proj.weight' + # 'model.layers.1.mlp.shared_experts.down_proj.weight' + + # mcore moe + # 'decoder.layers.1.pre_mlp_layernorm.weight' + # 'decoder.layers.1.mlp.router.weight' + # 'decoder.layers.1.mlp.router.expert_bias' + # 'decoder.layers.1.mlp.experts.linear_fc1.weight0' + # --- + # 'decoder.layers.1.mlp.experts.linear_fc2.weight0' + # hf moe + # 'model.layers.1.post_attention_layernorm.weight' + # 'model.layers.1.mlp.gate.weight' + # 'model.layers.1.mlp.gate.e_score_correction_bias' + # 'model.layers.1.mlp.experts.0.gate_proj.weight' + # 'model.layers.1.mlp.experts.0.up_proj.weight' + # 'model.layers.1.mlp.experts.0.down_proj.weight' + + name_map_after_layer = { + "mlp.linear_fc1.layer_norm_weight": "post_attention_layernorm.weight", + "mlp.linear_fc2.weight": "mlp.down_proj.weight", + "mlp.shared_experts.linear_fc2.weight": "mlp.shared_experts.down_proj.weight", + "mlp.linear_fc1.weight": ["mlp.gate_proj.weight", "mlp.up_proj.weight"], + "mlp.shared_experts.linear_fc1.weight": ["mlp.shared_experts.gate_proj.weight", "mlp.shared_experts.up_proj.weight"], + "pre_mlp_layernorm.weight": "post_attention_layernorm.weight", + "mlp.router.weight": "mlp.gate.weight", + "mlp.router.expert_bias": "mlp.gate.e_score_correction_bias", + } + convert_names = [] + layer_number = name.split(".")[2] + name_after_layer = name.split(f'.{layer_number}.')[1] + if name_after_layer in name_map_after_layer: + mapped_name = name_map_after_layer[name_after_layer] + if isinstance(mapped_name, list): + assert len(params) == len(mapped_name) + for one in mapped_name: + convert_names.append(f"model.layers.{layer_number}.{one}") + else: + assert len(params) == 1 + convert_names.append(f"model.layers.{layer_number}.{mapped_name}") + else: + if "mlp.experts.linear_fc1.weight" in name: + expert_id = name.split("weight")[-1] + convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight") + convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight") + assert len(params) == 2 + elif "mlp.experts.linear_fc2.weight" in name: + expert_id = name.split("weight")[-1] + convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight") + assert len(params) == 1 + else: + raise NotImplementedError(f"Unsupported parameter name: {name}") + + return convert_names, params + + + def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]: + direct_name_mapping = { + "embedding.word_embeddings.weight": "model.embed_tokens.weight", + "decoder.final_layernorm.weight": "model.norm.weight", + "output_layer.weight": "lm_head.weight", + } + if name in direct_name_mapping: + return [direct_name_mapping[name]], [params_one_group[0]] + + if "self_attention" in name or "input_layernorm.weight" in name: + return self._convert_attention_param(name, params_one_group) + elif "mlp" in name: + return self._convert_mlp_param(name, params_one_group) + else: + raise NotImplementedError(f"Unsupported parameter name: {name}") \ No newline at end of file diff --git a/verl/models/weight_loader_registry.py b/verl/models/weight_loader_registry.py index 31942b7cfe3..5931a1c866f 100644 --- a/verl/models/weight_loader_registry.py +++ b/verl/models/weight_loader_registry.py @@ -30,12 +30,17 @@ def get_weight_loader(arch: str): def get_weight_saver(arch: str): - from verl.models.mcore.saver import merge_megatron_ckpt_gptmodel, merge_megatron_ckpt_gptmodel_qwen_moe + from verl.models.mcore.saver import ( + merge_megatron_ckpt_gptmodel, + merge_megatron_ckpt_gptmodel_qwen_moe, + merge_megatron_ckpt_gptmodel_dpskv3, + ) _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY = { "LlamaForCausalLM": merge_megatron_ckpt_gptmodel, "Qwen2ForCausalLM": merge_megatron_ckpt_gptmodel, "Qwen2MoeForCausalLM": merge_megatron_ckpt_gptmodel_qwen_moe, + "DeepseekV3ForCausalLM": merge_megatron_ckpt_gptmodel_dpskv3, } if arch in _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY: return _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY[arch] diff --git a/verl/single_controller/base/megatron/worker.py b/verl/single_controller/base/megatron/worker.py index 5fc71128169..01e493f9ebc 100644 --- a/verl/single_controller/base/megatron/worker.py +++ b/verl/single_controller/base/megatron/worker.py @@ -39,7 +39,7 @@ def get_megatron_rank_info(self): info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank, cp_rank=cp_rank) return info - def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config): + def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config, trust_remote_code=False): from transformers import AutoConfig from verl.models.mcore import hf_to_mcore_config @@ -49,10 +49,10 @@ def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config # Step 1: initialize the tokenizer self.local_path = copy_to_local(model_path) - self.tokenizer = hf_tokenizer(self.local_path) + self.tokenizer = hf_tokenizer(self.local_path, trust_remote_code=trust_remote_code) # Step 2: get the hf - hf_config = AutoConfig.from_pretrained(self.local_path) + hf_config = AutoConfig.from_pretrained(self.local_path, trust_remote_code=trust_remote_code) # Step 3: override the hf config override_config_kwargs = { @@ -68,17 +68,21 @@ def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config print(f"Model config after override: {hf_config}") tf_config = hf_to_mcore_config(hf_config, dtype) - def add_optimization_config_to_tf_config(tf_config, verl_model_config): + def add_optimization_config_to_tf_config(tf_config): # add optimization config to tf_config, e.g. checkpointing - if verl_model_config.get("enable_gradient_checkpointing", False): - gradient_checkpointing_cfg = dict(verl_model_config.get("gradient_checkpointing_kwargs", dict())) + if self.config.model.get("enable_gradient_checkpointing", False): + gradient_checkpointing_cfg = dict(self.config.model.get("gradient_checkpointing_kwargs", dict())) tf_config.recompute_method = gradient_checkpointing_cfg.get("activations_checkpoint_method", "full") tf_config.recompute_granularity = gradient_checkpointing_cfg.get( "activations_checkpoint_granularity", "full" ) tf_config.recompute_num_layers = gradient_checkpointing_cfg.get("activations_checkpoint_num_layers", -1) + if megatron_config:=self.config.get("megatron", {}): + if extra:=megatron_config.get("extra", {}): + for k, v in extra.items(): + setattr(tf_config, k, v) - add_optimization_config_to_tf_config(tf_config, self.config.model) + add_optimization_config_to_tf_config(tf_config) print(f"TF config: {tf_config}") self.hf_config = hf_config diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index c9ae207a8e7..e452db19239 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -140,7 +140,8 @@ def _build_model_optimizer(self, model_path, optim_config, override_model_config from verl.utils.megatron_utils import get_model, init_megatron_optim_config from verl.utils.model import get_generation_config, print_model_size - self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config) + trust_remote_code = self.config.model.get("trust_remote_code", False) + self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, trust_remote_code) self.generation_config = get_generation_config(self.local_path) def megatron_actor_model_provider(pre_process, post_process): @@ -504,7 +505,8 @@ def _build_critic_model_optimizer(self, model_path, optim_config, override_model from verl.utils.megatron_utils import get_model, init_megatron_optim_config from verl.utils.model import print_model_size - self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config) + trust_remote_code = self.config.model.get("trust_remote_code", False) + self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, trust_remote_code) def megatron_critic_model_provider(pre_process, post_process): from verl.models.mcore import init_mcore_model @@ -680,7 +682,8 @@ def _build_rm_model(self, model_path, override_model_config): from verl.utils.megatron_utils import get_model - self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config) + trust_remote_code = self.config.model.get("trust_remote_code", False) + self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, trust_remote_code) def megatron_rm_model_provider(pre_process, post_process): from verl.models.mcore import init_mcore_model diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py index 12bc569d78b..f02ae3e49dc 100644 --- a/verl/workers/sharding_manager/megatron_vllm.py +++ b/verl/workers/sharding_manager/megatron_vllm.py @@ -530,7 +530,7 @@ def __enter__(self): self.inference_engine.wake_up() per_tensor_param = self.per_tensor_generator() model = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model - _patch_vllm_qwen2_moe_model_weight_loader(model) + _patch_vllm_moe_model_weight_loader(model) loaded_params = model.load_weights(per_tensor_param) info = f"vLLM load weights, loaded_params: {len(loaded_params)}" logger.info(info) @@ -595,7 +595,7 @@ def get_micro_data_parallel_rank(): return torch.distributed.get_rank(group=get_micro_data_parallel_group()) -def _patch_vllm_qwen2_moe_model_weight_loader(model): +def _patch_vllm_moe_model_weight_loader(model): # this is a work around to load the weight of vllm qwen2 moe model # it is from a bug from vllm 0.8.2 # all the weights are supposed to have a weight_loader, but the moe weights @@ -613,12 +613,11 @@ def _patch_vllm_qwen2_moe_model_weight_loader(model): # (False, 'model.layers.0.mlp.experts.w13_weight') use mlp.experts.weight_loader # (False, 'model.layers.0.mlp.experts.w2_weight') use mlp.experts.weight_loader from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM - - if not isinstance(model, Qwen2MoeForCausalLM): - return - for layer in model.model.layers: - mlp = layer.mlp - param_dict = dict(mlp.named_parameters()) - for name, param in param_dict.items(): - if "w13_weight" in name or "w2_weight" in name: - param.weight_loader = mlp.experts.weight_loader + from vllm.model_executor.models.deepseek_v2 import DeepseekV3ForCausalLM + if isinstance(model, DeepseekV3ForCausalLM) or isinstance(model, Qwen2MoeForCausalLM): + for layer in model.model.layers: + mlp = layer.mlp + param_dict = dict(mlp.named_parameters()) + for name, param in param_dict.items(): + if "w13_weight" in name or "w2_weight" in name: + param.weight_loader = mlp.experts.weight_loader From 880184175d9e51d8c9a51bfa7fbd7886728053ad Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Sun, 27 Apr 2025 20:30:07 -0700 Subject: [PATCH 14/19] fix --- verl/models/mcore/config_converter.py | 6 +++-- verl/models/mcore/model_initializer.py | 31 ++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index 9602df1c0fb..967d8510f20 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -150,10 +150,11 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) - mla_rope_config = { "beta_fast": 32, "beta_slow": 1, - "factor": 40, + "factor": 1, "mscale": 1.0, "mscale_all_dim": 1.0, "original_max_position_embeddings": 4096, + "type": "rope", } if "rope_scaling" in hf_config and hf_config.rope_scaling is not None: mla_rope_config.update(hf_config.rope_scaling) @@ -177,7 +178,7 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) - variable_seq_lengths=True, masked_softmax_fusion=True, # attention_backend=AttnBackend.flash, - attention_backend=AttnBackend.unfused, + attention_backend=AttnBackend.fused, bf16=dtype is torch.bfloat16, layernorm_epsilon=hf_config.rms_norm_eps, ffn_hidden_size=hf_config.intermediate_size, @@ -218,6 +219,7 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) - v_head_dim=hf_config.v_head_dim, rotary_base=hf_config.rope_theta, rotary_scaling_factor=mla_rope_config["factor"], + rope_type=mla_rope_config["type"], mscale=mla_rope_config["mscale"], mscale_all_dim=mla_rope_config["mscale_all_dim"], max_position_embeddings=mla_rope_config["original_max_position_embeddings"], diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py index 0be8c9eb7ac..e83ce4960a4 100644 --- a/verl/models/mcore/model_initializer.py +++ b/verl/models/mcore/model_initializer.py @@ -129,11 +129,38 @@ def init_mcore_model_dpskv3( post_process=None, share_embeddings_and_output_weights=False, value=False, + freeze_moe_router=True, **extra_kwargs, ): - return init_mcore_model_dense( - tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, value, **extra_kwargs + from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec + from megatron.core.models.gpt.gpt_model import GPTModel + + use_te = True + if freeze_moe_router: + tfconfig.moe_router_load_balancing_type = "none" + + assert tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now" + transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te) + model = GPTModel( + config=tfconfig, + transformer_layer_spec=transformer_layer_spec, + vocab_size=hf_config.vocab_size, + max_sequence_length=hf_config.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + share_embeddings_and_output_weights=share_embeddings_and_output_weights, + position_embedding_type="rope", + rotary_base=hf_config.rope_theta, ) + if freeze_moe_router: + for layer in model.decoder.layers: + if hasattr(layer.mlp, "router"): + layer.mlp.router.weight.requires_grad = False + + if post_process and value: + from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer + model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig) + return model def init_mcore_model_qwen2_5_vl( From e5d6ca0f4acf75c6e2a084dc33f841d18487721d Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Sun, 27 Apr 2025 20:34:34 -0700 Subject: [PATCH 15/19] typo --- scripts/converter_hf_to_mcore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py index f917a9c1d58..77bd981249a 100644 --- a/scripts/converter_hf_to_mcore.py +++ b/scripts/converter_hf_to_mcore.py @@ -97,7 +97,7 @@ def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config, @torch.no_grad() def convert_checkpoint_from_transformers_to_megatron_dpskv3(hf_model, model, hf_config, tfconfig): - warnings.warn("MPT model is not supported yet") + warnings.warn("MTP model is not supported yet") def safe_copy( src_tensor: torch.Tensor, dst_tensor: torch.Tensor, From ae550a889a3a60d3ccc65304f035253b35cf4021 Mon Sep 17 00:00:00 2001 From: Yan Bai Date: Mon, 28 Apr 2025 05:38:21 -0700 Subject: [PATCH 16/19] add scripts --- .../run_moonlight16b_a3b_gsm8k_megatron.sh | 81 +++++++++++++++++++ verl/models/mcore/config_converter.py | 2 +- 2 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh diff --git a/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh b/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh new file mode 100644 index 00000000000..efd788f0eec --- /dev/null +++ b/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh @@ -0,0 +1,81 @@ +set -x + +# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs: +# export VLLM_ATTENTION_BACKEND=XFORMERS +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping + + +# 0. download the model +huggingface-cli download moonshotai/Moonlight-16B-A3B-Instruct + +# 1. convert the model to mcore format +# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path +HF_MODEL_PATH=/data/models/moonshotai/Moonlight-16B-A3B-Instruct +DIST_CKPT_PATH=/data/mcore_ckpt/Moonlight-16B-A3B-Instruct +python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH + + +# 2. run the script +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +train_files=$gsm8k_train_path +test_files=$gsm8k_test_path + +NODES=4 +PP=2 +TP=4 +CP=1 +VLLM_TP=4 + +# RAY_ADDRESS='auto' ray job submit --working-dir . -- +python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + +data.trust_remote_code=True \ + actor_rollout_ref.model.path=$LLM \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ + critic.optim.lr=1e-5 \ + critic.model.path=$LLM \ + critic.model.enable_gradient_checkpointing=False \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=['console','wandb'] \ + trainer.project_name='verl_megatron_gsm8k_examples' \ + trainer.experiment_name='moonlight_freeze_moe_router' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=$NODES \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + +actor_rollout_ref.model.trust_remote_code=True \ + +critic.model.trust_remote_code=True \ + +actor_rollout_ref.megatron.extra.num_layers_in_last_pipeline_stage=13 \ + +critic.megatron.extra.num_layers_in_last_pipeline_stage=13 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \ + critic.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \ + critic.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \ + actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \ + critic.megatron.use_dist_checkpointing=True \ + actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + trainer.val_before_train=False \ + trainer.total_epochs=100 $@ + \ No newline at end of file diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index 1ed2ed6f995..84bd1f30520 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -138,7 +138,7 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) - "mscale": 1.0, "mscale_all_dim": 1.0, "original_max_position_embeddings": 4096, - "type": "rope", + # "type": "rope", } if "rope_scaling" in hf_config and hf_config.rope_scaling is not None: mla_rope_config.update(hf_config.rope_scaling) From dce2c402fd4f7a3286375659b700b9f9cad9052b Mon Sep 17 00:00:00 2001 From: spacegoing Date: Fri, 23 May 2025 03:20:17 +0000 Subject: [PATCH 17/19] [Fix] config_converter signature --- verl/models/mcore/config_converter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index a926ef70a2a..f6bb361c133 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -191,6 +191,8 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype, * moe_layer_freq[i] = 0 base_config = _get_base_transformer_config( + hf_config=hf_config, + dtype=dtype, activation_func=F.silu, use_cpu_initialization=False, add_bias_linear=False, @@ -227,7 +229,7 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype, * "mscale": 1.0, "mscale_all_dim": 1.0, "original_max_position_embeddings": 4096, - # "type": "rope", + "type": "rope", } if "rope_scaling" in hf_config and hf_config.rope_scaling is not None: mla_rope_config.update(hf_config.rope_scaling) From 3e09ccf41823e8f72132d75c3c29918f1e7db58d Mon Sep 17 00:00:00 2001 From: spacegoing Date: Fri, 23 May 2025 03:20:53 +0000 Subject: [PATCH 18/19] [Fix] restore trust remote code arg --- .../run_moonlight16b_a3b_gsm8k_megatron.sh | 3 +-- scripts/converter_hf_to_mcore.py | 11 +++++++---- verl/single_controller/base/megatron/worker.py | 6 +++--- verl/workers/megatron_workers.py | 11 +++++++---- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh b/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh index efd788f0eec..361d9e8061e 100644 --- a/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh +++ b/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh @@ -12,7 +12,7 @@ huggingface-cli download moonshotai/Moonlight-16B-A3B-Instruct # change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path HF_MODEL_PATH=/data/models/moonshotai/Moonlight-16B-A3B-Instruct DIST_CKPT_PATH=/data/mcore_ckpt/Moonlight-16B-A3B-Instruct -python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH +python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH --trust_remote_code # 2. run the script @@ -78,4 +78,3 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ trainer.val_before_train=False \ trainer.total_epochs=100 $@ - \ No newline at end of file diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py index 27c47c8558c..f5048d24c8f 100644 --- a/scripts/converter_hf_to_mcore.py +++ b/scripts/converter_hf_to_mcore.py @@ -35,6 +35,7 @@ def _init_args(): parser.add_argument("--output_path", type=str, required=True, help="The path for the output mcore model") parser.add_argument("--use_cpu_initialization", action="store_true", help="Whether to use cpu initialization") parser.add_argument("--test", action="store_true", help="Whether to test the conversion") + parser.add_argument("--trust_remote_code", action="store_true", help="Whether to trust remote hf code") args = parser.parse_args() return args @@ -210,7 +211,7 @@ def safe_copy( model.output_layer.weight.copy_(hf_model.lm_head.weight) -def convert_hf_to_mcore(hf_model_path, output_path, use_cpu_initialization=False, test=False): +def convert_hf_to_mcore(hf_model_path, output_path, use_cpu_initialization=False, test=False, trust_remote_code=False): os.makedirs(output_path, exist_ok=True) if len(os.listdir(output_path)) > 0 and not test: print(f"Output path {output_path} is not empty, skipping conversion") @@ -231,7 +232,7 @@ def convert_hf_to_mcore(hf_model_path, output_path, use_cpu_initialization=False model_parallel_cuda_manual_seed(0) # init hf config - hf_config = AutoConfig.from_pretrained(hf_model_path) + hf_config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=trust_remote_code) print(hf_config, flush=True) cfg = Config() @@ -265,7 +266,9 @@ def megatron_model_provider(pre_process, post_process): warnings.simplefilter("ignore") # init hf model - hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, torch_dtype=torch.bfloat16) + hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, + torch_dtype=torch.bfloat16, + trust_remote_code=trust_remote_code) hf_state_dict = hf_model.state_dict() # load hf state dict to megatron model @@ -299,4 +302,4 @@ def megatron_model_provider(pre_process, post_process): if __name__ == "__main__": args = _init_args() - convert_hf_to_mcore(args.hf_model_path, args.output_path, args.use_cpu_initialization, args.test) + convert_hf_to_mcore(args.hf_model_path, args.output_path, args.use_cpu_initialization, args.test, args.trust_remote_code) diff --git a/verl/single_controller/base/megatron/worker.py b/verl/single_controller/base/megatron/worker.py index 7615367e9fe..251e9585a0b 100644 --- a/verl/single_controller/base/megatron/worker.py +++ b/verl/single_controller/base/megatron/worker.py @@ -39,7 +39,7 @@ def get_megatron_rank_info(self): info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank, cp_rank=cp_rank) return info - def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config, override_transformer_config): + def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config, override_transformer_config, trust_remote_code=False): from transformers import AutoConfig from verl.models.mcore import hf_to_mcore_config @@ -49,10 +49,10 @@ def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config # Step 1: initialize the tokenizer self.local_path = copy_to_local(model_path) - self.tokenizer = hf_tokenizer(self.local_path) + self.tokenizer = hf_tokenizer(self.local_path, trust_remote_code=trust_remote_code) # Step 2: get the hf - hf_config = AutoConfig.from_pretrained(self.local_path) + hf_config = AutoConfig.from_pretrained(self.local_path, trust_remote_code=trust_remote_code) # Step 3: override the hf config override_config_kwargs = { diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index 01587c43dec..1c0af9cca00 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -142,8 +142,8 @@ def _build_model_optimizer(self, model_path, optim_config, override_model_config from verl.utils.megatron.optimizer import get_megatron_optimizer from verl.utils.megatron_utils import get_model, init_megatron_optim_config from verl.utils.model import get_generation_config, print_model_size - - self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config) + trust_remote_code=self.config.model.get("trust_remote_code", False) + self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config, trust_remote_code=trust_remote_code) self.generation_config = get_generation_config(self.local_path) def megatron_actor_model_provider(pre_process, post_process): @@ -237,6 +237,7 @@ def _build_rollout(self, trust_remote_code=False): tokenizer=self.tokenizer, model_hf_config=self.actor_model_config, device_mesh=rollout_device_mesh, + trust_remote_code=trust_remote_code, ) log_gpu_memory_usage("After building vllm rollout", logger=logger) @@ -560,7 +561,8 @@ def _build_critic_model_optimizer(self, model_path, optim_config, override_model from verl.utils.megatron_utils import get_model, init_megatron_optim_config from verl.utils.model import print_model_size - self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config) + trust_remote_code=self.config.model.get("trust_remote_code", False) + self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config, trust_remote_code=trust_remote_code) def megatron_critic_model_provider(pre_process, post_process): from verl.models.mcore import init_mcore_model @@ -752,7 +754,8 @@ def _build_rm_model(self, model_path, override_model_config, override_transforme from verl.utils.megatron_utils import get_model - self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config) + trust_remote_code=self.config.model.get("trust_remote_code", False) + self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config, trust_remote_code=trust_remote_code) def megatron_rm_model_provider(pre_process, post_process): from verl.models.mcore import init_mcore_model From b1df278d64545c999ff9afe20c01b7e077b13497 Mon Sep 17 00:00:00 2001 From: spacegoing Date: Fri, 23 May 2025 09:33:32 +0000 Subject: [PATCH 19/19] [Fix] adapt to use base class in config_converter --- verl/models/mcore/config_converter.py | 3 +++ verl/models/mcore/registry.py | 3 ++- verl/workers/megatron_workers.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index f6bb361c133..e3fccb57683 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -222,6 +222,9 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype, * ) base_config_dict = asdict(base_config) + # transformer config default multi_latent_attention = False + base_config_dict.update({"multi_latent_attention": True}) + mla_rope_config = { "beta_fast": 32, "beta_slow": 1, diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py index c32b78dd8dd..e20670774df 100644 --- a/verl/models/mcore/registry.py +++ b/verl/models/mcore/registry.py @@ -85,7 +85,7 @@ class SupportedModel(Enum): SupportedModel.QWEN2: DenseModel, SupportedModel.QWEN2_MOE: Qwen2MoEModel, SupportedModel.MIXTRAL: MixtralModel, - SupportedModel.DEEPSEEK_V3: DenseModel, + SupportedModel.DEEPSEEK_V3: Dpskv3Model, SupportedModel.QWEN2_5_VL: Qwen25VLModel, SupportedModel.LLAMA4: DenseModel, SupportedModel.QWEN3: DenseModel, @@ -113,6 +113,7 @@ class SupportedModel(Enum): SupportedModel.MIXTRAL: McoreToHFWeightConverterMixtral, SupportedModel.QWEN3: McoreToHFWeightConverterDense, SupportedModel.QWEN3_MOE: McoreToHFWeightConverterQwen3Moe, + SupportedModel.DEEPSEEK_V3: McoreToHFWeightConverterDpskv3, } diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index 1c0af9cca00..2b7ea092661 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -142,7 +142,7 @@ def _build_model_optimizer(self, model_path, optim_config, override_model_config from verl.utils.megatron.optimizer import get_megatron_optimizer from verl.utils.megatron_utils import get_model, init_megatron_optim_config from verl.utils.model import get_generation_config, print_model_size - trust_remote_code=self.config.model.get("trust_remote_code", False) + trust_remote_code = self.config.model.get("trust_remote_code", False) self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config, trust_remote_code=trust_remote_code) self.generation_config = get_generation_config(self.local_path)