From 6155a65fa275539ac1ff8299e493aa03254fba8b Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Sun, 13 Apr 2025 04:26:03 -0700
Subject: [PATCH 01/19] use mcore config_converer and model_initializer for
 more types of models

---
 verl/models/mcore/__init__.py          |  5 +-
 verl/models/mcore/config_converter.py  | 84 ++++++++++++++++++++++++
 verl/models/mcore/model_initializer.py | 88 ++++++++++++++++++++++++++
 verl/models/mcore/registry.py          | 65 +++++++++++++++++++
 verl/workers/megatron_workers.py       | 28 ++++----
 5 files changed, 256 insertions(+), 14 deletions(-)
 create mode 100644 verl/models/mcore/config_converter.py
 create mode 100644 verl/models/mcore/model_initializer.py
 create mode 100644 verl/models/mcore/registry.py

diff --git a/verl/models/mcore/__init__.py b/verl/models/mcore/__init__.py
index e4e2a3861a8..fbc26864c92 100644
--- a/verl/models/mcore/__init__.py
+++ b/verl/models/mcore/__init__.py
@@ -13,4 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .gpt_model import gptmodel_forward
\ No newline at end of file
+from .gpt_model import gptmodel_forward
+from .registry import init_mcore_model, hf_to_mcore_config
+
+__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'gptmodel_forward']
\ No newline at end of file
diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
new file mode 100644
index 00000000000..d8359ed2284
--- /dev/null
+++ b/verl/models/mcore/config_converter.py
@@ -0,0 +1,84 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# convert huggingface config to mcore transformer config
+
+from transformers import PretrainedConfig
+from megatron.core.transformer import TransformerConfig
+import torch
+import torch.nn.functional as F
+from megatron.core.enums import AttnBackend
+
+
+def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    # for LlamaForCausalLM or Qwen2ForCausalLM
+    from megatron.core import parallel_state as mpu
+    if "Qwen2ForCausalLM" in hf_config.architectures:
+        qkv_bias = True
+    else:
+        qkv_bias = getattr(hf_config, 'attention_bias', False)
+    overlap_p2p_comm = mpu.get_virtual_pipeline_model_parallel_world_size(
+    ) is not None and mpu.get_virtual_pipeline_model_parallel_world_size() > 1
+    batch_p2p_comm = False
+    transformer_config = TransformerConfig(
+        num_layers=hf_config.num_hidden_layers,
+        hidden_size=hf_config.hidden_size,
+        num_attention_heads=hf_config.num_attention_heads,
+        num_query_groups=hf_config.num_key_value_heads,
+        ffn_hidden_size=hf_config.intermediate_size,
+        activation_func=F.silu,
+        normalization='RMSNorm',
+        gated_linear_unit=True,
+        use_cpu_initialization=True,
+        add_bias_linear=False,
+        tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
+        pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
+        virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(),
+        context_parallel_size=mpu.get_context_parallel_world_size(),
+        overlap_p2p_comm=overlap_p2p_comm,
+        batch_p2p_comm=batch_p2p_comm,
+        pipeline_dtype=dtype,
+        params_dtype=dtype,
+        sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1,
+        variable_seq_lengths=True,
+        masked_softmax_fusion=True,
+        moe_token_dispatcher_type="alltoall",
+        attention_dropout=hf_config.attention_dropout,
+        hidden_dropout=getattr(hf_config, 'hidden_dropout', 0.0),
+        add_qkv_bias=qkv_bias,
+        attention_backend=AttnBackend.flash,
+        bf16=dtype is torch.bfloat16)
+
+    return transformer_config
+
+
+def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    # Qwen2MoeForCausalLM
+    raise NotImplementedError("Qwen2MoeForCausalLM is not supported yet")
+
+
+def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    # DeepseekV3ForCausalLM
+    raise NotImplementedError("DeepseekV3ForCausalLM is not supported yet")
+
+
+def hf_to_mcore_config_qwen2_5_vl(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    # Qwen2_5_VLForConditionalGeneration
+    raise NotImplementedError("Qwen2_5_VLForConditionalGeneration is not supported yet")
+
+
+def hf_to_mcore_config_llama4(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    # Llama4ForConditionalGeneration
+    raise NotImplementedError("Llama4ForConditionalGeneration is not supported yet")
diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py
new file mode 100644
index 00000000000..4ae2fe4e5d4
--- /dev/null
+++ b/verl/models/mcore/model_initializer.py
@@ -0,0 +1,88 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# use mcore transformer config to initialize the model
+
+
+def init_mcore_model_dense(tfconfig,
+                           hf_config,
+                           pre_process=None,
+                           post_process=None,
+                           share_embeddings_and_output_weights=False,
+                           value=False):
+    # for LlamaForCausalLM, Qwen2ForCausalLM
+    from megatron.core.models.gpt.gpt_model import GPTModel
+    from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
+    use_te = True
+    assert tfconfig.normalization == "RMSNorm", 'only RMSNorm is supported for now'
+    transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te)
+    rope_scaling_args = {}
+    if hf_config.rope_scaling is not None:
+        assert hf_config.rope_scaling['type'] == 'linear', "only linear scaling is supported for now"
+        rope_scaling_args['seq_len_interpolation_factor'] = hf_config.rope_scaling['factor']
+    model = GPTModel(config=tfconfig,
+                     transformer_layer_spec=transformer_layer_spec,
+                     vocab_size=hf_config.vocab_size,
+                     max_sequence_length=hf_config.max_position_embeddings,
+                     pre_process=pre_process,
+                     post_process=post_process,
+                     share_embeddings_and_output_weights=share_embeddings_and_output_weights,
+                     position_embedding_type='rope',
+                     rotary_base=hf_config.rope_theta,
+                     **rope_scaling_args)
+    if post_process and value:
+        from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
+        model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig)
+    return model
+
+
+def init_mcore_model_qwen2_moe(tfconfig,
+                               hf_config,
+                               pre_process=None,
+                               post_process=None,
+                               share_embeddings_and_output_weights=False,
+                               value=False):
+    return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights,
+                                  value)
+
+
+def init_mcore_model_llama4(tfconfig,
+                            hf_config,
+                            pre_process=None,
+                            post_process=None,
+                            share_embeddings_and_output_weights=False,
+                            value=False):
+    return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights,
+                                  value)
+
+
+def init_mcore_model_dpskv3(tfconfig,
+                            hf_config,
+                            pre_process=None,
+                            post_process=None,
+                            share_embeddings_and_output_weights=False,
+                            value=False):
+    return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights,
+                                  value)
+
+
+def init_mcore_model_qwen2_5_vl(tfconfig,
+                                hf_config,
+                                pre_process=None,
+                                post_process=None,
+                                share_embeddings_and_output_weights=False,
+                                value=False):
+    # Qwen2_5_VLForConditionalGeneration
+    raise NotImplementedError("VLM is not supported yet")
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
new file mode 100644
index 00000000000..a2a92924b00
--- /dev/null
+++ b/verl/models/mcore/registry.py
@@ -0,0 +1,65 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config_converter import hf_to_mcore_config_dense, hf_to_mcore_config_qwen2moe, hf_to_mcore_config_dpskv3, hf_to_mcore_config_qwen2_5_vl, hf_to_mcore_config_llama4
+from .config_converter import PretrainedConfig, TransformerConfig
+import torch
+import torch.nn as nn
+
+
+def hf_to_mcore_config(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+    MODEL_CONFIG_CONVERTER_REGISTRY = {
+        "LlamaForCausalLM": hf_to_mcore_config_dense,
+        "Qwen2ForCausalLM": hf_to_mcore_config_dense,
+        "Qwen2MoeForCausalLM": hf_to_mcore_config_qwen2moe,
+        "DeepseekV3ForCausalLM": hf_to_mcore_config_dpskv3,
+        "Qwen2_5_VLForConditionalGeneration": hf_to_mcore_config_qwen2_5_vl,
+        "Llama4ForConditionalGeneration": hf_to_mcore_config_llama4,
+    }
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    arch = hf_config.architectures[0]
+    if arch not in MODEL_CONFIG_CONVERTER_REGISTRY:
+        raise ValueError(f"Model architectures {arch} converter are not supported for now. "
+                         f"Supported architectures: {MODEL_CONFIG_CONVERTER_REGISTRY.keys()}")
+    return MODEL_CONFIG_CONVERTER_REGISTRY[arch](hf_config, dtype)
+
+
+from .model_initializer import init_mcore_model_dense, init_mcore_model_qwen2_moe, init_mcore_model_dpskv3, init_mcore_model_qwen2_5_vl, init_mcore_model_llama4
+
+
+def init_mcore_model(
+        tfconfig,
+        hf_config,
+        pre_process=None,
+        post_process=None,
+        share_embeddings_and_output_weights=False,
+        value=False,
+        **extra_kwargs  # may be used for vlm
+) -> nn.Module:
+    MODEL_INITIALIZER_REGISTRY = {
+        "LlamaForCausalLM": init_mcore_model_dense,
+        "Qwen2ForCausalLM": init_mcore_model_dense,
+        "Qwen2MoeForCausalLM": init_mcore_model_qwen2_moe,
+        "DeepseekV3ForCausalLM": init_mcore_model_dpskv3,
+        "Qwen2_5_VLForConditionalGeneration": init_mcore_model_qwen2_5_vl,
+        "Llama4ForConditionalGeneration": init_mcore_model_llama4,
+    }
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    arch = hf_config.architectures[0]
+    if arch not in MODEL_INITIALIZER_REGISTRY:
+        raise ValueError(f"Model architectures {arch} initializer are not supported for now. "
+                         f"Supported architectures: {MODEL_INITIALIZER_REGISTRY.keys()}")
+    return MODEL_INITIALIZER_REGISTRY[arch](tfconfig, hf_config, pre_process, post_process,
+                                            share_embeddings_and_output_weights, value, **extra_kwargs)
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index ffd7404fa0d..0b441372235 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -143,7 +143,8 @@ def _build_model_optimizer(self,
         from verl.utils.megatron.optimizer import get_megatron_optimizer
         from megatron.core.models.gpt.gpt_model import ModelType
         from verl.utils.model import print_model_size, update_model_config, get_generation_config
-        from verl.utils.megatron_utils import get_model, init_megatron_optim_config, convert_config
+        from verl.utils.megatron_utils import get_model, init_megatron_optim_config
+        from verl.models.mcore import hf_to_mcore_config
         from transformers import AutoConfig
 
         # Step 1: initialize the tokenizer
@@ -169,7 +170,7 @@ def _build_model_optimizer(self,
         self.share_embeddings_and_output_weights = getattr(actor_model_config, "tie_word_embeddings", False)
         self.architectures = getattr(actor_model_config, "architectures", None)
 
-        tfconfig = convert_config(actor_model_config, megatron_config)
+        tfconfig = hf_to_mcore_config(actor_model_config, megatron_config.dtype)
         if enable_gradient_checkpointing:
             gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict()))
             tfconfig.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method']
@@ -179,8 +180,8 @@ def _build_model_optimizer(self,
         self.hf_config = actor_model_config
 
         def megatron_actor_model_provider(pre_process, post_process):
-            from verl.utils.model import get_parallel_gptmodel_from_config
-            parallel_model = get_parallel_gptmodel_from_config(
+            from verl.models.mcore import init_mcore_model
+            parallel_model = init_mcore_model(
                 tfconfig,
                 actor_model_config,
                 pre_process,
@@ -531,8 +532,9 @@ def _build_critic_model_optimizer(self,
         from megatron.core.models.gpt.gpt_model import ModelType
         from verl.utils.model import print_model_size, update_model_config
         from verl.utils.megatron.optimizer import get_megatron_optimizer
-        from verl.utils.megatron_utils import get_model, init_megatron_optim_config, convert_config
+        from verl.utils.megatron_utils import get_model, init_megatron_optim_config
         from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+        from verl.models.mcore import hf_to_mcore_config
 
         # Step 1: initialize the tokenizer
         local_path = copy_to_local(model_path)
@@ -552,7 +554,7 @@ def _build_critic_model_optimizer(self,
         self.architectures = getattr(critic_model_config, "architectures", None)
         if self.rank == 0:
             print(f'Model config after override: {critic_model_config}')
-        tfconfig = convert_config(critic_model_config, megatron_config)
+        tfconfig = hf_to_mcore_config(critic_model_config, megatron_config.dtype)
         if enable_gradient_checkpointing:
             gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict()))
             tfconfig.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method']
@@ -562,13 +564,13 @@ def _build_critic_model_optimizer(self,
         self.hf_config = critic_model_config
 
         def megatron_critic_model_provider(pre_process, post_process):
-            from verl.utils.model import get_parallel_gptmodel_from_config
-            parallel_model = get_parallel_gptmodel_from_config(tfconfig,
-                                                               critic_model_config,
-                                                               pre_process,
-                                                               post_process,
-                                                               share_embeddings_and_output_weights=False,
-                                                               value=True)
+            from verl.models.mcore import init_mcore_model
+            parallel_model = init_mcore_model(tfconfig,
+                                              critic_model_config,
+                                              pre_process,
+                                              post_process,
+                                              share_embeddings_and_output_weights=False,
+                                              value=True)
             parallel_model.cuda()
             return parallel_model
 

From 8869168babef06257f29a7bfd3a919df99356672 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Sun, 13 Apr 2025 04:40:45 -0700
Subject: [PATCH 02/19] remove megatron_config from actor/critic

---
 verl/utils/megatron_utils.py           |  5 +++
 verl/workers/actor/megatron_actor.py   | 30 +++++--------
 verl/workers/critic/megatron_critic.py | 11 ++---
 verl/workers/megatron_workers.py       | 60 +++++++++++++-------------
 4 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/verl/utils/megatron_utils.py b/verl/utils/megatron_utils.py
index e84f154622a..ffeb114f6d2 100644
--- a/verl/utils/megatron_utils.py
+++ b/verl/utils/megatron_utils.py
@@ -217,6 +217,11 @@ def mcore_model_parallel_config(
     sequence_parallel: bool,
     params_dtype: torch.dtype,
 ) -> ModelParallelConfig:
+    # WARNING: Code should not reach this point. This function is deprecated and will be removed.
+    # Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead.
+    warnings.warn("Code should not reach this point. This function is deprecated and will be removed. "
+                 "Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead.",
+                 DeprecationWarning, stacklevel=2)
     return ModelParallelConfig(
         tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
         pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py
index 2d0adc807f7..9fd73fff137 100644
--- a/verl/workers/actor/megatron_actor.py
+++ b/verl/workers/actor/megatron_actor.py
@@ -52,7 +52,7 @@
 
 class MegatronPPOActor(BasePPOActor):
 
-    def __init__(self, config, model_config, megatron_config: ModelParallelConfig, actor_module: nn.ModuleList,
+    def __init__(self, config, model_config, hf_config, tf_config, actor_module: nn.ModuleList,
                  actor_optimizer: DistributedOptimizer, actor_optimizer_config: OptimizerConfig):
         """MeagtronPPOActor class. This class implements the simple PPO logics when the model is built with Megatron.
 
@@ -72,13 +72,8 @@ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, a
                 ``entropy_coeff``: entropy coefficient of the PPO loss. See https://arxiv.org/abs/1707.06347.
             model_config (OmegaConf): model configuration. It must contains ``model_config.vocab_size`` and
                 ``model_config.hidden_size``
-            megatron_config (OmegaConf): megatron configuration. It must contains
-
-                ``sequence_parallel_enabled``: whether the sequence parallel is enabled.
-
-                ``param_dtype``: the dtype of the parameters.
-
-                ``virtual_pipeline_model_parallel_size``: virtual pipeline model parallel size. a.k.a number of chunks in each pp stage.
+            hf_config (PretrainedConfig): huggingface config
+            tf_config (TransformerConfig): mcore transformer config
             actor_module (nn.ModuleList): actor module is a ModuleList that contains a list of nn.Module in this pp stage.
                 each nn.Module in this rank holds a vpp module chunk. See https://arxiv.org/pdf/2104.04473.pdf for more details.
                 The actor module has some constraints to follow in order to use the updating logics implemented here
@@ -93,13 +88,6 @@ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, a
             actor_optimizer (DistributedOptimizer): currently, we only support DistributedOptimizer in Megatron. It implements
                 zero1 optimizer that shards the optimizer state across dp ranks.
 
-        >>> def megatron_actor_model_provider(pre_process, post_process):
-        >>>     vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank()
-        >>>     parallel_model = ParallelMistralForCausalLMRmPadPP(config=actor_model_config,
-        >>>                                                        megatron_config=megatron_config,
-        >>>                                                        pre_process=pre_process,
-        >>>                                                        post_process=post_process).cuda()
-        >>>     return parallel_model
         >>> from megatron.training import get_model
         >>> from megatron.optimizer import get_megatron_optimizer
         >>> actor_module = get_model(megatron_actor_model_provider, wrap_with_ddp=True)
@@ -107,14 +95,16 @@ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, a
         >>> actor_optimizer = get_megatron_optimizer(actor_module)
         >>> actor = MegatronPPOActor(config=config,
         >>>                          model_config=actor_model_config,
-        >>>                          megatron_config=megatron_config,
+        >>>                          hf_config=hf_config,
+        >>>                          tf_config=tf_config,
         >>>                          actor_module=actor_module,
         >>>                          actor_optimizer=actor_optimizer)
         """
         super().__init__(config)
         self._validate_config(config)
         self.model_config = model_config
-        self.megatron_config = megatron_config
+        self.hf_config = hf_config
+        self.tf_config = tf_config
         self.actor_module = actor_module
         self.actor_optimizer: DistributedOptimizer = actor_optimizer
         self.actor_optimizer_config = actor_optimizer_config
@@ -124,7 +114,7 @@ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, a
             'overlap_dp_param_comm': False,
             'overlap_dp_grad_comm': False,
             'gradient_accumulation_steps': 1,
-            'sequence_parallel': self.megatron_config.sequence_parallel,
+            'sequence_parallel': self.tf_config.sequence_parallel,
             'DDP_impl': 'local',
             'layernorm_allreduce_bucket_threshold': 0,
             'pipeline_model_parallel_split_rank': None,
@@ -253,7 +243,7 @@ def forward_backward_batch(self, data: DataProto, forward_only=False, post_proce
         input_shapes = compute_transformers_input_shapes(
             batches,
             meta_info={
-                'sequence_parallel': self.megatron_config.sequence_parallel,
+                'sequence_parallel': self.tf_config.sequence_parallel,
                 'hidden_size': self.model_config.hidden_size
             })
         n_micro_batch = len(batches)
@@ -334,7 +324,7 @@ def forward_step(batch_iter, model):
                                       input_ids,
                                       attention_mask,
                                       position_ids,
-                                      sequence_parallel=self.megatron_config.sequence_parallel)
+                                      sequence_parallel=self.tf_config.sequence_parallel)
             if forward_only:
                 meta_info = None
             else:
diff --git a/verl/workers/critic/megatron_critic.py b/verl/workers/critic/megatron_critic.py
index 55076075c5a..af1cbcc4db7 100644
--- a/verl/workers/critic/megatron_critic.py
+++ b/verl/workers/critic/megatron_critic.py
@@ -42,12 +42,13 @@
 
 class MegatronPPOCritic(BasePPOCritic):
 
-    def __init__(self, config, model_config, megatron_config, critic_module: nn.ModuleList,
+    def __init__(self, config, model_config, hf_config, tf_config, critic_module: nn.ModuleList,
                  critic_optimizer: DistributedOptimizer, critic_optimizer_config: OptimizerConfig):
         super().__init__(config=config)
         self._validate_config(config)
         self.model_config = model_config
-        self.megatron_config = megatron_config
+        self.hf_config = hf_config # huggingface config
+        self.tf_config = tf_config # mcore transformer config
 
         self.critic_module = critic_module
         self.critic_optimizer = critic_optimizer
@@ -59,7 +60,7 @@ def __init__(self, config, model_config, megatron_config, critic_module: nn.Modu
             'overlap_dp_param_comm': False,
             'overlap_dp_grad_comm': False,
             'gradient_accumulation_steps': 1,
-            'sequence_parallel': self.megatron_config.sequence_parallel,
+            'sequence_parallel': self.tf_config.sequence_parallel,
             'DDP_impl': 'local',
             'layernorm_allreduce_bucket_threshold': 0,
             'pipeline_model_parallel_split_rank': None,
@@ -122,7 +123,7 @@ def forward_backward_batch(self, data: DataProto, forward_only=False):
         input_shapes = compute_transformers_input_shapes(
             batches,
             meta_info={
-                'sequence_parallel': self.megatron_config.sequence_parallel,
+                'sequence_parallel': self.tf_config.sequence_parallel,
                 'hidden_size': self.model_config.hidden_size
             })
 
@@ -169,7 +170,7 @@ def forward_step(batch_iter, model):
                                       input_ids,
                                       attention_mask,
                                       position_ids,
-                                      sequence_parallel=self.megatron_config.sequence_parallel,
+                                      sequence_parallel=self.tf_config.sequence_parallel,
                                       value_model=True)
 
             return output, partial(loss_func, data=batch, meta_info={})
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index 0b441372235..178327c375f 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -136,7 +136,7 @@ def __init__(self, config: DictConfig, role: str):
 
     def _build_model_optimizer(self,
                                model_path,
-                               megatron_config: ModelParallelConfig,
+                               dtype,
                                optim_config,
                                override_model_config,
                                enable_gradient_checkpointing=False):
@@ -170,19 +170,19 @@ def _build_model_optimizer(self,
         self.share_embeddings_and_output_weights = getattr(actor_model_config, "tie_word_embeddings", False)
         self.architectures = getattr(actor_model_config, "architectures", None)
 
-        tfconfig = hf_to_mcore_config(actor_model_config, megatron_config.dtype)
+        tf_config = hf_to_mcore_config(actor_model_config, dtype)
         if enable_gradient_checkpointing:
             gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict()))
-            tfconfig.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method']
-            tfconfig.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity']
-            tfconfig.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers']
-        print(f'TF config: {tfconfig}')
+            tf_config.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method']
+            tf_config.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity']
+            tf_config.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers']
+        print(f'TF config: {tf_config}')
         self.hf_config = actor_model_config
 
         def megatron_actor_model_provider(pre_process, post_process):
             from verl.models.mcore import init_mcore_model
             parallel_model = init_mcore_model(
-                tfconfig,
+                tf_config,
                 actor_model_config,
                 pre_process,
                 post_process,
@@ -214,7 +214,7 @@ def megatron_actor_model_provider(pre_process, post_process):
                     load_megatron_gptmodel_weights(self.config,
                                                    actor_model_config,
                                                    actor_module,
-                                                   params_dtype=megatron_config.params_dtype,
+                                                   params_dtype=dtype,
                                                    is_value_model=False)
 
             if self.rank == 0:
@@ -239,7 +239,7 @@ def megatron_actor_model_provider(pre_process, post_process):
                     load_megatron_gptmodel_weights(self.config,
                                                    actor_model_config,
                                                    ref_module,
-                                                   params_dtype=megatron_config.params_dtype,
+                                                   params_dtype=dtype,
                                                    is_value_model=False)
             log_gpu_memory_usage('After ref module init', logger=logger)
             return ref_module, actor_model_config
@@ -311,10 +311,7 @@ def init_model(self):
         override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
         self.param_dtype = torch.bfloat16
 
-        megatron_config = mcore_model_parallel_config(sequence_parallel=self.config.actor.megatron.get(
-            'sequence_parallel', True),
-                                                      params_dtype=PrecisionType.to_dtype(self.param_dtype))
-
+        self.dtype = PrecisionType.to_dtype(self.param_dtype)
         if self._is_actor or self._is_rollout:
             # we need the model for actor and rollout
             if self._is_actor:
@@ -324,7 +321,7 @@ def init_model(self):
             self.actor_module, self.hybrid_engine, self.actor_optimizer, \
             self.actor_model_config, self.actor_optim_config = self._build_model_optimizer(
                 model_path=self.config.model.path,
-                megatron_config=megatron_config,
+                dtype=self.dtype,
                 optim_config=optim_config,
                 override_model_config=override_model_config,
                 enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False)
@@ -333,7 +330,8 @@ def init_model(self):
         if self._is_actor:
             self.actor = MegatronPPOActor(config=self.config.actor,
                                           model_config=self.actor_model_config,
-                                          megatron_config=megatron_config,
+                                          hf_config=self.hf_config,
+                                          tf_config=self.tf_config,
                                           actor_module=self.actor_module,
                                           actor_optimizer=self.actor_optimizer,
                                           actor_optimizer_config=self.actor_optim_config)
@@ -344,13 +342,14 @@ def init_model(self):
         if self._is_ref:
             self.ref_module, self.ref_model_config = self._build_model_optimizer(
                 model_path=self.config.model.path,
-                megatron_config=megatron_config,
+                dtype=self.dtype,
                 optim_config=None,
                 override_model_config=override_model_config,
                 enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False))
             self.ref_policy = MegatronPPOActor(config=self.config.ref,
                                                model_config=self.ref_model_config,
-                                               megatron_config=megatron_config,
+                                               hf_config=self.hf_config,
+                                               tf_config=self.tf_config,
                                                actor_module=self.ref_module,
                                                actor_optimizer=None,
                                                actor_optimizer_config=None)
@@ -525,7 +524,7 @@ def __init__(self, config):
 
     def _build_critic_model_optimizer(self,
                                       model_path,
-                                      megatron_config: ModelParallelConfig,
+                                      dtype,
                                       optim_config,
                                       override_model_config,
                                       enable_gradient_checkpointing=False):
@@ -554,18 +553,19 @@ def _build_critic_model_optimizer(self,
         self.architectures = getattr(critic_model_config, "architectures", None)
         if self.rank == 0:
             print(f'Model config after override: {critic_model_config}')
-        tfconfig = hf_to_mcore_config(critic_model_config, megatron_config.dtype)
+        tf_config = hf_to_mcore_config(critic_model_config, dtype)
         if enable_gradient_checkpointing:
             gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict()))
-            tfconfig.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method']
-            tfconfig.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity']
-            tfconfig.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers']
-        print(f'Critic TF config: {tfconfig}')
+            tf_config.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method']
+            tf_config.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity']
+            tf_config.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers']
+        print(f'Critic TF config: {tf_config}')
         self.hf_config = critic_model_config
+        self.tf_config = tf_config
 
         def megatron_critic_model_provider(pre_process, post_process):
             from verl.models.mcore import init_mcore_model
-            parallel_model = init_mcore_model(tfconfig,
+            parallel_model = init_mcore_model(tf_config,
                                               critic_model_config,
                                               pre_process,
                                               post_process,
@@ -593,7 +593,7 @@ def megatron_critic_model_provider(pre_process, post_process):
                 load_megatron_gptmodel_weights(self.config,
                                                critic_model_config,
                                                critic_module,
-                                               params_dtype=megatron_config.params_dtype,
+                                               params_dtype=dtype,
                                                is_value_model=True)
             t1 = time.time()
             if torch.distributed.get_rank() == 0:
@@ -619,19 +619,17 @@ def init_model(self):
             importlib.import_module(self.config.model.external_lib)
         override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
         self.param_dtype = torch.bfloat16
-
-        megatron_config = mcore_model_parallel_config(sequence_parallel=self.config.megatron.get(
-            'sequence_parallel', True),
-                                                      params_dtype=PrecisionType.to_dtype(self.param_dtype))
+        self.dtype = PrecisionType.to_dtype(self.param_dtype)
         self.critic_module, self.critic_optimizer, self.critic_model_config, critic_optimizer_config = self._build_critic_model_optimizer(
             model_path=self.config.model.path,
-            megatron_config=megatron_config,
+            dtype=self.dtype,
             optim_config=self.config.optim,
             override_model_config=override_model_config,
             enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False))
         self.critic = MegatronPPOCritic(config=self.config,
                                         model_config=self.critic_model_config,
-                                        megatron_config=megatron_config,
+                                        hf_config=self.hf_config,
+                                        tf_config=self.tf_config,
                                         critic_module=self.critic_module,
                                         critic_optimizer=self.critic_optimizer,
                                         critic_optimizer_config=critic_optimizer_config)

From 9216811f2a1f24bdff3e1eeb9f3bbfeba322201e Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Sun, 13 Apr 2025 09:00:51 -0700
Subject: [PATCH 03/19] reward model use gptmodel api, clean megatron_worker

---
 .../single_controller/base/megatron/worker.py |  47 ++++-
 verl/workers/actor/megatron_actor.py          |   3 +-
 verl/workers/megatron_workers.py              | 193 +++++-------------
 .../reward_model/megatron/reward_model.py     |  20 +-
 4 files changed, 113 insertions(+), 150 deletions(-)

diff --git a/verl/single_controller/base/megatron/worker.py b/verl/single_controller/base/megatron/worker.py
index 3adce5cce4a..c6594ee4618 100644
--- a/verl/single_controller/base/megatron/worker.py
+++ b/verl/single_controller/base/megatron/worker.py
@@ -36,4 +36,49 @@ def get_megatron_rank_info(self):
         pp_rank = mpu.get_pipeline_model_parallel_rank()
         cp_rank = mpu.get_context_parallel_rank()
         info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank, cp_rank=cp_rank)
-        return info
\ No newline at end of file
+        return info
+
+    def _init_hf_config_and_tf_config(self, 
+                                      model_path,
+                                      dtype,
+                                      override_model_config):
+        from verl.utils.model import print_model_size, update_model_config
+        from verl.utils.fs import copy_to_local
+        from verl.utils import hf_tokenizer
+        from transformers import AutoConfig
+        from verl.models.mcore import hf_to_mcore_config
+
+        # Step 1: initialize the tokenizer
+        self.local_path = copy_to_local(model_path)
+        self.tokenizer = hf_tokenizer(self.local_path)
+
+        # Step 2: get the hf
+        hf_config = AutoConfig.from_pretrained(self.local_path)
+
+        # Step 3: override the hf config
+        override_config_kwargs = {
+            'bos_token_id': self.tokenizer.bos_token_id,
+            'eos_token_id': self.tokenizer.eos_token_id,
+            'pad_token_id': self.tokenizer.pad_token_id,
+        }
+        override_config_kwargs.update(override_model_config)
+        self.share_embeddings_and_output_weights = getattr(hf_config, "tie_word_embeddings", False)
+        update_model_config(hf_config, override_config_kwargs=override_config_kwargs)
+        self.architectures = getattr(hf_config, "architectures", None)
+        if self.rank == 0:
+            print(f'Model config after override: {hf_config}')
+        tf_config = hf_to_mcore_config(hf_config, dtype)
+
+        def add_optimization_config_to_tf_config(tf_config, verl_model_config):
+            # add optimization config to tf_config, e.g. checkpointing
+            if verl_model_config.get('enable_gradient_checkpointing', False):
+                gradient_checkpointing_cfg = dict(verl_model_config.get('gradient_checkpointing_kwargs', dict()))
+                tf_config.recompute_method = gradient_checkpointing_cfg.get('activations_checkpoint_method', 'full')
+                tf_config.recompute_granularity = gradient_checkpointing_cfg.get('activations_checkpoint_granularity', 'full')
+                tf_config.recompute_num_layers = gradient_checkpointing_cfg.get('activations_checkpoint_num_layers', -1)
+
+        add_optimization_config_to_tf_config(tf_config, self.config.model)
+
+        print(f'TF config: {tf_config}')
+        self.hf_config = hf_config
+        self.tf_config = tf_config
\ No newline at end of file
diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py
index 9fd73fff137..39fe6aa2d51 100644
--- a/verl/workers/actor/megatron_actor.py
+++ b/verl/workers/actor/megatron_actor.py
@@ -53,7 +53,7 @@
 class MegatronPPOActor(BasePPOActor):
 
     def __init__(self, config, model_config, hf_config, tf_config, actor_module: nn.ModuleList,
-                 actor_optimizer: DistributedOptimizer, actor_optimizer_config: OptimizerConfig):
+                 actor_optimizer: DistributedOptimizer):
         """MeagtronPPOActor class. This class implements the simple PPO logics when the model is built with Megatron.
 
         Args:
@@ -107,7 +107,6 @@ def __init__(self, config, model_config, hf_config, tf_config, actor_module: nn.
         self.tf_config = tf_config
         self.actor_module = actor_module
         self.actor_optimizer: DistributedOptimizer = actor_optimizer
-        self.actor_optimizer_config = actor_optimizer_config
 
         self.optimizer_step_args = OmegaConf.create({
             'skip_grad': None,
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index 178327c375f..76572de2026 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -37,7 +37,6 @@
 from verl.utils.model import load_megatron_model_weights, load_megatron_gptmodel_weights, load_mcore_dist_weights
 from verl.utils.flops_counter import FlopsCounter
 from verl.utils.checkpoint.megatron_checkpoint_manager import MegatronCheckpointManager
-from verl.utils.megatron_utils import mcore_model_parallel_config
 from verl.utils.megatron_utils import offload_megatron_param_and_grad, load_megatron_param_and_grad
 from verl.utils import hf_tokenizer
 
@@ -136,54 +135,21 @@ def __init__(self, config: DictConfig, role: str):
 
     def _build_model_optimizer(self,
                                model_path,
-                               dtype,
                                optim_config,
-                               override_model_config,
-                               enable_gradient_checkpointing=False):
+                               override_model_config):
         from verl.utils.megatron.optimizer import get_megatron_optimizer
         from megatron.core.models.gpt.gpt_model import ModelType
-        from verl.utils.model import print_model_size, update_model_config, get_generation_config
+        from verl.utils.model import print_model_size, get_generation_config
         from verl.utils.megatron_utils import get_model, init_megatron_optim_config
-        from verl.models.mcore import hf_to_mcore_config
-        from transformers import AutoConfig
 
-        # Step 1: initialize the tokenizer
-        local_path = copy_to_local(model_path)
-        self.tokenizer = hf_tokenizer(local_path)
-
-        # Step 2: get the actor_model_config
-        actor_model_config = AutoConfig.from_pretrained(local_path)
-
-        self.generation_config = get_generation_config(local_path)
-
-        override_config_kwargs = {
-            'bos_token_id': self.tokenizer.bos_token_id,
-            'eos_token_id': self.tokenizer.eos_token_id,
-            'pad_token_id': self.tokenizer.pad_token_id,
-        }
-        override_config_kwargs.update(override_model_config)
-        update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs)
-
-        if self.rank == 0:
-            print(f'Model config after override: {actor_model_config}')
-
-        self.share_embeddings_and_output_weights = getattr(actor_model_config, "tie_word_embeddings", False)
-        self.architectures = getattr(actor_model_config, "architectures", None)
-
-        tf_config = hf_to_mcore_config(actor_model_config, dtype)
-        if enable_gradient_checkpointing:
-            gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict()))
-            tf_config.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method']
-            tf_config.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity']
-            tf_config.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers']
-        print(f'TF config: {tf_config}')
-        self.hf_config = actor_model_config
+        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config)
+        self.generation_config = get_generation_config(self.local_path)
 
         def megatron_actor_model_provider(pre_process, post_process):
             from verl.models.mcore import init_mcore_model
             parallel_model = init_mcore_model(
-                tf_config,
-                actor_model_config,
+                self.tf_config,
+                self.hf_config,
                 pre_process,
                 post_process,
                 share_embeddings_and_output_weights=self.share_embeddings_and_output_weights,
@@ -212,9 +178,9 @@ def megatron_actor_model_provider(pre_process, post_process):
                                             is_value_model=False)
                 else:
                     load_megatron_gptmodel_weights(self.config,
-                                                   actor_model_config,
+                                                   self.hf_config,
                                                    actor_module,
-                                                   params_dtype=dtype,
+                                                   params_dtype=self.dtype,
                                                    is_value_model=False)
 
             if self.rank == 0:
@@ -237,12 +203,12 @@ def megatron_actor_model_provider(pre_process, post_process):
                                             is_value_model=False)
                 else:
                     load_megatron_gptmodel_weights(self.config,
-                                                   actor_model_config,
+                                                   self.hf_config,
                                                    ref_module,
-                                                   params_dtype=dtype,
+                                                   params_dtype=self.dtype,
                                                    is_value_model=False)
             log_gpu_memory_usage('After ref module init', logger=logger)
-            return ref_module, actor_model_config
+            return ref_module, self.hf_config
 
         # TODO: add more optimizer args into config
         if self._is_actor:
@@ -254,7 +220,7 @@ def megatron_actor_model_provider(pre_process, post_process):
 
         log_gpu_memory_usage('After actor optimizer init', logger=logger)
 
-        return actor_module, hybrid_engine, actor_optimizer, actor_model_config, optim_config
+        return actor_module, hybrid_engine, actor_optimizer, self.hf_config, optim_config
 
     def _build_rollout(self):
         if self.config.rollout.name == 'vllm':
@@ -319,12 +285,10 @@ def init_model(self):
             else:
                 optim_config = None
             self.actor_module, self.hybrid_engine, self.actor_optimizer, \
-            self.actor_model_config, self.actor_optim_config = self._build_model_optimizer(
+            self.actor_model_config, self.actor_hf_config = self._build_model_optimizer(
                 model_path=self.config.model.path,
-                dtype=self.dtype,
                 optim_config=optim_config,
-                override_model_config=override_model_config,
-                enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False)
+                override_model_config=override_model_config
             )
 
         if self._is_actor:
@@ -334,7 +298,7 @@ def init_model(self):
                                           tf_config=self.tf_config,
                                           actor_module=self.actor_module,
                                           actor_optimizer=self.actor_optimizer,
-                                          actor_optimizer_config=self.actor_optim_config)
+                                          )
 
         if self._is_rollout:
             self.rollout, self.sharding_manager = self._build_rollout()
@@ -342,17 +306,14 @@ def init_model(self):
         if self._is_ref:
             self.ref_module, self.ref_model_config = self._build_model_optimizer(
                 model_path=self.config.model.path,
-                dtype=self.dtype,
                 optim_config=None,
-                override_model_config=override_model_config,
-                enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False))
+                override_model_config=override_model_config,)
             self.ref_policy = MegatronPPOActor(config=self.config.ref,
                                                model_config=self.ref_model_config,
                                                hf_config=self.hf_config,
                                                tf_config=self.tf_config,
                                                actor_module=self.ref_module,
-                                               actor_optimizer=None,
-                                               actor_optimizer_config=None)
+                                               actor_optimizer=None)
 
         if self._is_actor:
             self.flops_counter = FlopsCounter(self.actor_model_config)
@@ -524,49 +485,19 @@ def __init__(self, config):
 
     def _build_critic_model_optimizer(self,
                                       model_path,
-                                      dtype,
                                       optim_config,
-                                      override_model_config,
-                                      enable_gradient_checkpointing=False):
+                                      override_model_config):
         from megatron.core.models.gpt.gpt_model import ModelType
-        from verl.utils.model import print_model_size, update_model_config
+        from verl.utils.model import print_model_size
         from verl.utils.megatron.optimizer import get_megatron_optimizer
         from verl.utils.megatron_utils import get_model, init_megatron_optim_config
-        from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-        from verl.models.mcore import hf_to_mcore_config
-
-        # Step 1: initialize the tokenizer
-        local_path = copy_to_local(model_path)
-        self.tokenizer = hf_tokenizer(local_path)
 
-        # Step 2: get the critic_model_config
-        critic_model_config = AutoConfig.from_pretrained(local_path)
-
-        override_config_kwargs = {
-            'bos_token_id': self.tokenizer.bos_token_id,
-            'eos_token_id': self.tokenizer.eos_token_id,
-            'pad_token_id': self.tokenizer.pad_token_id,
-        }
-        override_config_kwargs.update(override_model_config)
-        self.share_embeddings_and_output_weights = getattr(critic_model_config, "tie_word_embeddings", False)
-        update_model_config(critic_model_config, override_config_kwargs=override_config_kwargs)
-        self.architectures = getattr(critic_model_config, "architectures", None)
-        if self.rank == 0:
-            print(f'Model config after override: {critic_model_config}')
-        tf_config = hf_to_mcore_config(critic_model_config, dtype)
-        if enable_gradient_checkpointing:
-            gradient_checkpointing_cfg = dict(self.config.model.get('gradient_checkpointing_kwargs', dict()))
-            tf_config.recompute_method = gradient_checkpointing_cfg['activations_checkpoint_method']
-            tf_config.recompute_granularity = gradient_checkpointing_cfg['activations_checkpoint_granularity']
-            tf_config.recompute_num_layers = gradient_checkpointing_cfg['activations_checkpoint_num_layers']
-        print(f'Critic TF config: {tf_config}')
-        self.hf_config = critic_model_config
-        self.tf_config = tf_config
+        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config)
 
         def megatron_critic_model_provider(pre_process, post_process):
             from verl.models.mcore import init_mcore_model
-            parallel_model = init_mcore_model(tf_config,
-                                              critic_model_config,
+            parallel_model = init_mcore_model(self.tf_config,
+                                              self.hf_config,
                                               pre_process,
                                               post_process,
                                               share_embeddings_and_output_weights=False,
@@ -591,9 +522,9 @@ def megatron_critic_model_provider(pre_process, post_process):
                                         is_value_model=True)
             else:
                 load_megatron_gptmodel_weights(self.config,
-                                               critic_model_config,
+                                               self.hf_config,
                                                critic_module,
-                                               params_dtype=dtype,
+                                               params_dtype=self.dtype,
                                                is_value_model=True)
             t1 = time.time()
             if torch.distributed.get_rank() == 0:
@@ -605,7 +536,7 @@ def megatron_critic_model_provider(pre_process, post_process):
         optim_config = init_megatron_optim_config(optim_config)
         critic_optimizer = get_megatron_optimizer(model=critic_module, config=optim_config)
         torch.cuda.empty_cache()
-        return critic_module, critic_optimizer, critic_model_config, optim_config
+        return critic_module, critic_optimizer, self.hf_config, optim_config
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def init_model(self):
@@ -622,10 +553,8 @@ def init_model(self):
         self.dtype = PrecisionType.to_dtype(self.param_dtype)
         self.critic_module, self.critic_optimizer, self.critic_model_config, critic_optimizer_config = self._build_critic_model_optimizer(
             model_path=self.config.model.path,
-            dtype=self.dtype,
             optim_config=self.config.optim,
-            override_model_config=override_model_config,
-            enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False))
+            override_model_config=override_model_config)
         self.critic = MegatronPPOCritic(config=self.config,
                                         model_config=self.critic_model_config,
                                         hf_config=self.hf_config,
@@ -724,45 +653,25 @@ def __init__(self, config):
             self.config.micro_batch_size //= mpu.get_data_parallel_world_size()
             self.config.micro_batch_size_per_gpu = self.config.micro_batch_size
 
-    def _build_rm_model(self, model_path, megatron_config: ModelParallelConfig, override_model_config):
+    def _build_rm_model(self, model_path, override_model_config):
         from megatron.core.models.gpt.gpt_model import ModelType
-        from verl.utils.model import update_model_config
-        from verl.utils.megatron_utils import get_model
-        from transformers import AutoConfig
-
-        # Step 1: initialize the tokenizer
-        local_path = copy_to_local(model_path)
-        self.tokenizer = hf_tokenizer(local_path)
-
-        # Step 2: get the actor_model_config
-        rm_model_config = AutoConfig.from_pretrained(local_path)
-
-        override_config_kwargs = {
-            'bos_token_id': self.tokenizer.bos_token_id,
-            'eos_token_id': self.tokenizer.eos_token_id,
-            'pad_token_id': self.tokenizer.pad_token_id,
-        }
-        override_config_kwargs.update(override_model_config)
-        update_model_config(rm_model_config, override_config_kwargs=override_config_kwargs)
+        from verl.utils.model import print_model_size
+        from verl.utils.megatron.optimizer import get_megatron_optimizer
+        from verl.utils.megatron_utils import get_model, init_megatron_optim_config
 
-        if self.rank == 0:
-            print(f'Model config after override: rm_model_config {rm_model_config}')
+        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config)
 
         def megatron_rm_model_provider(pre_process, post_process):
-            from verl.utils.model import get_parallel_model_from_config
-            # vpp is not supported yet because it will hang for some reason. Need debugging
-            vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank()  # this will be set inside get_model
-            # this_megatron_config = copy.deepcopy(megatron_config)
-            # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
-            parallel_model = get_parallel_model_from_config(config=rm_model_config,
-                                                            megatron_config=megatron_config,
-                                                            pre_process=pre_process,
-                                                            post_process=post_process,
-                                                            share_embeddings_and_output_weights=False,
-                                                            value=True)
+            from verl.models.mcore import init_mcore_model
+            parallel_model = init_mcore_model(self.tf_config,
+                                              self.hf_config,
+                                              pre_process,
+                                              post_process,
+                                              share_embeddings_and_output_weights=False,
+                                              value=True)
             parallel_model.cuda()
             return parallel_model
-
+        
         # Step 3: initialize the megatron model
         reward_model = get_model(model_provider_func=megatron_rm_model_provider,
                                  model_type=ModelType.encoder_or_decoder,
@@ -773,15 +682,20 @@ def megatron_rm_model_provider(pre_process, post_process):
         # reward_model = nn.ModuleList(reward_model)
 
         if self.config.load_weight:
-            load_megatron_model_weights(self.config,
-                                        rm_model_config,
-                                        reward_model,
-                                        params_dtype=megatron_config.params_dtype,
+            if self.config.megatron.use_dist_checkpointing:
+                load_mcore_dist_weights(reward_model,
+                                        self.config.megatron.dist_checkpointing_path,
                                         is_value_model=True)
+            else:
+                load_megatron_gptmodel_weights(self.config,
+                                               self.hf_config,
+                                               reward_model,
+                                               params_dtype=self.dtype,
+                                               is_value_model=True)
 
         # TODO: add more optimizer args into config
         torch.cuda.empty_cache()
-        return reward_model, rm_model_config
+        return reward_model, self.hf_config
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def init_model(self):
@@ -804,14 +718,10 @@ def init_model(self):
             rm_tokenizer = hf_tokenizer(rm_tokenizer_local_path)
 
         self.param_dtype = torch.bfloat16
-
-        megatron_config = mcore_model_parallel_config(sequence_parallel=self.config.megatron.get(
-            'sequence_parallel', True),
-                                                      params_dtype=PrecisionType.to_dtype(self.param_dtype))
+        self.dtype = PrecisionType.to_dtype(self.param_dtype)
 
         reward_model_module, reward_model_config = self._build_rm_model(
             model_path=self.config.model.path,
-            megatron_config=megatron_config,
             override_model_config=override_model_config,
         )
         # FIXME(sgm): reward model param offload is implemented in MegatronRewardModel
@@ -819,7 +729,8 @@ def init_model(self):
         self.rm = MegatronRewardModel(config=self.config,
                                       reward_model_module=reward_model_module,
                                       model_config=reward_model_config,
-                                      megatron_config=megatron_config,
+                                      hf_config=self.hf_config,
+                                      tf_config=self.tf_config,
                                       sft_tokenizer=sft_tokenizer,
                                       rm_tokenizer=rm_tokenizer)
 
diff --git a/verl/workers/reward_model/megatron/reward_model.py b/verl/workers/reward_model/megatron/reward_model.py
index a890f288435..1b2cbee8ffa 100644
--- a/verl/workers/reward_model/megatron/reward_model.py
+++ b/verl/workers/reward_model/megatron/reward_model.py
@@ -35,12 +35,14 @@ def __init__(self,
                  config,
                  model_config,
                  reward_model_module: torch.nn.ModuleList,
-                 megatron_config,
+                 hf_config,
+                 tf_config,
                  sft_tokenizer=None,
                  rm_tokenizer=None):
         self.config = config
         self.reward_model_module = reward_model_module
-        self.megatron_config = megatron_config
+        self.hf_config = hf_config
+        self.tf_config = tf_config
         self.model_config = model_config
         self.device = 'cuda'
         self.sft_tokenizer = sft_tokenizer
@@ -133,7 +135,7 @@ def compute_reward(self, data: DataProto) -> DataProto:
         with torch.no_grad():
             output = self.forward_batch(data)
             if mpu.is_pipeline_last_stage(ignore_virtual=True):
-                logits = torch.cat([o['logits'] for o in output], dim=0)
+                logits = torch.cat([output], dim=0)
             else:
                 logits = torch.empty(
                     (input_ids.shape[0], input_ids.shape[1]),
@@ -205,21 +207,27 @@ def forward_batch(self, data: DataProto):
         input_shapes = compute_transformers_input_shapes(
             batches,
             meta_info={
-                'sequence_parallel': self.megatron_config.sequence_parallel,
+                'sequence_parallel': self.tf_config.sequence_parallel,
                 'hidden_size': self.model_config.hidden_size
             })
         # compute input shapes for pp stages
         forward_backward_func = get_forward_backward_func()
 
         def loss_func(output):
-            return 1., {'logits': output.logits}
+            return 1., {'logits': output}
 
         def forward_step(batch_iter, model):
             batch = next(batch_iter)
             input_ids = batch['input_ids']
             attention_mask = batch['attention_mask']
             position_ids = batch['position_ids']
-            output = model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
+            from verl.models.mcore import gptmodel_forward
+
+            output = gptmodel_forward(model,
+                                      input_ids,
+                                      attention_mask,
+                                      position_ids,
+                                      sequence_parallel=self.tf_config.sequence_parallel)
             return output, loss_func
 
         # batch should be a list of batches inside micro-batches

From 6c46c2a7e4b3d801bae918b6f3c78c5e51bc1825 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Sun, 13 Apr 2025 09:43:14 -0700
Subject: [PATCH 04/19] mcore model_forward for registry

---
 verl/models/mcore/__init__.py                 |   5 +-
 verl/models/mcore/config_converter.py         |   2 +-
 verl/models/mcore/model_forward.py            | 107 ++++++++++++++++++
 verl/models/mcore/registry.py                 |  20 ++++
 verl/models/mcore/{gpt_model.py => util.py}   |  42 -------
 .../single_controller/base/megatron/worker.py |  10 +-
 verl/utils/megatron_utils.py                  |  10 +-
 verl/workers/actor/megatron_actor.py          |  26 ++---
 verl/workers/critic/megatron_critic.py        |  32 +++---
 verl/workers/megatron_workers.py              |  34 +++---
 .../reward_model/megatron/reward_model.py     |  26 +++--
 11 files changed, 197 insertions(+), 117 deletions(-)
 create mode 100644 verl/models/mcore/model_forward.py
 rename verl/models/mcore/{gpt_model.py => util.py} (80%)

diff --git a/verl/models/mcore/__init__.py b/verl/models/mcore/__init__.py
index fbc26864c92..ccb2309f9e5 100644
--- a/verl/models/mcore/__init__.py
+++ b/verl/models/mcore/__init__.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .gpt_model import gptmodel_forward
-from .registry import init_mcore_model, hf_to_mcore_config
+from .registry import init_mcore_model, hf_to_mcore_config, get_mcore_forward_fn
 
-__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'gptmodel_forward']
\ No newline at end of file
+__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'get_mcore_forward_fn']
\ No newline at end of file
diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index d8359ed2284..f25e6211fcc 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -19,7 +19,7 @@
 from megatron.core.transformer import TransformerConfig
 import torch
 import torch.nn.functional as F
-from megatron.core.enums import AttnBackend
+from megatron.core.transformer.enums import AttnBackend
 
 
 def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
diff --git a/verl/models/mcore/model_forward.py b/verl/models/mcore/model_forward.py
new file mode 100644
index 00000000000..42fd702efee
--- /dev/null
+++ b/verl/models/mcore/model_forward.py
@@ -0,0 +1,107 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from verl.utils.megatron import sequence_parallel as sp_utils
+from verl.utils.megatron import tensor_parallel as tp_utils
+import torch
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core import parallel_state as mpu
+from verl.utils.megatron_utils import unwrap_model
+from .util import preprocess_packed_seqs, postprocess_packed_seqs, remove_left_padding, recover_left_padding
+
+
+def gptmodel_forward_dense(model,
+                           input_ids,
+                           attention_mask,
+                           position_ids,
+                           sequence_parallel,
+                           value_model=False,
+                           pack_seqs=True):
+    pre_process = unwrap_model(model).pre_process
+    post_process = unwrap_model(model).post_process
+    if pack_seqs:
+        batch_size, seq_len = attention_mask.shape[:2]
+        input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=pre_process)
+        input_ids_rmpad = input_ids_rmpad.contiguous()
+        output_orig = model(input_ids=input_ids_rmpad,
+                            attention_mask=None,
+                            position_ids=position_ids,
+                            packed_seq_params=packed_seq_params)
+
+        output = postprocess_packed_seqs(output_orig,
+                                         packed_seq_params,
+                                         attention_mask,
+                                         batch_size,
+                                         seq_len,
+                                         post_process=post_process)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(input_ids,
+                                                                                  attention_mask,
+                                                                                  position_ids,
+                                                                                  sequence_parallel,
+                                                                                  pre_process=pre_process)
+        output = model(input_ids=new_input_ids, attention_mask=new_attention_mask, position_ids=new_position_ids)
+        output = recover_left_padding(output,
+                                      new_attention_mask,
+                                      attention_mask,
+                                      sequence_length,
+                                      post_process=post_process)
+    if value_model and post_process:
+        output = output[..., 0]
+    return output
+
+
+def gptmodel_forward_qwen2_moe(model,
+                               input_ids,
+                               attention_mask,
+                               position_ids,
+                               sequence_parallel,
+                               value_model=False,
+                               pack_seqs=True):
+    return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model,
+                                  pack_seqs)
+
+
+def gptmodel_forward_llama4(model,
+                            input_ids,
+                            attention_mask,
+                            position_ids,
+                            sequence_parallel,
+                            value_model=False,
+                            pack_seqs=True):
+    return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model,
+                                  pack_seqs)
+
+
+def gptmodel_forward_dpskv3(model,
+                            input_ids,
+                            attention_mask,
+                            position_ids,
+                            sequence_parallel,
+                            value_model=False,
+                            pack_seqs=True):
+    return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model,
+                                  pack_seqs)
+
+
+def gptmodel_forward_qwen2_5_vl(model,
+                                input_ids,
+                                attention_mask,
+                                position_ids,
+                                sequence_parallel,
+                                value_model=False,
+                                pack_seqs=True):
+    raise NotImplementedError("VLM is not supported yet")
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index a2a92924b00..a54363026fe 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -63,3 +63,23 @@ def init_mcore_model(
                          f"Supported architectures: {MODEL_INITIALIZER_REGISTRY.keys()}")
     return MODEL_INITIALIZER_REGISTRY[arch](tfconfig, hf_config, pre_process, post_process,
                                             share_embeddings_and_output_weights, value, **extra_kwargs)
+
+
+from .model_forward import gptmodel_forward_dense, gptmodel_forward_qwen2_moe, gptmodel_forward_llama4, gptmodel_forward_dpskv3, gptmodel_forward_qwen2_5_vl
+
+
+def get_mcore_forward_fn(hf_config: PretrainedConfig):
+    MODEL_FORWARD_REGISTRY = {
+        "LlamaForCausalLM": gptmodel_forward_dense,
+        "Qwen2ForCausalLM": gptmodel_forward_dense,
+        "Qwen2MoeForCausalLM": gptmodel_forward_qwen2_moe,
+        "DeepseekV3ForCausalLM": gptmodel_forward_dpskv3,
+        "Qwen2_5_VLForConditionalGeneration": gptmodel_forward_qwen2_5_vl,
+        "Llama4ForConditionalGeneration": gptmodel_forward_llama4,
+    }
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    arch = hf_config.architectures[0]
+    if arch not in MODEL_FORWARD_REGISTRY:
+        raise ValueError(f"Model architectures {arch} forward function are not supported for now. "
+                         f"Supported architectures: {MODEL_FORWARD_REGISTRY.keys()}")
+    return MODEL_FORWARD_REGISTRY[arch]
diff --git a/verl/models/mcore/gpt_model.py b/verl/models/mcore/util.py
similarity index 80%
rename from verl/models/mcore/gpt_model.py
rename to verl/models/mcore/util.py
index 814dd3e8ac0..fcf406d4475 100644
--- a/verl/models/mcore/gpt_model.py
+++ b/verl/models/mcore/util.py
@@ -21,48 +21,6 @@
 from verl.utils.megatron_utils import unwrap_model
 
 
-def gptmodel_forward(model,
-                     input_ids,
-                     attention_mask,
-                     position_ids,
-                     sequence_parallel,
-                     value_model=False,
-                     pack_seqs=True):
-    pre_process = unwrap_model(model).pre_process
-    post_process = unwrap_model(model).post_process
-    if pack_seqs:
-        batch_size, seq_len = attention_mask.shape[:2]
-        input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=pre_process)
-        input_ids_rmpad = input_ids_rmpad.contiguous()
-        output_orig = model(input_ids=input_ids_rmpad,
-                            attention_mask=None,
-                            position_ids=position_ids,
-                            packed_seq_params=packed_seq_params)
-
-        output = postprocess_packed_seqs(output_orig,
-                                         packed_seq_params,
-                                         attention_mask,
-                                         batch_size,
-                                         seq_len,
-                                         post_process=post_process)
-    else:
-        batch_size, sequence_length = attention_mask.shape
-        new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(input_ids,
-                                                                                  attention_mask,
-                                                                                  position_ids,
-                                                                                  sequence_parallel,
-                                                                                  pre_process=pre_process)
-        output = model(input_ids=new_input_ids, attention_mask=new_attention_mask, position_ids=new_position_ids)
-        output = recover_left_padding(output,
-                                      new_attention_mask,
-                                      attention_mask,
-                                      sequence_length,
-                                      post_process=post_process)
-    if value_model and post_process:
-        output = output[..., 0]
-    return output
-
-
 def preprocess_packed_seqs(input_ids: torch.Tensor,
                            attention_mask: torch.Tensor,
                            pre_process: bool = True) -> tuple[torch.Tensor, PackedSeqParams]:
diff --git a/verl/single_controller/base/megatron/worker.py b/verl/single_controller/base/megatron/worker.py
index c6594ee4618..af9f612ead0 100644
--- a/verl/single_controller/base/megatron/worker.py
+++ b/verl/single_controller/base/megatron/worker.py
@@ -38,10 +38,7 @@ def get_megatron_rank_info(self):
         info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank, cp_rank=cp_rank)
         return info
 
-    def _init_hf_config_and_tf_config(self, 
-                                      model_path,
-                                      dtype,
-                                      override_model_config):
+    def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config):
         from verl.utils.model import print_model_size, update_model_config
         from verl.utils.fs import copy_to_local
         from verl.utils import hf_tokenizer
@@ -74,11 +71,12 @@ def add_optimization_config_to_tf_config(tf_config, verl_model_config):
             if verl_model_config.get('enable_gradient_checkpointing', False):
                 gradient_checkpointing_cfg = dict(verl_model_config.get('gradient_checkpointing_kwargs', dict()))
                 tf_config.recompute_method = gradient_checkpointing_cfg.get('activations_checkpoint_method', 'full')
-                tf_config.recompute_granularity = gradient_checkpointing_cfg.get('activations_checkpoint_granularity', 'full')
+                tf_config.recompute_granularity = gradient_checkpointing_cfg.get('activations_checkpoint_granularity',
+                                                                                 'full')
                 tf_config.recompute_num_layers = gradient_checkpointing_cfg.get('activations_checkpoint_num_layers', -1)
 
         add_optimization_config_to_tf_config(tf_config, self.config.model)
 
         print(f'TF config: {tf_config}')
         self.hf_config = hf_config
-        self.tf_config = tf_config
\ No newline at end of file
+        self.tf_config = tf_config
diff --git a/verl/utils/megatron_utils.py b/verl/utils/megatron_utils.py
index ffeb114f6d2..0dfb5ec1117 100644
--- a/verl/utils/megatron_utils.py
+++ b/verl/utils/megatron_utils.py
@@ -219,9 +219,11 @@ def mcore_model_parallel_config(
 ) -> ModelParallelConfig:
     # WARNING: Code should not reach this point. This function is deprecated and will be removed.
     # Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead.
-    warnings.warn("Code should not reach this point. This function is deprecated and will be removed. "
-                 "Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead.",
-                 DeprecationWarning, stacklevel=2)
+    warnings.warn(
+        "Code should not reach this point. This function is deprecated and will be removed. "
+        "Please use hf_to_mcore_config_dense() from verl.models.mcore.config_converter instead.",
+        DeprecationWarning,
+        stacklevel=2)
     return ModelParallelConfig(
         tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
         pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
@@ -306,4 +308,4 @@ def get_rng_states_checkpoint_path(checkpoint_path, only_rank0_save=True):
     tp_rank = mpu.get_tensor_model_parallel_rank()
     cp_rank = mpu.get_context_parallel_rank()
     return os.path.join(checkpoint_path, f'rng_states',
-                        f"rng_states_pp{pp_rank}_tp{tp_rank}_cp{cp_rank}_dp{dp_rank}.pt")
\ No newline at end of file
+                        f"rng_states_pp{pp_rank}_tp{tp_rank}_cp{cp_rank}_dp{dp_rank}.pt")
diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py
index 39fe6aa2d51..b8c328cfaca 100644
--- a/verl/workers/actor/megatron_actor.py
+++ b/verl/workers/actor/megatron_actor.py
@@ -239,12 +239,11 @@ def forward_backward_batch(self, data: DataProto, forward_only=False, post_proce
             batch_size = self.config.ppo_micro_batch_size_per_gpu
         batches = split_dict_tensor_into_batches(data.batch, batch_size=batch_size)
         # compute input shapes for pp stages
-        input_shapes = compute_transformers_input_shapes(
-            batches,
-            meta_info={
-                'sequence_parallel': self.tf_config.sequence_parallel,
-                'hidden_size': self.model_config.hidden_size
-            })
+        input_shapes = compute_transformers_input_shapes(batches,
+                                                         meta_info={
+                                                             'sequence_parallel': self.tf_config.sequence_parallel,
+                                                             'hidden_size': self.model_config.hidden_size
+                                                         })
         n_micro_batch = len(batches)
         seq_len = batches[0]['input_ids'].shape[1]
 
@@ -317,13 +316,14 @@ def forward_step(batch_iter, model):
             input_ids = batch['input_ids']
             attention_mask = batch['attention_mask']
             position_ids = batch['position_ids']
-            from verl.models.mcore import gptmodel_forward
-
-            output = gptmodel_forward(model,
-                                      input_ids,
-                                      attention_mask,
-                                      position_ids,
-                                      sequence_parallel=self.tf_config.sequence_parallel)
+            from verl.models.mcore import get_mcore_forward_fn
+            forward_fn = get_mcore_forward_fn(self.hf_config)
+
+            output = forward_fn(model,
+                                input_ids,
+                                attention_mask,
+                                position_ids,
+                                sequence_parallel=self.tf_config.sequence_parallel)
             if forward_only:
                 meta_info = None
             else:
diff --git a/verl/workers/critic/megatron_critic.py b/verl/workers/critic/megatron_critic.py
index af1cbcc4db7..be69a938c55 100644
--- a/verl/workers/critic/megatron_critic.py
+++ b/verl/workers/critic/megatron_critic.py
@@ -47,8 +47,8 @@ def __init__(self, config, model_config, hf_config, tf_config, critic_module: nn
         super().__init__(config=config)
         self._validate_config(config)
         self.model_config = model_config
-        self.hf_config = hf_config # huggingface config
-        self.tf_config = tf_config # mcore transformer config
+        self.hf_config = hf_config  # huggingface config
+        self.tf_config = tf_config  # mcore transformer config
 
         self.critic_module = critic_module
         self.critic_optimizer = critic_optimizer
@@ -120,12 +120,11 @@ def forward_backward_batch(self, data: DataProto, forward_only=False):
         seq_len = batches[0]['input_ids'].shape[1]
 
         # compute input shapes for pp stages
-        input_shapes = compute_transformers_input_shapes(
-            batches,
-            meta_info={
-                'sequence_parallel': self.tf_config.sequence_parallel,
-                'hidden_size': self.model_config.hidden_size
-            })
+        input_shapes = compute_transformers_input_shapes(batches,
+                                                         meta_info={
+                                                             'sequence_parallel': self.tf_config.sequence_parallel,
+                                                             'hidden_size': self.model_config.hidden_size
+                                                         })
 
         forward_backward_func = get_forward_backward_func()
 
@@ -164,14 +163,15 @@ def forward_step(batch_iter, model):
             input_ids = batch['input_ids']
             attention_mask = batch['attention_mask']
             position_ids = batch['position_ids']
-            from verl.models.mcore import gptmodel_forward
-
-            output = gptmodel_forward(model,
-                                      input_ids,
-                                      attention_mask,
-                                      position_ids,
-                                      sequence_parallel=self.tf_config.sequence_parallel,
-                                      value_model=True)
+            from verl.models.mcore import get_mcore_forward_fn
+            forward_fn = get_mcore_forward_fn(self.hf_config)
+
+            output = forward_fn(model,
+                                input_ids,
+                                attention_mask,
+                                position_ids,
+                                sequence_parallel=self.tf_config.sequence_parallel,
+                                value_model=True)
 
             return output, partial(loss_func, data=batch, meta_info={})
 
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index 76572de2026..46df4960d0f 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -133,10 +133,7 @@ def __init__(self, config: DictConfig, role: str):
                 self.config.ref.ppo_micro_batch_size_per_gpu = self.config.ref.ppo_micro_batch_size
             self._is_offload_param = self.config.ref.get('param_offload', False)
 
-    def _build_model_optimizer(self,
-                               model_path,
-                               optim_config,
-                               override_model_config):
+    def _build_model_optimizer(self, model_path, optim_config, override_model_config):
         from verl.utils.megatron.optimizer import get_megatron_optimizer
         from megatron.core.models.gpt.gpt_model import ModelType
         from verl.utils.model import print_model_size, get_generation_config
@@ -292,13 +289,14 @@ def init_model(self):
             )
 
         if self._is_actor:
-            self.actor = MegatronPPOActor(config=self.config.actor,
-                                          model_config=self.actor_model_config,
-                                          hf_config=self.hf_config,
-                                          tf_config=self.tf_config,
-                                          actor_module=self.actor_module,
-                                          actor_optimizer=self.actor_optimizer,
-                                          )
+            self.actor = MegatronPPOActor(
+                config=self.config.actor,
+                model_config=self.actor_model_config,
+                hf_config=self.hf_config,
+                tf_config=self.tf_config,
+                actor_module=self.actor_module,
+                actor_optimizer=self.actor_optimizer,
+            )
 
         if self._is_rollout:
             self.rollout, self.sharding_manager = self._build_rollout()
@@ -307,7 +305,8 @@ def init_model(self):
             self.ref_module, self.ref_model_config = self._build_model_optimizer(
                 model_path=self.config.model.path,
                 optim_config=None,
-                override_model_config=override_model_config,)
+                override_model_config=override_model_config,
+            )
             self.ref_policy = MegatronPPOActor(config=self.config.ref,
                                                model_config=self.ref_model_config,
                                                hf_config=self.hf_config,
@@ -483,10 +482,7 @@ def __init__(self, config):
 
         # TODO(sgm): support critic model offload
 
-    def _build_critic_model_optimizer(self,
-                                      model_path,
-                                      optim_config,
-                                      override_model_config):
+    def _build_critic_model_optimizer(self, model_path, optim_config, override_model_config):
         from megatron.core.models.gpt.gpt_model import ModelType
         from verl.utils.model import print_model_size
         from verl.utils.megatron.optimizer import get_megatron_optimizer
@@ -671,7 +667,7 @@ def megatron_rm_model_provider(pre_process, post_process):
                                               value=True)
             parallel_model.cuda()
             return parallel_model
-        
+
         # Step 3: initialize the megatron model
         reward_model = get_model(model_provider_func=megatron_rm_model_provider,
                                  model_type=ModelType.encoder_or_decoder,
@@ -683,9 +679,7 @@ def megatron_rm_model_provider(pre_process, post_process):
 
         if self.config.load_weight:
             if self.config.megatron.use_dist_checkpointing:
-                load_mcore_dist_weights(reward_model,
-                                        self.config.megatron.dist_checkpointing_path,
-                                        is_value_model=True)
+                load_mcore_dist_weights(reward_model, self.config.megatron.dist_checkpointing_path, is_value_model=True)
             else:
                 load_megatron_gptmodel_weights(self.config,
                                                self.hf_config,
diff --git a/verl/workers/reward_model/megatron/reward_model.py b/verl/workers/reward_model/megatron/reward_model.py
index 1b2cbee8ffa..54027dfcc11 100644
--- a/verl/workers/reward_model/megatron/reward_model.py
+++ b/verl/workers/reward_model/megatron/reward_model.py
@@ -204,12 +204,11 @@ def forward_batch(self, data: DataProto):
         seq_len = batches[0]['input_ids'].shape[1]
 
         # compute input shapes for pp stages
-        input_shapes = compute_transformers_input_shapes(
-            batches,
-            meta_info={
-                'sequence_parallel': self.tf_config.sequence_parallel,
-                'hidden_size': self.model_config.hidden_size
-            })
+        input_shapes = compute_transformers_input_shapes(batches,
+                                                         meta_info={
+                                                             'sequence_parallel': self.tf_config.sequence_parallel,
+                                                             'hidden_size': self.model_config.hidden_size
+                                                         })
         # compute input shapes for pp stages
         forward_backward_func = get_forward_backward_func()
 
@@ -221,13 +220,16 @@ def forward_step(batch_iter, model):
             input_ids = batch['input_ids']
             attention_mask = batch['attention_mask']
             position_ids = batch['position_ids']
-            from verl.models.mcore import gptmodel_forward
+            from verl.models.mcore import get_mcore_forward_fn
+            forward_fn = get_mcore_forward_fn(self.hf_config)
+
+            output = forward_fn(model,
+                                input_ids,
+                                attention_mask,
+                                position_ids,
+                                sequence_parallel=self.tf_config.sequence_parallel,
+                                value_model=True)
 
-            output = gptmodel_forward(model,
-                                      input_ids,
-                                      attention_mask,
-                                      position_ids,
-                                      sequence_parallel=self.tf_config.sequence_parallel)
             return output, loss_func
 
         # batch should be a list of batches inside micro-batches

From e709dc3676032060f859fdb07a1408437cce60f7 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Wed, 16 Apr 2025 03:35:39 -0700
Subject: [PATCH 05/19] (WIP) support qwen2moe

---
 verl/models/mcore/__init__.py                 |   4 +-
 verl/models/mcore/config_converter.py         |  57 ++++++-
 verl/models/mcore/model_initializer.py        |  34 ++++-
 verl/models/mcore/registry.py                 |  18 +++
 verl/models/mcore/saver.py                    |   8 +
 verl/models/mcore/weight_converter.py         | 143 ++++++++++++++++++
 verl/models/weight_loader_registry.py         |   7 +-
 verl/utils/model.py                           |   1 +
 verl/workers/actor/megatron_actor.py          |   4 +-
 verl/workers/critic/megatron_critic.py        |   2 +-
 verl/workers/megatron_workers.py              |   5 +-
 .../workers/sharding_manager/megatron_vllm.py |  56 ++++++-
 12 files changed, 316 insertions(+), 23 deletions(-)
 create mode 100644 verl/models/mcore/weight_converter.py

diff --git a/verl/models/mcore/__init__.py b/verl/models/mcore/__init__.py
index ccb2309f9e5..fbc2dc566bf 100644
--- a/verl/models/mcore/__init__.py
+++ b/verl/models/mcore/__init__.py
@@ -13,6 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .registry import init_mcore_model, hf_to_mcore_config, get_mcore_forward_fn
+from .registry import init_mcore_model, hf_to_mcore_config, get_mcore_forward_fn, get_mcore_weight_converter
 
-__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'get_mcore_forward_fn']
\ No newline at end of file
+__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'get_mcore_forward_fn', 'get_mcore_weight_converter']
\ No newline at end of file
diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index f25e6211fcc..e95d4ca73ba 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -65,8 +65,61 @@ def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) ->
 
 
 def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
-    # Qwen2MoeForCausalLM
-    raise NotImplementedError("Qwen2MoeForCausalLM is not supported yet")
+    from megatron.core import parallel_state as mpu
+    overlap_p2p_comm = mpu.get_virtual_pipeline_model_parallel_world_size(
+    ) is not None and mpu.get_virtual_pipeline_model_parallel_world_size() > 1
+    batch_p2p_comm = False
+    transformer_config = TransformerConfig(
+        num_layers=hf_config.num_hidden_layers,
+        hidden_size=hf_config.hidden_size,
+        num_attention_heads=hf_config.num_attention_heads,
+        num_query_groups=hf_config.num_key_value_heads,
+        attention_dropout=hf_config.attention_dropout,
+        hidden_dropout=getattr(hf_config, 'hidden_dropout', 0.0),
+        activation_func=F.silu,
+        normalization='RMSNorm',
+        gated_linear_unit=True,
+        use_cpu_initialization=False,
+        add_bias_linear=False,
+        pipeline_dtype=dtype,
+        params_dtype=dtype,
+        variable_seq_lengths=True,
+        masked_softmax_fusion=True,
+        attention_backend=AttnBackend.flash,
+        bf16=dtype is torch.bfloat16,
+        layernorm_epsilon=hf_config.rms_norm_eps,
+
+        # parallel config
+        tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
+        pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
+        virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(),
+        context_parallel_size=mpu.get_context_parallel_world_size(),
+        overlap_p2p_comm=overlap_p2p_comm,
+        batch_p2p_comm=batch_p2p_comm,
+        sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1,
+
+        # moe specific
+        ffn_hidden_size=hf_config.moe_intermediate_size,
+        moe_token_dispatcher_type="alltoall",
+        moe_router_bias_update_rate=0.001,
+        moe_router_topk=hf_config.num_experts_per_tok,
+        num_moe_experts=hf_config.num_experts,
+        moe_shared_expert_intermediate_size=hf_config.shared_expert_intermediate_size,
+        # moe_aux_loss_coeff=hf_config.router_aux_loss_coef,
+        moe_aux_loss_coeff=0.0,
+        moe_router_load_balancing_type="aux_loss",
+        moe_router_pre_softmax=False,  #?
+        moe_shared_expert_overlap=True,
+        # moe_permute_fusion=True,
+        moe_grouped_gemm=True,
+
+        # mcore 0.12
+        moe_router_dtype="fp64",
+        disable_bf16_reduced_precision_matmul=True,
+
+        # qwen specific
+        add_qkv_bias=True)
+    return transformer_config
 
 
 def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py
index 4ae2fe4e5d4..48ea980fd6b 100644
--- a/verl/models/mcore/model_initializer.py
+++ b/verl/models/mcore/model_initializer.py
@@ -54,8 +54,38 @@ def init_mcore_model_qwen2_moe(tfconfig,
                                post_process=None,
                                share_embeddings_and_output_weights=False,
                                value=False):
-    return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights,
-                                  value)
+
+    from megatron.core.models.gpt.gpt_model import GPTModel
+    from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
+    use_te = True
+
+    def patch_layer_spec(transformer_layer_spec):
+        # shared_experts.gate=True
+        for i in range(len(transformer_layer_spec.layer_specs)):
+            transformer_layer_spec.layer_specs[i].submodules.mlp.submodules.shared_experts.params['gate'] = True
+        return transformer_layer_spec
+
+    assert tfconfig.normalization == "RMSNorm", 'only RMSNorm is supported for now'
+    transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te)
+    transformer_layer_spec = patch_layer_spec(transformer_layer_spec)
+    rope_scaling_args = {}
+    if hf_config.rope_scaling is not None:
+        assert hf_config.rope_scaling['type'] == 'linear', "only linear scaling is supported for now"
+        rope_scaling_args['seq_len_interpolation_factor'] = hf_config.rope_scaling['factor']
+    model = GPTModel(config=tfconfig,
+                     transformer_layer_spec=transformer_layer_spec,
+                     vocab_size=hf_config.vocab_size,
+                     max_sequence_length=hf_config.max_position_embeddings,
+                     pre_process=pre_process,
+                     post_process=post_process,
+                     share_embeddings_and_output_weights=share_embeddings_and_output_weights,
+                     position_embedding_type='rope',
+                     rotary_base=hf_config.rope_theta,
+                     **rope_scaling_args)
+    if post_process and value:
+        from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
+        model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig)
+    return model
 
 
 def init_mcore_model_llama4(tfconfig,
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index a54363026fe..ab02c69e5be 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -83,3 +83,21 @@ def get_mcore_forward_fn(hf_config: PretrainedConfig):
         raise ValueError(f"Model architectures {arch} forward function are not supported for now. "
                          f"Supported architectures: {MODEL_FORWARD_REGISTRY.keys()}")
     return MODEL_FORWARD_REGISTRY[arch]
+
+
+from .weight_converter import McoreToHFWeightConverterDense, McoreToHFWeightConverterQwen2Moe
+
+
+def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype):
+    MODEL_WEIGHT_CONVERTER_REGISTRY = {
+        "LlamaForCausalLM": McoreToHFWeightConverterDense,
+        "Qwen2ForCausalLM": McoreToHFWeightConverterDense,
+        "Qwen2MoeForCausalLM": McoreToHFWeightConverterQwen2Moe,
+    }
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    arch = hf_config.architectures[0]
+    if arch not in MODEL_WEIGHT_CONVERTER_REGISTRY:
+        raise ValueError(f"Model architectures {arch} weight converter are not supported for now. "
+                         f"Supported architectures: {MODEL_WEIGHT_CONVERTER_REGISTRY.keys()}")
+    tfconfig = hf_to_mcore_config(hf_config, dtype)
+    return MODEL_WEIGHT_CONVERTER_REGISTRY[arch](hf_config, tfconfig)
diff --git a/verl/models/mcore/saver.py b/verl/models/mcore/saver.py
index 5598ab2d2cc..d4ab4610cc5 100644
--- a/verl/models/mcore/saver.py
+++ b/verl/models/mcore/saver.py
@@ -466,3 +466,11 @@ def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
 
     print_rank_0(f"merge megatron ckpt done, time elapsed {time.time() - start_time}s")
     return state_dict
+
+
+def merge_megatron_ckpt_gptmodel_qwen_moe(wrapped_models,
+                                          config,
+                                          dtype,
+                                          is_value_model=False,
+                                          tie_word_embeddings=False):
+    raise NotImplementedError("merge_megatron_ckpt_gptmodel_qwen_moe is not implemented")
diff --git a/verl/models/mcore/weight_converter.py b/verl/models/mcore/weight_converter.py
new file mode 100644
index 00000000000..155361a532a
--- /dev/null
+++ b/verl/models/mcore/weight_converter.py
@@ -0,0 +1,143 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# online convert mcore weight to pure huggingface weight, no any fusion
+# including format conversion and name mapping
+# not including resharding
+import torch
+from transformers import PretrainedConfig
+from megatron.core.transformer import TransformerConfig
+
+
+class McoreToHFWeightConverterBase:
+
+    def __init__(self, hf_config: PretrainedConfig, mcore_config: TransformerConfig):
+        self.hf_config = hf_config
+        self.mcore_config = mcore_config
+
+    def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class McoreToHFWeightConverterDense(McoreToHFWeightConverterBase):
+
+    def _convert_attention_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # 'decoder.layers.0.self_attention.linear_proj.weight'
+        # 'decoder.layers.0.self_attention.linear_qkv.layer_norm_weight'
+        # 'decoder.layers.0.self_attention.linear_qkv.weight'
+        # 'decoder.layers.0.self_attention.linear_qkv.bias'
+        layer_number = name.split('.')[2]
+        convert_names = []
+        if "self_attention.linear_qkv.bias" in name or "self_attention.linear_qkv.weight" in name:
+            param_type = name.split('.')[-1]
+            assert param_type == 'bias' or param_type == 'weight'
+            convert_names.append(f'model.layers.{layer_number}.self_attn.q_proj.{param_type}')
+            convert_names.append(f'model.layers.{layer_number}.self_attn.k_proj.{param_type}')
+            convert_names.append(f'model.layers.{layer_number}.self_attn.v_proj.{param_type}')
+            assert len(params) == 3
+        elif "self_attention.linear_proj.weight" in name:
+            convert_names.append(f'model.layers.{layer_number}.self_attn.o_proj.weight')
+            assert len(params) == 1
+        elif "self_attention.linear_qkv.layer_norm_weight" in name:
+            convert_names.append(f'model.layers.{layer_number}.input_layernorm.weight')
+            assert len(params) == 1
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+        return convert_names, params
+
+    def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # 'decoder.layers.0.mlp.linear_fc1.layer_norm_weight'
+        # 'decoder.layers.0.mlp.linear_fc1.weight'
+        # 'decoder.layers.0.mlp.linear_fc2.weight'
+        layer_number = name.split('.')[2]
+        convert_names = []
+        if "mlp.linear_fc1.weight" in name:
+            # split gate_proj and up_proj
+            convert_names.append(f'model.layers.{layer_number}.mlp.gate_proj.weight')
+            convert_names.append(f'model.layers.{layer_number}.mlp.up_proj.weight')
+            assert len(params) == 2
+        elif "mlp.linear_fc1.layer_norm_weight" in name:
+            convert_names.append(f'model.layers.{layer_number}.post_attention_layernorm.weight')
+            assert len(params) == 1
+        elif "mlp.linear_fc2.weight" in name:
+            convert_names.append(f'model.layers.{layer_number}.mlp.down_proj.weight')
+            assert len(params) == 1
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+        return convert_names, params
+
+    def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        direct_name_mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+        }
+        if name in direct_name_mapping:
+            return [direct_name_mapping[name]], [params_one_group[0]]
+
+        if "self_attention" in name:
+            return self._convert_attention_param(name, params_one_group)
+        elif "mlp" in name:
+            return self._convert_mlp_param(name, params_one_group)
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+
+
+class McoreToHFWeightConverterQwen2Moe(McoreToHFWeightConverterDense):
+
+    def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # 'decoder.layers.0.pre_mlp_layernorm.weight',
+        # 'decoder.layers.0.mlp.router.weight',
+        # 'decoder.layers.0.mlp.shared_experts.gate_weight',
+        # 'decoder.layers.0.mlp.shared_experts.linear_fc1.weight',
+        # 'decoder.layers.0.mlp.shared_experts.linear_fc2.weight'
+        # moe1
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight0',
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight1',
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight2',
+        # 'decoder.layers.0.mlp.experts.linear_fc1.weight3',
+        # moe2
+        # 'decoder.layers.0.mlp.experts.linear_fc2.weight0',
+        # 'decoder.layers.0.mlp.experts.linear_fc2.weight1',
+        layer_number = name.split('.')[2]
+        convert_names = []
+        if "pre_mlp_layernorm" in name:
+            convert_names.append(f'model.layers.{layer_number}.post_attention_layernorm.weight')
+            assert len(params) == 1
+        elif "mlp.router.weight" in name:
+            convert_names.append(f'model.layers.{layer_number}.mlp.gate.weight')
+            assert len(params) == 1
+        elif "shared_experts.gate_weight" in name:
+            convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert_gate.weight')
+            assert len(params) == 1
+        elif "shared_experts.linear_fc1.weight" in name:  # split gate_proj and up_proj
+            convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.gate_proj.weight')
+            convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.up_proj.weight')
+            assert len(params) == 2
+        elif "shared_experts.linear_fc2.weight" in name:
+            convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.down_proj.weight')
+            assert len(params) == 1
+        elif "mlp.experts.linear_fc1" in name:  # split gate_proj and up_proj
+            expert_id = name.split('weight')[-1]
+            convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight')
+            convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight')
+            assert len(params) == 2
+        elif "mlp.experts.linear_fc2" in name:
+            expert_id = name.split('weight')[-1]
+            convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight')
+            assert len(params) == 1
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
+        return convert_names, params
diff --git a/verl/models/weight_loader_registry.py b/verl/models/weight_loader_registry.py
index a700761268b..8687b0cac59 100644
--- a/verl/models/weight_loader_registry.py
+++ b/verl/models/weight_loader_registry.py
@@ -14,8 +14,6 @@
 
 
 def get_weight_loader(arch: str):
-    from verl.models.llama.megatron.checkpoint_utils.llama_loader import load_state_dict_to_megatron_llama
-    from verl.models.qwen2.megatron.checkpoint_utils.qwen2_loader import load_state_dict_to_megatron_qwen2
     from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel
     _MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY = {
         'LlamaForCausalLM': load_state_dict_to_megatron_gptmodel,
@@ -29,12 +27,11 @@ def get_weight_loader(arch: str):
 
 
 def get_weight_saver(arch: str):
-    from verl.models.llama.megatron.checkpoint_utils.llama_saver import merge_megatron_ckpt_llama
-    from verl.models.qwen2.megatron.checkpoint_utils.qwen2_saver import merge_megatron_ckpt_qwen2
-    from verl.models.mcore.saver import merge_megatron_ckpt_gptmodel
+    from verl.models.mcore.saver import merge_megatron_ckpt_gptmodel, merge_megatron_ckpt_gptmodel_qwen_moe
     _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY = {
         'LlamaForCausalLM': merge_megatron_ckpt_gptmodel,
         'Qwen2ForCausalLM': merge_megatron_ckpt_gptmodel,
+        "Qwen2MoeForCausalLM": merge_megatron_ckpt_gptmodel_qwen_moe,
     }
     if arch in _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY:
         return _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY[arch]
diff --git a/verl/utils/model.py b/verl/utils/model.py
index fbe26691e8e..52cb5a06af7 100644
--- a/verl/utils/model.py
+++ b/verl/utils/model.py
@@ -210,6 +210,7 @@ def normalize_model_name(name, pp_rank, vpp_rank, pp_size, vpp_size, num_layers)
         """
         Transform the model name in each model_chunk in each pp stage into the name in inference engine
         """
+        # TODO for mcore uneven pp/vpp, things are different
         if vpp_size > 1:
             # print(f'try to bind vpp params to inference engine...')
             layers_per_pp = num_layers // pp_size
diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py
index 7cdc268e836..a5c34111fb4 100644
--- a/verl/workers/actor/megatron_actor.py
+++ b/verl/workers/actor/megatron_actor.py
@@ -259,9 +259,9 @@ def forward_backward_batch(self, data: DataProto, forward_only=False, post_proce
         def loss_func(output, data, meta_info):
             if forward_only:
                 if post_process_fn is None:
-                    return 1.0, {'logits': output}
+                    return torch.tensor(1.0, device=output.device), {'logits': output}
                 else:
-                    return 1.0, post_process_fn(output, data)
+                    return torch.tensor(1.0, device=output.device), post_process_fn(output, data)
 
             responses = data['responses']
             response_length = responses.size(1)
diff --git a/verl/workers/critic/megatron_critic.py b/verl/workers/critic/megatron_critic.py
index 9f5b08b6281..30f8e4c64eb 100644
--- a/verl/workers/critic/megatron_critic.py
+++ b/verl/workers/critic/megatron_critic.py
@@ -137,7 +137,7 @@ def forward_backward_batch(self, data: DataProto, forward_only=False):
 
         def loss_func(output, data, meta_info):
             if forward_only:
-                return 1.0, {'vpreds': output}
+                return torch.tensor(1.0, device=output.device), {'vpreds': output}
 
             responses = data['responses']
             attention_mask = data['attention_mask']
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index 55775caf196..200ffd77274 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -255,10 +255,13 @@ def _build_rollout(self, trust_remote_code=False):
             log_gpu_memory_usage('After building vllm rollout', logger=logger)
 
             # perform weight resharding between actor and rollout
+            from verl.models.mcore import get_mcore_weight_converter
+            weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype)
             sharding_manager = MegatronVLLMShardingManager(module=self.hybrid_engine,
                                                            inference_engine=rollout.inference_engine,
                                                            model_config=self.actor_model_config,
-                                                           layer_name_mapping=layer_name_mapping)
+                                                           layer_name_mapping=layer_name_mapping,
+                                                           weight_converter=weight_converter)
             log_gpu_memory_usage('After building sharding manager', logger=logger)
         else:
             raise NotImplementedError('Only vllmRollout is supported with Megatron now')
diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py
index 60d8cbc6700..f456cde7d0a 100644
--- a/verl/workers/sharding_manager/megatron_vllm.py
+++ b/verl/workers/sharding_manager/megatron_vllm.py
@@ -258,6 +258,7 @@ def pp_models(self):
 from verl.third_party.vllm import LLM
 from verl.utils.model import normalize_pp_vpp_params
 from verl.utils.megatron_utils import convert_megatron_model_to_transformers_model
+from verl.models.mcore.weight_converter import McoreToHFWeightConverterBase
 # Micro Data parallel group. Micro data parallel group is additional dp group that origins from splitting training tp
 # into infer_tp and micro_tp. By default, we use order micro_dp - tp
 # NOTICE: in new version of vLLM, We need to all-gather all tp rank's model weights
@@ -267,12 +268,14 @@ def pp_models(self):
 
 class MegatronVLLMShardingManager(BaseShardingManager):
 
-    def __init__(self, module: AllGatherPPModel, inference_engine: LLM, model_config, layer_name_mapping):
+    def __init__(self, module: AllGatherPPModel, inference_engine: LLM, model_config, layer_name_mapping,
+                 weight_converter: McoreToHFWeightConverterBase):
         from megatron.core import parallel_state as mpu
         self.module = module
         self.inference_engine = inference_engine
         self.model_config = model_config
         self.layer_name_mapping = layer_name_mapping
+        self.weight_converter = weight_converter
 
         # initialize micro_dp group for vllm inference
         global _MICRO_DATA_PARALLEL_GROUP
@@ -362,6 +365,9 @@ def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_
             else:
                 infer_params = [gate, up]
 
+        elif "mlp.experts.linear_fc2.weight" in name:  # moe
+            infer_params = torch.cat(infer_params, dim=1)
+
         else:
             # concat tensor
             infer_params = torch.cat(infer_params, dim=tp_utils.get_tensor_parallel_partition_dim(param))
@@ -393,13 +399,18 @@ def _post_process_params(self, params, convert_qkv_gate_up_by_simple_split=False
                                                          convert_qkv_gate_up_by_simple_split)
             else:
                 infer_params = param
-            converted_names, converted_params = convert_megatron_model_to_transformers_model(
-                name,
-                infer_params,
-                self.model_config,
-                self.train_tp_size,
-                self.module.pp_models[0][0].config.num_query_groups,
-                convert_qkv_gate_up_by_trunk_concat=False)
+            if vllm_version in ('0.4.2', '0.5.4', '0.6.3'):
+                converted_names, converted_params = convert_megatron_model_to_transformers_model(
+                    name,
+                    infer_params,
+                    self.model_config,
+                    self.train_tp_size,
+                    self.module.pp_models[0][0].config.num_query_groups,
+                    convert_qkv_gate_up_by_trunk_concat=False)
+            else:
+                if not isinstance(infer_params, list):
+                    infer_params = [infer_params]
+                converted_names, converted_params = self.weight_converter.convert_param(name, infer_params)
             for converted_name, infer_param in zip(converted_names, converted_params):
                 yield converted_name, infer_param
 
@@ -425,6 +436,7 @@ def __enter__(self):
             per_tensor_param = self._post_process_params(cur_tp_rank_param, convert_qkv_gate_up_by_simple_split=True)
             self.inference_engine.wake_up()
             model = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
+            _patch_vllm_qwen2_moe_model_weight_loader(model)
             loaded_params = model.load_weights(per_tensor_param)
             logger.info(f"vLLM load weights, loaded_params: {len(loaded_params)}")
         log_gpu_memory_usage('After load_weights sharding manager memory', logger=logger)
@@ -484,3 +496,31 @@ def get_micro_data_parallel_world_size():
 
 def get_micro_data_parallel_rank():
     return torch.distributed.get_rank(group=get_micro_data_parallel_group())
+
+
+def _patch_vllm_qwen2_moe_model_weight_loader(model):
+    # this is a work around to load the weight of vllm qwen2 moe model
+    # it is from a bug from vllm 0.8.2
+    # all the weights are supposed to have a weight_loader, but the moe weights
+    # do not have a weight_loader, so we need to patch it
+    # (True, 'model.embed_tokens.weight')
+    # (True, 'model.layers.0.self_attn.qkv_proj.weight')
+    # (True, 'model.layers.0.self_attn.qkv_proj.bias')
+    # (True, 'model.layers.0.self_attn.o_proj.weight')
+    # (True, 'model.layers.0.mlp.gate.weight')
+    # (True, 'model.layers.0.mlp.shared_expert.gate_up_proj.weight')
+    # (True, 'model.layers.0.mlp.shared_expert.down_proj.weight')
+    # (False, 'model.layers.0.mlp.shared_expert_gate.weight')   use default
+    # (False, 'model.layers.0.input_layernorm.weight')          use default
+    # (False, 'model.layers.0.post_attention_layernorm.weight') use default
+    # (False, 'model.layers.0.mlp.experts.w13_weight')          use mlp.experts.weight_loader
+    # (False, 'model.layers.0.mlp.experts.w2_weight')          use mlp.experts.weight_loader
+    from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM
+    if not isinstance(model, Qwen2MoeForCausalLM):
+        return
+    for layer in model.model.layers:
+        mlp = layer.mlp
+        param_dict = dict(mlp.named_parameters())
+        for name, param in param_dict.items():
+            if "w13_weight" in name or "w2_weight" in name:
+                param.weight_loader = mlp.experts.weight_loader

From 0775d36852c8d11fa697e71ca35f29c05a737254 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Thu, 17 Apr 2025 04:10:29 -0700
Subject: [PATCH 06/19] qwen2moe config converter and weight converter

---
 scripts/converter_hf_to_mcore.py       | 100 ++++++++++++++++++-------
 verl/models/mcore/config_converter.py  |  26 +++++--
 verl/models/mcore/model_initializer.py |  26 +++++--
 verl/models/mcore/registry.py          |   2 +-
 4 files changed, 113 insertions(+), 41 deletions(-)

diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py
index c7c10b3fc85..cc4af55b512 100644
--- a/scripts/converter_hf_to_mcore.py
+++ b/scripts/converter_hf_to_mcore.py
@@ -24,11 +24,14 @@
 from concurrent.futures import ThreadPoolExecutor
 from safetensors.torch import load_file
 from torch.distributed._tensor import Shard, Placement
-from verl.utils.megatron_utils import get_model, convert_config
+from verl.utils.megatron_utils import get_model
 from megatron.core.models.gpt.gpt_model import ModelType
 from megatron.core import parallel_state as mpu
 from megatron.core import dist_checkpointing
 from megatron.core.dist_checkpointing.serialization import StrictHandling
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+
+from verl.models.mcore import hf_to_mcore_config
 
 
 def _init_args():
@@ -58,6 +61,48 @@ def __init__(self):
         self.model = ModelConfig()
 
 
+def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config):
+    num_attention_heads = hf_config.num_attention_heads
+    hidden_dim = hf_config.hidden_size
+    head_dim = hidden_dim // num_attention_heads
+    with torch.no_grad():
+        model.embedding.word_embeddings.weight.copy_(hf_model.model.embed_tokens.weight)
+        for layer, hf_layer in zip(model.decoder.layers, hf_model.model.layers):
+            layer.self_attention.linear_qkv.layer_norm_weight.copy_(hf_layer.input_layernorm.weight)
+
+            q = hf_layer.self_attn.q_proj.weight.view([num_attention_heads, -1, head_dim, hidden_dim])
+            k = hf_layer.self_attn.k_proj.weight.view([num_attention_heads, -1, head_dim, hidden_dim])
+            v = hf_layer.self_attn.v_proj.weight.view([num_attention_heads, -1, head_dim, hidden_dim])
+            qkv = torch.cat([q, k, v], dim=1).view(-1, hidden_dim).contiguous()
+
+            q_bias = hf_layer.self_attn.q_proj.bias.view([num_attention_heads, -1])
+            k_bias = hf_layer.self_attn.k_proj.bias.view([num_attention_heads, -1])
+            v_bias = hf_layer.self_attn.v_proj.bias.view([num_attention_heads, -1])
+            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=1).view(-1).contiguous()
+
+            layer.self_attention.linear_qkv.weight.copy_(qkv)
+            layer.self_attention.linear_qkv.bias.copy_(qkv_bias)
+
+            layer.self_attention.linear_proj.weight.copy_(hf_layer.self_attn.o_proj.weight)
+            layer.pre_mlp_layernorm.weight.copy_(hf_layer.post_attention_layernorm.weight)
+
+            layer.mlp.router.weight.copy_(hf_layer.mlp.gate.weight)
+
+            for idx, hf_expert in enumerate(hf_layer.mlp.experts):
+                fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight])
+                layer.mlp.experts.linear_fc1._parameters[f'weight{idx}'].copy_(fc1_weight)
+                layer.mlp.experts.linear_fc2._parameters[f'weight{idx}'].copy_(hf_expert.down_proj.weight)
+
+            layer.mlp.shared_experts.gate_weight.copy_(hf_layer.mlp.shared_expert_gate.weight)
+            shared_fc1_weight = torch.cat(
+                [hf_layer.mlp.shared_expert.gate_proj.weight, hf_layer.mlp.shared_expert.up_proj.weight])
+            layer.mlp.shared_experts.linear_fc1.weight.copy_(shared_fc1_weight)
+            layer.mlp.shared_experts.linear_fc2.weight.copy_(hf_layer.mlp.shared_expert.down_proj.weight)
+
+        model.decoder.final_layernorm.weight.copy_(hf_model.model.norm.weight)
+        model.output_layer.weight.copy_(hf_model.lm_head.weight)
+
+
 def convert_hf_to_mcore(hf_model_path, output_path, test=False):
     os.makedirs(output_path, exist_ok=True)
     if len(os.listdir(output_path)) > 0 and not test:
@@ -74,46 +119,51 @@ def convert_hf_to_mcore(hf_model_path, output_path, test=False):
                                   virtual_pipeline_model_parallel_size=None,
                                   context_parallel_size=1,
                                   expert_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(0)
 
     # init hf config
     hf_config = AutoConfig.from_pretrained(hf_model_path)
     print(hf_config)
-    megatron_config = MegatronConfig()
+
     cfg = Config()
     cfg.model.path = hf_model_path
-    tfconfig = convert_config(hf_config, megatron_config)
+    tfconfig = hf_to_mcore_config(hf_config, torch.bfloat16)
     tie_word_embeddings = getattr(hf_config, "tie_word_embeddings", False)
 
     # init megatron model
     def megatron_model_provider(pre_process, post_process):
-        from verl.utils.model import get_parallel_gptmodel_from_config
-        parallel_model = get_parallel_gptmodel_from_config(tfconfig,
-                                                           hf_config,
-                                                           pre_process,
-                                                           post_process,
-                                                           share_embeddings_and_output_weights=tie_word_embeddings,
-                                                           value=False)
+        from verl.models.mcore import init_mcore_model
+        parallel_model = init_mcore_model(tfconfig,
+                                          hf_config,
+                                          pre_process,
+                                          post_process,
+                                          share_embeddings_and_output_weights=tie_word_embeddings,
+                                          value=False)
         return parallel_model
 
     model = get_model(model_provider_func=megatron_model_provider,
                       model_type=ModelType.encoder_or_decoder,
-                      wrap_with_ddp=True)
+                      wrap_with_ddp=False)
 
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
 
     # init hf model
-    hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path)
+    hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, torch_dtype=torch.bfloat16)
     ref_state_dict = hf_model.state_dict()
 
     # load hf state dict to megatron model
-    from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel
-    load_state_dict_to_megatron_gptmodel(state_dict=ref_state_dict,
-                                         wrapped_models=model,
-                                         config=hf_config,
-                                         params_dtype=torch.bfloat16,
-                                         is_value_model=False)
-    ssd = model[0].module.module.sharded_state_dict()
+    if "Qwen2MoeForCausalLM" in hf_config.architectures:
+        convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config)
+    else:
+        from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel
+        load_state_dict_to_megatron_gptmodel(state_dict=ref_state_dict,
+                                             wrapped_models=model,
+                                             config=hf_config,
+                                             params_dtype=torch.bfloat16,
+                                             is_value_model=False)
+
+    ssd = model[0].module.sharded_state_dict()
     del ref_state_dict, hf_model
 
     # save megatron model
@@ -125,11 +175,11 @@ def megatron_model_provider(pre_process, post_process):
         model_test = get_model(model_provider_func=megatron_model_provider,
                                model_type=ModelType.encoder_or_decoder,
                                wrap_with_ddp=True)
-        ssd2 = model_test[0].module.module.sharded_state_dict()
+        ssd2 = model_test[0].module.sharded_state_dict()
         dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.ASSUME_OK_UNEXPECTED)
 
-        sd = model[0].module.module.state_dict()
-        sd2 = model_test[0].module.module.state_dict()
+        sd = model[0].module.state_dict()
+        sd2 = model_test[0].module.state_dict()
         for k in sd.keys():
             if sd[k] is None:
                 continue
@@ -162,11 +212,11 @@ def megatron_value_model_provider(pre_process, post_process):
         model_value = get_model(model_provider_func=megatron_value_model_provider,
                                 model_type=ModelType.encoder_or_decoder,
                                 wrap_with_ddp=True)
-        ssd2 = model_value[0].module.module.sharded_state_dict()
+        ssd2 = model_value[0].module.sharded_state_dict()
         dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.IGNORE_ALL)
 
-        sd = model[0].module.module.state_dict()
-        sd2 = model_value[0].module.module.state_dict()
+        sd = model[0].module.state_dict()
+        sd2 = model_value[0].module.state_dict()
         for k in sd.keys():
             if sd[k] is None:
                 continue
diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index e95d4ca73ba..6e419d6b857 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -86,8 +86,10 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
         variable_seq_lengths=True,
         masked_softmax_fusion=True,
         attention_backend=AttnBackend.flash,
+        # attention_backend=AttnBackend.fused,
         bf16=dtype is torch.bfloat16,
         layernorm_epsilon=hf_config.rms_norm_eps,
+        ffn_hidden_size=hf_config.intermediate_size,
 
         # parallel config
         tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
@@ -99,25 +101,33 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
         sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1,
 
         # moe specific
-        ffn_hidden_size=hf_config.moe_intermediate_size,
+        moe_ffn_hidden_size=hf_config.moe_intermediate_size,
         moe_token_dispatcher_type="alltoall",
         moe_router_bias_update_rate=0.001,
         moe_router_topk=hf_config.num_experts_per_tok,
         num_moe_experts=hf_config.num_experts,
         moe_shared_expert_intermediate_size=hf_config.shared_expert_intermediate_size,
-        # moe_aux_loss_coeff=hf_config.router_aux_loss_coef,
-        moe_aux_loss_coeff=0.0,
+        moe_aux_loss_coeff=hf_config.router_aux_loss_coef,
+        # moe_aux_loss_coeff=0.0,
         moe_router_load_balancing_type="aux_loss",
-        moe_router_pre_softmax=False,  #?
         moe_shared_expert_overlap=True,
-        # moe_permute_fusion=True,
+        # moe_permute_fusion=True, # need TE 2.1+
         moe_grouped_gemm=True,
+        moe_router_score_function="softmax",
+
+        # # mcore 0.12 moe
+        # moe_router_dtype="fp64",
+        # disable_bf16_reduced_precision_matmul=True,
 
-        # mcore 0.12
-        moe_router_dtype="fp64",
-        disable_bf16_reduced_precision_matmul=True,
+        # other
+        # deallocate_pipeline_outputs=True,
+        # gradient_accumulation_fusion=True,
+        persist_layer_norm=True,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
 
         # qwen specific
+        moe_router_pre_softmax=True,
         add_qkv_bias=True)
     return transformer_config
 
diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py
index 48ea980fd6b..823cf3af9d7 100644
--- a/verl/models/mcore/model_initializer.py
+++ b/verl/models/mcore/model_initializer.py
@@ -21,7 +21,8 @@ def init_mcore_model_dense(tfconfig,
                            pre_process=None,
                            post_process=None,
                            share_embeddings_and_output_weights=False,
-                           value=False):
+                           value=False,
+                           **extra_kwargs):
     # for LlamaForCausalLM, Qwen2ForCausalLM
     from megatron.core.models.gpt.gpt_model import GPTModel
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
@@ -53,11 +54,15 @@ def init_mcore_model_qwen2_moe(tfconfig,
                                pre_process=None,
                                post_process=None,
                                share_embeddings_and_output_weights=False,
-                               value=False):
+                               value=False,
+                               freeze_moe_router=True,
+                               **extra_kwargs):
 
     from megatron.core.models.gpt.gpt_model import GPTModel
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
     use_te = True
+    if freeze_moe_router:
+        tfconfig.moe_router_load_balancing_type = "none"
 
     def patch_layer_spec(transformer_layer_spec):
         # shared_experts.gate=True
@@ -82,6 +87,10 @@ def patch_layer_spec(transformer_layer_spec):
                      position_embedding_type='rope',
                      rotary_base=hf_config.rope_theta,
                      **rope_scaling_args)
+    if freeze_moe_router:
+        for layer in model.decoder.layers:
+            layer.mlp.router.weight.requires_grad = False
+            layer.mlp.shared_experts.gate_weight.requires_grad = False
     if post_process and value:
         from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
         model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig)
@@ -93,9 +102,10 @@ def init_mcore_model_llama4(tfconfig,
                             pre_process=None,
                             post_process=None,
                             share_embeddings_and_output_weights=False,
-                            value=False):
+                            value=False,
+                            **extra_kwargs):
     return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights,
-                                  value)
+                                  value, **extra_kwargs)
 
 
 def init_mcore_model_dpskv3(tfconfig,
@@ -103,9 +113,10 @@ def init_mcore_model_dpskv3(tfconfig,
                             pre_process=None,
                             post_process=None,
                             share_embeddings_and_output_weights=False,
-                            value=False):
+                            value=False,
+                            **extra_kwargs):
     return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights,
-                                  value)
+                                  value, **extra_kwargs)
 
 
 def init_mcore_model_qwen2_5_vl(tfconfig,
@@ -113,6 +124,7 @@ def init_mcore_model_qwen2_5_vl(tfconfig,
                                 pre_process=None,
                                 post_process=None,
                                 share_embeddings_and_output_weights=False,
-                                value=False):
+                                value=False,
+                                **extra_kwargs):
     # Qwen2_5_VLForConditionalGeneration
     raise NotImplementedError("VLM is not supported yet")
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index ab02c69e5be..6d9c3bbe37a 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -46,7 +46,7 @@ def init_mcore_model(
         post_process=None,
         share_embeddings_and_output_weights=False,
         value=False,
-        **extra_kwargs  # may be used for vlm
+        **extra_kwargs  # may be used for vlm and moe
 ) -> nn.Module:
     MODEL_INITIALIZER_REGISTRY = {
         "LlamaForCausalLM": init_mcore_model_dense,

From 6113b1095ce28d1526ed78841540a5ba7514de32 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Thu, 17 Apr 2025 05:55:07 -0700
Subject: [PATCH 07/19] add scripts to run qwen1.5moe_a2.7b

---
 .../run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh   | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh

diff --git a/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh b/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
new file mode 100644
index 00000000000..dad28cd6dd3
--- /dev/null
+++ b/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
@@ -0,0 +1,70 @@
+set -x
+# 0. download the model
+huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat
+
+# 1. convert the model to mcore format
+# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path
+HF_MODEL_PATH=/data/models/Qwen/Qwen1.5-MoE-A2.7B-Chat
+DIST_CKPT_PATH=/data/mcore_ckpt/Qwen1.5-MoE-A2.7B-Chat
+python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
+
+# 2. run the script
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+train_files=$gsm8k_train_path
+test_files=$gsm8k_test_path
+
+NODES=4
+PP=2
+TP=4
+CP=1
+VLLM_TP=4
+
+RAY_ADDRESS='auto' ray job submit --working-dir . -- python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$HF_MODEL_PATH \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=$HF_MODEL_PATH \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name='verl_megatron_gsm8k_examples' \
+    trainer.experiment_name='qwen1.5_moe_nochat' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=$NODES \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
+    critic.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \
+    critic.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
+    actor_rollout_ref.ref.megatron.context_parallel_size=$CP \
+    critic.megatron.context_parallel_size=$CP \
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
+    critic.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    trainer.total_epochs=100 $@
+    
\ No newline at end of file

From 5f8d8a0f5aa18a7cb89b8f9da965ff35c86737c9 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Thu, 17 Apr 2025 23:58:30 -0700
Subject: [PATCH 08/19] format

---
 verl/workers/sharding_manager/megatron_vllm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py
index b7e8ae88c13..b09f4a8efa4 100644
--- a/verl/workers/sharding_manager/megatron_vllm.py
+++ b/verl/workers/sharding_manager/megatron_vllm.py
@@ -271,6 +271,7 @@ def pp_models(self):
 
 
 class MegatronVLLMShardingManager(BaseShardingManager):
+
     def __init__(self,
                  actor_module: nn.ModuleList,
                  inference_engine: LLM,

From d2376eca9eb12cc5aa301bf9304934a236cb4221 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Fri, 18 Apr 2025 07:27:54 -0700
Subject: [PATCH 09/19] update scripts

---
 examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh b/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
index dad28cd6dd3..0d84d28046e 100644
--- a/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
+++ b/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
@@ -20,7 +20,8 @@ TP=4
 CP=1
 VLLM_TP=4
 
-RAY_ADDRESS='auto' ray job submit --working-dir . -- python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+# RAY_ADDRESS='auto' ray job submit --working-dir . -- 
+python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
     algorithm.adv_estimator=gae \
     data.train_files="$train_files" \
     data.val_files="$test_files" \

From 57d9671e0589f3f2c73b879b25d7aea4e6aa830c Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Fri, 18 Apr 2025 08:39:02 -0700
Subject: [PATCH 10/19] fix for pre-commit

---
 scripts/converter_hf_to_mcore.py              | 123 ++++++-------
 verl/models/mcore/__init__.py                 |   4 +-
 verl/models/mcore/config_converter.py         |  41 +++--
 verl/models/mcore/model_forward.py            | 108 +++++------
 verl/models/mcore/model_initializer.py        | 167 ++++++++++--------
 verl/models/mcore/registry.py                 |  79 ++++++---
 verl/models/mcore/saver.py                    |  31 ++--
 verl/models/mcore/weight_converter.py         |  55 +++---
 verl/models/weight_loader_registry.py         |   5 +-
 verl/workers/critic/megatron_critic.py        |  13 +-
 verl/workers/megatron_workers.py              |  20 +--
 .../workers/sharding_manager/megatron_vllm.py |  83 +++++----
 12 files changed, 362 insertions(+), 367 deletions(-)

diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py
index 1aa3d0f9c13..aa4256b67a4 100644
--- a/scripts/converter_hf_to_mcore.py
+++ b/scripts/converter_hf_to_mcore.py
@@ -13,50 +13,42 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Tuple, Dict
-import re
-import os
-import torch
 import argparse
+import os
 import warnings
-import numpy as np
-from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForTokenClassification, AutoModelForVision2Seq
-from concurrent.futures import ThreadPoolExecutor
-from safetensors.torch import load_file
-from torch.distributed._tensor import Shard, Placement
-from verl.utils.megatron_utils import get_model
-from megatron.core.models.gpt.gpt_model import ModelType
-from megatron.core import parallel_state as mpu
+
+import torch
 from megatron.core import dist_checkpointing
+from megatron.core import parallel_state as mpu
 from megatron.core.dist_checkpointing.serialization import StrictHandling
+from megatron.core.models.gpt.gpt_model import ModelType
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from transformers import AutoConfig, AutoModelForCausalLM
 
 from verl.models.mcore import hf_to_mcore_config
+from verl.utils.megatron_utils import get_model
 
 
 def _init_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--hf_model_path', type=str, required=True, help="The path for the huggingface model")
-    parser.add_argument('--output_path', type=str, required=True, help="The path for the output mcore model")
-    parser.add_argument('--test', action='store_true', help="Whether to test the conversion")
+    parser.add_argument("--hf_model_path", type=str, required=True, help="The path for the huggingface model")
+    parser.add_argument("--output_path", type=str, required=True, help="The path for the output mcore model")
+    parser.add_argument("--test", action="store_true", help="Whether to test the conversion")
     args = parser.parse_args()
     return args
 
 
 class MegatronConfig:
-
     def __init__(self):
         self.params_dtype = torch.bfloat16
 
 
 class ModelConfig:
-
     def __init__(self):
         self.path = None
 
 
 class Config:
-
     def __init__(self):
         self.model = ModelConfig()
 
@@ -90,12 +82,13 @@ def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config)
 
             for idx, hf_expert in enumerate(hf_layer.mlp.experts):
                 fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight])
-                layer.mlp.experts.linear_fc1._parameters[f'weight{idx}'].copy_(fc1_weight)
-                layer.mlp.experts.linear_fc2._parameters[f'weight{idx}'].copy_(hf_expert.down_proj.weight)
+                layer.mlp.experts.linear_fc1._parameters[f"weight{idx}"].copy_(fc1_weight)
+                layer.mlp.experts.linear_fc2._parameters[f"weight{idx}"].copy_(hf_expert.down_proj.weight)
 
             layer.mlp.shared_experts.gate_weight.copy_(hf_layer.mlp.shared_expert_gate.weight)
             shared_fc1_weight = torch.cat(
-                [hf_layer.mlp.shared_expert.gate_proj.weight, hf_layer.mlp.shared_expert.up_proj.weight])
+                [hf_layer.mlp.shared_expert.gate_proj.weight, hf_layer.mlp.shared_expert.up_proj.weight]
+            )
             layer.mlp.shared_experts.linear_fc1.weight.copy_(shared_fc1_weight)
             layer.mlp.shared_experts.linear_fc2.weight.copy_(hf_layer.mlp.shared_expert.down_proj.weight)
 
@@ -110,15 +103,17 @@ def convert_hf_to_mcore(hf_model_path, output_path, test=False):
         return
 
     # init torch distributed and mpu
-    os.environ['RANK'] = '0'
-    os.environ['WORLD_SIZE'] = '1'
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '12355'
-    torch.distributed.init_process_group('nccl')
-    mpu.initialize_model_parallel(tensor_model_parallel_size=1,
-                                  virtual_pipeline_model_parallel_size=None,
-                                  context_parallel_size=1,
-                                  expert_model_parallel_size=1)
+    os.environ["RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.distributed.init_process_group("nccl")
+    mpu.initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        context_parallel_size=1,
+        expert_model_parallel_size=1,
+    )
     model_parallel_cuda_manual_seed(0)
 
     # init hf config
@@ -133,17 +128,20 @@ def convert_hf_to_mcore(hf_model_path, output_path, test=False):
     # init megatron model
     def megatron_model_provider(pre_process, post_process):
         from verl.models.mcore import init_mcore_model
-        parallel_model = init_mcore_model(tfconfig,
-                                          hf_config,
-                                          pre_process,
-                                          post_process,
-                                          share_embeddings_and_output_weights=tie_word_embeddings,
-                                          value=False)
+
+        parallel_model = init_mcore_model(
+            tfconfig,
+            hf_config,
+            pre_process,
+            post_process,
+            share_embeddings_and_output_weights=tie_word_embeddings,
+            value=False,
+        )
         return parallel_model
 
-    model = get_model(model_provider_func=megatron_model_provider,
-                      model_type=ModelType.encoder_or_decoder,
-                      wrap_with_ddp=False)
+    model = get_model(
+        model_provider_func=megatron_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=False
+    )
 
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
@@ -157,11 +155,14 @@ def megatron_model_provider(pre_process, post_process):
         convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config)
     else:
         from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel
-        load_state_dict_to_megatron_gptmodel(state_dict=ref_state_dict,
-                                             wrapped_models=model,
-                                             config=hf_config,
-                                             params_dtype=torch.bfloat16,
-                                             is_value_model=False)
+
+        load_state_dict_to_megatron_gptmodel(
+            state_dict=ref_state_dict,
+            wrapped_models=model,
+            config=hf_config,
+            params_dtype=torch.bfloat16,
+            is_value_model=False,
+        )
 
     ssd = model[0].module.sharded_state_dict()
     del ref_state_dict, hf_model
@@ -172,9 +173,9 @@ def megatron_model_provider(pre_process, post_process):
     if test:
         ########### test ###########
         # load model
-        model_test = get_model(model_provider_func=megatron_model_provider,
-                               model_type=ModelType.encoder_or_decoder,
-                               wrap_with_ddp=True)
+        model_test = get_model(
+            model_provider_func=megatron_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True
+        )
         ssd2 = model_test[0].module.sharded_state_dict()
         dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.ASSUME_OK_UNEXPECTED)
 
@@ -186,7 +187,7 @@ def megatron_model_provider(pre_process, post_process):
             d1 = sd[k].data
             if k in sd2:
                 d2 = sd2[k].data
-                assert d1.shape == d2.shape, f'{k=} {d1.shape=} {d2.shape=}'
+                assert d1.shape == d2.shape, f"{k=} {d1.shape=} {d2.shape=}"
                 assert (d1 == d2).all(), f"{k} is not equal"
         for k in sd2.keys():
             if sd2[k] is None:
@@ -194,24 +195,24 @@ def megatron_model_provider(pre_process, post_process):
             d1 = sd2[k].data
             if k in sd:
                 d2 = sd[k].data
-                assert d1.shape == d2.shape, f'{k=} {d1.shape=} {d2.shape=}'
+                assert d1.shape == d2.shape, f"{k=} {d1.shape=} {d2.shape=}"
                 assert (d1 == d2).all(), f"{k} is not equal"
 
         # load value model
         def megatron_value_model_provider(pre_process, post_process):
             from verl.utils.model import get_parallel_gptmodel_from_config
-            parallel_model = get_parallel_gptmodel_from_config(tfconfig,
-                                                               hf_config,
-                                                               pre_process,
-                                                               post_process,
-                                                               share_embeddings_and_output_weights=False,
-                                                               value=True)
+
+            parallel_model = get_parallel_gptmodel_from_config(
+                tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights=False, value=True
+            )
             parallel_model.cuda()
             return parallel_model
 
-        model_value = get_model(model_provider_func=megatron_value_model_provider,
-                                model_type=ModelType.encoder_or_decoder,
-                                wrap_with_ddp=True)
+        model_value = get_model(
+            model_provider_func=megatron_value_model_provider,
+            model_type=ModelType.encoder_or_decoder,
+            wrap_with_ddp=True,
+        )
         ssd2 = model_value[0].module.sharded_state_dict()
         dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.IGNORE_ALL)
 
@@ -223,7 +224,7 @@ def megatron_value_model_provider(pre_process, post_process):
             d1 = sd[k].data
             if k in sd2:
                 d2 = sd2[k].data
-                assert d1.shape == d2.shape, f'{k=} {d1.shape=} {d2.shape=}'
+                assert d1.shape == d2.shape, f"{k=} {d1.shape=} {d2.shape=}"
                 assert (d1 == d2).all(), f"{k} is not equal"
         for k in sd2.keys():
             if sd2[k] is None:
@@ -231,10 +232,10 @@ def megatron_value_model_provider(pre_process, post_process):
             d1 = sd2[k].data
             if k in sd:
                 d2 = sd[k].data
-                assert d1.shape == d2.shape, f'{k=} {d1.shape=} {d2.shape=}'
+                assert d1.shape == d2.shape, f"{k=} {d1.shape=} {d2.shape=}"
                 assert (d1 == d2).all(), f"{k} is not equal"
 
 
 if __name__ == "__main__":
     args = _init_args()
-    convert_hf_to_mcore(args.hf_model_path, args.output_path, args.test)
\ No newline at end of file
+    convert_hf_to_mcore(args.hf_model_path, args.output_path, args.test)
diff --git a/verl/models/mcore/__init__.py b/verl/models/mcore/__init__.py
index 8782330500d..6ee338cd15c 100644
--- a/verl/models/mcore/__init__.py
+++ b/verl/models/mcore/__init__.py
@@ -13,6 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .registry import init_mcore_model, hf_to_mcore_config, get_mcore_forward_fn, get_mcore_weight_converter
+from .registry import get_mcore_forward_fn, get_mcore_weight_converter, hf_to_mcore_config, init_mcore_model
 
-__all__ = ['init_mcore_model', 'hf_to_mcore_config', 'get_mcore_forward_fn', 'get_mcore_weight_converter']
+__all__ = ["init_mcore_model", "hf_to_mcore_config", "get_mcore_forward_fn", "get_mcore_weight_converter"]
diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index 6e419d6b857..c14228d4261 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -15,22 +15,25 @@
 
 # convert huggingface config to mcore transformer config
 
-from transformers import PretrainedConfig
-from megatron.core.transformer import TransformerConfig
 import torch
 import torch.nn.functional as F
+from megatron.core.transformer import TransformerConfig
 from megatron.core.transformer.enums import AttnBackend
+from transformers import PretrainedConfig
 
 
 def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
     # for LlamaForCausalLM or Qwen2ForCausalLM
     from megatron.core import parallel_state as mpu
+
+    qkv_bias = getattr(hf_config, "attention_bias", False)
     if "Qwen2ForCausalLM" in hf_config.architectures:
         qkv_bias = True
-    else:
-        qkv_bias = getattr(hf_config, 'attention_bias', False)
-    overlap_p2p_comm = mpu.get_virtual_pipeline_model_parallel_world_size(
-    ) is not None and mpu.get_virtual_pipeline_model_parallel_world_size() > 1
+
+    overlap_p2p_comm = (
+        mpu.get_virtual_pipeline_model_parallel_world_size() is not None
+        and mpu.get_virtual_pipeline_model_parallel_world_size() > 1
+    )
     batch_p2p_comm = False
     transformer_config = TransformerConfig(
         num_layers=hf_config.num_hidden_layers,
@@ -39,7 +42,7 @@ def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) ->
         num_query_groups=hf_config.num_key_value_heads,
         ffn_hidden_size=hf_config.intermediate_size,
         activation_func=F.silu,
-        normalization='RMSNorm',
+        normalization="RMSNorm",
         gated_linear_unit=True,
         use_cpu_initialization=True,
         add_bias_linear=False,
@@ -56,18 +59,22 @@ def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) ->
         masked_softmax_fusion=True,
         moe_token_dispatcher_type="alltoall",
         attention_dropout=hf_config.attention_dropout,
-        hidden_dropout=getattr(hf_config, 'hidden_dropout', 0.0),
+        hidden_dropout=getattr(hf_config, "hidden_dropout", 0.0),
         add_qkv_bias=qkv_bias,
         attention_backend=AttnBackend.flash,
-        bf16=dtype is torch.bfloat16)
+        bf16=dtype is torch.bfloat16,
+    )
 
     return transformer_config
 
 
 def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
     from megatron.core import parallel_state as mpu
-    overlap_p2p_comm = mpu.get_virtual_pipeline_model_parallel_world_size(
-    ) is not None and mpu.get_virtual_pipeline_model_parallel_world_size() > 1
+
+    overlap_p2p_comm = (
+        mpu.get_virtual_pipeline_model_parallel_world_size() is not None
+        and mpu.get_virtual_pipeline_model_parallel_world_size() > 1
+    )
     batch_p2p_comm = False
     transformer_config = TransformerConfig(
         num_layers=hf_config.num_hidden_layers,
@@ -75,9 +82,9 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
         num_attention_heads=hf_config.num_attention_heads,
         num_query_groups=hf_config.num_key_value_heads,
         attention_dropout=hf_config.attention_dropout,
-        hidden_dropout=getattr(hf_config, 'hidden_dropout', 0.0),
+        hidden_dropout=getattr(hf_config, "hidden_dropout", 0.0),
         activation_func=F.silu,
-        normalization='RMSNorm',
+        normalization="RMSNorm",
         gated_linear_unit=True,
         use_cpu_initialization=False,
         add_bias_linear=False,
@@ -90,7 +97,6 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
         bf16=dtype is torch.bfloat16,
         layernorm_epsilon=hf_config.rms_norm_eps,
         ffn_hidden_size=hf_config.intermediate_size,
-
         # parallel config
         tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
         pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
@@ -99,7 +105,6 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
         overlap_p2p_comm=overlap_p2p_comm,
         batch_p2p_comm=batch_p2p_comm,
         sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1,
-
         # moe specific
         moe_ffn_hidden_size=hf_config.moe_intermediate_size,
         moe_token_dispatcher_type="alltoall",
@@ -114,21 +119,19 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
         # moe_permute_fusion=True, # need TE 2.1+
         moe_grouped_gemm=True,
         moe_router_score_function="softmax",
-
         # # mcore 0.12 moe
         # moe_router_dtype="fp64",
         # disable_bf16_reduced_precision_matmul=True,
-
         # other
         # deallocate_pipeline_outputs=True,
         # gradient_accumulation_fusion=True,
         persist_layer_norm=True,
         bias_activation_fusion=True,
         bias_dropout_fusion=True,
-
         # qwen specific
         moe_router_pre_softmax=True,
-        add_qkv_bias=True)
+        add_qkv_bias=True,
+    )
     return transformer_config
 
 
diff --git a/verl/models/mcore/model_forward.py b/verl/models/mcore/model_forward.py
index 42fd702efee..a615fefbfc1 100644
--- a/verl/models/mcore/model_forward.py
+++ b/verl/models/mcore/model_forward.py
@@ -13,95 +13,69 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from verl.utils.megatron import sequence_parallel as sp_utils
-from verl.utils.megatron import tensor_parallel as tp_utils
-import torch
-from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.core import parallel_state as mpu
 from verl.utils.megatron_utils import unwrap_model
-from .util import preprocess_packed_seqs, postprocess_packed_seqs, remove_left_padding, recover_left_padding
 
+from .util import postprocess_packed_seqs, preprocess_packed_seqs, recover_left_padding, remove_left_padding
 
-def gptmodel_forward_dense(model,
-                           input_ids,
-                           attention_mask,
-                           position_ids,
-                           sequence_parallel,
-                           value_model=False,
-                           pack_seqs=True):
+
+def gptmodel_forward_dense(
+    model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True
+):
     pre_process = unwrap_model(model).pre_process
     post_process = unwrap_model(model).post_process
     if pack_seqs:
         batch_size, seq_len = attention_mask.shape[:2]
         input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=pre_process)
         input_ids_rmpad = input_ids_rmpad.contiguous()
-        output_orig = model(input_ids=input_ids_rmpad,
-                            attention_mask=None,
-                            position_ids=position_ids,
-                            packed_seq_params=packed_seq_params)
+        output_orig = model(
+            input_ids=input_ids_rmpad,
+            attention_mask=None,
+            position_ids=position_ids,
+            packed_seq_params=packed_seq_params,
+        )
 
-        output = postprocess_packed_seqs(output_orig,
-                                         packed_seq_params,
-                                         attention_mask,
-                                         batch_size,
-                                         seq_len,
-                                         post_process=post_process)
+        output = postprocess_packed_seqs(
+            output_orig, packed_seq_params, attention_mask, batch_size, seq_len, post_process=post_process
+        )
     else:
         batch_size, sequence_length = attention_mask.shape
-        new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(input_ids,
-                                                                                  attention_mask,
-                                                                                  position_ids,
-                                                                                  sequence_parallel,
-                                                                                  pre_process=pre_process)
+        new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(
+            input_ids, attention_mask, position_ids, sequence_parallel, pre_process=pre_process
+        )
         output = model(input_ids=new_input_ids, attention_mask=new_attention_mask, position_ids=new_position_ids)
-        output = recover_left_padding(output,
-                                      new_attention_mask,
-                                      attention_mask,
-                                      sequence_length,
-                                      post_process=post_process)
+        output = recover_left_padding(
+            output, new_attention_mask, attention_mask, sequence_length, post_process=post_process
+        )
     if value_model and post_process:
         output = output[..., 0]
     return output
 
 
-def gptmodel_forward_qwen2_moe(model,
-                               input_ids,
-                               attention_mask,
-                               position_ids,
-                               sequence_parallel,
-                               value_model=False,
-                               pack_seqs=True):
-    return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model,
-                                  pack_seqs)
+def gptmodel_forward_qwen2_moe(
+    model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True
+):
+    return gptmodel_forward_dense(
+        model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, pack_seqs
+    )
 
 
-def gptmodel_forward_llama4(model,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            sequence_parallel,
-                            value_model=False,
-                            pack_seqs=True):
-    return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model,
-                                  pack_seqs)
+def gptmodel_forward_llama4(
+    model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True
+):
+    return gptmodel_forward_dense(
+        model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, pack_seqs
+    )
 
 
-def gptmodel_forward_dpskv3(model,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            sequence_parallel,
-                            value_model=False,
-                            pack_seqs=True):
-    return gptmodel_forward_dense(model, input_ids, attention_mask, position_ids, sequence_parallel, value_model,
-                                  pack_seqs)
+def gptmodel_forward_dpskv3(
+    model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True
+):
+    return gptmodel_forward_dense(
+        model, input_ids, attention_mask, position_ids, sequence_parallel, value_model, pack_seqs
+    )
 
 
-def gptmodel_forward_qwen2_5_vl(model,
-                                input_ids,
-                                attention_mask,
-                                position_ids,
-                                sequence_parallel,
-                                value_model=False,
-                                pack_seqs=True):
+def gptmodel_forward_qwen2_5_vl(
+    model, input_ids, attention_mask, position_ids, sequence_parallel, value_model=False, pack_seqs=True
+):
     raise NotImplementedError("VLM is not supported yet")
diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py
index 823cf3af9d7..0be8c9eb7ac 100644
--- a/verl/models/mcore/model_initializer.py
+++ b/verl/models/mcore/model_initializer.py
@@ -16,50 +16,58 @@
 # use mcore transformer config to initialize the model
 
 
-def init_mcore_model_dense(tfconfig,
-                           hf_config,
-                           pre_process=None,
-                           post_process=None,
-                           share_embeddings_and_output_weights=False,
-                           value=False,
-                           **extra_kwargs):
+def init_mcore_model_dense(
+    tfconfig,
+    hf_config,
+    pre_process=None,
+    post_process=None,
+    share_embeddings_and_output_weights=False,
+    value=False,
+    **extra_kwargs,
+):
     # for LlamaForCausalLM, Qwen2ForCausalLM
-    from megatron.core.models.gpt.gpt_model import GPTModel
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
+    from megatron.core.models.gpt.gpt_model import GPTModel
+
     use_te = True
-    assert tfconfig.normalization == "RMSNorm", 'only RMSNorm is supported for now'
+    assert tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now"
     transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te)
     rope_scaling_args = {}
     if hf_config.rope_scaling is not None:
-        assert hf_config.rope_scaling['type'] == 'linear', "only linear scaling is supported for now"
-        rope_scaling_args['seq_len_interpolation_factor'] = hf_config.rope_scaling['factor']
-    model = GPTModel(config=tfconfig,
-                     transformer_layer_spec=transformer_layer_spec,
-                     vocab_size=hf_config.vocab_size,
-                     max_sequence_length=hf_config.max_position_embeddings,
-                     pre_process=pre_process,
-                     post_process=post_process,
-                     share_embeddings_and_output_weights=share_embeddings_and_output_weights,
-                     position_embedding_type='rope',
-                     rotary_base=hf_config.rope_theta,
-                     **rope_scaling_args)
+        assert hf_config.rope_scaling["type"] == "linear", "only linear scaling is supported for now"
+        rope_scaling_args["seq_len_interpolation_factor"] = hf_config.rope_scaling["factor"]
+    model = GPTModel(
+        config=tfconfig,
+        transformer_layer_spec=transformer_layer_spec,
+        vocab_size=hf_config.vocab_size,
+        max_sequence_length=hf_config.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        share_embeddings_and_output_weights=share_embeddings_and_output_weights,
+        position_embedding_type="rope",
+        rotary_base=hf_config.rope_theta,
+        **rope_scaling_args,
+    )
     if post_process and value:
         from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
+
         model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig)
     return model
 
 
-def init_mcore_model_qwen2_moe(tfconfig,
-                               hf_config,
-                               pre_process=None,
-                               post_process=None,
-                               share_embeddings_and_output_weights=False,
-                               value=False,
-                               freeze_moe_router=True,
-                               **extra_kwargs):
-
-    from megatron.core.models.gpt.gpt_model import GPTModel
+def init_mcore_model_qwen2_moe(
+    tfconfig,
+    hf_config,
+    pre_process=None,
+    post_process=None,
+    share_embeddings_and_output_weights=False,
+    value=False,
+    freeze_moe_router=True,
+    **extra_kwargs,
+):
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
+    from megatron.core.models.gpt.gpt_model import GPTModel
+
     use_te = True
     if freeze_moe_router:
         tfconfig.moe_router_load_balancing_type = "none"
@@ -67,64 +75,75 @@ def init_mcore_model_qwen2_moe(tfconfig,
     def patch_layer_spec(transformer_layer_spec):
         # shared_experts.gate=True
         for i in range(len(transformer_layer_spec.layer_specs)):
-            transformer_layer_spec.layer_specs[i].submodules.mlp.submodules.shared_experts.params['gate'] = True
+            transformer_layer_spec.layer_specs[i].submodules.mlp.submodules.shared_experts.params["gate"] = True
         return transformer_layer_spec
 
-    assert tfconfig.normalization == "RMSNorm", 'only RMSNorm is supported for now'
+    assert tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now"
     transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te)
     transformer_layer_spec = patch_layer_spec(transformer_layer_spec)
     rope_scaling_args = {}
     if hf_config.rope_scaling is not None:
-        assert hf_config.rope_scaling['type'] == 'linear', "only linear scaling is supported for now"
-        rope_scaling_args['seq_len_interpolation_factor'] = hf_config.rope_scaling['factor']
-    model = GPTModel(config=tfconfig,
-                     transformer_layer_spec=transformer_layer_spec,
-                     vocab_size=hf_config.vocab_size,
-                     max_sequence_length=hf_config.max_position_embeddings,
-                     pre_process=pre_process,
-                     post_process=post_process,
-                     share_embeddings_and_output_weights=share_embeddings_and_output_weights,
-                     position_embedding_type='rope',
-                     rotary_base=hf_config.rope_theta,
-                     **rope_scaling_args)
+        assert hf_config.rope_scaling["type"] == "linear", "only linear scaling is supported for now"
+        rope_scaling_args["seq_len_interpolation_factor"] = hf_config.rope_scaling["factor"]
+    model = GPTModel(
+        config=tfconfig,
+        transformer_layer_spec=transformer_layer_spec,
+        vocab_size=hf_config.vocab_size,
+        max_sequence_length=hf_config.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        share_embeddings_and_output_weights=share_embeddings_and_output_weights,
+        position_embedding_type="rope",
+        rotary_base=hf_config.rope_theta,
+        **rope_scaling_args,
+    )
     if freeze_moe_router:
         for layer in model.decoder.layers:
             layer.mlp.router.weight.requires_grad = False
             layer.mlp.shared_experts.gate_weight.requires_grad = False
     if post_process and value:
         from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
+
         model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig)
     return model
 
 
-def init_mcore_model_llama4(tfconfig,
-                            hf_config,
-                            pre_process=None,
-                            post_process=None,
-                            share_embeddings_and_output_weights=False,
-                            value=False,
-                            **extra_kwargs):
-    return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights,
-                                  value, **extra_kwargs)
-
-
-def init_mcore_model_dpskv3(tfconfig,
-                            hf_config,
-                            pre_process=None,
-                            post_process=None,
-                            share_embeddings_and_output_weights=False,
-                            value=False,
-                            **extra_kwargs):
-    return init_mcore_model_dense(tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights,
-                                  value, **extra_kwargs)
-
-
-def init_mcore_model_qwen2_5_vl(tfconfig,
-                                hf_config,
-                                pre_process=None,
-                                post_process=None,
-                                share_embeddings_and_output_weights=False,
-                                value=False,
-                                **extra_kwargs):
+def init_mcore_model_llama4(
+    tfconfig,
+    hf_config,
+    pre_process=None,
+    post_process=None,
+    share_embeddings_and_output_weights=False,
+    value=False,
+    **extra_kwargs,
+):
+    return init_mcore_model_dense(
+        tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, value, **extra_kwargs
+    )
+
+
+def init_mcore_model_dpskv3(
+    tfconfig,
+    hf_config,
+    pre_process=None,
+    post_process=None,
+    share_embeddings_and_output_weights=False,
+    value=False,
+    **extra_kwargs,
+):
+    return init_mcore_model_dense(
+        tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, value, **extra_kwargs
+    )
+
+
+def init_mcore_model_qwen2_5_vl(
+    tfconfig,
+    hf_config,
+    pre_process=None,
+    post_process=None,
+    share_embeddings_and_output_weights=False,
+    value=False,
+    **extra_kwargs,
+):
     # Qwen2_5_VLForConditionalGeneration
     raise NotImplementedError("VLM is not supported yet")
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index 6d9c3bbe37a..19d8433db08 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -13,11 +13,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .config_converter import hf_to_mcore_config_dense, hf_to_mcore_config_qwen2moe, hf_to_mcore_config_dpskv3, hf_to_mcore_config_qwen2_5_vl, hf_to_mcore_config_llama4
-from .config_converter import PretrainedConfig, TransformerConfig
 import torch
 import torch.nn as nn
 
+from .config_converter import (
+    PretrainedConfig,
+    TransformerConfig,
+    hf_to_mcore_config_dense,
+    hf_to_mcore_config_dpskv3,
+    hf_to_mcore_config_llama4,
+    hf_to_mcore_config_qwen2_5_vl,
+    hf_to_mcore_config_qwen2moe,
+)
+from .model_forward import (
+    gptmodel_forward_dense,
+    gptmodel_forward_dpskv3,
+    gptmodel_forward_llama4,
+    gptmodel_forward_qwen2_5_vl,
+    gptmodel_forward_qwen2_moe,
+)
+from .model_initializer import (
+    init_mcore_model_dense,
+    init_mcore_model_dpskv3,
+    init_mcore_model_llama4,
+    init_mcore_model_qwen2_5_vl,
+    init_mcore_model_qwen2_moe,
+)
+from .weight_converter import McoreToHFWeightConverterDense, McoreToHFWeightConverterQwen2Moe
+
 
 def hf_to_mcore_config(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
     MODEL_CONFIG_CONVERTER_REGISTRY = {
@@ -31,22 +54,21 @@ def hf_to_mcore_config(hf_config: PretrainedConfig, dtype: torch.dtype) -> Trans
     assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
     arch = hf_config.architectures[0]
     if arch not in MODEL_CONFIG_CONVERTER_REGISTRY:
-        raise ValueError(f"Model architectures {arch} converter are not supported for now. "
-                         f"Supported architectures: {MODEL_CONFIG_CONVERTER_REGISTRY.keys()}")
+        raise ValueError(
+            f"Model architectures {arch} converter are not supported for now. "
+            f"Supported architectures: {MODEL_CONFIG_CONVERTER_REGISTRY.keys()}"
+        )
     return MODEL_CONFIG_CONVERTER_REGISTRY[arch](hf_config, dtype)
 
 
-from .model_initializer import init_mcore_model_dense, init_mcore_model_qwen2_moe, init_mcore_model_dpskv3, init_mcore_model_qwen2_5_vl, init_mcore_model_llama4
-
-
 def init_mcore_model(
-        tfconfig,
-        hf_config,
-        pre_process=None,
-        post_process=None,
-        share_embeddings_and_output_weights=False,
-        value=False,
-        **extra_kwargs  # may be used for vlm and moe
+    tfconfig,
+    hf_config,
+    pre_process=None,
+    post_process=None,
+    share_embeddings_and_output_weights=False,
+    value=False,
+    **extra_kwargs,  # may be used for vlm and moe
 ) -> nn.Module:
     MODEL_INITIALIZER_REGISTRY = {
         "LlamaForCausalLM": init_mcore_model_dense,
@@ -59,13 +81,13 @@ def init_mcore_model(
     assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
     arch = hf_config.architectures[0]
     if arch not in MODEL_INITIALIZER_REGISTRY:
-        raise ValueError(f"Model architectures {arch} initializer are not supported for now. "
-                         f"Supported architectures: {MODEL_INITIALIZER_REGISTRY.keys()}")
-    return MODEL_INITIALIZER_REGISTRY[arch](tfconfig, hf_config, pre_process, post_process,
-                                            share_embeddings_and_output_weights, value, **extra_kwargs)
-
-
-from .model_forward import gptmodel_forward_dense, gptmodel_forward_qwen2_moe, gptmodel_forward_llama4, gptmodel_forward_dpskv3, gptmodel_forward_qwen2_5_vl
+        raise ValueError(
+            f"Model architectures {arch} initializer are not supported for now. "
+            f"Supported architectures: {MODEL_INITIALIZER_REGISTRY.keys()}"
+        )
+    return MODEL_INITIALIZER_REGISTRY[arch](
+        tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, value, **extra_kwargs
+    )
 
 
 def get_mcore_forward_fn(hf_config: PretrainedConfig):
@@ -80,14 +102,13 @@ def get_mcore_forward_fn(hf_config: PretrainedConfig):
     assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
     arch = hf_config.architectures[0]
     if arch not in MODEL_FORWARD_REGISTRY:
-        raise ValueError(f"Model architectures {arch} forward function are not supported for now. "
-                         f"Supported architectures: {MODEL_FORWARD_REGISTRY.keys()}")
+        raise ValueError(
+            f"Model architectures {arch} forward function are not supported for now. "
+            f"Supported architectures: {MODEL_FORWARD_REGISTRY.keys()}"
+        )
     return MODEL_FORWARD_REGISTRY[arch]
 
 
-from .weight_converter import McoreToHFWeightConverterDense, McoreToHFWeightConverterQwen2Moe
-
-
 def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype):
     MODEL_WEIGHT_CONVERTER_REGISTRY = {
         "LlamaForCausalLM": McoreToHFWeightConverterDense,
@@ -97,7 +118,9 @@ def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype):
     assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
     arch = hf_config.architectures[0]
     if arch not in MODEL_WEIGHT_CONVERTER_REGISTRY:
-        raise ValueError(f"Model architectures {arch} weight converter are not supported for now. "
-                         f"Supported architectures: {MODEL_WEIGHT_CONVERTER_REGISTRY.keys()}")
+        raise ValueError(
+            f"Model architectures {arch} weight converter are not supported for now. "
+            f"Supported architectures: {MODEL_WEIGHT_CONVERTER_REGISTRY.keys()}"
+        )
     tfconfig = hf_to_mcore_config(hf_config, dtype)
     return MODEL_WEIGHT_CONVERTER_REGISTRY[arch](hf_config, tfconfig)
diff --git a/verl/models/mcore/saver.py b/verl/models/mcore/saver.py
index e8dc6bdf679..df8721aa56a 100644
--- a/verl/models/mcore/saver.py
+++ b/verl/models/mcore/saver.py
@@ -35,7 +35,7 @@ def _megatron_calc_global_rank(
     dp_size = mpu.get_data_parallel_world_size()
     pp_size = mpu.get_pipeline_model_parallel_world_size()
     cp_size = mpu.get_context_parallel_world_size()
-    ep_size = mpu.get_expert_model_parallel_world_size()
+    # ep_size = mpu.get_expert_model_parallel_world_size()
 
     # Verify total GPU count matches (must be consistent with parallel_state.py)
     total_size = tp_size * dp_size * pp_size * cp_size
@@ -179,14 +179,11 @@ def _broadcast_tp_shard_tensor(tensor, name, src_pp_rank, concat_dim=0, mutate_f
         """broadcast tensor in tp shards across mp_group"""
         nonlocal state_dict
         nonlocal mp_group
-        tp_rank = mpu.get_tensor_model_parallel_rank()
+        # tp_rank = mpu.get_tensor_model_parallel_rank()
         tp_size = mpu.get_tensor_model_parallel_world_size()
         src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
 
-        if torch.distributed.get_rank() == src_rank:
-            chunk_shape = tensor.shape
-        else:
-            chunk_shape = None
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
 
         obj_list = [chunk_shape]
         dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
@@ -223,14 +220,11 @@ def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name, src_pp_rank)
         """broadcast tensor in tp shards across mp_group"""
         nonlocal state_dict
         nonlocal mp_group
-        tp_rank = mpu.get_tensor_model_parallel_rank()
+        # tp_rank = mpu.get_tensor_model_parallel_rank()
         tp_size = mpu.get_tensor_model_parallel_world_size()
         src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
 
-        if torch.distributed.get_rank() == src_rank:
-            chunk_shape = tensor.shape
-        else:
-            chunk_shape = None
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
 
         obj_list = [chunk_shape]
         dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
@@ -276,14 +270,11 @@ def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
         """broadcast tensor in tp shards across mp_group"""
         nonlocal state_dict
         nonlocal mp_group
-        tp_rank = mpu.get_tensor_model_parallel_rank()
+        # tp_rank = mpu.get_tensor_model_parallel_rank()
         tp_size = mpu.get_tensor_model_parallel_world_size()
         src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank, cp_rank=cp_rank)
 
-        if torch.distributed.get_rank() == src_rank:
-            chunk_shape = tensor.shape
-        else:
-            chunk_shape = None
+        chunk_shape = tensor.shape if torch.distributed.get_rank() == src_rank else None
 
         obj_list = [chunk_shape]
         dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
@@ -473,9 +464,7 @@ def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
     return state_dict
 
 
-def merge_megatron_ckpt_gptmodel_qwen_moe(wrapped_models,
-                                          config,
-                                          dtype,
-                                          is_value_model=False,
-                                          tie_word_embeddings=False):
+def merge_megatron_ckpt_gptmodel_qwen_moe(
+    wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False
+):
     raise NotImplementedError("merge_megatron_ckpt_gptmodel_qwen_moe is not implemented")
diff --git a/verl/models/mcore/weight_converter.py b/verl/models/mcore/weight_converter.py
index 155361a532a..6bdee51b3e9 100644
--- a/verl/models/mcore/weight_converter.py
+++ b/verl/models/mcore/weight_converter.py
@@ -17,12 +17,11 @@
 # including format conversion and name mapping
 # not including resharding
 import torch
-from transformers import PretrainedConfig
 from megatron.core.transformer import TransformerConfig
+from transformers import PretrainedConfig
 
 
 class McoreToHFWeightConverterBase:
-
     def __init__(self, hf_config: PretrainedConfig, mcore_config: TransformerConfig):
         self.hf_config = hf_config
         self.mcore_config = mcore_config
@@ -32,26 +31,25 @@ def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> torc
 
 
 class McoreToHFWeightConverterDense(McoreToHFWeightConverterBase):
-
     def _convert_attention_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
         # 'decoder.layers.0.self_attention.linear_proj.weight'
         # 'decoder.layers.0.self_attention.linear_qkv.layer_norm_weight'
         # 'decoder.layers.0.self_attention.linear_qkv.weight'
         # 'decoder.layers.0.self_attention.linear_qkv.bias'
-        layer_number = name.split('.')[2]
+        layer_number = name.split(".")[2]
         convert_names = []
         if "self_attention.linear_qkv.bias" in name or "self_attention.linear_qkv.weight" in name:
-            param_type = name.split('.')[-1]
-            assert param_type == 'bias' or param_type == 'weight'
-            convert_names.append(f'model.layers.{layer_number}.self_attn.q_proj.{param_type}')
-            convert_names.append(f'model.layers.{layer_number}.self_attn.k_proj.{param_type}')
-            convert_names.append(f'model.layers.{layer_number}.self_attn.v_proj.{param_type}')
+            param_type = name.split(".")[-1]
+            assert param_type == "bias" or param_type == "weight"
+            convert_names.append(f"model.layers.{layer_number}.self_attn.q_proj.{param_type}")
+            convert_names.append(f"model.layers.{layer_number}.self_attn.k_proj.{param_type}")
+            convert_names.append(f"model.layers.{layer_number}.self_attn.v_proj.{param_type}")
             assert len(params) == 3
         elif "self_attention.linear_proj.weight" in name:
-            convert_names.append(f'model.layers.{layer_number}.self_attn.o_proj.weight')
+            convert_names.append(f"model.layers.{layer_number}.self_attn.o_proj.weight")
             assert len(params) == 1
         elif "self_attention.linear_qkv.layer_norm_weight" in name:
-            convert_names.append(f'model.layers.{layer_number}.input_layernorm.weight')
+            convert_names.append(f"model.layers.{layer_number}.input_layernorm.weight")
             assert len(params) == 1
         else:
             raise NotImplementedError(f"Unsupported parameter name: {name}")
@@ -61,18 +59,18 @@ def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[lis
         # 'decoder.layers.0.mlp.linear_fc1.layer_norm_weight'
         # 'decoder.layers.0.mlp.linear_fc1.weight'
         # 'decoder.layers.0.mlp.linear_fc2.weight'
-        layer_number = name.split('.')[2]
+        layer_number = name.split(".")[2]
         convert_names = []
         if "mlp.linear_fc1.weight" in name:
             # split gate_proj and up_proj
-            convert_names.append(f'model.layers.{layer_number}.mlp.gate_proj.weight')
-            convert_names.append(f'model.layers.{layer_number}.mlp.up_proj.weight')
+            convert_names.append(f"model.layers.{layer_number}.mlp.gate_proj.weight")
+            convert_names.append(f"model.layers.{layer_number}.mlp.up_proj.weight")
             assert len(params) == 2
         elif "mlp.linear_fc1.layer_norm_weight" in name:
-            convert_names.append(f'model.layers.{layer_number}.post_attention_layernorm.weight')
+            convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
             assert len(params) == 1
         elif "mlp.linear_fc2.weight" in name:
-            convert_names.append(f'model.layers.{layer_number}.mlp.down_proj.weight')
+            convert_names.append(f"model.layers.{layer_number}.mlp.down_proj.weight")
             assert len(params) == 1
         else:
             raise NotImplementedError(f"Unsupported parameter name: {name}")
@@ -96,7 +94,6 @@ def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> tupl
 
 
 class McoreToHFWeightConverterQwen2Moe(McoreToHFWeightConverterDense):
-
     def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
         # 'decoder.layers.0.pre_mlp_layernorm.weight',
         # 'decoder.layers.0.mlp.router.weight',
@@ -111,32 +108,32 @@ def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[lis
         # moe2
         # 'decoder.layers.0.mlp.experts.linear_fc2.weight0',
         # 'decoder.layers.0.mlp.experts.linear_fc2.weight1',
-        layer_number = name.split('.')[2]
+        layer_number = name.split(".")[2]
         convert_names = []
         if "pre_mlp_layernorm" in name:
-            convert_names.append(f'model.layers.{layer_number}.post_attention_layernorm.weight')
+            convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
             assert len(params) == 1
         elif "mlp.router.weight" in name:
-            convert_names.append(f'model.layers.{layer_number}.mlp.gate.weight')
+            convert_names.append(f"model.layers.{layer_number}.mlp.gate.weight")
             assert len(params) == 1
         elif "shared_experts.gate_weight" in name:
-            convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert_gate.weight')
+            convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert_gate.weight")
             assert len(params) == 1
         elif "shared_experts.linear_fc1.weight" in name:  # split gate_proj and up_proj
-            convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.gate_proj.weight')
-            convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.up_proj.weight')
+            convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.gate_proj.weight")
+            convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.up_proj.weight")
             assert len(params) == 2
         elif "shared_experts.linear_fc2.weight" in name:
-            convert_names.append(f'model.layers.{layer_number}.mlp.shared_expert.down_proj.weight')
+            convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.down_proj.weight")
             assert len(params) == 1
         elif "mlp.experts.linear_fc1" in name:  # split gate_proj and up_proj
-            expert_id = name.split('weight')[-1]
-            convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight')
-            convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight')
+            expert_id = name.split("weight")[-1]
+            convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight")
+            convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight")
             assert len(params) == 2
         elif "mlp.experts.linear_fc2" in name:
-            expert_id = name.split('weight')[-1]
-            convert_names.append(f'model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight')
+            expert_id = name.split("weight")[-1]
+            convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight")
             assert len(params) == 1
         else:
             raise NotImplementedError(f"Unsupported parameter name: {name}")
diff --git a/verl/models/weight_loader_registry.py b/verl/models/weight_loader_registry.py
index b274f087718..31942b7cfe3 100644
--- a/verl/models/weight_loader_registry.py
+++ b/verl/models/weight_loader_registry.py
@@ -31,9 +31,10 @@ def get_weight_loader(arch: str):
 
 def get_weight_saver(arch: str):
     from verl.models.mcore.saver import merge_megatron_ckpt_gptmodel, merge_megatron_ckpt_gptmodel_qwen_moe
+
     _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY = {
-        'LlamaForCausalLM': merge_megatron_ckpt_gptmodel,
-        'Qwen2ForCausalLM': merge_megatron_ckpt_gptmodel,
+        "LlamaForCausalLM": merge_megatron_ckpt_gptmodel,
+        "Qwen2ForCausalLM": merge_megatron_ckpt_gptmodel,
         "Qwen2MoeForCausalLM": merge_megatron_ckpt_gptmodel_qwen_moe,
     }
     if arch in _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY:
diff --git a/verl/workers/critic/megatron_critic.py b/verl/workers/critic/megatron_critic.py
index 47a535ee0f4..db014d96b1f 100644
--- a/verl/workers/critic/megatron_critic.py
+++ b/verl/workers/critic/megatron_critic.py
@@ -28,7 +28,7 @@
 
 from verl import DataProto
 from verl.trainer.ppo import core_algos
-from verl.utils.megatron.pipeline_parallel import compute_transformers_input_shapes, make_batch_generator
+from verl.utils.megatron.pipeline_parallel import make_batch_generator
 from verl.utils.py_functional import append_to_dict
 from verl.utils.torch_functional import broadcast_dict_tensor, masked_mean, split_dict_tensor_into_batches
 from verl.workers.critic import BasePPOCritic
@@ -133,20 +133,11 @@ def forward_backward_batch(self, data: DataProto, forward_only=False):
         n_micro_batch = len(batches)
         seq_len = batches[0]["input_ids"].shape[1]
 
-        # compute input shapes for pp stages
-        input_shapes = compute_transformers_input_shapes(
-            batches,
-            meta_info={
-                "sequence_parallel": self.tf_config.sequence_parallel,
-                "hidden_size": self.model_config.hidden_size,
-            },
-        )
-
         forward_backward_func = get_forward_backward_func()
 
         def loss_func(output, data, meta_info):
             if forward_only:
-                return torch.tensor(1.0, device=output.device), {'vpreds': output}
+                return torch.tensor(1.0, device=output.device), {"vpreds": output}
 
             responses = data["responses"]
             attention_mask = data["attention_mask"]
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index 9e9e613b105..c9ae207a8e7 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -257,13 +257,16 @@ def _build_rollout(self, trust_remote_code=False):
 
             # perform weight resharding between actor and rollout
             from verl.models.mcore import get_mcore_weight_converter
+
             weight_converter = get_mcore_weight_converter(self.actor_model_config, self.dtype)
-            sharding_manager = MegatronVLLMShardingManager(inference_engine=rollout.inference_engine,
-                                                           model_config=self.actor_model_config,
-                                                           layer_name_mapping=layer_name_mapping,
-                                                           actor_module=self.actor.actor_module,
-                                                           weight_converter=weight_converter)
-            log_gpu_memory_usage('After building sharding manager', logger=logger)
+            sharding_manager = MegatronVLLMShardingManager(
+                inference_engine=rollout.inference_engine,
+                model_config=self.actor_model_config,
+                layer_name_mapping=layer_name_mapping,
+                actor_module=self.actor.actor_module,
+                weight_converter=weight_converter,
+            )
+            log_gpu_memory_usage("After building sharding manager", logger=logger)
         else:
             raise NotImplementedError("Only vllmRollout is supported with Megatron now")
 
@@ -287,10 +290,7 @@ def init_model(self):
         self.dtype = PrecisionType.to_dtype(self.param_dtype)
         if self._is_actor or self._is_rollout:
             # we need the model for actor and rollout
-            if self._is_actor:
-                optim_config = self.config.actor.optim
-            else:
-                optim_config = None
+            optim_config = self.config.actor.optim if self._is_actor else None
             self.actor_module, self.actor_optimizer, self.actor_model_config, self.actor_optim_config = (
                 self._build_model_optimizer(
                     model_path=self.config.model.path,
diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py
index 33d9a51aa24..b60ce9f73e3 100644
--- a/verl/workers/sharding_manager/megatron_vllm.py
+++ b/verl/workers/sharding_manager/megatron_vllm.py
@@ -15,21 +15,30 @@
 This file contains a Megatron style Hybrid Engine that shares the weights of the actor with the inference engine.
 """
 
+import inspect
 import logging
 import os
 
 import torch
+import torch.distributed
 import torch.distributed as dist
 from megatron.core import DistributedDataParallel as LocalDDP
 from megatron.core import parallel_state as mpu
 from megatron.core.transformer.module import Float16Module
 from torch import nn
+from torch.distributed import new_group
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 
+import verl.utils.megatron.tensor_parallel as tp_utils
+from verl import DataProto
+from verl.models.mcore.weight_converter import McoreToHFWeightConverterBase
+from verl.third_party.vllm import LLM, vllm_version
+from verl.third_party.vllm import parallel_state as vllm_ps
 from verl.utils.debug import log_gpu_memory_usage
 from verl.utils.megatron_utils import (
     broadcast_from_megatron_pp,
     broadcast_str_from_megatron_pp,
+    convert_megatron_model_to_transformers_model,
     get_model,
     unwrap_model,
 )
@@ -39,6 +48,9 @@
     get_weight_buffer_meta_from_module,
 )
 from verl.utils.model import normalize_model_name
+from verl.utils.torch_functional import allgather_dict_tensors
+
+from .base import BaseShardingManager
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_PPO_LOGGING_LEVEL", "WARN"))
@@ -47,7 +59,8 @@
 class AllGatherPPModel:
     def __init__(self, model_provider, use_distributed_optimizer=True) -> None:
         print(
-            "[WARNING] This class is deprecated and will no longer be supported. Consider using the `MegatronPPOActor` class directly as a replacement."
+            "[WARNING] This class is deprecated and will no longer be supported. \
+Consider using the `MegatronPPOActor` class directly as a replacement."
         )
         self._pp_group = mpu.get_pipeline_model_parallel_group()
         self._pp_rank = mpu.get_pipeline_model_parallel_rank()
@@ -243,26 +256,13 @@ def pp_models(self):
 """
 Megatron Hybrid Engine:
 - During training, only the current pp stage holds the parameters
-- Before inference, broadcast the parameters of the current pp rank to all other pp ranks (all pp ranks holds all the parameters)
+- Before inference, broadcast the parameters of the current pp rank 
+   to all other pp ranks (all pp ranks holds all the parameters)
 - Bind the parameters to the inference engine
 - Do inference in tp. pp is treated as additional dp
 - After inference, all the parameters that doesn't belong to this pp rank is freed.
 """
 
-import inspect
-
-import torch.distributed
-from torch.distributed import new_group
-
-import verl.utils.megatron.tensor_parallel as tp_utils
-from verl import DataProto
-from verl.third_party.vllm import LLM, vllm_version
-from verl.third_party.vllm import parallel_state as vllm_ps
-from verl.utils.megatron_utils import convert_megatron_model_to_transformers_model
-from verl.models.mcore.weight_converter import McoreToHFWeightConverterBase
-from verl.utils.torch_functional import allgather_dict_tensors
-
-from .base import BaseShardingManager
 
 # Micro Data parallel group. Micro data parallel group is additional dp group that origins from splitting training tp
 # into infer_tp and micro_tp. By default, we use order micro_dp - tp
@@ -272,14 +272,15 @@ def pp_models(self):
 
 
 class MegatronVLLMShardingManager(BaseShardingManager):
-
-    def __init__(self,
-                 actor_module: nn.ModuleList,
-                 inference_engine: LLM,
-                 model_config,
-                 layer_name_mapping,
-                 weight_converter: McoreToHFWeightConverterBase,
-                 module: AllGatherPPModel = None):
+    def __init__(
+        self,
+        actor_module: nn.ModuleList,
+        inference_engine: LLM,
+        model_config,
+        layer_name_mapping,
+        weight_converter: McoreToHFWeightConverterBase,
+        module: AllGatherPPModel = None,
+    ):
         from megatron.core import parallel_state as mpu
 
         self.actor_module = actor_module
@@ -331,8 +332,7 @@ def per_tensor_generator(self, convert_qkv_gate_up_by_simple_split=True):
 
         def tensor_generator():
             for scan_vpp_idx in range(vpp_size):
-                for name, param in self.actor_module[scan_vpp_idx].named_parameters():
-                    yield name, param
+                yield from self.actor_module[scan_vpp_idx].named_parameters()
 
         # we need first make all rank get full model information
         meta_info = []
@@ -395,14 +395,14 @@ def tensor_generator():
                 convert_qkv_gate_up_by_trunk_concat=False,
             )  # defualt false
 
-            for converted_name, infer_param in zip(converted_names, converted_params):
-                yield converted_name, infer_param
+            yield from zip(converted_names, converted_params)
 
     def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_qkv_gate_up_by_simple_split=False):
         """
         name: name of the parameter
         param: training parameters
-        infer_params (Iterable[torch.Tensor]): a iterator towards list of parameters all-gathered from train tp group (vllm 0.8.2) or micro-dp group (vllm <= 0.6.3)
+        infer_params (Iterable[torch.Tensor]): a iterator towards list of parameters all-gathered
+          from train tp group (vllm 0.8.2) or micro-dp group (vllm <= 0.6.3)
         model_config: huggingface model_config
         TODO(zhangchi.usc1992): currently, the implementation is adhoc. We can move this function to the model
         definition so that it is model-agnostic. If the model doesn't implement this function,
@@ -436,10 +436,7 @@ def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_
             q = torch.cat(q_lst, dim=0)
             k = torch.cat(k_lst, dim=0)
             v = torch.cat(v_lst, dim=0)
-            if not convert_qkv_gate_up_by_simple_split:
-                infer_params = torch.cat((q, k, v), dim=0)
-            else:
-                infer_params = [q, k, v]
+            infer_params = torch.cat((q, k, v), dim=0) if not convert_qkv_gate_up_by_simple_split else [q, k, v]
 
         elif self.layer_name_mapping.get("gate_proj_layer_name") in name:
             # if the tensor is gate and proj
@@ -451,10 +448,7 @@ def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_
                 up_lst.append(up)
             gate = torch.cat(gate_lst, dim=0)
             up = torch.cat(up_lst, dim=0)
-            if not convert_qkv_gate_up_by_simple_split:
-                infer_params = torch.cat((gate, up), dim=0)
-            else:
-                infer_params = [gate, up]
+            infer_params = torch.cat((gate, up), dim=0) if not convert_qkv_gate_up_by_simple_split else [gate, up]
 
         elif "mlp.experts.linear_fc2.weight" in name:  # moe
             infer_params = torch.cat(infer_params, dim=1)
@@ -467,7 +461,8 @@ def default_tp_concat_fn(self, name, param, infer_params, model_config, convert_
 
     def _post_process_params(self, params, convert_qkv_gate_up_by_simple_split=False):
         """
-        For each param, if it is a tp-splited param, we all-gather from train tp group (vllm 0.8.2) or micro-dp group (vllm <= 0.6.3)
+        For each param, if it is a tp-splited param, we all-gather from train
+        tp group (vllm 0.8.2) or micro-dp group (vllm <= 0.6.3)
         """
         # here the params are in train tp format. we iterate params and all-gather
         # TODO(zhangchi.usc1992) We can consider copy non-tp weight to another infer buffer.
@@ -491,20 +486,20 @@ def _post_process_params(self, params, convert_qkv_gate_up_by_simple_split=False
                 )
             else:
                 infer_params = param
-            if vllm_version in ('0.4.2', '0.5.4', '0.6.3'):
+            if vllm_version in ("0.4.2", "0.5.4", "0.6.3"):
                 converted_names, converted_params = convert_megatron_model_to_transformers_model(
                     name,
                     infer_params,
                     self.model_config,
                     self.train_tp_size,
                     self.module.pp_models[0][0].config.num_query_groups,
-                    convert_qkv_gate_up_by_trunk_concat=False)
+                    convert_qkv_gate_up_by_trunk_concat=False,
+                )
             else:
                 if not isinstance(infer_params, list):
                     infer_params = [infer_params]
                 converted_names, converted_params = self.weight_converter.convert_param(name, infer_params)
-            for converted_name, infer_param in zip(converted_names, converted_params):
-                yield converted_name, infer_param
+            yield from zip(converted_names, converted_params)
 
     def __enter__(self):
         if vllm_version in ("0.4.2", "0.5.4", "0.6.3"):
@@ -520,7 +515,8 @@ def __enter__(self):
             model = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
             _patch_vllm_qwen2_moe_model_weight_loader(model)
             loaded_params = model.load_weights(per_tensor_param)
-            logger.info(f"vLLM load weights, loaded_params: {len(loaded_params)}")
+            info = f"vLLM load weights, loaded_params: {len(loaded_params)}"
+            logger.info(info)
             log_gpu_memory_usage("After load_weights sharding manager memory", logger=logger)
 
             if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
@@ -597,6 +593,7 @@ def _patch_vllm_qwen2_moe_model_weight_loader(model):
     # (False, 'model.layers.0.mlp.experts.w13_weight')          use mlp.experts.weight_loader
     # (False, 'model.layers.0.mlp.experts.w2_weight')          use mlp.experts.weight_loader
     from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM
+
     if not isinstance(model, Qwen2MoeForCausalLM):
         return
     for layer in model.model.layers:

From 7b66d82b4619820ddc324786b16318d2f874eab7 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Fri, 18 Apr 2025 23:58:10 -0700
Subject: [PATCH 11/19] fix bug of merge

---
 .../workers/sharding_manager/megatron_vllm.py | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py
index a2ea7b288b4..12bc569d78b 100644
--- a/verl/workers/sharding_manager/megatron_vllm.py
+++ b/verl/workers/sharding_manager/megatron_vllm.py
@@ -390,15 +390,19 @@ def tensor_generator():
             else:
                 infer_params = broad_pp_tensor
 
-            # change megatron tensor name to hf model name
-            converted_names, converted_params = convert_megatron_model_to_transformers_model(
-                cur_name,
-                infer_params,
-                self.model_config,
-                self.train_tp_size,
-                0,  # no impact
-                convert_qkv_gate_up_by_trunk_concat=False,
-            )  # defualt false
+            if vllm_version in ("0.4.2", "0.5.4", "0.6.3"):
+                converted_names, converted_params = convert_megatron_model_to_transformers_model(
+                    cur_name,
+                    infer_params,
+                    self.model_config,
+                    self.train_tp_size,
+                    0,  # no impact
+                    convert_qkv_gate_up_by_trunk_concat=False,
+                )  # defualt false
+            else:
+                if not isinstance(infer_params, list):
+                    infer_params = [infer_params]
+                converted_names, converted_params = self.weight_converter.convert_param(cur_name, infer_params)
 
             yield from zip(converted_names, converted_params)
 

From 941ab9589e471c4648b7bb7efc16c82cebd423fa Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Sat, 19 Apr 2025 01:00:01 -0700
Subject: [PATCH 12/19] compatible to mcore 0.12

---
 verl/workers/actor/megatron_actor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py
index c7ce1161e97..49a76687402 100644
--- a/verl/workers/actor/megatron_actor.py
+++ b/verl/workers/actor/megatron_actor.py
@@ -344,7 +344,7 @@ def loss_func(output, data, meta_info):
 
             stats = {}
             if forward_only:
-                policy_loss = 1.0
+                policy_loss = torch.tensor(1.0, device=output.device)
             else:
                 if self.config.use_kl_loss:
                     ref_log_prob = data["ref_log_prob"]

From 267a119410eec1ab9778c407a0de0ab8fca86d19 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Mon, 21 Apr 2025 08:29:28 -0700
Subject: [PATCH 13/19] WIP support moonlight

---
 scripts/converter_hf_to_mcore.py              |  76 ++++++++++-
 verl/models/mcore/config_converter.py         | 107 ++++++++++++++-
 verl/models/mcore/registry.py                 |   7 +-
 verl/models/mcore/saver.py                    |   5 +
 verl/models/mcore/weight_converter.py         | 128 ++++++++++++++++++
 verl/models/weight_loader_registry.py         |   7 +-
 .../single_controller/base/megatron/worker.py |  18 ++-
 verl/workers/megatron_workers.py              |   9 +-
 .../workers/sharding_manager/megatron_vllm.py |  21 ++-
 9 files changed, 347 insertions(+), 31 deletions(-)

diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py
index aa4256b67a4..f917a9c1d58 100644
--- a/scripts/converter_hf_to_mcore.py
+++ b/scripts/converter_hf_to_mcore.py
@@ -53,7 +53,7 @@ def __init__(self):
         self.model = ModelConfig()
 
 
-def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config):
+def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config,tfconfig):
     num_attention_heads = hf_config.num_attention_heads
     hidden_dim = hf_config.hidden_size
     head_dim = hidden_dim // num_attention_heads
@@ -95,6 +95,70 @@ def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config)
         model.decoder.final_layernorm.weight.copy_(hf_model.model.norm.weight)
         model.output_layer.weight.copy_(hf_model.lm_head.weight)
 
+@torch.no_grad()
+def convert_checkpoint_from_transformers_to_megatron_dpskv3(hf_model, model, hf_config, tfconfig):
+    warnings.warn("MPT model is not supported yet")
+    def safe_copy(
+        src_tensor: torch.Tensor, 
+        dst_tensor: torch.Tensor,
+        skip_dtype_assert: bool = False,
+    ):
+        if not skip_dtype_assert:
+            if src_tensor.dtype != dst_tensor.dtype:
+                raise ValueError(f"Get source dtype {src_tensor.dtype}, but target dtype {dst_tensor.dtype}")
+        assert src_tensor.shape == dst_tensor.shape
+        dst_tensor.data.copy_(src_tensor.data)
+        return src_tensor.numel()
+    model.embedding.word_embeddings.weight.copy_(hf_model.model.embed_tokens.weight)
+    for layer_idx, (layer, hf_layer) in enumerate(zip(model.decoder.layers, hf_model.model.layers)):
+        print(layer_idx)
+        layer.input_layernorm.weight.copy_(hf_layer.input_layernorm.weight)
+        
+        if hf_config.q_lora_rank is None:
+            layer.self_attention.linear_q_proj.weight.copy_(hf_layer.self_attn.q_proj.weight)
+        else:
+            layer.self_attention.linear_q_down_proj.weight.copy_(hf_layer.self_attn.q_a_proj.weight)
+            layer.self_attention.linear_q_up_proj.weight.copy_(hf_layer.self_attn.q_b_proj.weight)
+            layer.self_attention.linear_q_up_proj.layer_norm_weight.copy_(hf_layer.self_attn.q_a_layernorm.weight)
+
+        layer.self_attention.linear_kv_down_proj.weight.copy_(hf_layer.self_attn.kv_a_proj_with_mqa.weight)
+        layer.self_attention.linear_kv_up_proj.weight.copy_(hf_layer.self_attn.kv_b_proj.weight)
+        layer.self_attention.linear_kv_up_proj.layer_norm_weight.copy_(hf_layer.self_attn.kv_a_layernorm.weight)
+        layer.self_attention.linear_proj.weight.copy_(hf_layer.self_attn.o_proj.weight)
+        
+        if not hasattr(layer.mlp, 'router'):
+            layer.mlp.linear_fc1.layer_norm_weight.copy_(hf_layer.post_attention_layernorm.weight)
+            layer.mlp.linear_fc1.weight.copy_(
+                torch.cat([hf_layer.mlp.gate_proj.weight, hf_layer.mlp.up_proj.weight]))
+            layer.mlp.linear_fc2.weight.copy_(hf_layer.mlp.down_proj.weight)
+        else:
+            layer.mlp.router.weight.copy_(hf_layer.mlp.gate.weight)
+            # NOTE: the e_score_correction_bias in mcore model will be initialized with bfloat16 and \
+            # recover to fp32 in the first forward. There is always a diff in the bias between two models (~0.3%)
+            safe_copy(hf_layer.mlp.gate.e_score_correction_bias, layer.mlp.router.expert_bias, skip_dtype_assert=True)
+            if tfconfig.moe_grouped_gemm == True:
+                for i, hf_expert in enumerate(hf_layer.mlp.experts):
+                    fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight])
+                    linear_fc1_weighti = getattr(layer.mlp.experts.linear_fc1, 'weight' + str(i))
+                    linear_fc1_weighti.copy_(fc1_weight)
+                    linear_fc2_weighti = getattr(layer.mlp.experts.linear_fc2, 'weight' + str(i))
+                    linear_fc2_weighti.copy_(hf_expert.down_proj.weight)
+            else:
+                for i, hf_expert in enumerate(hf_layer.mlp.experts):
+                    expert = layer.mlp.experts.local_experts[i]
+                    fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight])
+                    expert.linear_fc1.weight.copy_(fc1_weight)
+                    expert.linear_fc2.weight.copy_(hf_expert.down_proj.weight)
+            layer.pre_mlp_layernorm.weight.copy_(hf_layer.post_attention_layernorm.weight)
+            shared_fc1_weight = torch.cat(
+                [hf_layer.mlp.shared_experts.gate_proj.weight, hf_layer.mlp.shared_experts.up_proj.weight])
+            layer.mlp.shared_experts.linear_fc1.weight.copy_(shared_fc1_weight)
+            layer.mlp.shared_experts.linear_fc2.weight.copy_(hf_layer.mlp.shared_experts.down_proj.weight)
+
+        model.decoder.final_layernorm.weight.copy_(hf_model.model.norm.weight)
+        if not hf_config.tie_word_embeddings:
+            model.output_layer.weight.copy_(hf_model.lm_head.weight)
+
 
 def convert_hf_to_mcore(hf_model_path, output_path, test=False):
     os.makedirs(output_path, exist_ok=True)
@@ -117,7 +181,7 @@ def convert_hf_to_mcore(hf_model_path, output_path, test=False):
     model_parallel_cuda_manual_seed(0)
 
     # init hf config
-    hf_config = AutoConfig.from_pretrained(hf_model_path)
+    hf_config = AutoConfig.from_pretrained(hf_model_path,trust_remote_code=True)
     print(hf_config)
 
     cfg = Config()
@@ -147,12 +211,14 @@ def megatron_model_provider(pre_process, post_process):
         warnings.simplefilter("ignore")
 
     # init hf model
-    hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, torch_dtype=torch.bfloat16)
+    hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, torch_dtype=torch.bfloat16,trust_remote_code=True)
     ref_state_dict = hf_model.state_dict()
 
     # load hf state dict to megatron model
-    if "Qwen2MoeForCausalLM" in hf_config.architectures:
-        convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config)
+    if "DeepseekV3ForCausalLM" in hf_config.architectures:
+        convert_checkpoint_from_transformers_to_megatron_dpskv3(hf_model, model[0].module, hf_config, tfconfig=tfconfig)
+    elif "Qwen2MoeForCausalLM" in hf_config.architectures:
+        convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config,tfconfig)
     else:
         from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel
 
diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index c43f7f75526..9602df1c0fb 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -17,7 +17,7 @@
 
 import torch
 import torch.nn.functional as F
-from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer import TransformerConfig, MLATransformerConfig
 from megatron.core.transformer.enums import AttnBackend
 from transformers import PretrainedConfig
 
@@ -94,6 +94,7 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
         bf16=dtype is torch.bfloat16,
         layernorm_epsilon=hf_config.rms_norm_eps,
         ffn_hidden_size=hf_config.intermediate_size,
+
         # parallel config
         tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
         pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
@@ -102,6 +103,7 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
         overlap_p2p_comm=overlap_p2p_comm,
         batch_p2p_comm=batch_p2p_comm,
         sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1,
+
         # moe specific
         moe_ffn_hidden_size=hf_config.moe_intermediate_size,
         moe_token_dispatcher_type="alltoall",
@@ -116,15 +118,18 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
         # moe_permute_fusion=True, # need TE 2.1+
         moe_grouped_gemm=True,
         moe_router_score_function="softmax",
+
         # # mcore 0.12 moe
         # moe_router_dtype="fp64",
         # disable_bf16_reduced_precision_matmul=True,
+
         # other
         # deallocate_pipeline_outputs=True,
         # gradient_accumulation_fusion=True,
         persist_layer_norm=True,
         bias_activation_fusion=True,
         bias_dropout_fusion=True,
+        
         # qwen specific
         moe_router_pre_softmax=True,
         add_qkv_bias=True,
@@ -132,9 +137,105 @@ def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype)
     return transformer_config
 
 
-def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
+def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> MLATransformerConfig:
     # DeepseekV3ForCausalLM
-    raise NotImplementedError("DeepseekV3ForCausalLM is not supported yet")
+    from megatron.core import parallel_state as mpu
+
+    overlap_p2p_comm = (
+        mpu.get_virtual_pipeline_model_parallel_world_size() is not None
+        and mpu.get_virtual_pipeline_model_parallel_world_size() > 1
+    )
+    batch_p2p_comm = False
+
+    mla_rope_config = {
+        "beta_fast": 32,
+        "beta_slow": 1,
+        "factor": 40,
+        "mscale": 1.0,
+        "mscale_all_dim": 1.0,
+        "original_max_position_embeddings": 4096,
+    }
+    if "rope_scaling" in hf_config and hf_config.rope_scaling is not None:
+        mla_rope_config.update(hf_config.rope_scaling)
+    moe_layer_freq = [1] * hf_config.num_hidden_layers
+    for i in range(hf_config.first_k_dense_replace):
+        moe_layer_freq[i] = 0
+    transformer_config = MLATransformerConfig(
+        num_layers=hf_config.num_hidden_layers,
+        hidden_size=hf_config.hidden_size,
+        num_attention_heads=hf_config.num_attention_heads,
+        num_query_groups=hf_config.num_key_value_heads,
+        attention_dropout=hf_config.attention_dropout,
+        hidden_dropout=getattr(hf_config, "hidden_dropout", 0.0),
+        activation_func=F.silu,
+        normalization="RMSNorm",
+        gated_linear_unit=True,
+        use_cpu_initialization=False,
+        add_bias_linear=False,
+        pipeline_dtype=dtype,
+        params_dtype=dtype,
+        variable_seq_lengths=True,
+        masked_softmax_fusion=True,
+        # attention_backend=AttnBackend.flash,
+        attention_backend=AttnBackend.unfused,
+        bf16=dtype is torch.bfloat16,
+        layernorm_epsilon=hf_config.rms_norm_eps,
+        ffn_hidden_size=hf_config.intermediate_size,
+        qk_layernorm=True,
+
+        # parallel config
+        tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
+        pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
+        virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(),
+        context_parallel_size=mpu.get_context_parallel_world_size(),
+        overlap_p2p_comm=overlap_p2p_comm,
+        batch_p2p_comm=batch_p2p_comm,
+        sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1,
+
+        # moe specific
+        moe_ffn_hidden_size=hf_config.moe_intermediate_size,
+        moe_token_dispatcher_type="alltoall",
+        moe_router_bias_update_rate=0.001,
+        moe_router_enable_expert_bias=True,
+        moe_router_topk=hf_config.num_experts_per_tok,
+        num_moe_experts=hf_config.n_routed_experts,
+        moe_shared_expert_intermediate_size=hf_config.moe_intermediate_size * hf_config.n_shared_experts,
+        moe_aux_loss_coeff=getattr(hf_config, "aux_loss_alpha", 0.001),
+        moe_router_load_balancing_type="seq_aux_loss",
+        moe_shared_expert_overlap=True,
+        # moe_permute_fusion=True, # need TE 2.1+
+        moe_grouped_gemm=True,
+        moe_router_score_function="sigmoid",
+        moe_router_pre_softmax=True,
+        moe_router_topk_scaling_factor=hf_config.routed_scaling_factor,
+        moe_layer_freq=moe_layer_freq,
+
+        # MLA
+        q_lora_rank=hf_config.q_lora_rank,
+        kv_lora_rank=hf_config.kv_lora_rank,
+        qk_head_dim=hf_config.qk_nope_head_dim,
+        qk_pos_emb_head_dim=hf_config.qk_rope_head_dim,
+        v_head_dim=hf_config.v_head_dim,
+        rotary_base=hf_config.rope_theta,
+        rotary_scaling_factor=mla_rope_config["factor"],
+        mscale=mla_rope_config["mscale"],
+        mscale_all_dim=mla_rope_config["mscale_all_dim"],
+        max_position_embeddings=mla_rope_config["original_max_position_embeddings"],
+        beta_fast=mla_rope_config["beta_fast"],
+        beta_slow=mla_rope_config["beta_slow"],
+
+        # mcore 0.12 moe
+        # moe_router_dtype="fp64",
+        # disable_bf16_reduced_precision_matmul=True,
+
+        # other
+        # deallocate_pipeline_outputs=True,
+        # gradient_accumulation_fusion=True,
+        persist_layer_norm=True,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+    )
+    return transformer_config
 
 
 def hf_to_mcore_config_qwen2_5_vl(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index 19d8433db08..2fdab2160a4 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -39,7 +39,11 @@
     init_mcore_model_qwen2_5_vl,
     init_mcore_model_qwen2_moe,
 )
-from .weight_converter import McoreToHFWeightConverterDense, McoreToHFWeightConverterQwen2Moe
+from .weight_converter import (
+    McoreToHFWeightConverterDense,
+    McoreToHFWeightConverterQwen2Moe,
+    McoreToHFWeightConverterDpskv3,
+)
 
 
 def hf_to_mcore_config(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
@@ -114,6 +118,7 @@ def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype):
         "LlamaForCausalLM": McoreToHFWeightConverterDense,
         "Qwen2ForCausalLM": McoreToHFWeightConverterDense,
         "Qwen2MoeForCausalLM": McoreToHFWeightConverterQwen2Moe,
+        "DeepseekV3ForCausalLM": McoreToHFWeightConverterDpskv3,
     }
     assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
     arch = hf_config.architectures[0]
diff --git a/verl/models/mcore/saver.py b/verl/models/mcore/saver.py
index df8721aa56a..14c7e29278e 100644
--- a/verl/models/mcore/saver.py
+++ b/verl/models/mcore/saver.py
@@ -468,3 +468,8 @@ def merge_megatron_ckpt_gptmodel_qwen_moe(
     wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False
 ):
     raise NotImplementedError("merge_megatron_ckpt_gptmodel_qwen_moe is not implemented")
+
+def merge_megatron_ckpt_gptmodel_dpskv3(
+    wrapped_models, config, dtype, is_value_model=False, tie_word_embeddings=False
+):
+    raise NotImplementedError("merge_megatron_ckpt_gptmodel_dpskv3 is not implemented")
diff --git a/verl/models/mcore/weight_converter.py b/verl/models/mcore/weight_converter.py
index 6bdee51b3e9..ce8c9fb7eab 100644
--- a/verl/models/mcore/weight_converter.py
+++ b/verl/models/mcore/weight_converter.py
@@ -138,3 +138,131 @@ def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[lis
         else:
             raise NotImplementedError(f"Unsupported parameter name: {name}")
         return convert_names, params
+
+class McoreToHFWeightConverterDpskv3(McoreToHFWeightConverterBase):
+
+    def _convert_attention_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # mcore
+        # 'decoder.layers.0.input_layernorm.weight'
+        # 'decoder.layers.0.self_attention.linear_proj.weight'
+        # 'decoder.layers.0.self_attention.linear_q_proj.weight'
+        # 'decoder.layers.0.self_attention.linear_kv_down_proj.weight'
+        # 'decoder.layers.0.self_attention.linear_kv_up_proj.layer_norm_weight'
+        # 'decoder.layers.0.self_attention.linear_kv_up_proj.weight'
+        # 'decoder.layers.0.self_attention.linear_q_down_proj.weight'
+        # 'decoder.layers.0.self_attention.linear_q_up_proj.weight'
+        # 'decoder.layers.0.self_attention.linear_q_up_proj.layer_norm_weight'
+        # hf
+        # 'model.layers.0.input_layernorm.weight'
+        # 'model.layers.0.self_attn.o_proj.weight'
+        # 'model.layers.0.self_attn.q_proj.weight'
+        # 'model.layers.0.self_attn.kv_a_proj_with_mqa.weight'
+        # 'model.layers.0.self_attn.kv_a_layernorm.weight'
+        # 'model.layers.0.self_attn.kv_b_proj.weight'
+        # 'model.layers.0.self_attn.q_a_proj.weight'
+        # 'model.layers.0.self_attn.q_b_proj.weight'
+        # 'model.layers.0.self_attn.q_a_layernorm.weight'
+        name_map_after_layer = {
+            "input_layernorm.weight": "input_layernorm.weight",
+            "self_attention.linear_proj.weight": "self_attn.o_proj.weight",
+            "self_attention.linear_q_proj.weight": "self_attn.q_proj.weight",
+            "self_attention.linear_kv_down_proj.weight": "self_attn.kv_a_proj_with_mqa.weight",
+            "self_attention.linear_kv_up_proj.layer_norm_weight": "self_attn.kv_a_layernorm.weight",
+            "self_attention.linear_kv_up_proj.weight": "self_attn.kv_b_proj.weight",
+            "self_attention.linear_q_down_proj.weight": "self_attn.q_a_proj.weight",
+            "self_attention.linear_q_up_proj.weight": "self_attn.q_b_proj.weight",
+            "self_attention.linear_q_up_proj.layer_norm_weight": "self_attn.q_a_layernorm.weight",
+        }
+        assert len(params) == 1
+        convert_names = []
+        layer_number = name.split(".")[2]
+        name_after_layer = name.split(f'.{layer_number}.')[1]
+        convert_names.append(f"model.layers.{layer_number}.{name_map_after_layer[name_after_layer]}")
+        return convert_names, params
+
+    def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        # mcore dense
+        # 'decoder.layers.0.mlp.linear_fc1.layer_norm_weight'
+        # 'decoder.layers.0.mlp.linear_fc2.weight'
+        # 'decoder.layers.0.mlp.linear_fc1.weight'
+        #       ---
+        # 'decoder.layers.1.mlp.shared_experts.linear_fc1.weight'
+        #       ---
+        # 'decoder.layers.1.mlp.shared_experts.linear_fc2.weight'
+        # hf dense
+        # 'model.layers.0.post_attention_layernorm.weight'
+        # 'model.layers.0.mlp.down_proj.weight'
+        # 'model.layers.0.mlp.gate_proj.weight'
+        # 'model.layers.0.mlp.up_proj.weight'
+        # 'model.layers.1.mlp.shared_experts.gate_proj.weight'
+        # 'model.layers.1.mlp.shared_experts.up_proj.weight'
+        # 'model.layers.1.mlp.shared_experts.down_proj.weight'
+
+        # mcore moe
+        # 'decoder.layers.1.pre_mlp_layernorm.weight'
+        # 'decoder.layers.1.mlp.router.weight'
+        # 'decoder.layers.1.mlp.router.expert_bias'
+        # 'decoder.layers.1.mlp.experts.linear_fc1.weight0'
+        #       ---
+        # 'decoder.layers.1.mlp.experts.linear_fc2.weight0'
+        # hf moe
+        # 'model.layers.1.post_attention_layernorm.weight'
+        # 'model.layers.1.mlp.gate.weight'
+        # 'model.layers.1.mlp.gate.e_score_correction_bias'
+        # 'model.layers.1.mlp.experts.0.gate_proj.weight'
+        # 'model.layers.1.mlp.experts.0.up_proj.weight'
+        # 'model.layers.1.mlp.experts.0.down_proj.weight'
+
+        name_map_after_layer = {
+            "mlp.linear_fc1.layer_norm_weight": "post_attention_layernorm.weight",
+            "mlp.linear_fc2.weight": "mlp.down_proj.weight",
+            "mlp.shared_experts.linear_fc2.weight": "mlp.shared_experts.down_proj.weight",
+            "mlp.linear_fc1.weight": ["mlp.gate_proj.weight", "mlp.up_proj.weight"],
+            "mlp.shared_experts.linear_fc1.weight": ["mlp.shared_experts.gate_proj.weight", "mlp.shared_experts.up_proj.weight"],
+            "pre_mlp_layernorm.weight": "post_attention_layernorm.weight",
+            "mlp.router.weight": "mlp.gate.weight",
+            "mlp.router.expert_bias": "mlp.gate.e_score_correction_bias",
+        }
+        convert_names = []
+        layer_number = name.split(".")[2]
+        name_after_layer = name.split(f'.{layer_number}.')[1]
+        if name_after_layer in name_map_after_layer:
+            mapped_name = name_map_after_layer[name_after_layer]
+            if isinstance(mapped_name, list):
+                assert len(params) == len(mapped_name)
+                for one in mapped_name:
+                    convert_names.append(f"model.layers.{layer_number}.{one}")
+            else:
+                assert len(params) == 1
+                convert_names.append(f"model.layers.{layer_number}.{mapped_name}")
+        else:
+            if "mlp.experts.linear_fc1.weight" in name:
+                expert_id = name.split("weight")[-1]
+                convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight")
+                convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight")
+                assert len(params) == 2
+            elif "mlp.experts.linear_fc2.weight" in name:
+                expert_id = name.split("weight")[-1]
+                convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight")
+                assert len(params) == 1
+            else:
+                raise NotImplementedError(f"Unsupported parameter name: {name}")
+
+        return convert_names, params
+
+
+    def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
+        direct_name_mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+        }
+        if name in direct_name_mapping:
+            return [direct_name_mapping[name]], [params_one_group[0]]
+
+        if "self_attention" in name or "input_layernorm.weight" in name:
+            return self._convert_attention_param(name, params_one_group)
+        elif "mlp" in name:
+            return self._convert_mlp_param(name, params_one_group)
+        else:
+            raise NotImplementedError(f"Unsupported parameter name: {name}")
\ No newline at end of file
diff --git a/verl/models/weight_loader_registry.py b/verl/models/weight_loader_registry.py
index 31942b7cfe3..5931a1c866f 100644
--- a/verl/models/weight_loader_registry.py
+++ b/verl/models/weight_loader_registry.py
@@ -30,12 +30,17 @@ def get_weight_loader(arch: str):
 
 
 def get_weight_saver(arch: str):
-    from verl.models.mcore.saver import merge_megatron_ckpt_gptmodel, merge_megatron_ckpt_gptmodel_qwen_moe
+    from verl.models.mcore.saver import (
+        merge_megatron_ckpt_gptmodel,
+        merge_megatron_ckpt_gptmodel_qwen_moe,
+        merge_megatron_ckpt_gptmodel_dpskv3,
+    )
 
     _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY = {
         "LlamaForCausalLM": merge_megatron_ckpt_gptmodel,
         "Qwen2ForCausalLM": merge_megatron_ckpt_gptmodel,
         "Qwen2MoeForCausalLM": merge_megatron_ckpt_gptmodel_qwen_moe,
+        "DeepseekV3ForCausalLM": merge_megatron_ckpt_gptmodel_dpskv3,
     }
     if arch in _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY:
         return _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY[arch]
diff --git a/verl/single_controller/base/megatron/worker.py b/verl/single_controller/base/megatron/worker.py
index 5fc71128169..01e493f9ebc 100644
--- a/verl/single_controller/base/megatron/worker.py
+++ b/verl/single_controller/base/megatron/worker.py
@@ -39,7 +39,7 @@ def get_megatron_rank_info(self):
         info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank, cp_rank=cp_rank)
         return info
 
-    def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config):
+    def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config, trust_remote_code=False):
         from transformers import AutoConfig
 
         from verl.models.mcore import hf_to_mcore_config
@@ -49,10 +49,10 @@ def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config
 
         # Step 1: initialize the tokenizer
         self.local_path = copy_to_local(model_path)
-        self.tokenizer = hf_tokenizer(self.local_path)
+        self.tokenizer = hf_tokenizer(self.local_path, trust_remote_code=trust_remote_code)
 
         # Step 2: get the hf
-        hf_config = AutoConfig.from_pretrained(self.local_path)
+        hf_config = AutoConfig.from_pretrained(self.local_path, trust_remote_code=trust_remote_code)
 
         # Step 3: override the hf config
         override_config_kwargs = {
@@ -68,17 +68,21 @@ def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config
             print(f"Model config after override: {hf_config}")
         tf_config = hf_to_mcore_config(hf_config, dtype)
 
-        def add_optimization_config_to_tf_config(tf_config, verl_model_config):
+        def add_optimization_config_to_tf_config(tf_config):
             # add optimization config to tf_config, e.g. checkpointing
-            if verl_model_config.get("enable_gradient_checkpointing", False):
-                gradient_checkpointing_cfg = dict(verl_model_config.get("gradient_checkpointing_kwargs", dict()))
+            if self.config.model.get("enable_gradient_checkpointing", False):
+                gradient_checkpointing_cfg = dict(self.config.model.get("gradient_checkpointing_kwargs", dict()))
                 tf_config.recompute_method = gradient_checkpointing_cfg.get("activations_checkpoint_method", "full")
                 tf_config.recompute_granularity = gradient_checkpointing_cfg.get(
                     "activations_checkpoint_granularity", "full"
                 )
                 tf_config.recompute_num_layers = gradient_checkpointing_cfg.get("activations_checkpoint_num_layers", -1)
+            if megatron_config:=self.config.get("megatron", {}):
+                if extra:=megatron_config.get("extra", {}):
+                    for k, v in extra.items():
+                        setattr(tf_config, k, v)
 
-        add_optimization_config_to_tf_config(tf_config, self.config.model)
+        add_optimization_config_to_tf_config(tf_config)
 
         print(f"TF config: {tf_config}")
         self.hf_config = hf_config
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index c9ae207a8e7..e452db19239 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -140,7 +140,8 @@ def _build_model_optimizer(self, model_path, optim_config, override_model_config
         from verl.utils.megatron_utils import get_model, init_megatron_optim_config
         from verl.utils.model import get_generation_config, print_model_size
 
-        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config)
+        trust_remote_code = self.config.model.get("trust_remote_code", False)
+        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, trust_remote_code)
         self.generation_config = get_generation_config(self.local_path)
 
         def megatron_actor_model_provider(pre_process, post_process):
@@ -504,7 +505,8 @@ def _build_critic_model_optimizer(self, model_path, optim_config, override_model
         from verl.utils.megatron_utils import get_model, init_megatron_optim_config
         from verl.utils.model import print_model_size
 
-        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config)
+        trust_remote_code = self.config.model.get("trust_remote_code", False)
+        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, trust_remote_code)
 
         def megatron_critic_model_provider(pre_process, post_process):
             from verl.models.mcore import init_mcore_model
@@ -680,7 +682,8 @@ def _build_rm_model(self, model_path, override_model_config):
 
         from verl.utils.megatron_utils import get_model
 
-        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config)
+        trust_remote_code = self.config.model.get("trust_remote_code", False)
+        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, trust_remote_code)
 
         def megatron_rm_model_provider(pre_process, post_process):
             from verl.models.mcore import init_mcore_model
diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py
index 12bc569d78b..f02ae3e49dc 100644
--- a/verl/workers/sharding_manager/megatron_vllm.py
+++ b/verl/workers/sharding_manager/megatron_vllm.py
@@ -530,7 +530,7 @@ def __enter__(self):
                 self.inference_engine.wake_up()
             per_tensor_param = self.per_tensor_generator()
             model = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
-            _patch_vllm_qwen2_moe_model_weight_loader(model)
+            _patch_vllm_moe_model_weight_loader(model)
             loaded_params = model.load_weights(per_tensor_param)
             info = f"vLLM load weights, loaded_params: {len(loaded_params)}"
             logger.info(info)
@@ -595,7 +595,7 @@ def get_micro_data_parallel_rank():
     return torch.distributed.get_rank(group=get_micro_data_parallel_group())
 
 
-def _patch_vllm_qwen2_moe_model_weight_loader(model):
+def _patch_vllm_moe_model_weight_loader(model):
     # this is a work around to load the weight of vllm qwen2 moe model
     # it is from a bug from vllm 0.8.2
     # all the weights are supposed to have a weight_loader, but the moe weights
@@ -613,12 +613,11 @@ def _patch_vllm_qwen2_moe_model_weight_loader(model):
     # (False, 'model.layers.0.mlp.experts.w13_weight')          use mlp.experts.weight_loader
     # (False, 'model.layers.0.mlp.experts.w2_weight')          use mlp.experts.weight_loader
     from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM
-
-    if not isinstance(model, Qwen2MoeForCausalLM):
-        return
-    for layer in model.model.layers:
-        mlp = layer.mlp
-        param_dict = dict(mlp.named_parameters())
-        for name, param in param_dict.items():
-            if "w13_weight" in name or "w2_weight" in name:
-                param.weight_loader = mlp.experts.weight_loader
+    from vllm.model_executor.models.deepseek_v2 import DeepseekV3ForCausalLM
+    if isinstance(model, DeepseekV3ForCausalLM) or isinstance(model, Qwen2MoeForCausalLM):
+        for layer in model.model.layers:
+            mlp = layer.mlp
+            param_dict = dict(mlp.named_parameters())
+            for name, param in param_dict.items():
+                if "w13_weight" in name or "w2_weight" in name:
+                    param.weight_loader = mlp.experts.weight_loader

From 880184175d9e51d8c9a51bfa7fbd7886728053ad Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Sun, 27 Apr 2025 20:30:07 -0700
Subject: [PATCH 14/19] fix

---
 verl/models/mcore/config_converter.py  |  6 +++--
 verl/models/mcore/model_initializer.py | 31 ++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index 9602df1c0fb..967d8510f20 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -150,10 +150,11 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -
     mla_rope_config = {
         "beta_fast": 32,
         "beta_slow": 1,
-        "factor": 40,
+        "factor": 1,
         "mscale": 1.0,
         "mscale_all_dim": 1.0,
         "original_max_position_embeddings": 4096,
+        "type": "rope",
     }
     if "rope_scaling" in hf_config and hf_config.rope_scaling is not None:
         mla_rope_config.update(hf_config.rope_scaling)
@@ -177,7 +178,7 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -
         variable_seq_lengths=True,
         masked_softmax_fusion=True,
         # attention_backend=AttnBackend.flash,
-        attention_backend=AttnBackend.unfused,
+        attention_backend=AttnBackend.fused,
         bf16=dtype is torch.bfloat16,
         layernorm_epsilon=hf_config.rms_norm_eps,
         ffn_hidden_size=hf_config.intermediate_size,
@@ -218,6 +219,7 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -
         v_head_dim=hf_config.v_head_dim,
         rotary_base=hf_config.rope_theta,
         rotary_scaling_factor=mla_rope_config["factor"],
+        rope_type=mla_rope_config["type"],
         mscale=mla_rope_config["mscale"],
         mscale_all_dim=mla_rope_config["mscale_all_dim"],
         max_position_embeddings=mla_rope_config["original_max_position_embeddings"],
diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py
index 0be8c9eb7ac..e83ce4960a4 100644
--- a/verl/models/mcore/model_initializer.py
+++ b/verl/models/mcore/model_initializer.py
@@ -129,11 +129,38 @@ def init_mcore_model_dpskv3(
     post_process=None,
     share_embeddings_and_output_weights=False,
     value=False,
+    freeze_moe_router=True,
     **extra_kwargs,
 ):
-    return init_mcore_model_dense(
-        tfconfig, hf_config, pre_process, post_process, share_embeddings_and_output_weights, value, **extra_kwargs
+    from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
+    from megatron.core.models.gpt.gpt_model import GPTModel
+
+    use_te = True
+    if freeze_moe_router:
+        tfconfig.moe_router_load_balancing_type = "none"
+
+    assert tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now"
+    transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te)
+    model = GPTModel(
+        config=tfconfig,
+        transformer_layer_spec=transformer_layer_spec,
+        vocab_size=hf_config.vocab_size,
+        max_sequence_length=hf_config.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        share_embeddings_and_output_weights=share_embeddings_and_output_weights,
+        position_embedding_type="rope",
+        rotary_base=hf_config.rope_theta,
     )
+    if freeze_moe_router:
+        for layer in model.decoder.layers:
+            if hasattr(layer.mlp, "router"):
+                layer.mlp.router.weight.requires_grad = False
+    
+    if post_process and value:
+        from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
+        model.output_layer = LinearForLastLayer(input_size=tfconfig.hidden_size, output_size=1, config=tfconfig)
+    return model
 
 
 def init_mcore_model_qwen2_5_vl(

From e5d6ca0f4acf75c6e2a084dc33f841d18487721d Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Sun, 27 Apr 2025 20:34:34 -0700
Subject: [PATCH 15/19] typo

---
 scripts/converter_hf_to_mcore.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py
index f917a9c1d58..77bd981249a 100644
--- a/scripts/converter_hf_to_mcore.py
+++ b/scripts/converter_hf_to_mcore.py
@@ -97,7 +97,7 @@ def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config,
 
 @torch.no_grad()
 def convert_checkpoint_from_transformers_to_megatron_dpskv3(hf_model, model, hf_config, tfconfig):
-    warnings.warn("MPT model is not supported yet")
+    warnings.warn("MTP model is not supported yet")
     def safe_copy(
         src_tensor: torch.Tensor, 
         dst_tensor: torch.Tensor,

From ae550a889a3a60d3ccc65304f035253b35cf4021 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Mon, 28 Apr 2025 05:38:21 -0700
Subject: [PATCH 16/19] add scripts

---
 .../run_moonlight16b_a3b_gsm8k_megatron.sh    | 81 +++++++++++++++++++
 verl/models/mcore/config_converter.py         |  2 +-
 2 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh

diff --git a/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh b/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh
new file mode 100644
index 00000000000..efd788f0eec
--- /dev/null
+++ b/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh
@@ -0,0 +1,81 @@
+set -x
+
+# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
+# export VLLM_ATTENTION_BACKEND=XFORMERS
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+
+
+# 0. download the model
+huggingface-cli download moonshotai/Moonlight-16B-A3B-Instruct
+
+# 1. convert the model to mcore format
+# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path
+HF_MODEL_PATH=/data/models/moonshotai/Moonlight-16B-A3B-Instruct
+DIST_CKPT_PATH=/data/mcore_ckpt/Moonlight-16B-A3B-Instruct
+python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
+
+
+# 2. run the script
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+train_files=$gsm8k_train_path
+test_files=$gsm8k_test_path
+
+NODES=4
+PP=2
+TP=4
+CP=1
+VLLM_TP=4
+
+# RAY_ADDRESS='auto' ray job submit --working-dir . -- 
+python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    +data.trust_remote_code=True \
+    actor_rollout_ref.model.path=$LLM \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=$LLM \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name='verl_megatron_gsm8k_examples' \
+    trainer.experiment_name='moonlight_freeze_moe_router' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=$NODES \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    +actor_rollout_ref.model.trust_remote_code=True \
+    +critic.model.trust_remote_code=True \
+    +actor_rollout_ref.megatron.extra.num_layers_in_last_pipeline_stage=13 \
+               +critic.megatron.extra.num_layers_in_last_pipeline_stage=13 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
+    critic.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \
+    critic.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
+    critic.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    trainer.val_before_train=False \
+    trainer.total_epochs=100 $@
+    
\ No newline at end of file
diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index 1ed2ed6f995..84bd1f30520 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -138,7 +138,7 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -
         "mscale": 1.0,
         "mscale_all_dim": 1.0,
         "original_max_position_embeddings": 4096,
-        "type": "rope",
+        # "type": "rope",
     }
     if "rope_scaling" in hf_config and hf_config.rope_scaling is not None:
         mla_rope_config.update(hf_config.rope_scaling)

From dce2c402fd4f7a3286375659b700b9f9cad9052b Mon Sep 17 00:00:00 2001
From: spacegoing <spacegoing@gmail.com>
Date: Fri, 23 May 2025 03:20:17 +0000
Subject: [PATCH 17/19] [Fix] config_converter signature

---
 verl/models/mcore/config_converter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index a926ef70a2a..f6bb361c133 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -191,6 +191,8 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype, *
         moe_layer_freq[i] = 0
 
     base_config = _get_base_transformer_config(
+        hf_config=hf_config,
+        dtype=dtype,
         activation_func=F.silu,
         use_cpu_initialization=False,
         add_bias_linear=False,
@@ -227,7 +229,7 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype, *
         "mscale": 1.0,
         "mscale_all_dim": 1.0,
         "original_max_position_embeddings": 4096,
-        # "type": "rope",
+        "type": "rope",
     }
     if "rope_scaling" in hf_config and hf_config.rope_scaling is not None:
         mla_rope_config.update(hf_config.rope_scaling)

From 3e09ccf41823e8f72132d75c3c29918f1e7db58d Mon Sep 17 00:00:00 2001
From: spacegoing <spacegoing@gmail.com>
Date: Fri, 23 May 2025 03:20:53 +0000
Subject: [PATCH 18/19] [Fix] restore trust remote code arg

---
 .../run_moonlight16b_a3b_gsm8k_megatron.sh            |  3 +--
 scripts/converter_hf_to_mcore.py                      | 11 +++++++----
 verl/single_controller/base/megatron/worker.py        |  6 +++---
 verl/workers/megatron_workers.py                      | 11 +++++++----
 4 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh b/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh
index efd788f0eec..361d9e8061e 100644
--- a/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh
+++ b/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh
@@ -12,7 +12,7 @@ huggingface-cli download moonshotai/Moonlight-16B-A3B-Instruct
 # change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path
 HF_MODEL_PATH=/data/models/moonshotai/Moonlight-16B-A3B-Instruct
 DIST_CKPT_PATH=/data/mcore_ckpt/Moonlight-16B-A3B-Instruct
-python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
+python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH --trust_remote_code
 
 
 # 2. run the script
@@ -78,4 +78,3 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
     critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
     trainer.val_before_train=False \
     trainer.total_epochs=100 $@
-    
\ No newline at end of file
diff --git a/scripts/converter_hf_to_mcore.py b/scripts/converter_hf_to_mcore.py
index 27c47c8558c..f5048d24c8f 100644
--- a/scripts/converter_hf_to_mcore.py
+++ b/scripts/converter_hf_to_mcore.py
@@ -35,6 +35,7 @@ def _init_args():
     parser.add_argument("--output_path", type=str, required=True, help="The path for the output mcore model")
     parser.add_argument("--use_cpu_initialization", action="store_true", help="Whether to use cpu initialization")
     parser.add_argument("--test", action="store_true", help="Whether to test the conversion")
+    parser.add_argument("--trust_remote_code", action="store_true", help="Whether to trust remote hf code")
     args = parser.parse_args()
     return args
 
@@ -210,7 +211,7 @@ def safe_copy(
             model.output_layer.weight.copy_(hf_model.lm_head.weight)
 
 
-def convert_hf_to_mcore(hf_model_path, output_path, use_cpu_initialization=False, test=False):
+def convert_hf_to_mcore(hf_model_path, output_path, use_cpu_initialization=False, test=False, trust_remote_code=False):
     os.makedirs(output_path, exist_ok=True)
     if len(os.listdir(output_path)) > 0 and not test:
         print(f"Output path {output_path} is not empty, skipping conversion")
@@ -231,7 +232,7 @@ def convert_hf_to_mcore(hf_model_path, output_path, use_cpu_initialization=False
     model_parallel_cuda_manual_seed(0)
 
     # init hf config
-    hf_config = AutoConfig.from_pretrained(hf_model_path)
+    hf_config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=trust_remote_code)
     print(hf_config, flush=True)
 
     cfg = Config()
@@ -265,7 +266,9 @@ def megatron_model_provider(pre_process, post_process):
         warnings.simplefilter("ignore")
 
     # init hf model
-    hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, torch_dtype=torch.bfloat16)
+    hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path,
+                                                    torch_dtype=torch.bfloat16,
+                                                    trust_remote_code=trust_remote_code)
     hf_state_dict = hf_model.state_dict()
 
     # load hf state dict to megatron model
@@ -299,4 +302,4 @@ def megatron_model_provider(pre_process, post_process):
 
 if __name__ == "__main__":
     args = _init_args()
-    convert_hf_to_mcore(args.hf_model_path, args.output_path, args.use_cpu_initialization, args.test)
+    convert_hf_to_mcore(args.hf_model_path, args.output_path, args.use_cpu_initialization, args.test, args.trust_remote_code)
diff --git a/verl/single_controller/base/megatron/worker.py b/verl/single_controller/base/megatron/worker.py
index 7615367e9fe..251e9585a0b 100644
--- a/verl/single_controller/base/megatron/worker.py
+++ b/verl/single_controller/base/megatron/worker.py
@@ -39,7 +39,7 @@ def get_megatron_rank_info(self):
         info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank, cp_rank=cp_rank)
         return info
 
-    def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config, override_transformer_config):
+    def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config, override_transformer_config, trust_remote_code=False):
         from transformers import AutoConfig
 
         from verl.models.mcore import hf_to_mcore_config
@@ -49,10 +49,10 @@ def _init_hf_config_and_tf_config(self, model_path, dtype, override_model_config
 
         # Step 1: initialize the tokenizer
         self.local_path = copy_to_local(model_path)
-        self.tokenizer = hf_tokenizer(self.local_path)
+        self.tokenizer = hf_tokenizer(self.local_path, trust_remote_code=trust_remote_code)
 
         # Step 2: get the hf
-        hf_config = AutoConfig.from_pretrained(self.local_path)
+        hf_config = AutoConfig.from_pretrained(self.local_path, trust_remote_code=trust_remote_code)
 
         # Step 3: override the hf config
         override_config_kwargs = {
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index 01587c43dec..1c0af9cca00 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -142,8 +142,8 @@ def _build_model_optimizer(self, model_path, optim_config, override_model_config
         from verl.utils.megatron.optimizer import get_megatron_optimizer
         from verl.utils.megatron_utils import get_model, init_megatron_optim_config
         from verl.utils.model import get_generation_config, print_model_size
-
-        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config)
+        trust_remote_code=self.config.model.get("trust_remote_code", False)
+        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config, trust_remote_code=trust_remote_code)
         self.generation_config = get_generation_config(self.local_path)
 
         def megatron_actor_model_provider(pre_process, post_process):
@@ -237,6 +237,7 @@ def _build_rollout(self, trust_remote_code=False):
                     tokenizer=self.tokenizer,
                     model_hf_config=self.actor_model_config,
                     device_mesh=rollout_device_mesh,
+                    trust_remote_code=trust_remote_code,
                 )
             log_gpu_memory_usage("After building vllm rollout", logger=logger)
 
@@ -560,7 +561,8 @@ def _build_critic_model_optimizer(self, model_path, optim_config, override_model
         from verl.utils.megatron_utils import get_model, init_megatron_optim_config
         from verl.utils.model import print_model_size
 
-        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config)
+        trust_remote_code=self.config.model.get("trust_remote_code", False)
+        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config, trust_remote_code=trust_remote_code)
 
         def megatron_critic_model_provider(pre_process, post_process):
             from verl.models.mcore import init_mcore_model
@@ -752,7 +754,8 @@ def _build_rm_model(self, model_path, override_model_config, override_transforme
 
         from verl.utils.megatron_utils import get_model
 
-        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config)
+        trust_remote_code=self.config.model.get("trust_remote_code", False)
+        self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config, trust_remote_code=trust_remote_code)
 
         def megatron_rm_model_provider(pre_process, post_process):
             from verl.models.mcore import init_mcore_model

From b1df278d64545c999ff9afe20c01b7e077b13497 Mon Sep 17 00:00:00 2001
From: spacegoing <spacegoing@gmail.com>
Date: Fri, 23 May 2025 09:33:32 +0000
Subject: [PATCH 19/19] [Fix] adapt to use base class in config_converter

---
 verl/models/mcore/config_converter.py | 3 +++
 verl/models/mcore/registry.py         | 3 ++-
 verl/workers/megatron_workers.py      | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index f6bb361c133..e3fccb57683 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -222,6 +222,9 @@ def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype, *
     )
     base_config_dict = asdict(base_config)
 
+    # transformer config default multi_latent_attention = False
+    base_config_dict.update({"multi_latent_attention": True})
+
     mla_rope_config = {
         "beta_fast": 32,
         "beta_slow": 1,
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index c32b78dd8dd..e20670774df 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -85,7 +85,7 @@ class SupportedModel(Enum):
     SupportedModel.QWEN2: DenseModel,
     SupportedModel.QWEN2_MOE: Qwen2MoEModel,
     SupportedModel.MIXTRAL: MixtralModel,
-    SupportedModel.DEEPSEEK_V3: DenseModel,
+    SupportedModel.DEEPSEEK_V3: Dpskv3Model,
     SupportedModel.QWEN2_5_VL: Qwen25VLModel,
     SupportedModel.LLAMA4: DenseModel,
     SupportedModel.QWEN3: DenseModel,
@@ -113,6 +113,7 @@ class SupportedModel(Enum):
     SupportedModel.MIXTRAL: McoreToHFWeightConverterMixtral,
     SupportedModel.QWEN3: McoreToHFWeightConverterDense,
     SupportedModel.QWEN3_MOE: McoreToHFWeightConverterQwen3Moe,
+    SupportedModel.DEEPSEEK_V3: McoreToHFWeightConverterDpskv3,
 }
 
 
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index 1c0af9cca00..2b7ea092661 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -142,7 +142,7 @@ def _build_model_optimizer(self, model_path, optim_config, override_model_config
         from verl.utils.megatron.optimizer import get_megatron_optimizer
         from verl.utils.megatron_utils import get_model, init_megatron_optim_config
         from verl.utils.model import get_generation_config, print_model_size
-        trust_remote_code=self.config.model.get("trust_remote_code", False)
+        trust_remote_code = self.config.model.get("trust_remote_code", False)
         self._init_hf_config_and_tf_config(model_path, self.dtype, override_model_config, override_transformer_config, trust_remote_code=trust_remote_code)
         self.generation_config = get_generation_config(self.local_path)