From 0350b4ef4ef8892615e209b36e8dde4ca42d9232 Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Sat, 28 Feb 2026 11:37:13 -0700 Subject: [PATCH 1/4] renmove deprecated providers Signed-off-by: yaoyu-33 --- src/megatron/bridge/models/__init__.py | 131 +---- .../bridge/models/conversion/model_bridge.py | 4 +- .../bridge/models/deepseek/__init__.py | 24 +- .../models/deepseek/deepseek_provider.py | 203 -------- src/megatron/bridge/models/glm/__init__.py | 8 - .../bridge/models/glm/glm45_provider.py | 135 ----- .../bridge/models/glm_vl/glm_45v_provider.py | 92 +++- .../bridge/models/gpt_oss/__init__.py | 8 - .../bridge/models/gpt_oss/gpt_oss_provider.py | 106 ---- src/megatron/bridge/models/kimi/__init__.py | 4 +- .../bridge/models/kimi/kimi_provider.py | 128 ----- src/megatron/bridge/models/llama/__init__.py | 43 +- .../bridge/models/llama/llama_provider.py | 383 --------------- .../bridge/models/llama_nemotron/__init__.py | 8 - .../llama_nemotron/llama_nemotron_bridge.py | 3 +- .../llama_nemotron/llama_nemotron_provider.py | 117 +---- .../bridge/models/nemotron/__init__.py | 14 - .../models/nemotron/nemotron_provider.py | 147 ------ .../nemotron_vl/nemotron_vl_provider.py | 85 ++-- .../bridge/models/nemotronh/__init__.py | 30 -- .../models/nemotronh/nemotron_h_provider.py | 271 ---------- src/megatron/bridge/recipes/kimi/kimi_k2.py | 89 +++- .../bridge/recipes/moonlight/moonlight_16b.py | 97 +++- .../recipes/nemotronh/nemotron_3_nano.py | 160 +++++- .../recipes/nemotronh/nemotron_nano_v2.py | 222 ++++++++- .../bridge/recipes/nemotronh/nemotronh.py | 445 +++++++++++++++-- tests/functional_tests/data/test_samplers.py | 20 +- .../models/gpt_oss/test_gpt_oss_provider.py | 81 --- .../training/test_callbacks.py | 28 +- .../training/test_decentralized_pg.py | 148 +++++- .../training/test_finetune_dora.py | 23 +- .../training/test_finetune_lora.py | 23 +- .../training/test_inprocess_restart.py | 29 +- .../training/test_megatron_fsdp.py | 25 +- .../training/test_nvrx_straggler.py | 29 +- .../training/test_pretrain.py | 54 +- .../training/test_pretrain_resume.py | 23 +- .../training/test_sample_based_training.py | 28 +- tests/functional_tests/training/test_sft.py | 23 +- .../training/test_tensor_inspect.py | 28 +- .../models/deepseek/test_deepseek_provider.py | 107 ---- .../models/glm/test_glm45_provider.py | 115 ----- .../models/glm_vl/test_glm_45v_provider.py | 13 +- .../models/gpt_oss/test_gpt_oss_provider.py | 58 --- .../models/kimi/test_kimi_provider.py | 107 ---- .../models/llama/test_llama_provider.py | 339 ------------- .../test_llama_nemotron_bridge.py | 111 ----- .../models/nemotron/test_nemotron_provider.py | 139 ------ .../nemotronh/test_nemotron_h_provider.py | 462 ------------------ .../unit_tests/models/test_models_imports.py | 21 +- tests/unit_tests/recipes/kimi/test_kimi_k2.py | 4 +- .../recipes/nemotronh/test_nemotron_3_nano.py | 12 +- .../nemotronh/test_nemotron_nano_v2.py | 13 +- .../recipes/nemotronh/test_nemotronh.py | 23 +- .../unit_tests/recipes/test_glm45_recipes.py | 22 +- tests/unit_tests/recipes/test_run_plugins.py | 28 +- tests/unit_tests/training/test_config.py | 8 +- .../training/test_log_non_default_values.py | 8 +- 58 files changed, 1607 insertions(+), 3502 deletions(-) delete mode 100644 src/megatron/bridge/models/deepseek/deepseek_provider.py delete mode 100644 src/megatron/bridge/models/glm/glm45_provider.py delete mode 100644 src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py delete mode 100644 src/megatron/bridge/models/kimi/kimi_provider.py delete mode 100644 src/megatron/bridge/models/llama/llama_provider.py delete mode 100644 src/megatron/bridge/models/nemotron/nemotron_provider.py delete mode 100644 src/megatron/bridge/models/nemotronh/nemotron_h_provider.py delete mode 100644 tests/functional_tests/models/gpt_oss/test_gpt_oss_provider.py delete mode 100644 tests/unit_tests/models/deepseek/test_deepseek_provider.py delete mode 100644 tests/unit_tests/models/glm/test_glm45_provider.py delete mode 100644 tests/unit_tests/models/gpt_oss/test_gpt_oss_provider.py delete mode 100644 tests/unit_tests/models/kimi/test_kimi_provider.py delete mode 100644 tests/unit_tests/models/llama/test_llama_provider.py delete mode 100644 tests/unit_tests/models/nemotron/test_nemotron_provider.py delete mode 100644 tests/unit_tests/models/nemotronh/test_nemotron_h_provider.py diff --git a/src/megatron/bridge/models/__init__.py b/src/megatron/bridge/models/__init__.py index 4fd486022c..b05607100c 100644 --- a/src/megatron/bridge/models/__init__.py +++ b/src/megatron/bridge/models/__init__.py @@ -25,18 +25,6 @@ ReplicatedMapping, RowParallelMapping, ) -from megatron.bridge.models.deepseek import ( - DeepSeekModelProvider, - DeepSeekProvider, - DeepSeekV2LiteModelProvider, - DeepSeekV2LiteProvider, - DeepSeekV2ModelProvider, - DeepSeekV2Provider, - DeepSeekV3ModelProvider, - DeepSeekV3Provider, - MoonlightModelProvider16B, - MoonlightProvider, -) from megatron.bridge.models.gemma import ( CodeGemmaModelProvider2B, CodeGemmaModelProvider7B, @@ -59,10 +47,7 @@ Gemma3VLModelProvider, ) from megatron.bridge.models.glm import ( - GLM45AirModelProvider106B, GLM45Bridge, - GLM45ModelProvider355B, - GLMMoEModelProvider, ) from megatron.bridge.models.glm_vl import ( GLM45VBridge, @@ -70,38 +55,12 @@ ) from megatron.bridge.models.gpt_oss import ( GPTOSSBridge, - GPTOSSProvider, - GPTOSSProvider20B, - GPTOSSProvider120B, ) from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.models.llama import ( - CodeLlamaModelProvider7B, - CodeLlamaModelProvider13B, - CodeLlamaModelProvider34B, - CodeLlamaModelProvider70B, - Llama2ModelProvider7B, - Llama2ModelProvider13B, - Llama2ModelProvider70B, - Llama3ModelProvider, - Llama3ModelProvider8B, - Llama3ModelProvider70B, - Llama4Experts16ModelProvider, - Llama4Experts128ModelProvider, - Llama4ModelProvider, - Llama31ModelProvider, - Llama31ModelProvider8B, - Llama31ModelProvider70B, - Llama31ModelProvider405B, - Llama32ModelProvider1B, - Llama32ModelProvider3B, - LlamaModelProvider, + LlamaBridge, ) from megatron.bridge.models.llama_nemotron import ( - Llama31Nemotron70BProvider, - Llama31NemotronNano8BProvider, - Llama31NemotronUltra253BProvider, - Llama33NemotronSuper49BProvider, LlamaNemotronBridge, LlamaNemotronHeterogeneousProvider, ) @@ -120,35 +79,15 @@ MistralSmall3ModelProvider24B, ) from megatron.bridge.models.nemotron import ( - Nemotron3ModelProvider4B, - Nemotron3ModelProvider8B, - Nemotron3ModelProvider22B, - Nemotron4ModelProvider15B, - Nemotron4ModelProvider340B, NemotronBridge, - NemotronModelProvider, ) from megatron.bridge.models.nemotron_vl import ( - NemotronNano12Bv2Provider, NemotronNano12Bv2VLModelProvider, NemotronVLBridge, NemotronVLModel, ) -from megatron.bridge.models.nemotronh.nemotron_h_provider import ( - Nemotron3NanoProvider, - NemotronHModel4BProvider, - NemotronHModel8BProvider, - NemotronHModel47BProvider, - NemotronHModel56BProvider, - NemotronHModelProvider, - NemotronHModelProvider4B, - NemotronHModelProvider8B, - NemotronHModelProvider47B, - NemotronHModelProvider56B, - NemotronNano9Bv2Provider, - NemotronNano12Bv2Provider, - NemotronNanoModelProvider9Bv2, - NemotronNanoModelProvider12Bv2, +from megatron.bridge.models.nemotronh import ( + NemotronHBridge, ) from megatron.bridge.models.olmoe import ( OlMoEBridge, @@ -218,43 +157,15 @@ "Gemma2ModelProvider2B", "Gemma2ModelProvider9B", "Gemma2ModelProvider27B", - "GLMMoEModelProvider", - "GLM45ModelProvider355B", - "GLM45AirModelProvider106B", "GLM45Bridge", "GLM45VBridge", "GLM45VModelProvider", "GPTModelProvider", "GPTOSSBridge", - "GPTOSSProvider", - "GPTOSSProvider20B", - "GPTOSSProvider120B", "T5ModelProvider", - "LlamaModelProvider", - "Llama2ModelProvider7B", - "Llama2ModelProvider13B", - "Llama2ModelProvider70B", - "Llama3ModelProvider", - "Llama3ModelProvider8B", - "Llama3ModelProvider70B", - "Llama31ModelProvider", - "Llama31ModelProvider8B", - "Llama31ModelProvider70B", - "Llama31ModelProvider405B", - "Llama32ModelProvider1B", - "Llama32ModelProvider3B", - "CodeLlamaModelProvider7B", - "CodeLlamaModelProvider13B", - "CodeLlamaModelProvider34B", - "CodeLlamaModelProvider70B", - "Llama4ModelProvider", - "Llama4Experts16ModelProvider", - "Llama4Experts128ModelProvider", + "LlamaBridge", "LlamaNemotronHeterogeneousProvider", - "Llama31NemotronNano8BProvider", - "Llama31Nemotron70BProvider", - "Llama31NemotronUltra253BProvider", - "Llama33NemotronSuper49BProvider", + "LlamaNemotronBridge", "MistralModelProvider", "MistralSmall3ModelProvider24B", # Ministral 3 Models @@ -288,40 +199,11 @@ "Qwen3MoEModelProvider", "Qwen3MoEModelProvider30B_A3B", "Qwen3MoEModelProvider235B_A22B", - "DeepSeekModelProvider", - "DeepSeekProvider", - "DeepSeekV2LiteModelProvider", - "DeepSeekV2LiteProvider", - "DeepSeekV2ModelProvider", - "DeepSeekV2Provider", - "DeepSeekV3ModelProvider", - "DeepSeekV3Provider", - "MoonlightModelProvider16B", - "MoonlightProvider", - "NemotronHModelProvider", - "NemotronHModelProvider4B", - "NemotronHModelProvider8B", - "NemotronHModelProvider47B", - "NemotronHModelProvider56B", - "NemotronNanoModelProvider9Bv2", - "NemotronNanoModelProvider12Bv2", - "NemotronHModel4BProvider", - "NemotronHModel8BProvider", - "NemotronHModel47BProvider", - "NemotronHModel56BProvider", - "NemotronNano9Bv2Provider", - "NemotronNano12Bv2Provider", - "Nemotron3NanoProvider", + "NemotronHBridge", "MambaModelProvider", "MimoBridge", # Nemotron Models "NemotronBridge", - "NemotronModelProvider", - "Nemotron3ModelProvider4B", - "Nemotron3ModelProvider8B", - "Nemotron3ModelProvider22B", - "Nemotron4ModelProvider15B", - "Nemotron4ModelProvider340B", # VL Models "Qwen25VLModel", "Qwen25VLBridge", @@ -336,6 +218,5 @@ "Gemma3VLModelProvider", "NemotronVLModel", "NemotronVLBridge", - "NemotronNano12Bv2Provider", "NemotronNano12Bv2VLModelProvider", ] diff --git a/src/megatron/bridge/models/conversion/model_bridge.py b/src/megatron/bridge/models/conversion/model_bridge.py index c5a6155b12..49029dfdd5 100644 --- a/src/megatron/bridge/models/conversion/model_bridge.py +++ b/src/megatron/bridge/models/conversion/model_bridge.py @@ -203,8 +203,8 @@ class MegatronCausalLlamaBridge(MegatronModelBridge): .. code-block:: python - def provider_bridge(self, hf_pretrained) -> LlamaModelProvider: - return LlamaModelProvider( + def provider_bridge(self, hf_pretrained) -> GPTModelProvider: + return GPTModelProvider( num_layers=hf_pretrained.config.num_hidden_layers, hidden_size=hf_pretrained.config.hidden_size, ... diff --git a/src/megatron/bridge/models/deepseek/__init__.py b/src/megatron/bridge/models/deepseek/__init__.py index 50d137fe02..af9620a389 100644 --- a/src/megatron/bridge/models/deepseek/__init__.py +++ b/src/megatron/bridge/models/deepseek/__init__.py @@ -12,31 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from megatron.bridge.models.deepseek.deepseek_provider import ( - DeepSeekModelProvider, - DeepSeekProvider, - DeepSeekV2LiteModelProvider, - DeepSeekV2LiteProvider, - DeepSeekV2ModelProvider, - DeepSeekV2Provider, - DeepSeekV3ModelProvider, - DeepSeekV3Provider, - MoonlightModelProvider16B, - MoonlightProvider, -) from megatron.bridge.models.deepseek.deepseek_v2_bridge import DeepSeekV2Bridge # noqa: F401 from megatron.bridge.models.deepseek.deepseek_v3_bridge import DeepSeekV3Bridge # noqa: F401 __all__ = [ - "DeepSeekModelProvider", - "DeepSeekV2LiteModelProvider", - "DeepSeekV2ModelProvider", - "DeepSeekV3ModelProvider", - "MoonlightModelProvider16B", - "DeepSeekProvider", - "DeepSeekV2LiteProvider", - "DeepSeekV2Provider", - "DeepSeekV3Provider", - "MoonlightProvider", + "DeepSeekV2Bridge", + "DeepSeekV3Bridge", ] diff --git a/src/megatron/bridge/models/deepseek/deepseek_provider.py b/src/megatron/bridge/models/deepseek/deepseek_provider.py deleted file mode 100644 index 2e02d4cfe1..0000000000 --- a/src/megatron/bridge/models/deepseek/deepseek_provider.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import warnings -from dataclasses import dataclass, field -from typing import Callable, List, Optional, Union - -import torch -import torch.nn.functional as F - -from megatron.bridge.models.mla_provider import MLAModelProvider -from megatron.bridge.utils.common_utils import get_rank_safe - - -def _warn_deprecated(old_cls: str, new_cls: str = "MLAModelProvider") -> None: - if get_rank_safe() == 0: - warnings.warn( - f"{old_cls} is deprecated and will be removed in a future release. " - f"Use {new_cls} with MEGATRON_DEFAULTS in the bridge instead.", - DeprecationWarning, - stacklevel=3, - ) - - -@dataclass -class DeepSeekModelProvider(MLAModelProvider): - """Deprecated alias for ``MLAModelProvider``. - - Deprecated: - This alias remains for backward compatibility and will be removed in a - future release. Use ``MLAModelProvider`` instead. - """ - - # Common DeepSeek defaults - normalization: str = "RMSNorm" - activation_func: Callable = F.silu - gated_linear_unit: bool = True - position_embedding_type: str = "rope" - add_bias_linear: bool = False - share_embeddings_and_output_weights: bool = False - qk_layernorm: bool = True - bf16: bool = True - params_dtype: torch.dtype = torch.bfloat16 - moe_grouped_gemm: bool = True - moe_token_dispatcher_type: str = "alltoall" - # MLA defaults - q_lora_rank: Optional[int] = 1536 - kv_lora_rank: int = 512 - - def __post_init__(self) -> None: - _warn_deprecated("DeepSeekModelProvider") - super().__post_init__() - - -@dataclass -class DeepSeekV2ModelProvider(MLAModelProvider): - """ - DeepSeek-V2 Model: https://github.com/deepseek-ai/DeepSeek-V2 - """ - - num_layers: int = 60 - hidden_size: int = 5120 - ffn_hidden_size: int = 12288 - num_moe_experts: int = 160 - moe_ffn_hidden_size: int = 1536 - moe_shared_expert_intermediate_size: int = 3072 # 1536 * 2 shared experts - moe_layer_freq: Union[int, List[int]] = field(default_factory=lambda: [0] + [1] * 59) # first layer is dense - moe_router_topk: int = 6 - moe_router_num_groups: int = 8 - moe_router_group_topk: int = 3 - moe_router_topk_scaling_factor: float = 16.0 - moe_aux_loss_coeff: float = 1e-3 - mscale: float = 0.707 - mscale_all_dim: float = 0.707 - vocab_size: int = 102400 - - def __post_init__(self) -> None: - _warn_deprecated("DeepSeekV2ModelProvider") - super().__post_init__() - - -@dataclass -class DeepSeekV2LiteModelProvider(MLAModelProvider): - """ - DeepSeek-V2-Lite Model: https://github.com/deepseek-ai/DeepSeek-V2 - HuggingFace: https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite - """ - - num_layers: int = 27 - hidden_size: int = 2048 - ffn_hidden_size: int = 10944 - num_attention_heads: int = 16 - kv_channels: int = 16 - q_lora_rank: Optional[int] = None - num_moe_experts: int = 64 - moe_ffn_hidden_size: int = 1408 - moe_shared_expert_intermediate_size: int = 2816 # 1408 * 2 shared experts - moe_layer_freq: Union[int, List[int]] = field(default_factory=lambda: [0] + [1] * 26) # first layer is dense - moe_router_topk: int = 6 - moe_router_num_groups: int = 1 - moe_router_group_topk: int = 1 - moe_router_topk_scaling_factor: float = 1.0 - mscale: float = 0.707 - mscale_all_dim: float = 0.707 - vocab_size: int = 102400 - - def __post_init__(self) -> None: - _warn_deprecated("DeepSeekV2LiteModelProvider") - super().__post_init__() - - -@dataclass -class DeepSeekV3ModelProvider(MLAModelProvider): - """ - DeepSeek-V3 Model: https://github.com/deepseek-ai/DeepSeek-V3 - """ - - num_layers: int = 61 - hidden_size: int = 7168 - ffn_hidden_size: int = 18432 - kv_channels: int = 128 - num_moe_experts: int = 256 - moe_ffn_hidden_size: int = 2048 - moe_shared_expert_intermediate_size: int = 2048 # 2048 * 1 shared expert - moe_layer_freq: Union[int, List[int]] = field( - default_factory=lambda: [0] * 3 + [1] * 58 - ) # first three layers are dense - moe_router_topk: int = 8 - moe_router_num_groups: int = 8 - moe_router_group_topk: int = 4 - moe_router_topk_scaling_factor: float = 2.5 - moe_aux_loss_coeff: float = 1e-4 - make_vocab_size_divisible_by: int = 1280 - moe_router_score_function: str = "sigmoid" - moe_router_enable_expert_bias: bool = True - moe_router_bias_update_rate: float = 1e-3 - mscale: float = 1.0 - mscale_all_dim: float = 1.0 - vocab_size: int = 129280 - - def __post_init__(self) -> None: - _warn_deprecated("DeepSeekV3ModelProvider") - super().__post_init__() - - -@dataclass -class MoonlightModelProvider16B(MLAModelProvider): - """ - Moonlight-16B-A3B Model: https://github.com/moonshotai/Moonlight-16B-A3B - - Moonlight is based on DeepSeek-V3. - """ - - max_position_embeddings: int = 4096 - num_layers: int = 27 - hidden_size: int = 2048 - ffn_hidden_size: int = 11264 - num_attention_heads: int = 16 - kv_channels: int = 16 - num_moe_experts: int = 64 - moe_ffn_hidden_size: int = 1408 - moe_shared_expert_intermediate_size: int = 2816 # 1408 * 2 shared expert - moe_layer_freq: Union[int, List[int]] = field(default_factory=lambda: [0] * 1 + [1] * 26) # first layer is dense - moe_router_topk: int = 6 - moe_router_num_groups: int = 1 - moe_router_group_topk: int = 1 - moe_router_topk_scaling_factor: float = 2.446 - moe_aux_loss_coeff: float = 0.001 - make_vocab_size_divisible_by: int = 1280 - moe_router_score_function: str = "sigmoid" - moe_router_enable_expert_bias: bool = True - rotary_scaling_factor: float = 1.0 - mscale: float = 1.0 - mscale_all_dim: float = 1.0 - rotary_base: float = 50000 - layernorm_epsilon: float = 1e-5 - q_lora_rank: int = None - init_method_std: float = 0.02 - moe_router_bias_update_rate: float = 1e-3 - rotary_percent: float = 1.0 - vocab_size: int = 163842 - - def __post_init__(self) -> None: - _warn_deprecated("MoonlightModelProvider16B") - super().__post_init__() - - -# Legacy aliases for backward compatibility -DeepSeekProvider = DeepSeekModelProvider -DeepSeekV2Provider = DeepSeekV2ModelProvider -DeepSeekV2LiteProvider = DeepSeekV2LiteModelProvider -DeepSeekV3Provider = DeepSeekV3ModelProvider -MoonlightProvider = MoonlightModelProvider16B diff --git a/src/megatron/bridge/models/glm/__init__.py b/src/megatron/bridge/models/glm/__init__.py index e356463bc5..aba1a10f73 100644 --- a/src/megatron/bridge/models/glm/__init__.py +++ b/src/megatron/bridge/models/glm/__init__.py @@ -13,16 +13,8 @@ # limitations under the License. from megatron.bridge.models.glm.glm45_bridge import GLM45Bridge -from megatron.bridge.models.glm.glm45_provider import ( - GLM45AirModelProvider106B, - GLM45ModelProvider355B, - GLMMoEModelProvider, -) __all__ = [ - "GLMMoEModelProvider", - "GLM45ModelProvider355B", - "GLM45AirModelProvider106B", "GLM45Bridge", ] diff --git a/src/megatron/bridge/models/glm/glm45_provider.py b/src/megatron/bridge/models/glm/glm45_provider.py deleted file mode 100644 index 219d0ff591..0000000000 --- a/src/megatron/bridge/models/glm/glm45_provider.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from dataclasses import dataclass, field -from functools import partial -from typing import TYPE_CHECKING, Callable, List, Optional, Union - -import torch -import torch.nn.functional as F -from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec - -from megatron.bridge.models.gpt_provider import GPTModelProvider - - -try: - import transformer_engine # type: ignore # noqa: F401 - - HAVE_TE = True -except (ImportError, ModuleNotFoundError): - HAVE_TE = False -if TYPE_CHECKING: - from megatron.core.transformer import ModuleSpec - - -logger = logging.getLogger(__name__) - - -@dataclass -class GLMMoEModelProvider(GPTModelProvider): - """Base provider for GLM MoE Models.""" - - transformer_layer_spec: Union["ModuleSpec", Callable[["GPTModelProvider"], "ModuleSpec"]] = partial( - get_gpt_decoder_block_spec, use_transformer_engine=HAVE_TE - ) - - normalization: str = "RMSNorm" - activation_func: Callable = F.silu - gated_linear_unit: bool = True - add_bias_linear: bool = False - add_qkv_bias: bool = True - seq_length: int = 131072 - init_method_std: int = 0.02 - hidden_dropout: float = 0.0 - vocab_size: int = 151552 - share_embeddings_and_output_weights: Optional[bool] = False - layernorm_epsilon: float = 1e-5 - autocast_dtype: torch.dtype = torch.bfloat16 - params_dtype: torch.dtype = torch.bfloat16 - bf16: bool = True - - # Attention - num_query_groups: int = 8 - num_attention_heads: int = 96 - attention_dropout: float = 0.0 - kv_channels: int = 128 - - # RoPE - position_embedding_type: str = "rope" - rotary_base: float = 1000000.0 - rotary_percent: float = 0.5 - - # MoE specific parameters - moe_router_topk: int = 8 - moe_shared_expert_overlap: bool = True - moe_token_dispatcher_type: str = "alltoall" - moe_router_load_balancing_type: str = "seq_aux_loss" - moe_aux_loss_coeff: float = 1e-3 - moe_router_pre_softmax: bool = False - moe_grouped_gemm: bool = True - moe_router_score_function: str = "sigmoid" - moe_permute_fusion: bool = True - moe_router_dtype: str = "fp32" - moe_router_enable_expert_bias: bool = True - moe_router_bias_update_rate: float = 0 - - # optimization - persist_layer_norm: bool = True - bias_activation_fusion: bool = True - bias_dropout_fusion: bool = True - - # MTP - mtp_num_layers: Optional[int] = 1 - mtp_loss_scaling_factor: Optional[float] = ( - 0.3 # https://arxiv.org/pdf/2508.06471 0.3 for the first 15T tokens, 0.1 for the remaining tokens. - ) - - -@dataclass -class GLM45ModelProvider355B(GLMMoEModelProvider): - """ - Provider for GLM 4.5 355B-A32B: https://huggingface.co/zai-org/GLM-4.5 - """ - - num_layers: int = 92 - num_moe_experts: int = 160 - hidden_size: int = 5120 - ffn_hidden_size: int = 12288 - moe_layer_freq: Union[int, List[int]] = field( - default_factory=lambda: [0] * 3 + [1] * 89 - ) # first three layers are dense - moe_ffn_hidden_size: int = 1536 - moe_shared_expert_intermediate_size: int = 1536 - qk_layernorm: bool = True - moe_router_topk_scaling_factor: float = 2.5 - - -@dataclass -class GLM45AirModelProvider106B(GLMMoEModelProvider): - """ - Provider for GLM 4.5 Air 106B-A12B: https://huggingface.co/zai-org/GLM-4.5-Air - """ - - num_layers: int = 46 - num_moe_experts: int = 128 - hidden_size: int = 4096 - ffn_hidden_size: int = 10944 - moe_layer_freq: Union[int, List[int]] = field( - default_factory=lambda: [0] * 1 + [1] * 45 - ) # first one layer is dense - moe_ffn_hidden_size: int = 1408 - moe_shared_expert_intermediate_size: int = 1408 - qk_layernorm: bool = False - moe_router_topk_scaling_factor: float = 1.0 diff --git a/src/megatron/bridge/models/glm_vl/glm_45v_provider.py b/src/megatron/bridge/models/glm_vl/glm_45v_provider.py index 9ee94c5c87..54545ebb5c 100644 --- a/src/megatron/bridge/models/glm_vl/glm_45v_provider.py +++ b/src/megatron/bridge/models/glm_vl/glm_45v_provider.py @@ -13,29 +13,104 @@ # limitations under the License. from dataclasses import dataclass, field -from typing import List +from functools import partial +from typing import Callable, List, Optional, Union +import torch +import torch.nn.functional as F from megatron.core.models.gpt import GPTModel as MCoreGPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec +from megatron.core.transformer import ModuleSpec from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig -from megatron.bridge.models import GLM45AirModelProvider106B +from megatron.bridge.models.gpt_provider import GPTModelProvider from .modeling_glm_45v import GLM45VModel +try: + import transformer_engine # type: ignore # noqa: F401 + + HAVE_TE = True +except (ImportError, ModuleNotFoundError): + HAVE_TE = False + + @dataclass -class GLM45VModelProvider(GLM45AirModelProvider106B): +class GLM45VModelProvider(GPTModelProvider): """ Base model provider for GLM 4.5 Vision-Language (VL) Models. + Combines GLM 4.5 Air 106B language config with VL-specific settings. """ - # Language configuration inherited from GLM45ModelProvider (GLM 4.5 Air) - # VL models shouldn't scatter embeddings across sequence parallel regions because - # the vision embeddings are going to be inserted into the language embeddings. - scatter_embedding_sequence_parallel: bool = False + # GLM MoE base config (from former GLMMoEModelProvider + GLM45AirModelProvider106B) + transformer_layer_spec: Union[ModuleSpec, Callable[["GPTModelProvider"], ModuleSpec]] = partial( + get_gpt_decoder_block_spec, use_transformer_engine=HAVE_TE + ) + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + add_bias_linear: bool = False + add_qkv_bias: bool = True + seq_length: int = 131072 + init_method_std: int = 0.02 + hidden_dropout: float = 0.0 + vocab_size: int = 151552 + share_embeddings_and_output_weights: Optional[bool] = False + layernorm_epsilon: float = 1e-5 + autocast_dtype: torch.dtype = torch.bfloat16 + params_dtype: torch.dtype = torch.bfloat16 + bf16: bool = True + + # Attention + num_query_groups: int = 8 + num_attention_heads: int = 96 + attention_dropout: float = 0.0 + kv_channels: int = 128 + + # RoPE position_embedding_type: str = "mrope" + rotary_base: float = 1000000.0 + rotary_percent: float = 0.5 mrope_section: List[int] = field(default_factory=lambda: [8, 12, 12]) + # MoE specific parameters + moe_router_topk: int = 8 + moe_shared_expert_overlap: bool = True + moe_token_dispatcher_type: str = "alltoall" + moe_router_load_balancing_type: str = "seq_aux_loss" + moe_aux_loss_coeff: float = 1e-3 + moe_router_pre_softmax: bool = False + moe_grouped_gemm: bool = True + moe_router_score_function: str = "sigmoid" + moe_permute_fusion: bool = True + moe_router_dtype: str = "fp32" + moe_router_enable_expert_bias: bool = True + moe_router_bias_update_rate: float = 0 + + # Optimization + persist_layer_norm: bool = True + bias_activation_fusion: bool = True + bias_dropout_fusion: bool = True + + # MTP + mtp_num_layers: Optional[int] = 1 + mtp_loss_scaling_factor: Optional[float] = 0.3 + + # GLM 4.5 Air 106B specifics + num_layers: int = 46 + num_moe_experts: int = 128 + hidden_size: int = 4096 + ffn_hidden_size: int = 10944 + moe_layer_freq: Union[int, List[int]] = field(default_factory=lambda: [0] * 1 + [1] * 45) + moe_ffn_hidden_size: int = 1408 + moe_shared_expert_intermediate_size: int = 1408 + qk_layernorm: bool = False + moe_router_topk_scaling_factor: float = 1.0 + + # VL configuration + scatter_embedding_sequence_parallel: bool = False + # Vision configuration vision_config: Glm4vVisionConfig = field(default_factory=Glm4vVisionConfig) return_dict: bool = True @@ -57,7 +132,6 @@ class GLM45VModelProvider(GLM45AirModelProvider106B): def provide(self, pre_process=None, post_process=None, vp_stage=None) -> GLM45VModel: model = GLM45VModel(self, pre_process=pre_process, post_process=post_process, vp_stage=vp_stage) - # Apply freeze options if any are enabled if self.freeze_language_model or self.freeze_vision_model or self.freeze_vision_projection: model.freeze( freeze_language_model=self.freeze_language_model, @@ -68,4 +142,4 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None) -> GLM45VM return model def provide_language_model(self, pre_process=None, post_process=None, vp_stage=None) -> MCoreGPTModel: - return super().provide(pre_process=pre_process, post_process=post_process, vp_stage=vp_stage) + return GPTModelProvider.provide(self, pre_process=pre_process, post_process=post_process, vp_stage=vp_stage) diff --git a/src/megatron/bridge/models/gpt_oss/__init__.py b/src/megatron/bridge/models/gpt_oss/__init__.py index d07800ac8f..2b2709b6b2 100644 --- a/src/megatron/bridge/models/gpt_oss/__init__.py +++ b/src/megatron/bridge/models/gpt_oss/__init__.py @@ -13,16 +13,8 @@ # limitations under the License. from megatron.bridge.models.gpt_oss.gpt_oss_bridge import GPTOSSBridge -from megatron.bridge.models.gpt_oss.gpt_oss_provider import ( - GPTOSSProvider, - GPTOSSProvider20B, - GPTOSSProvider120B, -) __all__ = [ "GPTOSSBridge", - "GPTOSSProvider", - "GPTOSSProvider120B", - "GPTOSSProvider20B", ] diff --git a/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py b/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py deleted file mode 100644 index 0574ce7b57..0000000000 --- a/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from dataclasses import dataclass -from typing import Callable, List, Literal, Optional, Tuple, Union - -import torch -from megatron.core.models.gpt import GPTModel as MCoreGPTModel -from megatron.core.transformer.enums import AttnBackend -from megatron.core.utils import is_te_min_version - -from megatron.bridge.models.gpt_provider import GPTModelProvider - - -try: - from megatron.core.fusions.fused_bias_geglu import quick_gelu -except ImportError: - # TODO(yuya): remove backup path once versioning issue solved - # Fallback if fused_bias_geglu is not available - quick_gelu = torch.nn.functional.gelu - - -logger = logging.getLogger(__name__) - - -@dataclass -class GPTOSSProvider(GPTModelProvider): - """ - Base config for GPT-OSS - """ - - hidden_size: int = 2880 - num_attention_heads: int = 64 - num_query_groups: int = 8 - ffn_hidden_size: int = 2880 - kv_channels: Optional[int] = 64 - normalization: str = "RMSNorm" - gated_linear_unit: bool = True - add_bias_linear: bool = True - share_embeddings_and_output_weights: bool = False - vocab_size: int = 201088 - hidden_dropout: float = 0.0 - attention_dropout: float = 0.0 - bf16: bool = True - params_dtype: torch.dtype = torch.bfloat16 - - position_embedding_type: str = "yarn" - rotary_base: int = 150000 - yarn_rotary_scaling_factor: float = 32.0 - yarn_original_max_position_embeddings: int = 4096 - yarn_beta_fast: float = 32.0 - yarn_beta_slow: float = 1.0 - yarn_correction_range_round_to_int: bool = False - yarn_mscale: Optional[float] = None - yarn_mscale_all_dim: Optional[float] = None - - moe_router_topk: int = 4 - moe_router_pre_softmax: bool = False - moe_grouped_gemm: bool = True - moe_token_dispatcher_type: str = "alltoall" - moe_permute_fusion: bool = True - moe_ffn_hidden_size: int = 2880 - moe_router_load_balancing_type: str = "none" - seq_length: int = 131072 - window_size: Optional[Tuple[int, int]] = (128, 0) - softmax_type: Literal["vanilla", "off-by-one", "learnable"] = "learnable" - activation_func: Callable = quick_gelu - glu_linear_offset: float = 1.0 - bias_activation_fusion: bool = True - bias_dropout_fusion: bool = False - window_attn_skip_freq: Optional[Union[int, List[int]]] = 2 # alternative SWA/full - activation_func_clamp_value: Optional[float] = 7.0 - - def provide(self, pre_process=None, post_process=None, vp_stage=None) -> MCoreGPTModel: - if not is_te_min_version("2.8"): - logger.info("Fused sink attention requires TE >= 2.8. Falling back to MCore local sink attention.") - self.attention_backend = AttnBackend.local - return super().provide(pre_process, post_process, vp_stage) - - -@dataclass -class GPTOSSProvider120B(GPTOSSProvider): - """Config for GPT-OSS 120B""" - - num_layers: int = 36 - num_moe_experts: int = 128 - - -@dataclass -class GPTOSSProvider20B(GPTOSSProvider): - """Config for GPT-OSS 20B""" - - num_layers: int = 24 - num_moe_experts: int = 32 diff --git a/src/megatron/bridge/models/kimi/__init__.py b/src/megatron/bridge/models/kimi/__init__.py index fa0c184346..414d81bbc2 100644 --- a/src/megatron/bridge/models/kimi/__init__.py +++ b/src/megatron/bridge/models/kimi/__init__.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from megatron.bridge.models.kimi.kimi_provider import KimiK2Provider +from megatron.bridge.models.kimi.kimi_bridge import KimiBridge # noqa: F401 __all__ = [ - "KimiK2Provider", + "KimiBridge", ] diff --git a/src/megatron/bridge/models/kimi/kimi_provider.py b/src/megatron/bridge/models/kimi/kimi_provider.py deleted file mode 100644 index 8d23ea060e..0000000000 --- a/src/megatron/bridge/models/kimi/kimi_provider.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from dataclasses import dataclass, field -from functools import partial -from typing import TYPE_CHECKING, Callable, List, Optional, Union - -import torch -import torch.nn.functional as F -from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec - -from megatron.bridge.models.gpt_provider import GPTModelProvider -from megatron.bridge.models.transformer_config import MLATransformerConfig - - -try: - import transformer_engine # type: ignore # noqa: F401 - - HAVE_TE = True -except (ImportError, ModuleNotFoundError): - HAVE_TE = False - -if TYPE_CHECKING: - from megatron.core.transformer import ModuleSpec - -if HAVE_TE: - from megatron.core.utils import is_te_min_version - - -@dataclass -class KimiK2Provider(MLATransformerConfig, GPTModelProvider): - """ - https://moonshotai.github.io/Kimi-K2/ - """ - - transformer_layer_spec: Union["ModuleSpec", Callable[["GPTModelProvider"], "ModuleSpec"]] = partial( - get_gpt_decoder_block_spec, use_transformer_engine=HAVE_TE - ) - - # Model - num_layers: int = 61 - hidden_size: int = 7168 - ffn_hidden_size: int = 18432 - num_moe_experts: int = 384 - moe_ffn_hidden_size: int = 2048 - moe_shared_expert_intermediate_size: int = 2048 # 2048 * 1 shared expert - moe_layer_freq: Union[int, List[int]] = field(default_factory=lambda: [0] + [1] * 60) # first layer are dense - normalization: str = "RMSNorm" - activation_func: Callable = F.silu - gated_linear_unit: bool = True # swiglu - position_embedding_type: str = "rope" - add_bias_linear: bool = False - share_embeddings_and_output_weights: bool = False - num_attention_heads: int = 64 - kv_channels: int = 64 - max_position_embeddings: int = 4096 - seq_length: int = 4096 - rotary_base: float = 50000.0 - make_vocab_size_divisible_by: int = 1280 - mtp_num_layers: Optional[int] = None - mtp_loss_scaling_factor: Optional[float] = None - - # Regularization - attention_dropout: float = 0.0 - hidden_dropout: float = 0.0 - qk_layernorm: bool = True - - # MoE - moe_router_topk: int = 8 - moe_router_num_groups: int = 1 - moe_router_group_topk: int = 1 - moe_router_topk_scaling_factor: float = 2.827 - moe_aux_loss_coeff: float = 1e-3 - moe_router_score_function: str = "sigmoid" - moe_router_enable_expert_bias: bool = True - moe_router_bias_update_rate: float = 1e-3 - moe_grouped_gemm: bool = True - moe_router_pre_softmax: bool = True - moe_token_dispatcher_type: str = "alltoall" - moe_router_load_balancing_type: str = "seq_aux_loss" - moe_shared_expert_overlap: bool = True - moe_router_dtype: Optional[str] = "fp32" - - # MLA - multi_latent_attention: bool = True - q_lora_rank: int = 1536 - kv_lora_rank: int = 512 - qk_head_dim: int = 128 - qk_pos_emb_head_dim: int = 64 - v_head_dim: int = 128 - rotary_scaling_factor: float = 32 - beta_fast: float = 1.0 - beta_slow: float = 1.0 - mscale: float = 1.0 - mscale_all_dim: float = 1.0 - - # Miscellaneous - init_method_std: float = 0.006 - layernorm_epsilon: float = 1e-6 - bf16: bool = True - params_dtype: torch.dtype = torch.bfloat16 - attention_softmax_in_fp32: bool = False - persist_layer_norm: bool = True - num_layers_in_first_pipeline_stage: Optional[int] = None - num_layers_in_last_pipeline_stage: Optional[int] = None - account_for_embedding_in_pipeline_split: bool = False - account_for_loss_in_pipeline_split: bool = False - vocab_size: int = 163840 - - # fusions - apply_rope_fusion: bool = False - bias_activation_fusion: bool = True - bias_dropout_fusion: bool = True - masked_softmax_fusion: bool = True - gradient_accumulation_fusion: bool = True - cross_entropy_loss_fusion: bool = True - cross_entropy_fusion_impl: str = "te" - moe_permute_fusion: bool = is_te_min_version("2.1.0") if HAVE_TE else False diff --git a/src/megatron/bridge/models/llama/__init__.py b/src/megatron/bridge/models/llama/__init__.py index 62a5bfac9d..9a19d24eec 100644 --- a/src/megatron/bridge/models/llama/__init__.py +++ b/src/megatron/bridge/models/llama/__init__.py @@ -13,49 +13,8 @@ # limitations under the License. from megatron.bridge.models.llama.llama_bridge import LlamaBridge # noqa: F401 -from megatron.bridge.models.llama.llama_provider import ( - CodeLlamaModelProvider7B, - CodeLlamaModelProvider13B, - CodeLlamaModelProvider34B, - CodeLlamaModelProvider70B, - Llama2ModelProvider7B, - Llama2ModelProvider13B, - Llama2ModelProvider70B, - Llama3ModelProvider, - Llama3ModelProvider8B, - Llama3ModelProvider70B, - Llama4Experts16ModelProvider, - Llama4Experts128ModelProvider, - Llama4ModelProvider, - Llama31ModelProvider, - Llama31ModelProvider8B, - Llama31ModelProvider70B, - Llama31ModelProvider405B, - Llama32ModelProvider1B, - Llama32ModelProvider3B, - LlamaModelProvider, -) __all__ = [ - "LlamaModelProvider", - "Llama2ModelProvider7B", - "Llama2ModelProvider13B", - "Llama2ModelProvider70B", - "Llama3ModelProvider", - "Llama3ModelProvider8B", - "Llama3ModelProvider70B", - "Llama31ModelProvider", - "Llama31ModelProvider8B", - "Llama31ModelProvider70B", - "Llama31ModelProvider405B", - "Llama32ModelProvider1B", - "Llama32ModelProvider3B", - "CodeLlamaModelProvider7B", - "CodeLlamaModelProvider13B", - "CodeLlamaModelProvider34B", - "CodeLlamaModelProvider70B", - "Llama4ModelProvider", - "Llama4Experts16ModelProvider", - "Llama4Experts128ModelProvider", + "LlamaBridge", ] diff --git a/src/megatron/bridge/models/llama/llama_provider.py b/src/megatron/bridge/models/llama/llama_provider.py deleted file mode 100644 index d74783aa64..0000000000 --- a/src/megatron/bridge/models/llama/llama_provider.py +++ /dev/null @@ -1,383 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import Callable, List, Optional, Union - -import torch.nn.functional as F -from megatron.core.transformer import ModuleSpec - -from megatron.bridge.models.gpt_provider import GPTModelProvider -from megatron.bridge.models.llama.llama4_utils import get_llama4_layer_spec - - -@dataclass -class LlamaModelProvider(GPTModelProvider): - """Configuration class for Llama models. - - Extends GPTConfig with specific settings optimized for Llama architectures. - Includes configurations for normalization, activation functions, and various - architecture-specific options. - """ - - # configs that are common across model sizes - normalization: str = "RMSNorm" - activation_func: Callable = F.silu - gated_linear_unit: bool = True - position_embedding_type: str = "rope" - add_bias_linear: bool = False - seq_length: int = 4096 - attention_dropout: float = 0.0 - hidden_dropout: float = 0.0 - share_embeddings_and_output_weights: bool = False - # Fusions - bias_activation_fusion: bool = True - masked_softmax_fusion: bool = True - persist_layer_norm: bool = True - bias_dropout_fusion: bool = True - apply_rope_fusion: bool = True - use_transformer_engine_op_fuser: Optional[bool] = None - - -@dataclass -class Llama2ModelProvider7B(LlamaModelProvider): - """Configuration for a 7B parameter Llama 2 model. - - Specific configuration for the 7B Llama 2 model with 32 layers, - 4096 hidden size, and 32 attention heads. - """ - - num_layers: int = 32 - hidden_size: int = 4096 - num_attention_heads: int = 32 - num_query_groups: int = 32 - ffn_hidden_size: int = 11008 - - -@dataclass -class Llama2ModelProvider13B(LlamaModelProvider): - """Configuration for a 13B parameter Llama 2 model. - - Specific configuration for the 13B Llama 2 model with 40 layers, - 5120 hidden size, and 40 attention heads. - """ - - num_layers: int = 40 - hidden_size: int = 5120 - num_attention_heads: int = 40 - num_query_groups: int = 40 - ffn_hidden_size: int = 13824 - - -@dataclass -class Llama2ModelProvider70B(LlamaModelProvider): - """Configuration for a 70B parameter Llama 2 model. - - Specific configuration for the 70B Llama 2 model with 80 layers, - 8192 hidden size, and 64 attention heads with 8 query groups. - """ - - num_layers: int = 80 - hidden_size: int = 8192 - num_attention_heads: int = 64 - num_query_groups: int = 8 - ffn_hidden_size: int = 28672 - - -@dataclass -class Llama3ModelProvider(LlamaModelProvider): - """Configuration for Llama 3 models. - - Base configuration for Llama 3 architecture with common settings - across different model sizes, including group query attention (GQA) - and architecture-specific settings. - """ - - num_query_groups: int = 8 - hidden_dropout: float = 0.0 - attention_dropout: float = 0.0 - normalization: str = "RMSNorm" - init_method_std: float = 0.01 - layernorm_epsilon: float = 1.0e-05 - add_bias_linear: bool = False - activation_func: Callable = F.silu - gated_linear_unit: bool = True - # Fusions - bias_activation_fusion: bool = True - masked_softmax_fusion: bool = True - persist_layer_norm: bool = True - bias_dropout_fusion: bool = True - apply_rope_fusion: bool = True - share_embeddings_and_output_weights: bool = False - position_embedding_type: str = "rope" - rotary_percent: float = 1.0 - - -@dataclass -class Llama31ModelProvider(Llama3ModelProvider): - """Configuration for Llama 3.1 models. - - Extends Llama3ModelProvider with specific settings for Llama 3.1 models, - including RoPE scaling via Megatron Core's built-in support. - """ - - # RoPE scaling is now handled by Megatron Core's RotaryEmbedding - rope_scaling: bool = True - rope_scaling_factor: float = 8.0 - init_method_std: float = 0.02 - - -@dataclass -class Llama3ModelProvider8B(Llama3ModelProvider): - """Configuration for an 8B parameter Llama 3 model. - - Specific configuration for the 8B Llama 3 model with 32 layers, - 4096 hidden size, and 32 attention heads. - """ - - rotary_base: int = 500_000 - seq_length: int = 8192 - num_layers: int = 32 - hidden_size: int = 4096 - ffn_hidden_size: int = 14336 - num_attention_heads: int = 32 - cross_entropy_fusion_impl: str = "te" - - -@dataclass -class Llama3ModelProvider70B(Llama3ModelProvider): - """Configuration for a 70B parameter Llama 3 model. - - Specific configuration for the 70B Llama 3 model with 80 layers, - 8192 hidden size, and 64 attention heads. - """ - - rotary_base: int = 500_000 - seq_length: int = 8192 - num_layers: int = 80 - hidden_size: int = 8192 - ffn_hidden_size: int = 28672 - num_attention_heads: int = 64 - init_method_std: float = 0.008944 - make_vocab_size_divisible_by: int = 128 - cross_entropy_fusion_impl: str = "te" - - -@dataclass -class Llama31ModelProvider8B(Llama31ModelProvider): - """Configuration for an 8B parameter Llama 3.1 model. - - Specific configuration for the 8B Llama 3.1 model with 32 layers, - 4096 hidden size, and 32 attention heads, supporting a longer context - length of 131K tokens. - """ - - rotary_base: int = 500_000 - seq_length: int = 131072 - num_layers: int = 32 - hidden_size: int = 4096 - ffn_hidden_size: int = 14336 - num_attention_heads: int = 32 - - -@dataclass -class Llama31ModelProvider70B(Llama31ModelProvider): - """Configuration for a 70B parameter Llama 3.1 model. - - Specific configuration for the 70B Llama 3.1 model with 80 layers, - 8192 hidden size, and 64 attention heads, supporting a longer context - length of 131K tokens. - """ - - rotary_base: int = 500_000 - seq_length: int = 131072 - num_layers: int = 80 - hidden_size: int = 8192 - ffn_hidden_size: int = 28672 - num_attention_heads: int = 64 - make_vocab_size_divisible_by: int = 128 - - -@dataclass -class Llama31ModelProvider405B(Llama31ModelProvider): - """Configuration for a 405B parameter Llama 3.1 model. - - Specific configuration for the 405B Llama 3.1 model with 126 layers, - 16384 hidden size, and 128 attention heads, supporting a longer context - length of 131K tokens. - """ - - rotary_base: int = 500_000 - seq_length: int = 131072 - num_layers: int = 126 - hidden_size: int = 16384 - ffn_hidden_size: int = 53248 - num_attention_heads: int = 128 - make_vocab_size_divisible_by: int = 128 - cross_entropy_fusion_impl: str = "te" - - -@dataclass -class Llama32ModelProvider1B(Llama31ModelProvider): - """Configuration for a 1B parameter Llama 3.2 model. - - Specific configuration for the 1B Llama 3.2 model with 16 layers, - 2048 hidden size, and 32 attention heads (8 query groups). - """ - - rope_scaling_factor: float = 32.0 - share_embeddings_and_output_weights: bool = True - rotary_base: int = 500_000 - seq_length: int = 131072 - num_layers: int = 16 - hidden_size: int = 2048 - ffn_hidden_size: int = 8192 - num_attention_heads: int = 32 - num_query_groups: int = 8 - make_vocab_size_divisible_by: int = 128 - - -@dataclass -class Llama32ModelProvider3B(Llama31ModelProvider): - """Configuration for a 3B parameter Llama 3.2 model. - - Specific configuration for the 3B Llama 3.2 model with 28 layers, - 3072 hidden size, and 24 attention heads (8 query groups). - """ - - rope_scaling_factor: float = 32.0 - share_embeddings_and_output_weights: bool = True - rotary_base: int = 500_000 - seq_length: int = 131072 - num_layers: int = 28 - hidden_size: int = 3072 - ffn_hidden_size: int = 8192 - num_attention_heads: int = 24 - num_query_groups: int = 8 - make_vocab_size_divisible_by: int = 128 - - -@dataclass -class CodeLlamaModelProvider7B(Llama2ModelProvider7B): - """Configuration for a 7B parameter CodeLlama model. - - Extends Llama2ModelProvider7B with modified settings specifically for code generation, - including longer context length and different rotary base. - """ - - rotary_base: int = 1_000_000 - seq_length: int = 16384 - - -@dataclass -class CodeLlamaModelProvider13B(Llama2ModelProvider13B): - """Configuration for a 13B parameter CodeLlama model. - - Extends Llama2ModelProvider13B with modified settings specifically for code generation, - including longer context length and different rotary base. - """ - - rotary_base: int = 1_000_000 - seq_length: int = 16384 - - -@dataclass -class CodeLlamaModelProvider34B(LlamaModelProvider): - """Configuration for a 34B parameter CodeLlama model. - - Specific configuration for the 34B CodeLlama model with 48 layers, - 8192 hidden size, and 64 attention heads (8 query groups). - """ - - num_layers: int = 48 - hidden_size: int = 8192 - num_attention_heads: int = 64 - num_query_groups: int = 8 - ffn_hidden_size: int = 22016 - rotary_base: int = 1_000_000 - seq_length: int = 16384 - - -@dataclass -class CodeLlamaModelProvider70B(Llama2ModelProvider70B): - """Configuration for a 70B parameter CodeLlama model. - - Extends Llama2ModelProvider70B with settings specifically for code generation. - """ - - pass - - -@dataclass -class Llama4ModelProvider(Llama3ModelProvider): - """ - Configuration for Llama4 language model. - """ - - rotary_base: int = 500_000 - seq_length: int = 8192 - num_layers: int = 48 - hidden_size: int = 5120 - ffn_hidden_size: int = 16384 - num_attention_heads: int = 40 - vocab_size: int = 25256 * 8 - add_bias_linear: bool = False - gated_linear_unit: bool = True - rotary_interleaved: bool = True - apply_rope_fusion: bool = False - nope_layer_interval: int = 4 - transformer_layer_spec: Union[ModuleSpec, Callable[["LlamaModelProvider"], ModuleSpec]] = field( - default_factory=lambda: get_llama4_layer_spec - ) - # MOE - moe_grouped_gemm: bool = True - moe_shared_expert_intermediate_size: int = 8192 - moe_ffn_hidden_size: int = 8192 - moe_router_topk: int = 1 - moe_router_pre_softmax: bool = False - moe_router_score_function: str = "sigmoid" - moe_token_dispatcher_type: str = "alltoall" - moe_router_dtype: Optional[str] = None - moe_apply_probs_on_input: bool = True - moe_shared_expert_overlap: bool = True - moe_permute_fusion: bool = False - # Configs that are overwritten in subclass models - qk_l2_norm: bool = True - rope_scaling: bool = True - rope_scaling_factor: float = 8.0 - attention_chunk_size: int = 8192 - - -@dataclass -class Llama4Experts16ModelProvider(Llama4ModelProvider): - """ - Configuration for llama4 16-experts model. - """ - - num_moe_experts: int = 16 - rope_scaling: bool = True - rope_scaling_factor: float = 8.0 - qk_l2_norm: bool = True - - -@dataclass -class Llama4Experts128ModelProvider(Llama4ModelProvider): - """ - Configuration for llama4 128-experts model. - """ - - num_moe_experts: int = 128 - rope_scaling: bool = False - moe_layer_freq: Union[int, List[int]] = field(default_factory=lambda: [0, 1] * 24) - qk_l2_norm: bool = False diff --git a/src/megatron/bridge/models/llama_nemotron/__init__.py b/src/megatron/bridge/models/llama_nemotron/__init__.py index c79e7dd5fb..62743d2a00 100644 --- a/src/megatron/bridge/models/llama_nemotron/__init__.py +++ b/src/megatron/bridge/models/llama_nemotron/__init__.py @@ -14,19 +14,11 @@ from megatron.bridge.models.llama_nemotron.llama_nemotron_bridge import LlamaNemotronBridge from megatron.bridge.models.llama_nemotron.llama_nemotron_provider import ( - Llama31Nemotron70BProvider, - Llama31NemotronNano8BProvider, - Llama31NemotronUltra253BProvider, - Llama33NemotronSuper49BProvider, LlamaNemotronHeterogeneousProvider, ) __all__ = [ "LlamaNemotronBridge", - "Llama31NemotronNano8BProvider", - "Llama31Nemotron70BProvider", - "Llama33NemotronSuper49BProvider", - "Llama31NemotronUltra253BProvider", "LlamaNemotronHeterogeneousProvider", ] diff --git a/src/megatron/bridge/models/llama_nemotron/llama_nemotron_bridge.py b/src/megatron/bridge/models/llama_nemotron/llama_nemotron_bridge.py index 55ea5a298a..ff4b4c3d15 100644 --- a/src/megatron/bridge/models/llama_nemotron/llama_nemotron_bridge.py +++ b/src/megatron/bridge/models/llama_nemotron/llama_nemotron_bridge.py @@ -24,7 +24,6 @@ ) from megatron.bridge.models.conversion.transformers_compat import rope_scaling_factor_from_hf, rope_theta_from_hf from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM -from megatron.bridge.models.llama.llama_provider import Llama31ModelProvider from megatron.bridge.models.llama_nemotron.llama_nemotron_provider import LlamaNemotronHeterogeneousProvider @@ -59,7 +58,7 @@ class LlamaNemotronBridge(MegatronModelBridge): >>> provider = bridge.to_megatron_provider() """ - def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> Llama31ModelProvider: + def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> LlamaNemotronHeterogeneousProvider: hf_config = hf_pretrained.config # Validate heterogeneous DeciLM (NAS) config and select provider if not (hasattr(hf_config, "block_configs") and hf_config.block_configs): diff --git a/src/megatron/bridge/models/llama_nemotron/llama_nemotron_provider.py b/src/megatron/bridge/models/llama_nemotron/llama_nemotron_provider.py index 7a6af22faf..7027c0722a 100644 --- a/src/megatron/bridge/models/llama_nemotron/llama_nemotron_provider.py +++ b/src/megatron/bridge/models/llama_nemotron/llama_nemotron_provider.py @@ -17,17 +17,11 @@ from typing import Callable import torch - -# Import heterogeneous layer spec dependencies +import torch.nn.functional as F from megatron.core.models.gpt.heterogeneous.heterogeneous_layer_specs import get_gpt_heterogeneous_layer_spec from megatron.core.transformer.spec_utils import ModuleSpec -from megatron.bridge.models.llama.llama_provider import ( - Llama31ModelProvider, - Llama31ModelProvider8B, - Llama31ModelProvider70B, - Llama31ModelProvider405B, -) +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.models.transformer_config import HeterogeneousTransformerConfig @@ -49,95 +43,7 @@ def heterogeneous_layer_spec(config) -> ModuleSpec: @dataclass -class Llama31NemotronNano8BProvider(Llama31ModelProvider8B): - """ - Configuration class for the Llama3.1-Nemotron-Nano-8B model. - Maps to: nvidia/Llama-3.1-Nemotron-Nano-8B-v1 - Based on Llama31Config8B with kv_channels=128 - """ - - kv_channels: int = 128 - # Data type settings to match HF models - bf16: bool = True - fp16: bool = False - params_dtype: torch.dtype = torch.bfloat16 - autocast_dtype: torch.dtype = torch.bfloat16 - - -@dataclass -class Llama31Nemotron70BProvider(Llama31ModelProvider70B): - """ - Configuration class for the Llama3.1-Nemotron-70B model. - Maps to: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF - Based on Llama31Config70B with kv_channels=128 - """ - - kv_channels: int = 128 - # Data type settings to match HF models - bf16: bool = True - fp16: bool = False - params_dtype: torch.dtype = torch.bfloat16 - autocast_dtype: torch.dtype = torch.bfloat16 - - -@dataclass -class Llama33NemotronSuper49BProvider(Llama31ModelProvider70B, HeterogeneousTransformerConfig): - """ - Configuration class for the Llama3.3-Nemotron-Super-49B model. - Maps to: nvidia/Llama-3_3-Nemotron-Super-49B-v1 - Based on Llama31Config70B with heterogeneous architecture and kv_channels=128 - - Developer Note: - For MRO, Llama31ModelProvider70B must come first to ensure proper provider functionality, - then HeterogeneousTransformerConfig for heterogeneous support. - """ - - hidden_size: int = 8192 - num_attention_heads: int = 64 - num_layers: int = 80 - kv_channels: int = 128 - # Data type settings to match HF models - bf16: bool = True - fp16: bool = False - params_dtype: torch.dtype = torch.bfloat16 - autocast_dtype: torch.dtype = torch.bfloat16 - - heterogeneous_layers_config_path: str | None = None - heterogeneous_layers_config_encoded_json: str = "" - transformer_layer_spec: ModuleSpec | Callable = heterogeneous_layer_spec - - -@dataclass -class Llama31NemotronUltra253BProvider(Llama31ModelProvider405B, HeterogeneousTransformerConfig): - """ - Configuration class for the Llama3.1-Nemotron-Ultra-253B model. - Maps to: nvidia/Llama-3_1-Nemotron-Ultra-253B-v1 - Based on Llama31Config405B with heterogeneous architecture and kv_channels=128 - - Developer Note: - For MRO, Llama31ModelProvider405B must come first to ensure proper provider functionality, - then HeterogeneousTransformerConfig for heterogeneous support. - """ - - # Override base config for Ultra model specifics - num_layers: int = 162 - hidden_size: int = 16384 - num_attention_heads: int = 128 - kv_channels: int = 128 - # Data type settings to match HF models - bf16: bool = True - fp16: bool = False - params_dtype: torch.dtype = torch.bfloat16 - autocast_dtype: torch.dtype = torch.bfloat16 - - # Heterogeneous configuration fields - heterogeneous_layers_config_path: str | None = None - heterogeneous_layers_config_encoded_json: str = "" - transformer_layer_spec: ModuleSpec | Callable = heterogeneous_layer_spec - - -@dataclass -class LlamaNemotronHeterogeneousProvider(Llama31ModelProvider, HeterogeneousTransformerConfig): +class LlamaNemotronHeterogeneousProvider(GPTModelProvider, HeterogeneousTransformerConfig): """ Generic provider for heterogeneous (NAS) Llama-Nemotron models using DeciLMForCausalLM. @@ -145,6 +51,23 @@ class LlamaNemotronHeterogeneousProvider(Llama31ModelProvider, HeterogeneousTran provided at runtime via kwargs (num_layers, hidden_size, heads, kv_channels, etc.). """ + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + position_embedding_type: str = "rope" + add_bias_linear: bool = False + hidden_dropout: float = 0.0 + attention_dropout: float = 0.0 + share_embeddings_and_output_weights: bool = False + bias_activation_fusion: bool = True + masked_softmax_fusion: bool = True + persist_layer_norm: bool = True + bias_dropout_fusion: bool = True + apply_rope_fusion: bool = True + rotary_percent: float = 1.0 + num_query_groups: int = 8 + init_method_std: float = 0.02 + # Data type settings to match HF models (BF16) bf16: bool = True fp16: bool = False diff --git a/src/megatron/bridge/models/nemotron/__init__.py b/src/megatron/bridge/models/nemotron/__init__.py index 0ea0b3172e..6f01b173a4 100644 --- a/src/megatron/bridge/models/nemotron/__init__.py +++ b/src/megatron/bridge/models/nemotron/__init__.py @@ -13,22 +13,8 @@ # limitations under the License. from megatron.bridge.models.nemotron.nemotron_bridge import NemotronBridge -from megatron.bridge.models.nemotron.nemotron_provider import ( - Nemotron3ModelProvider4B, - Nemotron3ModelProvider8B, - Nemotron3ModelProvider22B, - Nemotron4ModelProvider15B, - Nemotron4ModelProvider340B, - NemotronModelProvider, -) __all__ = [ "NemotronBridge", - "NemotronModelProvider", - "Nemotron3ModelProvider4B", - "Nemotron3ModelProvider8B", - "Nemotron3ModelProvider22B", - "Nemotron4ModelProvider15B", - "Nemotron4ModelProvider340B", ] diff --git a/src/megatron/bridge/models/nemotron/nemotron_provider.py b/src/megatron/bridge/models/nemotron/nemotron_provider.py deleted file mode 100644 index 20bc8a9d4f..0000000000 --- a/src/megatron/bridge/models/nemotron/nemotron_provider.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from dataclasses import dataclass -from typing import Callable, Optional - -import torch - -from megatron.bridge.models.gpt_provider import GPTModelProvider - - -logger = logging.getLogger(__name__) - - -def squared_relu(x): - """Squared ReLU activation function.""" - return torch.pow(torch.nn.functional.relu(x), 2) - - -@dataclass -class NemotronModelProvider(GPTModelProvider): - """Configuration class for Nemotron models.""" - - # configs that are common across model sizes - normalization: str = "LayerNorm" - activation_func: Callable = squared_relu - position_embedding_type: str = "rope" - share_embeddings_and_output_weights: bool = False - add_bias_linear: bool = False - - hidden_dropout: float = 0.0 - attention_dropout: float = 0.0 - rotary_percent: float = 0.5 - masked_softmax_fusion: bool = True - persist_layer_norm: bool = True - bias_dropout_add_fusion: bool = False - layernorm_zero_centered_gamma: bool = True - cross_entropy_loss_fusion: bool = True - apply_rope_fusion: bool = True - - # Nemotron3Config4B as default configs - num_layers: int = 32 - seq_length: int = 4096 - hidden_size: int = 3072 - ffn_hidden_size: int = 9216 - num_attention_heads: int = 24 - num_query_groups: Optional[int] = 8 - kv_channels: Optional[int] = 128 - init_method_std: float = 0.0134 - - # Data type settings to match HF models - bf16: bool = True - fp16: bool = False - params_dtype: torch.dtype = torch.bfloat16 - autocast_dtype: torch.dtype = torch.bfloat16 - - -@dataclass -class Nemotron3ModelProvider4B(NemotronModelProvider): - """ - Configuration class for the Nemotron3 4B model, inheriting from NemotronModelProvider. - """ - - num_layers: int = 32 - seq_length: int = 4096 - hidden_size: int = 3072 - ffn_hidden_size: int = 9216 - num_attention_heads: int = 24 - num_query_groups: int = 8 - kv_channels: Optional[int] = 128 - init_method_std: float = 0.0134 - - -@dataclass -class Nemotron3ModelProvider8B(NemotronModelProvider): - """ - Configuration class for the Nemotron3 8B model, inheriting from NemotronModelProvider. - """ - - num_layers: int = 32 - seq_length: int = 4096 - hidden_size: int = 4096 - ffn_hidden_size: int = 16384 - num_attention_heads: int = 32 - num_query_groups: Optional[int] = None - kv_channels: Optional[int] = None - init_method_std: float = 0.010 - - -@dataclass -class Nemotron3ModelProvider22B(NemotronModelProvider): - """ - Configuration class for the Nemotron3 22B model, inheriting from NemotronModelProvider. - """ - - num_layers: int = 40 - seq_length: int = 4096 - hidden_size: int = 6144 - ffn_hidden_size: int = 24576 - num_attention_heads: int = 48 - num_query_groups: Optional[int] = None - kv_channels: Optional[int] = None - init_method_std: float = 0.008 - - -@dataclass -class Nemotron4ModelProvider15B(NemotronModelProvider): - """ - Configuration class for the Nemotron4 15B model, inheriting from NemotronModelProvider. - """ - - num_layers: int = 32 - seq_length: int = 4096 - hidden_size: int = 6144 - ffn_hidden_size: int = 24576 - num_attention_heads: int = 48 - num_query_groups: Optional[int] = 8 - kv_channels: Optional[int] = None - init_method_std: float = 0.0134 - - -@dataclass -class Nemotron4ModelProvider340B(NemotronModelProvider): - """ - Configuration class for the Nemotron4 340B model, inheriting from NemotronModelProvider. - """ - - num_layers: int = 96 - seq_length: int = 4096 - hidden_size: int = 18432 - ffn_hidden_size: int = 73728 - num_attention_heads: int = 96 - num_query_groups: Optional[int] = 8 - kv_channels: Optional[int] = None - init_method_std: float = 0.0063 diff --git a/src/megatron/bridge/models/nemotron_vl/nemotron_vl_provider.py b/src/megatron/bridge/models/nemotron_vl/nemotron_vl_provider.py index e1c24603bc..141dfe95d3 100644 --- a/src/megatron/bridge/models/nemotron_vl/nemotron_vl_provider.py +++ b/src/megatron/bridge/models/nemotron_vl/nemotron_vl_provider.py @@ -14,58 +14,73 @@ import copy from dataclasses import dataclass +from typing import Callable -from megatron.core.activations import fast_gelu +from megatron.core.activations import fast_gelu, squared_relu from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec from megatron.core.models.multimodal.llava_model import LLaVAModel from megatron.core.models.vision.vit_layer_specs import get_vit_layer_with_transformer_engine_spec -from megatron.bridge.models.nemotronh.nemotron_h_provider import NemotronNano12Bv2Provider +from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider @dataclass -class NemotronNano12Bv2VLModelProvider(NemotronNano12Bv2Provider): - """Configuration provider for Nemotron-VL models.""" - - # ------------------------------------------------------------------ - # Language configuration – inherit sensible defaults from NemotronNano12Bv2Provider - # ------------------------------------------------------------------ - - # For VL models we do *not* scatter embeddings across the sequence - # parallel region because we need to splice vision embeddings later. +class NemotronNano12Bv2VLModelProvider(MambaModelProvider): + """Configuration provider for Nemotron-VL models. + + Inlines NemotronH + NemotronNano12Bv2 defaults directly. + """ + + # NemotronH base defaults + mamba_num_groups: int = 8 + mamba_head_dim: int = 80 + num_query_groups: int = 8 + make_vocab_size_divisible_by: int = 128 + activation_func: Callable = squared_relu + masked_softmax_fusion: bool = True + apply_query_key_layer_scaling: bool = False + persist_layer_norm: bool = True + first_last_layers_bf16: bool = True + is_hybrid_model: bool = True + + # MoE + moe_aux_loss_coeff: float = 0.0001 + moe_router_score_function: str = "sigmoid" + moe_router_enable_expert_bias: bool = True + moe_router_load_balancing_type: str = "seq_aux_loss" + moe_router_dtype: str = "fp32" + moe_grouped_gemm: bool = True + moe_token_dispatcher_type: str = "alltoall" + moe_permute_fusion: bool = True + moe_shared_expert_overlap: bool = True + + # NemotronNano12Bv2 specifics + hybrid_override_pattern: str = "M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-" + num_layers: int = 62 + hidden_size: int = 5120 + mamba_num_heads: int = 128 + kv_channels: int = 128 + mamba_state_dim: int = 128 + ffn_hidden_size: int = 20480 + num_attention_heads: int = 40 + seq_length: int = 131072 + + # VL overrides scatter_embedding_sequence_parallel: bool = False attention_softmax_in_fp32: bool = True vision_model_type: str = "radio" language_model_type: str = "nemotron5-hybrid-12b" - # Freeze knobs useful for transfer-learning scenarios freeze_language_model: bool = False freeze_vision_model: bool = False freeze_vision_projection: bool = False - # ------------------------------------------------------------------ - # Provider API - # ------------------------------------------------------------------ - def provide(self, pre_process=None, post_process=None, vp_stage=None): # noqa: D401 - """Assemble a full :class:`~megatron.core.models.multimodal.llava_model.LLaVAModel` and wrap it. - - This is a *very* trimmed-down version of the assembly code used in - `pretrain_vlm.py` – it relies only on parameters already stored in the - provider so that it works in any script (no Megatron-training CLI - required). - """ - - # ------------------------------------------------------------------ - # Build configs and layer specs - # ------------------------------------------------------------------ + """Assemble a full :class:`~megatron.core.models.multimodal.llava_model.LLaVAModel`.""" - # Language config is basically *self* (GPTModelProvider), but we make a - # shallow copy so tweaks do not leak back. language_cfg = copy.deepcopy(self) - # Vision transformer config – start from language_cfg but ensure SP/CP disabled vision_cfg = copy.deepcopy(language_cfg) vision_cfg.sequence_parallel = False vision_cfg.context_parallel_size = 1 @@ -73,7 +88,6 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None): # noqa: vision_cfg.recompute_granularity = None vision_cfg.recompute_method = None vision_cfg.recompute_num_layers = None - # Overrides for vision_model_type = "radio" vision_cfg.num_layers = 32 vision_cfg.num_attention_heads = 16 vision_cfg.add_bias_linear = True @@ -91,8 +105,6 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None): # noqa: vision_cfg.qk_layernorm = False vision_cfg.layernorm_epsilon = 1e-6 - # Vision-projection config/spec: a tiny two-layer MLP; for now just reuse - # the MLP sub-modules from the language layer spec if available. vision_proj_cfg = copy.deepcopy(language_cfg) vision_proj_cfg.sequence_parallel = False vision_proj_cfg.context_parallel_size = 1 @@ -100,7 +112,6 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None): # noqa: vision_proj_cfg.recompute_granularity = None vision_proj_cfg.recompute_method = None vision_proj_cfg.recompute_num_layers = None - # Overrides for language_model_type = "nemotron5-hybrid-12b" vision_proj_cfg.ffn_hidden_size = 20480 vision_proj_cfg.bias_activation_fusion = False @@ -108,9 +119,6 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None): # noqa: vision_spec = get_vit_layer_with_transformer_engine_spec() vision_proj_spec = copy.deepcopy(language_spec.submodules.mlp_layer.submodules.mlp.submodules) - # ------------------------------------------------------------------ - # Instantiate LLaVA - # ------------------------------------------------------------------ llava_model = LLaVAModel( language_transformer_config=language_cfg, language_transformer_layer_spec=language_spec, @@ -139,7 +147,7 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None): # noqa: pixel_shuffle=True, max_num_tiles=12, tokenizer_type="nemotron-h-5p5-reasoning", - use_vision_backbone_fp8_arch=True, # Note: this is true in mlm code + use_vision_backbone_fp8_arch=True, ) from megatron.bridge.models.nemotron_vl.modeling_nemotron_vl import NemotronVLModel @@ -155,6 +163,5 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None): # noqa: return model - # Alias that NemotronVLModel relies on to create the LM component def provide_language_model(self, pre_process=None, post_process=None, vp_stage=None): return super().provide(pre_process=pre_process, post_process=post_process, vp_stage=vp_stage) diff --git a/src/megatron/bridge/models/nemotronh/__init__.py b/src/megatron/bridge/models/nemotronh/__init__.py index 9fc2ba99ea..cf1c4d67c8 100644 --- a/src/megatron/bridge/models/nemotronh/__init__.py +++ b/src/megatron/bridge/models/nemotronh/__init__.py @@ -13,38 +13,8 @@ # limitations under the License. from megatron.bridge.models.nemotronh.nemotron_h_bridge import NemotronHBridge -from megatron.bridge.models.nemotronh.nemotron_h_provider import ( - Nemotron3NanoProvider, - NemotronHModel4BProvider, - NemotronHModel8BProvider, - NemotronHModel47BProvider, - NemotronHModel56BProvider, - NemotronHModelProvider, - NemotronHModelProvider4B, - NemotronHModelProvider8B, - NemotronHModelProvider47B, - NemotronHModelProvider56B, - NemotronNano9Bv2Provider, - NemotronNano12Bv2Provider, - NemotronNanoModelProvider9Bv2, - NemotronNanoModelProvider12Bv2, -) __all__ = [ "NemotronHBridge", - "NemotronHModelProvider", - "NemotronHModelProvider4B", - "NemotronHModelProvider8B", - "NemotronHModelProvider47B", - "NemotronHModelProvider56B", - "NemotronNanoModelProvider9Bv2", - "NemotronNanoModelProvider12Bv2", - "NemotronHModel4BProvider", - "NemotronHModel8BProvider", - "NemotronHModel47BProvider", - "NemotronHModel56BProvider", - "NemotronNano9Bv2Provider", - "NemotronNano12Bv2Provider", - "Nemotron3NanoProvider", ] diff --git a/src/megatron/bridge/models/nemotronh/nemotron_h_provider.py b/src/megatron/bridge/models/nemotronh/nemotron_h_provider.py deleted file mode 100644 index 004e441734..0000000000 --- a/src/megatron/bridge/models/nemotronh/nemotron_h_provider.py +++ /dev/null @@ -1,271 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import warnings -from dataclasses import dataclass -from typing import Callable - -from megatron.core.activations import squared_relu -from megatron.core.transformer.enums import AttnBackend - -from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider -from megatron.bridge.utils.common_utils import get_rank_safe - - -logger = logging.getLogger(__name__) - - -@dataclass -class NemotronHModelProvider(MambaModelProvider): - """Configuration for Nemotron-H models.""" - - seq_length: int = 8192 - mamba_num_groups: int = 8 - mamba_head_dim: int = 64 - num_query_groups: int = 8 - make_vocab_size_divisible_by: int = 128 - activation_func: Callable = squared_relu - masked_softmax_fusion: bool = True - apply_query_key_layer_scaling: bool = False - persist_layer_norm: bool = True - attention_softmax_in_fp32: bool = False - first_last_layers_bf16: bool = True - is_hybrid_model: bool = True - - # MoE - moe_aux_loss_coeff: float = 0.0001 - moe_router_score_function: str = "sigmoid" - moe_router_enable_expert_bias: bool = True - moe_router_load_balancing_type: str = "seq_aux_loss" - moe_router_dtype: str = "fp32" - moe_grouped_gemm: bool = True - moe_token_dispatcher_type: str = "alltoall" - moe_permute_fusion: bool = True - moe_shared_expert_overlap: bool = True - - -@dataclass -class NemotronHModelProvider4B(NemotronHModelProvider): - """Configuration for a 4B parameter Nemotron-H model.""" - - hybrid_override_pattern: str = "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" - num_layers: int = 52 - hidden_size: int = 3072 - mamba_num_heads: int = 112 - kv_channels: int = 128 - mamba_state_dim: int = 128 - ffn_hidden_size: int = 12288 - num_attention_heads: int = 32 - use_mamba_mem_eff_path: bool = False - - -@dataclass -class NemotronHModelProvider8B(NemotronHModelProvider): - """Configuration for a 8B parameter Nemotron-H model.""" - - hybrid_override_pattern: str = "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" - num_layers: int = 52 - hidden_size: int = 4096 - mamba_state_dim: int = 128 - mamba_num_heads: int = 128 - ffn_hidden_size: int = 21504 - num_attention_heads: int = 32 - - -@dataclass -class NemotronHModelProvider47B(NemotronHModelProvider): - """Configuration for a 47B parameter Nemotron-H model.""" - - hybrid_override_pattern: str = ( - "M-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M---MM---M-M*-M-M-M-M-M-" - ) - num_layers: int = 98 - hidden_size: int = 8192 - mamba_state_dim: int = 256 - mamba_num_heads: int = 256 - ffn_hidden_size: int = 30720 - num_attention_heads: int = 64 - - -@dataclass -class NemotronHModelProvider56B(NemotronHModelProvider): - """Configuration for a 56B parameter Nemotron-H model.""" - - hybrid_override_pattern: str = ( - "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-" - "M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" - ) - num_layers: int = 118 - hidden_size: int = 8192 - mamba_state_dim: int = 256 - mamba_num_heads: int = 256 - ffn_hidden_size: int = 32768 - num_attention_heads: int = 64 - - attention_backend: AttnBackend = AttnBackend.auto - - -@dataclass -class NemotronNanoModelProvider9Bv2(NemotronHModelProvider): - """Configuration for a 9B parameter Nemotron Nano v2 model.""" - - hybrid_override_pattern: str = "M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-" - num_layers: int = 56 - hidden_size: int = 4480 - mamba_num_heads: int = 128 - kv_channels: int = 128 - mamba_state_dim: int = 128 - ffn_hidden_size: int = 15680 - num_attention_heads: int = 40 - mamba_head_dim: int = 80 - seq_length: int = 131072 - - -@dataclass -class NemotronNanoModelProvider12Bv2(NemotronHModelProvider): - """Configuration for the Nemotron Nano v2 12B model.""" - - hybrid_override_pattern: str = "M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-" - num_layers: int = 62 - hidden_size: int = 5120 - mamba_num_heads: int = 128 - kv_channels: int = 128 - mamba_state_dim: int = 128 - ffn_hidden_size: int = 20480 - num_attention_heads: int = 40 - mamba_head_dim: int = 80 - seq_length: int = 131072 - - -@dataclass -class Nemotron3NanoProvider(NemotronHModelProvider): - """Configuration for a 3B parameter Nemotron 3 Nano model.""" - - seq_length: int = 262144 - num_query_groups: int = 2 - hybrid_override_pattern: str = "MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME" - num_layers: int = 52 - hidden_size: int = 2688 - mamba_num_heads: int = 64 - kv_channels: int = 128 - mamba_state_dim: int = 128 - ffn_hidden_size: int = 1856 - num_attention_heads: int = 32 - mamba_head_dim: int = 64 - num_moe_experts: int = 128 - moe_ffn_hidden_size: int = 1856 - moe_shared_expert_intermediate_size: int = 3712 # 1856 * 2 shared expert - moe_router_topk: int = 6 - moe_router_topk_scaling_factor: float = 2.5 - moe_router_num_groups: int = 1 - moe_router_group_topk: int = 1 - - -# ----------------------------------------------------------------------------- -# Deprecated aliases (to be removed in a future release) -# ----------------------------------------------------------------------------- - - -def _warn_deprecated(old_cls: str, new_cls: str) -> None: - if get_rank_safe() == 0: - warnings.warn( - f"{old_cls} is deprecated and will be removed in a future release. Use {new_cls} instead.", - DeprecationWarning, - stacklevel=2, - ) - - -@dataclass -class NemotronHModel4BProvider(NemotronHModelProvider4B): - """Deprecated alias for ``NemotronHModelProvider4B``. - - Deprecated: - This alias remains for backward compatibility and will be removed in a - future release. Import and use ``NemotronHModelProvider4B`` instead. - """ - - def __post_init__(self) -> None: - _warn_deprecated("NemotronHModel4BProvider", "NemotronHModelProvider4B") - super().__post_init__() - - -@dataclass -class NemotronHModel8BProvider(NemotronHModelProvider8B): - """Deprecated alias for ``NemotronHModelProvider8B``. - - Deprecated: - This alias remains for backward compatibility and will be removed in a - future release. Import and use ``NemotronHModelProvider8B`` instead. - """ - - def __post_init__(self) -> None: - _warn_deprecated("NemotronHModel8BProvider", "NemotronHModelProvider8B") - super().__post_init__() - - -@dataclass -class NemotronHModel47BProvider(NemotronHModelProvider47B): - """Deprecated alias for ``NemotronHModelProvider47B``. - - Deprecated: - This alias remains for backward compatibility and will be removed in a - future release. Import and use ``NemotronHModelProvider47B`` instead. - """ - - def __post_init__(self) -> None: - _warn_deprecated("NemotronHModel47BProvider", "NemotronHModelProvider47B") - super().__post_init__() - - -@dataclass -class NemotronHModel56BProvider(NemotronHModelProvider56B): - """Deprecated alias for ``NemotronHModelProvider56B``. - - Deprecated: - This alias remains for backward compatibility and will be removed in a - future release. Import and use ``NemotronHModelProvider56B`` instead. - """ - - def __post_init__(self) -> None: - _warn_deprecated("NemotronHModel56BProvider", "NemotronHModelProvider56B") - super().__post_init__() - - -@dataclass -class NemotronNano9Bv2Provider(NemotronNanoModelProvider9Bv2): - """Deprecated alias for ``NemotronNanoModelProvider9Bv2``. - - Deprecated: - This alias remains for backward compatibility and will be removed in a - future release. Import and use ``NemotronNanoModelProvider9Bv2`` instead. - """ - - def __post_init__(self) -> None: - _warn_deprecated("NemotronNano9Bv2Provider", "NemotronNanoModelProvider9Bv2") - super().__post_init__() - - -@dataclass -class NemotronNano12Bv2Provider(NemotronNanoModelProvider12Bv2): - """Deprecated alias for ``NemotronNanoModelProvider12Bv2``. - - Deprecated: - This alias remains for backward compatibility and will be removed in a - future release. Import and use ``NemotronNanoModelProvider12Bv2`` instead. - """ - - def __post_init__(self) -> None: - _warn_deprecated("NemotronNano12Bv2Provider", "NemotronNanoModelProvider12Bv2") - super().__post_init__() diff --git a/src/megatron/bridge/recipes/kimi/kimi_k2.py b/src/megatron/bridge/recipes/kimi/kimi_k2.py index 3450f91820..a8695b306c 100644 --- a/src/megatron/bridge/recipes/kimi/kimi_k2.py +++ b/src/megatron/bridge/recipes/kimi/kimi_k2.py @@ -12,9 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +from functools import partial + import torch +import torch.nn.functional as F +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec + +from megatron.bridge.models.mla_provider import MLAModelProvider + -from megatron.bridge.models.kimi import KimiK2Provider +try: + import transformer_engine # noqa: F401 + + HAVE_TE = True +except (ImportError, ModuleNotFoundError): + HAVE_TE = False from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.optimizer_utils import distributed_muon_with_cosine_annealing from megatron.bridge.training.comm_overlap import CommOverlapConfig @@ -54,8 +66,77 @@ def kimi_k2_pretrain_config() -> ConfigContainer: """ cfg = _pretrain_common() - # Model config - uses KimiK2Provider instead of AutoBridge - cfg.model = KimiK2Provider( + # Model config - uses MLAModelProvider with Kimi-K2 architecture + cfg.model = MLAModelProvider( + # Architecture + transformer_layer_spec=partial(get_gpt_decoder_block_spec, use_transformer_engine=HAVE_TE), + num_layers=61, + hidden_size=7168, + ffn_hidden_size=18432, + num_moe_experts=384, + moe_ffn_hidden_size=2048, + moe_shared_expert_intermediate_size=2048, + moe_layer_freq=[0] + [1] * 60, + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + share_embeddings_and_output_weights=False, + num_attention_heads=64, + kv_channels=64, + max_position_embeddings=4096, + seq_length=4096, + rotary_base=50000.0, + make_vocab_size_divisible_by=1280, + attention_dropout=0.0, + hidden_dropout=0.0, + qk_layernorm=True, + # MoE + moe_router_topk=8, + moe_router_num_groups=1, + moe_router_group_topk=1, + moe_router_topk_scaling_factor=2.827, + moe_aux_loss_coeff=1e-3, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_bias_update_rate=1e-3, + moe_grouped_gemm=True, + moe_router_pre_softmax=True, + moe_token_dispatcher_type="alltoall", + moe_router_load_balancing_type="seq_aux_loss", + moe_shared_expert_overlap=True, + moe_router_dtype="fp32", + moe_permute_fusion=False, + # MLA + multi_latent_attention=True, + q_lora_rank=1536, + kv_lora_rank=512, + qk_head_dim=128, + qk_pos_emb_head_dim=64, + v_head_dim=128, + rotary_scaling_factor=32, + beta_fast=1.0, + beta_slow=1.0, + mscale=1.0, + mscale_all_dim=1.0, + # Miscellaneous + init_method_std=0.006, + layernorm_epsilon=1e-6, + bf16=True, + params_dtype=torch.bfloat16, + attention_softmax_in_fp32=False, + persist_layer_norm=True, + vocab_size=163840, + # Fusions + apply_rope_fusion=False, + bias_activation_fusion=True, + bias_dropout_fusion=True, + masked_softmax_fusion=True, + gradient_accumulation_fusion=True, + cross_entropy_loss_fusion=True, + cross_entropy_fusion_impl="te", + # Parallelism tensor_model_parallel_size=2, pipeline_model_parallel_size=16, pipeline_dtype=torch.bfloat16, @@ -129,7 +210,7 @@ def kimi_k2_pretrain_config() -> ConfigContainer: cfg.model.cross_entropy_loss_fusion = True cfg.model.cross_entropy_fusion_impl = "te" - # Memory saving (recompute & offloading) - already set in KimiK2Provider + # Memory saving (recompute & offloading) - already set in model provider # cfg.model.recompute_granularity = "selective" # cfg.model.recompute_modules = None cfg.model.fine_grained_activation_offloading = False diff --git a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py index cb31571d67..1eecebee24 100644 --- a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py +++ b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py @@ -14,9 +14,10 @@ import torch +import torch.nn.functional as F from megatron.bridge import AutoBridge -from megatron.bridge.models.deepseek import MoonlightModelProvider16B +from megatron.bridge.models.mla_provider import MLAModelProvider from megatron.bridge.peft.base import PEFT from megatron.bridge.recipes.common import _peft_common, _pretrain_common, _sft_common from megatron.bridge.recipes.utils.finetune_utils import default_peft_config @@ -195,8 +196,51 @@ def moonlight_16b_sft_config() -> ConfigContainer: """ cfg = _sft_common() - # Model config - uses MoonlightModelProvider16B - cfg.model = MoonlightModelProvider16B( + # Model config - uses MLAModelProvider with Moonlight-16B architecture + cfg.model = MLAModelProvider( + # Architecture + num_layers=27, + hidden_size=2048, + ffn_hidden_size=11264, + num_attention_heads=16, + kv_channels=16, + q_lora_rank=None, + kv_lora_rank=512, + max_position_embeddings=4096, + num_moe_experts=64, + moe_ffn_hidden_size=1408, + moe_shared_expert_intermediate_size=2816, + moe_layer_freq=[0] * 1 + [1] * 26, + moe_router_topk=6, + moe_router_num_groups=1, + moe_router_group_topk=1, + moe_router_topk_scaling_factor=2.446, + moe_aux_loss_coeff=0.001, + make_vocab_size_divisible_by=1280, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + rotary_scaling_factor=1.0, + mscale=1.0, + mscale_all_dim=1.0, + rotary_base=50000, + layernorm_epsilon=1e-5, + init_method_std=0.02, + moe_router_bias_update_rate=1e-3, + rotary_percent=1.0, + vocab_size=163842, + # Common defaults + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + share_embeddings_and_output_weights=False, + qk_layernorm=True, + bf16=True, + params_dtype=torch.bfloat16, + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + # Parallelism tensor_model_parallel_size=2, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -267,7 +311,7 @@ def moonlight_16b_sft_config() -> ConfigContainer: cfg.model.cross_entropy_fusion_impl = "te" # Memory saving (recompute & offloading) - # recompute_granularity already set in MoonlightModelProvider16B + # recompute_granularity already set in model provider cfg.model.fine_grained_activation_offloading = False cfg.model.offload_modules = None @@ -367,7 +411,50 @@ def moonlight_16b_peft_config( cfg = _peft_common() # Model config - PEFT uses TP=1, EP=2 - cfg.model = MoonlightModelProvider16B( + cfg.model = MLAModelProvider( + # Architecture + num_layers=27, + hidden_size=2048, + ffn_hidden_size=11264, + num_attention_heads=16, + kv_channels=16, + q_lora_rank=None, + kv_lora_rank=512, + max_position_embeddings=4096, + num_moe_experts=64, + moe_ffn_hidden_size=1408, + moe_shared_expert_intermediate_size=2816, + moe_layer_freq=[0] * 1 + [1] * 26, + moe_router_topk=6, + moe_router_num_groups=1, + moe_router_group_topk=1, + moe_router_topk_scaling_factor=2.446, + moe_aux_loss_coeff=0.001, + make_vocab_size_divisible_by=1280, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + rotary_scaling_factor=1.0, + mscale=1.0, + mscale_all_dim=1.0, + rotary_base=50000, + layernorm_epsilon=1e-5, + init_method_std=0.02, + moe_router_bias_update_rate=1e-3, + rotary_percent=1.0, + vocab_size=163842, + # Common defaults + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + share_embeddings_and_output_weights=False, + qk_layernorm=True, + bf16=True, + params_dtype=torch.bfloat16, + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + # Parallelism tensor_model_parallel_size=1, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, diff --git a/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py b/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py index 30bcd12faf..3853913979 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py @@ -14,8 +14,9 @@ import torch +from megatron.core.activations import squared_relu -from megatron.bridge.models.nemotronh import Nemotron3NanoProvider +from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider from megatron.bridge.peft.base import PEFT from megatron.bridge.peft.lora import LoRA from megatron.bridge.recipes.common import _peft_common, _pretrain_common, _sft_common @@ -37,7 +38,47 @@ def nemotron_3_nano_pretrain_config() -> ConfigContainer: cfg = _pretrain_common() # Model Configuration (MoE) - cfg.model = Nemotron3NanoProvider( + cfg.model = MambaModelProvider( + # Architecture (Nemotron 3 Nano 30B-A3B) + hybrid_override_pattern="MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME", + num_layers=52, + hidden_size=2688, + mamba_num_heads=64, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=1856, + num_attention_heads=32, + mamba_head_dim=64, + seq_length=8192, + num_query_groups=2, + # MoE + num_moe_experts=128, + moe_ffn_hidden_size=1856, + moe_shared_expert_intermediate_size=3712, + moe_router_topk=6, + moe_router_topk_scaling_factor=2.5, + moe_router_num_groups=1, + moe_router_group_topk=1, + # NemotronH base + mamba_num_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=4, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -46,7 +87,6 @@ def nemotron_3_nano_pretrain_config() -> ConfigContainer: sequence_parallel=True, expert_tensor_parallel_size=1, expert_model_parallel_size=8, - seq_length=8192, ) # Tokenizer (--tokenizer-model) @@ -168,8 +208,55 @@ def nemotron_3_nano_sft_config() -> ConfigContainer: """ cfg = _sft_common() - # Model config - uses Nemotron3NanoProvider - cfg.model = Nemotron3NanoProvider( + # Model config - Nemotron 3 Nano + cfg.model = MambaModelProvider( + # Architecture (Nemotron 3 Nano 30B-A3B) + hybrid_override_pattern="MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME", + num_layers=52, + hidden_size=2688, + mamba_num_heads=64, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=1856, + num_attention_heads=32, + mamba_head_dim=64, + seq_length=2048, + num_query_groups=2, + # MoE + num_moe_experts=128, + moe_ffn_hidden_size=1856, + moe_shared_expert_intermediate_size=3712, + moe_router_topk=6, + moe_router_topk_scaling_factor=2.5, + moe_router_num_groups=1, + moe_router_group_topk=1, + # NemotronH base + mamba_num_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Extra config + apply_rope_fusion=False, + attention_backend="fused", + gradient_accumulation_fusion=True, + init_method_std=0.0173, + use_fused_weighted_squared_relu=True, + calculate_per_token_loss=True, + # Parallelism tensor_model_parallel_size=1, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -178,13 +265,6 @@ def nemotron_3_nano_sft_config() -> ConfigContainer: sequence_parallel=False, expert_tensor_parallel_size=1, expert_model_parallel_size=8, - apply_rope_fusion=False, - attention_backend="fused", - gradient_accumulation_fusion=True, - init_method_std=0.0173, - use_fused_weighted_squared_relu=True, - seq_length=2048, - calculate_per_token_loss=True, ) # Parallelism settings @@ -319,7 +399,54 @@ def nemotron_3_nano_peft_config( cfg = _peft_common() # Model config - PEFT uses same parallelism as SFT - cfg.model = Nemotron3NanoProvider( + cfg.model = MambaModelProvider( + # Architecture (Nemotron 3 Nano 30B-A3B) + hybrid_override_pattern="MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME", + num_layers=52, + hidden_size=2688, + mamba_num_heads=64, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=1856, + num_attention_heads=32, + mamba_head_dim=64, + seq_length=2048, + num_query_groups=2, + # MoE + num_moe_experts=128, + moe_ffn_hidden_size=1856, + moe_shared_expert_intermediate_size=3712, + moe_router_topk=6, + moe_router_topk_scaling_factor=2.5, + moe_router_num_groups=1, + moe_router_group_topk=1, + # NemotronH base + mamba_num_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Extra config + apply_rope_fusion=False, + attention_backend="fused", + gradient_accumulation_fusion=True, + init_method_std=0.0173, + use_fused_weighted_squared_relu=True, + calculate_per_token_loss=True, + # Parallelism tensor_model_parallel_size=1, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -328,13 +455,6 @@ def nemotron_3_nano_peft_config( sequence_parallel=False, expert_tensor_parallel_size=1, expert_model_parallel_size=8, - apply_rope_fusion=False, - attention_backend="fused", - gradient_accumulation_fusion=True, - init_method_std=0.0173, - use_fused_weighted_squared_relu=True, - seq_length=2048, - calculate_per_token_loss=True, ) # Parallelism settings diff --git a/src/megatron/bridge/recipes/nemotronh/nemotron_nano_v2.py b/src/megatron/bridge/recipes/nemotronh/nemotron_nano_v2.py index 4f8f744df0..19dc537112 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotron_nano_v2.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotron_nano_v2.py @@ -14,11 +14,9 @@ import torch +from megatron.core.activations import squared_relu -from megatron.bridge.models.nemotronh import ( - NemotronNanoModelProvider9Bv2, - NemotronNanoModelProvider12Bv2, -) +from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider from megatron.bridge.peft.base import PEFT from megatron.bridge.peft.lora import LoRA from megatron.bridge.recipes.common import _peft_common, _pretrain_common, _sft_common @@ -36,8 +34,40 @@ def nemotron_nano_9b_v2_pretrain_config() -> ConfigContainer: """ cfg = _pretrain_common() - # Model config - uses NemotronNanoModelProvider9Bv2 - cfg.model = NemotronNanoModelProvider9Bv2( + # Model config - Nemotron Nano 9B v2 + cfg.model = MambaModelProvider( + # Architecture (Nemotron Nano 9B v2) + hybrid_override_pattern="M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-", + num_layers=56, + hidden_size=4480, + mamba_num_heads=128, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=15680, + num_attention_heads=40, + mamba_head_dim=80, + seq_length=131072, + # NemotronH base + mamba_num_groups=8, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=2, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -139,8 +169,40 @@ def nemotron_nano_12b_v2_pretrain_config() -> ConfigContainer: """ cfg = _pretrain_common() - # Model config - uses NemotronNanoModelProvider12Bv2 - cfg.model = NemotronNanoModelProvider12Bv2( + # Model config - Nemotron Nano 12B v2 + cfg.model = MambaModelProvider( + # Architecture (Nemotron Nano 12B v2) + hybrid_override_pattern="M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-", + num_layers=62, + hidden_size=5120, + mamba_num_heads=128, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=20480, + num_attention_heads=40, + mamba_head_dim=80, + seq_length=131072, + # NemotronH base + mamba_num_groups=8, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=4, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -244,15 +306,46 @@ def nemotron_nano_9b_v2_sft_config() -> ConfigContainer: """ cfg = _sft_common() - # Model config - uses NemotronNanoModelProvider9Bv2 - cfg.model = NemotronNanoModelProvider9Bv2( + # Model config - Nemotron Nano 9B v2 + cfg.model = MambaModelProvider( + # Architecture (Nemotron Nano 9B v2) + hybrid_override_pattern="M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-", + num_layers=56, + hidden_size=4480, + mamba_num_heads=128, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=15680, + num_attention_heads=40, + mamba_head_dim=80, + seq_length=2048, + # NemotronH base + mamba_num_groups=8, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=2, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=True, - seq_length=2048, ) # Parallelism settings @@ -339,15 +432,46 @@ def nemotron_nano_12b_v2_sft_config() -> ConfigContainer: """ cfg = _sft_common() - # Model config - uses NemotronNanoModelProvider12Bv2 - cfg.model = NemotronNanoModelProvider12Bv2( + # Model config - Nemotron Nano 12B v2 + cfg.model = MambaModelProvider( + # Architecture (Nemotron Nano 12B v2) + hybrid_override_pattern="M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-", + num_layers=62, + hidden_size=5120, + mamba_num_heads=128, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=20480, + num_attention_heads=40, + mamba_head_dim=80, + seq_length=2048, + # NemotronH base + mamba_num_groups=8, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=4, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=True, - seq_length=2048, ) # Parallelism settings @@ -444,14 +568,45 @@ def nemotron_nano_9b_v2_peft_config( cfg = _peft_common() # Model config - PEFT uses TP=1, SP=False - cfg.model = NemotronNanoModelProvider9Bv2( + cfg.model = MambaModelProvider( + # Architecture (Nemotron Nano 9B v2) + hybrid_override_pattern="M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-", + num_layers=56, + hidden_size=4480, + mamba_num_heads=128, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=15680, + num_attention_heads=40, + mamba_head_dim=80, + seq_length=2048, + # NemotronH base + mamba_num_groups=8, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=1, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=False, - seq_length=2048, ) # Parallelism settings @@ -560,14 +715,45 @@ def nemotron_nano_12b_v2_peft_config( cfg = _peft_common() # Model config - PEFT uses TP=1, SP=False - cfg.model = NemotronNanoModelProvider12Bv2( + cfg.model = MambaModelProvider( + # Architecture (Nemotron Nano 12B v2) + hybrid_override_pattern="M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-", + num_layers=62, + hidden_size=5120, + mamba_num_heads=128, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=20480, + num_attention_heads=40, + mamba_head_dim=80, + seq_length=2048, + # NemotronH base + mamba_num_groups=8, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=1, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=False, - seq_length=2048, ) # Parallelism settings diff --git a/src/megatron/bridge/recipes/nemotronh/nemotronh.py b/src/megatron/bridge/recipes/nemotronh/nemotronh.py index fb3307875a..9baabe2f5c 100644 --- a/src/megatron/bridge/recipes/nemotronh/nemotronh.py +++ b/src/megatron/bridge/recipes/nemotronh/nemotronh.py @@ -14,13 +14,10 @@ import torch +from megatron.core.activations import squared_relu +from megatron.core.transformer.enums import AttnBackend -from megatron.bridge.models.nemotronh import ( - NemotronHModelProvider4B, - NemotronHModelProvider8B, - NemotronHModelProvider47B, - NemotronHModelProvider56B, -) +from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider from megatron.bridge.peft.base import PEFT from megatron.bridge.peft.lora import LoRA from megatron.bridge.recipes.common import _peft_common, _pretrain_common, _sft_common @@ -39,7 +36,40 @@ def nemotronh_4b_pretrain_config() -> ConfigContainer: cfg = _pretrain_common() # Model config - cfg.model = NemotronHModelProvider4B( + cfg.model = MambaModelProvider( + # Architecture (NemotronH 4B) + hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-", + num_layers=52, + hidden_size=3072, + mamba_num_heads=112, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=12288, + num_attention_heads=32, + use_mamba_mem_eff_path=False, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=1, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -140,7 +170,38 @@ def nemotronh_8b_pretrain_config() -> ConfigContainer: cfg = _pretrain_common() # Model config - cfg.model = NemotronHModelProvider8B( + cfg.model = MambaModelProvider( + # Architecture (NemotronH 8B) + hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-", + num_layers=52, + hidden_size=4096, + mamba_state_dim=128, + mamba_num_heads=128, + ffn_hidden_size=21504, + num_attention_heads=32, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=2, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -243,7 +304,40 @@ def nemotronh_47b_pretrain_config() -> ConfigContainer: cfg = _pretrain_common() # Model config - cfg.model = NemotronHModelProvider47B( + cfg.model = MambaModelProvider( + # Architecture (NemotronH 47B) + hybrid_override_pattern=( + "M-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M---MM---M-M*-M-M-M-M-M-" + ), + num_layers=98, + hidden_size=8192, + mamba_state_dim=256, + mamba_num_heads=256, + ffn_hidden_size=30720, + num_attention_heads=64, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=8, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -346,7 +440,42 @@ def nemotronh_56b_pretrain_config() -> ConfigContainer: cfg = _pretrain_common() # Model config - cfg.model = NemotronHModelProvider56B( + cfg.model = MambaModelProvider( + # Architecture (NemotronH 56B) + hybrid_override_pattern=( + "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-" + "M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" + ), + num_layers=118, + hidden_size=8192, + mamba_state_dim=256, + mamba_num_heads=256, + ffn_hidden_size=32768, + num_attention_heads=64, + attention_backend=AttnBackend.auto, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=8, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, @@ -453,15 +582,47 @@ def nemotronh_4b_sft_config() -> ConfigContainer: """ cfg = _sft_common() - # Model config - uses NemotronHModelProvider4B - cfg.model = NemotronHModelProvider4B( + # Model config - NemotronH 4B + cfg.model = MambaModelProvider( + # Architecture (NemotronH 4B) + hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-", + num_layers=52, + hidden_size=3072, + mamba_num_heads=112, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=12288, + num_attention_heads=32, + use_mamba_mem_eff_path=False, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=1, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=False, - seq_length=8192, ) # Parallelism settings @@ -551,15 +712,45 @@ def nemotronh_8b_sft_config() -> ConfigContainer: """ cfg = _sft_common() - # Model config - uses NemotronHModelProvider8B - cfg.model = NemotronHModelProvider8B( + # Model config - NemotronH 8B + cfg.model = MambaModelProvider( + # Architecture (NemotronH 8B) + hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-", + num_layers=52, + hidden_size=4096, + mamba_state_dim=128, + mamba_num_heads=128, + ffn_hidden_size=21504, + num_attention_heads=32, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=2, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=True, - seq_length=8192, ) # Parallelism settings @@ -647,15 +838,47 @@ def nemotronh_47b_sft_config() -> ConfigContainer: """ cfg = _sft_common() - # Model config - uses NemotronHModelProvider47B - cfg.model = NemotronHModelProvider47B( + # Model config - NemotronH 47B + cfg.model = MambaModelProvider( + # Architecture (NemotronH 47B) + hybrid_override_pattern=( + "M-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M---MM---M-M*-M-M-M-M-M-" + ), + num_layers=98, + hidden_size=8192, + mamba_state_dim=256, + mamba_num_heads=256, + ffn_hidden_size=30720, + num_attention_heads=64, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=8, pipeline_model_parallel_size=2, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=True, - seq_length=8192, ) # Parallelism settings @@ -743,15 +966,49 @@ def nemotronh_56b_sft_config() -> ConfigContainer: """ cfg = _sft_common() - # Model config - uses NemotronHModelProvider56B - cfg.model = NemotronHModelProvider56B( + # Model config - NemotronH 56B + cfg.model = MambaModelProvider( + # Architecture (NemotronH 56B) + hybrid_override_pattern=( + "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-" + "M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" + ), + num_layers=118, + hidden_size=8192, + mamba_state_dim=256, + mamba_num_heads=256, + ffn_hidden_size=32768, + num_attention_heads=64, + attention_backend=AttnBackend.auto, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=8, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=True, - seq_length=8192, ) # Parallelism settings @@ -849,14 +1106,46 @@ def nemotronh_4b_peft_config( cfg = _peft_common() # Model config - PEFT uses same parallelism as SFT for 4B - cfg.model = NemotronHModelProvider4B( + cfg.model = MambaModelProvider( + # Architecture (NemotronH 4B) + hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-", + num_layers=52, + hidden_size=3072, + mamba_num_heads=112, + kv_channels=128, + mamba_state_dim=128, + ffn_hidden_size=12288, + num_attention_heads=32, + use_mamba_mem_eff_path=False, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=1, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=False, - seq_length=8192, ) # Parallelism settings @@ -965,14 +1254,44 @@ def nemotronh_8b_peft_config( cfg = _peft_common() # Model config - PEFT uses TP=1, SP=False - cfg.model = NemotronHModelProvider8B( + cfg.model = MambaModelProvider( + # Architecture (NemotronH 8B) + hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-", + num_layers=52, + hidden_size=4096, + mamba_state_dim=128, + mamba_num_heads=128, + ffn_hidden_size=21504, + num_attention_heads=32, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=1, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=False, - seq_length=8192, ) # Parallelism settings @@ -1081,14 +1400,46 @@ def nemotronh_47b_peft_config( cfg = _peft_common() # Model config - PEFT uses TP=4, PP=1 - cfg.model = NemotronHModelProvider47B( + cfg.model = MambaModelProvider( + # Architecture (NemotronH 47B) + hybrid_override_pattern=( + "M-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M---MM---M-M*-M-M-M-M-M-" + ), + num_layers=98, + hidden_size=8192, + mamba_state_dim=256, + mamba_num_heads=256, + ffn_hidden_size=30720, + num_attention_heads=64, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=4, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=False, - seq_length=8192, ) # Parallelism settings @@ -1197,14 +1548,48 @@ def nemotronh_56b_peft_config( cfg = _peft_common() # Model config - PEFT uses TP=4, PP=1 - cfg.model = NemotronHModelProvider56B( + cfg.model = MambaModelProvider( + # Architecture (NemotronH 56B) + hybrid_override_pattern=( + "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-" + "M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" + ), + num_layers=118, + hidden_size=8192, + mamba_state_dim=256, + mamba_num_heads=256, + ffn_hidden_size=32768, + num_attention_heads=64, + attention_backend=AttnBackend.auto, + # NemotronH base + seq_length=8192, + mamba_num_groups=8, + mamba_head_dim=64, + num_query_groups=8, + make_vocab_size_divisible_by=128, + activation_func=squared_relu, + masked_softmax_fusion=True, + apply_query_key_layer_scaling=False, + persist_layer_norm=True, + attention_softmax_in_fp32=False, + first_last_layers_bf16=True, + is_hybrid_model=True, + moe_aux_loss_coeff=0.0001, + moe_router_score_function="sigmoid", + moe_router_enable_expert_bias=True, + moe_router_load_balancing_type="seq_aux_loss", + moe_router_dtype="fp32", + moe_grouped_gemm=True, + moe_token_dispatcher_type="alltoall", + moe_permute_fusion=True, + moe_shared_expert_overlap=True, + # Parallelism tensor_model_parallel_size=4, pipeline_model_parallel_size=1, pipeline_dtype=torch.bfloat16, virtual_pipeline_model_parallel_size=None, context_parallel_size=1, sequence_parallel=False, - seq_length=8192, ) # Parallelism settings diff --git a/tests/functional_tests/data/test_samplers.py b/tests/functional_tests/data/test_samplers.py index 475f72efde..6e8f2f96f4 100644 --- a/tests/functional_tests/data/test_samplers.py +++ b/tests/functional_tests/data/test_samplers.py @@ -58,9 +58,9 @@ def test_build_pretraining_data_loader_single(self): class _DummyBridge: def to_megatron_provider(self, load_weights=False): - from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider + from megatron.bridge.models.gpt_provider import GPTModelProvider - return Llama3ModelProvider() + return GPTModelProvider() mock_from.return_value = _DummyBridge() cfg = pretrain_config() @@ -102,9 +102,9 @@ def test_build_pretraining_data_loader_cyclic(self): class _DummyBridge: def to_megatron_provider(self, load_weights=False): - from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider + from megatron.bridge.models.gpt_provider import GPTModelProvider - return Llama3ModelProvider() + return GPTModelProvider() mock_from.return_value = _DummyBridge() cfg = pretrain_config() @@ -155,9 +155,9 @@ def test_build_pretraining_data_loader_external(self): class _DummyBridge: def to_megatron_provider(self, load_weights=False): - from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider + from megatron.bridge.models.gpt_provider import GPTModelProvider - return Llama3ModelProvider() + return GPTModelProvider() mock_from.return_value = _DummyBridge() cfg = pretrain_config() @@ -579,9 +579,9 @@ def test_build_batch_dataloader_basic(self): class _DummyBridge: def to_megatron_provider(self, load_weights=False): - from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider + from megatron.bridge.models.gpt_provider import GPTModelProvider - return Llama3ModelProvider() + return GPTModelProvider() mock_from.return_value = _DummyBridge() cfg = pretrain_config() @@ -617,9 +617,9 @@ def test_build_batch_dataloader_missing_global_batch_size(self): class _DummyBridge: def to_megatron_provider(self, load_weights=False): - from megatron.bridge.models.llama.llama_provider import Llama3ModelProvider + from megatron.bridge.models.gpt_provider import GPTModelProvider - return Llama3ModelProvider() + return GPTModelProvider() mock_from.return_value = _DummyBridge() cfg = pretrain_config() diff --git a/tests/functional_tests/models/gpt_oss/test_gpt_oss_provider.py b/tests/functional_tests/models/gpt_oss/test_gpt_oss_provider.py deleted file mode 100644 index 8cee92f27b..0000000000 --- a/tests/functional_tests/models/gpt_oss/test_gpt_oss_provider.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest - -from megatron.bridge.models.conversion.auto_bridge import AutoBridge -from megatron.bridge.models.gpt_oss import ( - GPTOSSProvider20B, - GPTOSSProvider120B, -) -from tests.functional_tests.utils import compare_provider_configs - - -# These HF IDs are placeholders for tests in environments with pre-downloaded models. -# For CI or local runs without actual HF downloads, we allow mapping using a config-only load. -HF_MODEL_ID_TO_PROVIDER = { - # If your environment has these, point to actual repo ids or local cache paths - "openai/gpt-oss-20b": GPTOSSProvider20B, - "openai/gpt-oss-120b": GPTOSSProvider120B, -} - - -class TestGptOssProviderMapping: - """Test that bridge provider configs match predefined GPT-OSS providers.""" - - @pytest.mark.parametrize("provider_class", [GPTOSSProvider20B, GPTOSSProvider120B]) - def test_bridge_vs_predefined_provider_config_from_config_only(self, provider_class): - # Skip if transformers lacks GPT-OSS - transformers = pytest.importorskip("transformers") - GptOssConfig = getattr(transformers, "GptOssConfig", None) - if GptOssConfig is None: - pytest.skip("transformers installation does not include GptOssConfig") - - # Create a minimal config aligned with GPT-OSS; values don't need to match a real HF repo - # because we compare converted vs predefined providers for equality, not against a specific model ID. - cfg = GptOssConfig( - architectures=["GptOssForCausalLM"], - hidden_size=provider_class.hidden_size if hasattr(provider_class, "hidden_size") else 2880, - num_hidden_layers=getattr(provider_class, "num_layers", 24), - num_attention_heads=getattr(provider_class, "num_attention_heads", 64), - num_key_value_heads=getattr(provider_class, "num_key_value_heads", 8), - num_local_experts=getattr(provider_class, "num_moe_experts", 32), - vocab_size=201088, - ) - - bridge = AutoBridge.from_hf_config(cfg) - converted_provider = bridge.to_megatron_provider(load_weights=False) - converted_provider.finalize() - - predefined_provider = provider_class() - predefined_provider.finalize() - - compare_provider_configs(converted_provider, predefined_provider, "gpt-oss-config-only") - - @pytest.mark.parametrize("hf_model_id,provider_class", list(HF_MODEL_ID_TO_PROVIDER.items())) - def test_bridge_vs_predefined_provider_config_hf(self, hf_model_id, provider_class): - # Optional mapping test that uses from_hf_pretrained if available in the environment - if not HF_MODEL_ID_TO_PROVIDER: - pytest.skip("No HF model ids configured for GPT-OSS mapping test") - - bridge = AutoBridge.from_hf_pretrained(hf_model_id, trust_remote_code=True) - converted_provider = bridge.to_megatron_provider(load_weights=False) - converted_provider.finalize() - - predefined_provider = provider_class() - predefined_provider.finalize() - - compare_provider_configs(converted_provider, predefined_provider, hf_model_id) diff --git a/tests/functional_tests/training/test_callbacks.py b/tests/functional_tests/training/test_callbacks.py index 3877ecdbec..e92a8d17a1 100644 --- a/tests/functional_tests/training/test_callbacks.py +++ b/tests/functional_tests/training/test_callbacks.py @@ -16,8 +16,9 @@ import pytest import torch +import torch.nn.functional as F -from megatron.bridge.models.llama import Llama32ModelProvider1B +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.callbacks import Callback, CallbackContext, CallbackManager from megatron.bridge.training.config import ( CheckpointConfig, @@ -153,7 +154,30 @@ def test_callbacks(self): eval_interval = 5 # Eval only at step 5 during training eval_iters = 2 - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + share_embeddings_and_output_weights=True, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1, diff --git a/tests/functional_tests/training/test_decentralized_pg.py b/tests/functional_tests/training/test_decentralized_pg.py index 03c3325f2d..1bf04c4d47 100644 --- a/tests/functional_tests/training/test_decentralized_pg.py +++ b/tests/functional_tests/training/test_decentralized_pg.py @@ -25,8 +25,9 @@ import pytest import torch +import torch.nn.functional as F -from megatron.bridge.models.llama import Llama32ModelProvider1B +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -100,7 +101,29 @@ def test_pretrain_with_decentralized_pg(self, tmp_path): seq_length = 512 total_iters = 5 - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1, @@ -229,7 +252,30 @@ def test_pretrain_with_decentralized_pg_disabled(self, tmp_path): seq_length = 512 total_iters = 5 - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + share_embeddings_and_output_weights=True, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1, @@ -361,7 +407,29 @@ def test_pretrain_with_decentralized_pg_and_pp(self, tmp_path): seq_length = 512 total_iters = 5 - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=2, # Enable PP context_parallel_size=1, @@ -495,7 +563,29 @@ def test_pretrain_with_decentralized_pg_and_cp(self, tmp_path): seq_length = 512 total_iters = 5 - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=2, # Enable CP @@ -629,7 +719,29 @@ def test_pretrain_with_decentralized_pg_combined_parallelism(self, tmp_path): seq_length = 512 total_iters = 5 - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=2, # Enable TP pipeline_model_parallel_size=2, # Enable PP context_parallel_size=1, @@ -763,7 +875,29 @@ def test_pretrain_with_decentralized_pg_and_tp(self, tmp_path): seq_length = 512 total_iters = 5 - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=2, # Enable TP pipeline_model_parallel_size=1, context_parallel_size=1, diff --git a/tests/functional_tests/training/test_finetune_dora.py b/tests/functional_tests/training/test_finetune_dora.py index 9563526152..a45dd36392 100644 --- a/tests/functional_tests/training/test_finetune_dora.py +++ b/tests/functional_tests/training/test_finetune_dora.py @@ -14,13 +14,15 @@ import os from dataclasses import dataclass +from typing import Callable import pytest import torch +import torch.nn.functional as F from megatron.bridge.data.builders.hf_dataset import HFDatasetConfig from megatron.bridge.data.hf_processors.squad import process_squad_example -from megatron.bridge.models.llama import Llama3ModelProvider +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.peft.dora import DoRA from megatron.bridge.training.config import ( CheckpointConfig, @@ -48,7 +50,24 @@ @dataclass -class Llama3ModelProvider145M(Llama3ModelProvider): +class Llama3ModelProvider145M(GPTModelProvider): + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + position_embedding_type: str = "rope" + add_bias_linear: bool = False + attention_dropout: float = 0.0 + hidden_dropout: float = 0.0 + share_embeddings_and_output_weights: bool = False + bias_activation_fusion: bool = True + masked_softmax_fusion: bool = True + persist_layer_norm: bool = True + bias_dropout_fusion: bool = True + apply_rope_fusion: bool = True + num_query_groups: int = 8 + init_method_std: float = 0.01 + layernorm_epsilon: float = 1e-05 + rotary_percent: float = 1.0 rotary_base: int = 500_000 num_layers: int = 2 hidden_size: int = 768 diff --git a/tests/functional_tests/training/test_finetune_lora.py b/tests/functional_tests/training/test_finetune_lora.py index 52c64c2fcb..a44703f907 100644 --- a/tests/functional_tests/training/test_finetune_lora.py +++ b/tests/functional_tests/training/test_finetune_lora.py @@ -14,14 +14,16 @@ import os from dataclasses import dataclass +from typing import Callable import pytest import torch +import torch.nn.functional as F from megatron.bridge.data.builders.hf_dataset import HFDatasetConfig from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs from megatron.bridge.data.hf_processors.squad import process_squad_example -from megatron.bridge.models.llama import Llama3ModelProvider +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.peft.lora import LoRA from megatron.bridge.training.config import ( CheckpointConfig, @@ -49,9 +51,26 @@ @dataclass -class Llama3ModelProvider145M(Llama3ModelProvider): +class Llama3ModelProvider145M(GPTModelProvider): """Smaller Llama3 config used previously for functional tests.""" + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + position_embedding_type: str = "rope" + add_bias_linear: bool = False + attention_dropout: float = 0.0 + hidden_dropout: float = 0.0 + share_embeddings_and_output_weights: bool = False + bias_activation_fusion: bool = True + masked_softmax_fusion: bool = True + persist_layer_norm: bool = True + bias_dropout_fusion: bool = True + apply_rope_fusion: bool = True + num_query_groups: int = 8 + init_method_std: float = 0.01 + layernorm_epsilon: float = 1e-05 + rotary_percent: float = 1.0 rotary_base: int = 500_000 num_layers: int = 2 hidden_size: int = 768 diff --git a/tests/functional_tests/training/test_inprocess_restart.py b/tests/functional_tests/training/test_inprocess_restart.py index 9ca9cf19a1..8e7e617e86 100644 --- a/tests/functional_tests/training/test_inprocess_restart.py +++ b/tests/functional_tests/training/test_inprocess_restart.py @@ -24,10 +24,11 @@ import pytest import torch +import torch.nn.functional as F from megatron.core.distributed import DistributedDataParallelConfig from megatron.core.optimizer import OptimizerConfig -from megatron.bridge.models.llama import Llama32ModelProvider1B +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -66,7 +67,31 @@ def build_test_config( Returns: Complete configuration for training with in-process restart """ - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + share_embeddings_and_output_weights=True, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, + make_vocab_size_divisible_by=128, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1, diff --git a/tests/functional_tests/training/test_megatron_fsdp.py b/tests/functional_tests/training/test_megatron_fsdp.py index 3d8574ed30..f630595e2b 100644 --- a/tests/functional_tests/training/test_megatron_fsdp.py +++ b/tests/functional_tests/training/test_megatron_fsdp.py @@ -14,12 +14,13 @@ import os from dataclasses import dataclass -from typing import Optional +from typing import Callable, Optional import pytest import torch +import torch.nn.functional as F -from megatron.bridge.models.llama import Llama3ModelProvider +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -45,9 +46,26 @@ @dataclass -class Llama3ModelProviderFSDP145M(Llama3ModelProvider): +class Llama3ModelProviderFSDP145M(GPTModelProvider): """Small Llama3 model configuration for FSDP testing.""" + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + position_embedding_type: str = "rope" + add_bias_linear: bool = False + attention_dropout: float = 0.0 + hidden_dropout: float = 0.0 + share_embeddings_and_output_weights: bool = False + bias_activation_fusion: bool = True + masked_softmax_fusion: bool = True + persist_layer_norm: bool = True + bias_dropout_fusion: bool = True + apply_rope_fusion: bool = True + num_query_groups: int = 8 + init_method_std: float = 0.01 + layernorm_epsilon: float = 1e-05 + rotary_percent: float = 1.0 rotary_base: int = 500_000 seq_length: int = 8192 num_layers: int = 2 @@ -55,7 +73,6 @@ class Llama3ModelProviderFSDP145M(Llama3ModelProvider): ffn_hidden_size: int = 2688 num_attention_heads: int = 16 vocab_size: int | None = None - # Disable gradient accumulation fusion for FSDP gradient_accumulation_fusion: bool = False diff --git a/tests/functional_tests/training/test_nvrx_straggler.py b/tests/functional_tests/training/test_nvrx_straggler.py index 13cc96bca5..416e076c72 100644 --- a/tests/functional_tests/training/test_nvrx_straggler.py +++ b/tests/functional_tests/training/test_nvrx_straggler.py @@ -27,8 +27,9 @@ import time import torch +import torch.nn.functional as F -from megatron.bridge.models.llama import Llama32ModelProvider1B +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -64,7 +65,31 @@ def create_functional_test_config(enable_nvrx: bool = True) -> ConfigContainer: eval_iters=0, ) - model_config = Llama32ModelProvider1B( + model_config = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + share_embeddings_and_output_weights=True, + rotary_base=500_000, + num_layers=16, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1, diff --git a/tests/functional_tests/training/test_pretrain.py b/tests/functional_tests/training/test_pretrain.py index ea1a9b1787..141315060d 100644 --- a/tests/functional_tests/training/test_pretrain.py +++ b/tests/functional_tests/training/test_pretrain.py @@ -16,8 +16,9 @@ import pytest import torch +import torch.nn.functional as F -from megatron.bridge.models.llama import Llama32ModelProvider1B +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -69,7 +70,30 @@ def test_pretrain_with_checkpoint(self, tmp_path): seq_length = 512 total_iters = 10 - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + share_embeddings_and_output_weights=True, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1, @@ -192,7 +216,31 @@ def test_pretrain_vpp(self, tmp_path): total_iters = 10 # Create model config with VPP - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + share_embeddings_and_output_weights=True, + rotary_base=500_000, + num_layers=16, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=2, diff --git a/tests/functional_tests/training/test_pretrain_resume.py b/tests/functional_tests/training/test_pretrain_resume.py index 53b48f0284..31994a33a6 100644 --- a/tests/functional_tests/training/test_pretrain_resume.py +++ b/tests/functional_tests/training/test_pretrain_resume.py @@ -14,11 +14,13 @@ import os from dataclasses import dataclass +from typing import Callable import pytest import torch +import torch.nn.functional as F -from megatron.bridge.models.llama import Llama3ModelProvider +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -43,7 +45,24 @@ @dataclass -class Llama3ModelProvider145M(Llama3ModelProvider): +class Llama3ModelProvider145M(GPTModelProvider): + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + position_embedding_type: str = "rope" + add_bias_linear: bool = False + attention_dropout: float = 0.0 + hidden_dropout: float = 0.0 + share_embeddings_and_output_weights: bool = False + bias_activation_fusion: bool = True + masked_softmax_fusion: bool = True + persist_layer_norm: bool = True + bias_dropout_fusion: bool = True + apply_rope_fusion: bool = True + num_query_groups: int = 8 + init_method_std: float = 0.01 + layernorm_epsilon: float = 1e-05 + rotary_percent: float = 1.0 rotary_base: int = 500_000 seq_length: int = 1024 num_layers: int = 1 diff --git a/tests/functional_tests/training/test_sample_based_training.py b/tests/functional_tests/training/test_sample_based_training.py index 1c7acec0c5..e7ab09d54b 100644 --- a/tests/functional_tests/training/test_sample_based_training.py +++ b/tests/functional_tests/training/test_sample_based_training.py @@ -17,8 +17,9 @@ import logging import torch +import torch.nn.functional as F -from megatron.bridge.models.llama import Llama32ModelProvider1B +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing_samples from megatron.bridge.training.config import ( CheckpointConfig, @@ -66,7 +67,30 @@ def test_sample_based_training_mini_run(self): eval_iters=2, skip_train=False, ), - model=Llama32ModelProvider1B( + model=GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + share_embeddings_and_output_weights=True, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1, diff --git a/tests/functional_tests/training/test_sft.py b/tests/functional_tests/training/test_sft.py index 9837a5a124..724859acb3 100644 --- a/tests/functional_tests/training/test_sft.py +++ b/tests/functional_tests/training/test_sft.py @@ -14,11 +14,13 @@ import os from dataclasses import dataclass +from typing import Callable import pytest import torch +import torch.nn.functional as F -from megatron.bridge.models.llama import Llama3ModelProvider +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -44,7 +46,24 @@ @dataclass -class Llama3ModelProvider145M(Llama3ModelProvider): +class Llama3ModelProvider145M(GPTModelProvider): + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + position_embedding_type: str = "rope" + add_bias_linear: bool = False + attention_dropout: float = 0.0 + hidden_dropout: float = 0.0 + share_embeddings_and_output_weights: bool = False + bias_activation_fusion: bool = True + masked_softmax_fusion: bool = True + persist_layer_norm: bool = True + bias_dropout_fusion: bool = True + apply_rope_fusion: bool = True + num_query_groups: int = 8 + init_method_std: float = 0.01 + layernorm_epsilon: float = 1e-05 + rotary_percent: float = 1.0 rotary_base: int = 500_000 num_layers: int = 2 hidden_size: int = 768 diff --git a/tests/functional_tests/training/test_tensor_inspect.py b/tests/functional_tests/training/test_tensor_inspect.py index da9a70bee8..949ed231db 100644 --- a/tests/functional_tests/training/test_tensor_inspect.py +++ b/tests/functional_tests/training/test_tensor_inspect.py @@ -16,8 +16,9 @@ import pytest import torch +import torch.nn.functional as F -from megatron.bridge.models.llama import Llama32ModelProvider1B +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -68,7 +69,30 @@ def test_pretrain_with_bf16_tensor_stats(self, tmp_path): seq_length = 512 total_iters = 10 - model_cfg = Llama32ModelProvider1B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, + apply_rope_fusion=True, + num_query_groups=8, + init_method_std=0.02, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rope_scaling=True, + rope_scaling_factor=32.0, + share_embeddings_and_output_weights=True, + rotary_base=500_000, + hidden_size=2048, + ffn_hidden_size=8192, + num_attention_heads=32, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, context_parallel_size=1, diff --git a/tests/unit_tests/models/deepseek/test_deepseek_provider.py b/tests/unit_tests/models/deepseek/test_deepseek_provider.py deleted file mode 100644 index 0ad86d1e00..0000000000 --- a/tests/unit_tests/models/deepseek/test_deepseek_provider.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Unit tests for DeepSeek provider classes. -""" - -import torch - -from megatron.bridge.models.deepseek.deepseek_provider import ( - DeepSeekModelProvider, - DeepSeekV2LiteModelProvider, - DeepSeekV2ModelProvider, - DeepSeekV3ModelProvider, - MoonlightModelProvider16B, -) - - -class TestDeepSeekProviderDefaults: - """Test default configuration values for DeepSeek providers.""" - - def test_deepseek_provider_base_defaults(self): - # Provide minimal valid values to satisfy Megatron-Core post-init checks - provider = DeepSeekModelProvider(num_layers=1, hidden_size=1024, num_attention_heads=8) - - # Generic model defaults - assert provider.normalization == "RMSNorm" - assert provider.activation_func is not None - assert provider.gated_linear_unit is True - assert provider.position_embedding_type == "rope" - assert provider.add_bias_linear is False - assert provider.share_embeddings_and_output_weights is False - assert provider.qk_layernorm is True - - # DType defaults - assert provider.bf16 is True - assert provider.params_dtype == torch.bfloat16 - - # MoE and MLA flags - assert provider.moe_grouped_gemm is True - assert provider.moe_token_dispatcher_type == "alltoall" - assert provider.q_lora_rank is not None - assert provider.kv_lora_rank is not None - - def test_deepseek_v2_defaults(self): - provider = DeepSeekV2ModelProvider() - - assert provider.num_layers == 60 - assert provider.hidden_size == 5120 - assert provider.num_moe_experts == 160 - assert provider.moe_router_topk == 6 - # Note: qk_layernorm is set by the bridge, not the provider class - assert provider.mscale == 0.707 - assert provider.mscale_all_dim == 0.707 - - def test_deepseek_v2_lite_defaults(self): - provider = DeepSeekV2LiteModelProvider() - - assert provider.num_layers == 27 - assert provider.hidden_size == 2048 - assert provider.num_attention_heads == 16 - assert provider.num_moe_experts == 64 - assert provider.q_lora_rank is None - assert provider.mscale == 0.707 - assert provider.mscale_all_dim == 0.707 - - def test_deepseek_v3_defaults(self): - provider = DeepSeekV3ModelProvider() - - assert provider.num_layers == 61 - assert provider.hidden_size == 7168 - assert provider.num_moe_experts == 256 - assert provider.moe_router_topk == 8 - assert provider.kv_channels == 128 - assert provider.moe_router_score_function == "sigmoid" - assert provider.moe_router_enable_expert_bias is True - assert provider.moe_router_bias_update_rate == 1e-3 - assert provider.mscale == 1.0 - assert provider.mscale_all_dim == 1.0 - - def test_moonlight_defaults(self): - provider = MoonlightModelProvider16B() - - assert provider.num_layers == 27 - assert provider.hidden_size == 2048 - assert provider.ffn_hidden_size == 11264 - assert provider.num_moe_experts == 64 - assert provider.moe_ffn_hidden_size == 1408 - assert provider.moe_router_topk == 6 - assert provider.moe_router_num_groups == 1 - assert provider.moe_router_group_topk == 1 - assert provider.rotary_base == 50000 - assert provider.layernorm_epsilon == 1e-5 - assert provider.q_lora_rank is None - assert provider.mscale == 1.0 - assert provider.mscale_all_dim == 1.0 diff --git a/tests/unit_tests/models/glm/test_glm45_provider.py b/tests/unit_tests/models/glm/test_glm45_provider.py deleted file mode 100644 index 2a9771a317..0000000000 --- a/tests/unit_tests/models/glm/test_glm45_provider.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Unit tests for GLM 4.5 provider classes. -""" - -import torch -import torch.nn.functional as F - -from megatron.bridge.models.glm.glm45_provider import ( - GLM45AirModelProvider106B, - GLM45ModelProvider355B, - GLMMoEModelProvider, -) - - -class TestGLM45ProviderDefaults: - """Test default configuration values for GLM 4.5 providers.""" - - def test_glm_moe_provider_base_defaults(self): - # Provide minimal valid values to satisfy Megatron-Core post-init checks - provider = GLMMoEModelProvider(num_layers=1, hidden_size=1024) - - # Generic model defaults - assert provider.normalization == "RMSNorm" - assert provider.activation_func == F.silu - assert provider.gated_linear_unit is True - assert provider.add_bias_linear is False - assert provider.add_qkv_bias is True - assert provider.position_embedding_type == "rope" - assert provider.share_embeddings_and_output_weights is False - assert provider.layernorm_epsilon == 1e-5 - - # Sequence and vocab defaults - assert provider.seq_length == 131072 - assert provider.vocab_size == 151552 - assert provider.init_method_std == 0.02 - assert provider.hidden_dropout == 0.0 - - # DType defaults - assert provider.bf16 is True - assert provider.params_dtype == torch.bfloat16 - assert provider.autocast_dtype == torch.bfloat16 - - # Attention defaults - assert provider.num_query_groups == 8 - assert provider.num_attention_heads == 96 - assert provider.attention_dropout == 0.0 - assert provider.kv_channels == 128 - - # RoPE defaults - assert provider.rotary_base == 1000000.0 - assert provider.rotary_percent == 0.5 - - # MoE specific parameters - assert provider.moe_router_topk == 8 - assert provider.moe_shared_expert_overlap is True - assert provider.moe_token_dispatcher_type == "alltoall" - assert provider.moe_router_load_balancing_type == "seq_aux_loss" - assert provider.moe_aux_loss_coeff == 1e-3 - assert provider.moe_router_pre_softmax is False - assert provider.moe_grouped_gemm is True - assert provider.moe_router_score_function == "sigmoid" - assert provider.moe_permute_fusion is True - assert provider.moe_router_dtype == "fp32" - assert provider.moe_router_enable_expert_bias is True - assert provider.moe_router_bias_update_rate == 0 - - # Optimization defaults - assert provider.persist_layer_norm is True - assert provider.bias_activation_fusion is True - assert provider.bias_dropout_fusion is True - - def test_glm45_355b_defaults(self): - provider = GLM45ModelProvider355B() - - assert provider.num_layers == 92 - assert provider.num_moe_experts == 160 - assert provider.hidden_size == 5120 - assert provider.ffn_hidden_size == 12288 - assert provider.moe_ffn_hidden_size == 1536 - assert provider.moe_shared_expert_intermediate_size == 1536 - assert provider.qk_layernorm is True - assert provider.moe_router_topk_scaling_factor == 2.5 - - # Test moe_layer_freq (first 3 layers are dense, rest are MoE) - expected_freq = [0] * 3 + [1] * 89 - assert provider.moe_layer_freq == expected_freq - - def test_glm45_air_106b_defaults(self): - provider = GLM45AirModelProvider106B() - - assert provider.num_layers == 46 - assert provider.num_moe_experts == 128 - assert provider.hidden_size == 4096 - assert provider.ffn_hidden_size == 10944 - assert provider.moe_ffn_hidden_size == 1408 - assert provider.moe_shared_expert_intermediate_size == 1408 - assert provider.qk_layernorm is False - - # Test moe_layer_freq (first 1 layer is dense, rest are MoE) - expected_freq = [0] * 1 + [1] * 45 - assert provider.moe_layer_freq == expected_freq diff --git a/tests/unit_tests/models/glm_vl/test_glm_45v_provider.py b/tests/unit_tests/models/glm_vl/test_glm_45v_provider.py index 3aa6bf92d9..1da3965e28 100644 --- a/tests/unit_tests/models/glm_vl/test_glm_45v_provider.py +++ b/tests/unit_tests/models/glm_vl/test_glm_45v_provider.py @@ -41,8 +41,7 @@ def test_glm_45v_model_provider_initialization(self, mock_vision_config): vision_config=mock_vision_config, ) - # Check inherited defaults from GLM45AirModelProvider106B - # These are expected values for GLM-4.5 Air 106B base model + # Check defaults from GLM-4.5 Air 106B base model configuration assert provider.normalization == "RMSNorm" assert provider.gated_linear_unit is True assert provider.add_bias_linear is False @@ -239,17 +238,17 @@ def test_glm_45v_different_vision_configs(self): class TestGLM45VModelProviderInheritance: - """Test inheritance behavior from GLM45AirModelProvider106B.""" + """Test inheritance behavior from GPTModelProvider.""" - def test_glm_45v_inherits_from_air_provider(self, mock_vision_config): - """Test GLM45VModelProvider inherits from GLM45AirModelProvider106B.""" - from megatron.bridge.models import GLM45AirModelProvider106B + def test_glm_45v_inherits_from_gpt_provider(self, mock_vision_config): + """Test GLM45VModelProvider inherits from GPTModelProvider.""" + from megatron.bridge.models.gpt_provider import GPTModelProvider provider = GLM45VModelProvider( vision_config=mock_vision_config, ) - assert isinstance(provider, GLM45AirModelProvider106B) + assert isinstance(provider, GPTModelProvider) def test_glm_45v_overrides_position_embedding(self, mock_vision_config): """Test GLM45VModelProvider overrides position embedding type.""" diff --git a/tests/unit_tests/models/gpt_oss/test_gpt_oss_provider.py b/tests/unit_tests/models/gpt_oss/test_gpt_oss_provider.py deleted file mode 100644 index b3e6e4646f..0000000000 --- a/tests/unit_tests/models/gpt_oss/test_gpt_oss_provider.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from megatron.bridge.models.gpt_oss.gpt_oss_provider import ( - GPTOSSProvider, - GPTOSSProvider20B, - GPTOSSProvider120B, -) - - -class TestGptOssProviderDefaults: - """Test default configuration values for GPT-OSS providers.""" - - def test_gpt_oss_base_defaults(self): - # Provide minimal required fields for post-init checks via base class - provider = GPTOSSProvider(num_layers=1, hidden_size=512, num_attention_heads=8) - - # Generic defaults - assert provider.normalization == "RMSNorm" - assert provider.gated_linear_unit is True - assert provider.position_embedding_type == "yarn" - assert provider.add_bias_linear is True - assert provider.share_embeddings_and_output_weights is False - - # DType defaults - assert provider.bf16 is True - assert provider.params_dtype == torch.bfloat16 - - # MoE and window attention flags - assert provider.moe_grouped_gemm is True - assert provider.moe_token_dispatcher_type == "alltoall" - assert provider.softmax_type in ("vanilla", "off-by-one", "learnable") - - def test_gpt_oss_20b_defaults(self): - provider = GPTOSSProvider20B() - - assert provider.num_layers == 24 - assert provider.num_moe_experts == 32 - - def test_gpt_oss_120b_defaults(self): - provider = GPTOSSProvider120B() - - assert provider.num_layers == 36 - assert provider.num_moe_experts == 128 diff --git a/tests/unit_tests/models/kimi/test_kimi_provider.py b/tests/unit_tests/models/kimi/test_kimi_provider.py deleted file mode 100644 index 357a48bec6..0000000000 --- a/tests/unit_tests/models/kimi/test_kimi_provider.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn.functional as F - -from megatron.bridge.models.kimi.kimi_provider import KimiK2Provider - - -class TestKimiK2Provider: - """Test cases for KimiK2Provider class.""" - - def test_kimi_k2_provider_initialization(self): - """Test KimiK2Provider can be initialized with default values.""" - provider = KimiK2Provider() - - # Check core model architecture - assert provider.num_layers == 61 - assert provider.hidden_size == 7168 - assert provider.num_attention_heads == 64 - assert provider.vocab_size == 163840 - - # Check key configuration - assert provider.normalization == "RMSNorm" - assert provider.activation_func == F.silu - assert provider.gated_linear_unit is True - assert provider.bf16 is True - assert provider.params_dtype == torch.bfloat16 - - def test_kimi_k2_moe_configuration(self): - """Test KimiK2Provider MoE-specific configuration.""" - provider = KimiK2Provider() - - # Check key MoE settings - assert provider.num_moe_experts == 384 - assert provider.moe_router_topk == 8 - assert provider.moe_router_score_function == "sigmoid" - assert provider.moe_token_dispatcher_type == "alltoall" - - # Check moe_layer_freq format - assert isinstance(provider.moe_layer_freq, list) - assert len(provider.moe_layer_freq) == 61 - assert provider.moe_layer_freq[0] == 0 # first layer is dense - assert all(freq == 1 for freq in provider.moe_layer_freq[1:]) # rest are MoE - - def test_kimi_k2_mla_configuration(self): - """Test KimiK2Provider MLA (Multi-Latent Attention) configuration.""" - provider = KimiK2Provider() - - # Check key MLA settings - assert provider.multi_latent_attention is True - assert provider.q_lora_rank == 1536 - assert provider.kv_lora_rank == 512 - assert provider.qk_head_dim == 128 - assert provider.v_head_dim == 128 - - def test_kimi_k2_custom_parameters(self): - """Test that KimiK2Provider can be initialized with custom parameters.""" - custom_provider = KimiK2Provider( - num_layers=30, - hidden_size=4096, - tensor_model_parallel_size=2, - pipeline_model_parallel_size=4, - expert_model_parallel_size=16, - sequence_parallel=False, - ) - - # Check custom values - assert custom_provider.num_layers == 30 - assert custom_provider.hidden_size == 4096 - assert custom_provider.tensor_model_parallel_size == 2 - assert custom_provider.pipeline_model_parallel_size == 4 - assert custom_provider.expert_model_parallel_size == 16 - assert custom_provider.sequence_parallel is False - - # Check defaults are still preserved - assert custom_provider.num_moe_experts == 384 - assert custom_provider.multi_latent_attention is True - - def test_kimi_k2_inheritance(self): - """Test that KimiK2Provider properly inherits from required base classes.""" - from megatron.bridge.models.gpt_provider import GPTModelProvider - from megatron.bridge.models.transformer_config import MLATransformerConfig - - provider = KimiK2Provider() - - # Check it's a dataclass - assert hasattr(provider, "__dataclass_fields__") - - # Check inheritance - assert isinstance(provider, GPTModelProvider) - assert isinstance(provider, MLATransformerConfig) - - # Check it has the provide method - assert hasattr(provider, "provide") - assert callable(getattr(provider, "provide")) diff --git a/tests/unit_tests/models/llama/test_llama_provider.py b/tests/unit_tests/models/llama/test_llama_provider.py deleted file mode 100644 index 2fceda8d06..0000000000 --- a/tests/unit_tests/models/llama/test_llama_provider.py +++ /dev/null @@ -1,339 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch.nn.functional as F - -from megatron.bridge.models.llama import ( - Llama2ModelProvider7B, - Llama3ModelProvider, - Llama3ModelProvider8B, - Llama3ModelProvider70B, - Llama4Experts16ModelProvider, - Llama4Experts128ModelProvider, - Llama4ModelProvider, - Llama31ModelProvider, - Llama31ModelProvider8B, - Llama31ModelProvider70B, - Llama31ModelProvider405B, - Llama32ModelProvider1B, - Llama32ModelProvider3B, - LlamaModelProvider, -) - - -class TestLlamaModelProvider: - """Test cases for base LlamaModelProvider class.""" - - def test_llama_model_provider_initialization(self): - """Test LlamaModelProvider can be initialized with default values.""" - provider = LlamaModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - ) - - # Check required transformer config fields - assert provider.num_layers == 32 - assert provider.hidden_size == 4096 - assert provider.num_attention_heads == 32 - - # Check Llama-specific defaults - assert provider.position_embedding_type == "rope" - assert provider.rotary_base == 10000 - assert provider.activation_func is F.silu - assert provider.gated_linear_unit is True - assert provider.normalization == "RMSNorm" - assert provider.add_bias_linear is False - assert provider.share_embeddings_and_output_weights is False - - def test_llama_model_provider_with_custom_rope(self): - """Test LlamaModelProvider with custom RoPE configuration.""" - provider = LlamaModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - rotary_base=500000, - rotary_percent=0.5, - ) - - assert provider.rotary_base == 500000 - assert provider.rotary_percent == 0.5 - - def test_llama_model_provider_ffn_hidden_size(self): - """Test LlamaModelProvider FFN hidden size calculation.""" - provider = LlamaModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - ffn_hidden_size=11008, - ) - - assert provider.ffn_hidden_size == 11008 - - def test_llama_model_provider_group_query_attention(self): - """Test LlamaModelProvider with group query attention.""" - provider = LlamaModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - num_query_groups=8, - ) - - assert provider.num_query_groups == 8 - - -class TestLlama2ModelProvider7B: - """Test cases for Llama2ModelProvider7B class.""" - - def test_llama2_7b_default_configuration(self): - """Test Llama2 7B model has correct default configuration.""" - provider = Llama2ModelProvider7B() - - # Check Llama2 7B specific configuration - assert provider.num_layers == 32 - assert provider.hidden_size == 4096 - assert provider.num_attention_heads == 32 - assert provider.ffn_hidden_size == 11008 - assert provider.normalization == "RMSNorm" - assert provider.rotary_base == 10000 - assert provider.seq_length == 4096 - assert provider.num_query_groups == 32 - - def test_llama2_7b_override_configuration(self): - """Test Llama2 7B model with overridden configuration.""" - provider = Llama2ModelProvider7B( - seq_length=8192, - hidden_dropout=0.1, - ) - - # Check overridden values - assert provider.seq_length == 8192 - assert provider.hidden_dropout == 0.1 - - # Check defaults remain - assert provider.num_layers == 32 - assert provider.hidden_size == 4096 - - -class TestLlama3ModelProvider8B: - """Test cases for Llama3ModelProvider8B class.""" - - def test_llama3_8b_default_configuration(self): - """Test Llama3 8B model has correct default configuration.""" - provider = Llama3ModelProvider8B() - - # Check Llama3 8B specific configuration - assert provider.num_layers == 32 - assert provider.hidden_size == 4096 - assert provider.num_attention_heads == 32 - assert provider.ffn_hidden_size == 14336 - assert provider.normalization == "RMSNorm" - assert provider.rotary_base == 500000 - assert provider.seq_length == 8192 - - def test_llama3_8b_group_query_attention(self): - """Test Llama3 8B uses default query groups from base class.""" - provider = Llama3ModelProvider8B() - - # Llama3 8B doesn't set num_query_groups explicitly - # It would use the default from Llama3ModelProvider which is 8 - assert provider.num_attention_heads == 32 - - def test_llama3_8b_override_configuration(self): - """Test Llama3 8B model with overridden configuration.""" - provider = Llama3ModelProvider8B( - seq_length=16384, - hidden_dropout=0.1, - ) - - # Check overridden values - assert provider.seq_length == 16384 - assert provider.hidden_dropout == 0.1 - - # Check critical defaults remain - assert provider.rotary_base == 500000 - - -class TestLlama31ModelProvider70B: - """Test cases for Llama31ModelProvider70B class.""" - - def test_llama31_70b_default_configuration(self): - """Test Llama3.1 70B model has correct default configuration.""" - provider = Llama31ModelProvider70B() - - # Check Llama3.1 70B specific configuration - assert provider.num_layers == 80 - assert provider.hidden_size == 8192 - assert provider.num_attention_heads == 64 - assert provider.ffn_hidden_size == 28672 - assert provider.normalization == "RMSNorm" - assert provider.rotary_base == 500000 - assert provider.seq_length == 131072 # 128k context - - def test_llama31_70b_large_context(self): - """Test Llama3.1 70B supports large context window.""" - provider = Llama31ModelProvider70B() - - # Llama3.1 70B supports 128k context window - assert provider.seq_length == 131072 - - def test_llama31_70b_rope_scaling(self): - """Test Llama3.1 70B RoPE configuration for long context.""" - provider = Llama31ModelProvider70B() - - # Check RoPE base for extended context - assert provider.rotary_base == 500000 - - # Check if rope scaling is configured - if hasattr(provider, "rope_scaling_type"): - assert provider.rope_scaling_type is not None - if hasattr(provider, "rope_scaling_factor"): - assert provider.rope_scaling_factor > 1.0 - - -class TestLlama3QueryGroupsInheritance: - """Test cases to verify that all configs extending from Llama3 have num_query_groups=8.""" - - def test_llama3_base_provider_has_correct_num_query_groups(self): - """Test that Llama3ModelProvider has num_query_groups=8.""" - provider = Llama3ModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - ) - assert provider.num_query_groups == 8 - - def test_llama31_provider_inherits_num_query_groups(self): - """Test that Llama31ModelProvider inherits num_query_groups=8 from Llama3ModelProvider.""" - provider = Llama31ModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - ) - assert provider.num_query_groups == 8 - - def test_llama3_8b_provider_inherits_num_query_groups(self): - """Test that Llama3ModelProvider8B inherits num_query_groups=8.""" - provider = Llama3ModelProvider8B() - assert provider.num_query_groups == 8 - - def test_llama3_70b_provider_inherits_num_query_groups(self): - """Test that Llama3ModelProvider70B inherits num_query_groups=8.""" - provider = Llama3ModelProvider70B() - assert provider.num_query_groups == 8 - - def test_llama31_8b_provider_inherits_num_query_groups(self): - """Test that Llama31ModelProvider8B inherits num_query_groups=8.""" - provider = Llama31ModelProvider8B() - assert provider.num_query_groups == 8 - - def test_llama31_70b_provider_inherits_num_query_groups(self): - """Test that Llama31ModelProvider70B inherits num_query_groups=8.""" - provider = Llama31ModelProvider70B() - assert provider.num_query_groups == 8 - - def test_llama31_405b_provider_inherits_num_query_groups(self): - """Test that Llama31ModelProvider405B inherits num_query_groups=8.""" - provider = Llama31ModelProvider405B() - assert provider.num_query_groups == 8 - - def test_llama32_1b_provider_has_correct_num_query_groups(self): - """Test that Llama32ModelProvider1B has num_query_groups=8.""" - provider = Llama32ModelProvider1B() - assert provider.num_query_groups == 8 - - def test_llama32_3b_provider_has_correct_num_query_groups(self): - """Test that Llama32ModelProvider3B has num_query_groups=8.""" - provider = Llama32ModelProvider3B() - assert provider.num_query_groups == 8 - - def test_llama4_provider_inherits_num_query_groups(self): - """Test that Llama4ModelProvider inherits num_query_groups=8 from Llama3ModelProvider.""" - provider = Llama4ModelProvider(num_moe_experts=16) - assert provider.num_query_groups == 8 - - def test_llama4_experts16_provider_inherits_num_query_groups(self): - """Test that Llama4Experts16ModelProvider inherits num_query_groups=8.""" - provider = Llama4Experts16ModelProvider() - assert provider.num_query_groups == 8 - - def test_llama4_experts128_provider_inherits_num_query_groups(self): - """Test that Llama4Experts128ModelProvider inherits num_query_groups=8.""" - provider = Llama4Experts128ModelProvider() - assert provider.num_query_groups == 8 - - -class TestLlamaProviderInheritance: - """Test inheritance relationships between Llama providers.""" - - def test_llama2_inherits_from_base(self): - """Test Llama2 providers inherit from LlamaModelProvider.""" - assert issubclass(Llama2ModelProvider7B, LlamaModelProvider) - - def test_llama3_inherits_from_base(self): - """Test Llama3 providers inherit from LlamaModelProvider.""" - assert issubclass(Llama3ModelProvider8B, LlamaModelProvider) - - def test_llama31_inherits_from_llama3(self): - """Test Llama3.1 providers inherit from Llama3ModelProvider.""" - # This depends on the actual implementation - assert issubclass(Llama31ModelProvider70B, LlamaModelProvider) - - def test_provide_method_inherited(self): - """Test that provide method works correctly in inherited classes.""" - # Test with Llama3 8B - provider = Llama3ModelProvider8B() - - # The provide method should be inherited from GPTModelProvider - assert hasattr(provider, "provide") - assert callable(provider.provide) - - -class TestLlamaProviderEdgeCases: - """Test edge cases and error conditions.""" - - def test_valid_num_query_groups(self): - """Test that valid num_query_groups configuration works.""" - # num_attention_heads must be divisible by num_query_groups - provider = LlamaModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - num_query_groups=8, # 32 divisible by 8 - ) - assert provider.num_query_groups == 8 - - def test_vocabulary_size_divisibility(self): - """Test vocabulary size divisibility configuration.""" - provider = LlamaModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - vocab_size=32000, - make_vocab_size_divisible_by=128, - ) - - # The actual vocab size should be adjusted if needed - assert provider.make_vocab_size_divisible_by == 128 - - def test_seq_length_override(self): - """Test sequence length configuration.""" - provider = LlamaModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - seq_length=8192, - ) - - assert provider.seq_length == 8192 diff --git a/tests/unit_tests/models/llama_nemotron/test_llama_nemotron_bridge.py b/tests/unit_tests/models/llama_nemotron/test_llama_nemotron_bridge.py index 6b938e3de4..baba375755 100644 --- a/tests/unit_tests/models/llama_nemotron/test_llama_nemotron_bridge.py +++ b/tests/unit_tests/models/llama_nemotron/test_llama_nemotron_bridge.py @@ -23,10 +23,6 @@ from megatron.bridge.models.hf_pretrained.causal_lm import PreTrainedCausalLM from megatron.bridge.models.llama_nemotron.llama_nemotron_bridge import LlamaNemotronBridge from megatron.bridge.models.llama_nemotron.llama_nemotron_provider import ( - Llama31Nemotron70BProvider, - Llama31NemotronNano8BProvider, - Llama31NemotronUltra253BProvider, - Llama33NemotronSuper49BProvider, LlamaNemotronHeterogeneousProvider, ) @@ -205,110 +201,3 @@ def test_dtype_configuration(self, mock_pretrained_super): assert provider.bf16 is True assert provider.fp16 is False assert provider.params_dtype == torch.bfloat16 - - -class TestLlamaNemotronSpecificProviders: - """Test cases for specific Llama-Nemotron model provider configurations.""" - - def test_nano_8b_config(self): - """Test Llama3.1-Nemotron-Nano-8B provider configuration.""" - provider = Llama31NemotronNano8BProvider() - provider.finalize() - - assert provider.hidden_size == 4096 - assert provider.num_layers == 32 - assert provider.num_attention_heads == 32 - assert provider.num_query_groups == 8 - assert provider.ffn_hidden_size == 14336 - assert provider.kv_channels == 128 - assert provider.seq_length == 131072 - assert provider.rope_scaling_factor == 8.0 - assert provider.gated_linear_unit is True - - def test_70b_config(self): - """Test Llama3.1-Nemotron-70B provider configuration.""" - provider = Llama31Nemotron70BProvider() - provider.finalize() - - assert provider.hidden_size == 8192 - assert provider.num_layers == 80 - assert provider.num_attention_heads == 64 - assert provider.num_query_groups == 8 - assert provider.ffn_hidden_size == 28672 - assert provider.kv_channels == 128 - assert provider.seq_length == 131072 - assert provider.rope_scaling_factor == 8.0 - - def test_super_49b_config(self): - """Test Llama3.3-Nemotron-Super-49B provider configuration.""" - provider = Llama33NemotronSuper49BProvider() - # Provide minimal heterogeneous config JSON to avoid file access - provider.heterogeneous_layers_config_encoded_json = json.dumps( - { - "block_configs": [ - { - "attention": {"n_heads_in_group": 8, "no_op": False}, - "ffn": {"ffn_mult": 2.625, "no_op": False}, - } - ], - "rope_scaling": { - "factor": 8.0, - "low_freq_factor": 1.0, - "high_freq_factor": 4.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3", - }, - } - ) - provider.finalize() - - assert provider.hidden_size == 8192 - assert provider.num_layers == 80 - assert provider.num_attention_heads == 64 - assert provider.kv_channels == 128 - assert provider.seq_length == 131072 - - # The ffn_hidden_size comes from the base Llama31ModelProvider70B - # This is correct since heterogeneous models have varying FFN sizes per layer - assert provider.ffn_hidden_size == 28672 # From base class - - # Should have heterogeneous config processed - assert hasattr(provider, "per_block_parameters") - assert hasattr(provider, "heterogeneous_layers_config_encoded_json") - - def test_ultra_253b_config(self): - """Test Llama3.1-Nemotron-Ultra-253B provider configuration.""" - provider = Llama31NemotronUltra253BProvider() - # Provide minimal heterogeneous config JSON to avoid file access - provider.heterogeneous_layers_config_encoded_json = json.dumps( - { - "block_configs": [ - { - "attention": {"n_heads_in_group": 16, "no_op": False}, - "ffn": {"ffn_mult": 1.0, "no_op": False}, - } - ], - "rope_scaling": { - "factor": 16.0, - "low_freq_factor": 1.0, - "high_freq_factor": 4.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3", - }, - } - ) - provider.finalize() - - assert provider.hidden_size == 16384 - assert provider.num_layers == 162 - assert provider.num_attention_heads == 128 - assert provider.kv_channels == 128 - assert provider.seq_length == 131072 - - # The ffn_hidden_size comes from the base Llama31ModelProvider405B (53248) - # This is correct since heterogeneous models have varying FFN sizes per layer - assert provider.ffn_hidden_size == 53248 # From base class, not override - - # Should have heterogeneous config processed in finalize - assert hasattr(provider, "per_block_parameters") - assert hasattr(provider, "heterogeneous_layers_config_encoded_json") diff --git a/tests/unit_tests/models/nemotron/test_nemotron_provider.py b/tests/unit_tests/models/nemotron/test_nemotron_provider.py deleted file mode 100644 index 8ab01a3048..0000000000 --- a/tests/unit_tests/models/nemotron/test_nemotron_provider.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from megatron.bridge.models.nemotron.nemotron_provider import ( - Nemotron3ModelProvider4B, - Nemotron3ModelProvider8B, - Nemotron3ModelProvider22B, - Nemotron4ModelProvider15B, - Nemotron4ModelProvider340B, - NemotronModelProvider, - squared_relu, -) - - -@pytest.mark.unit -class TestNemotronModelProvider: - """Test cases for base NemotronModelProvider class.""" - - def test_nemotron_model_provider_initialization(self): - """Test NemotronModelProvider can be initialized with default values.""" - provider = NemotronModelProvider() - - # Check Nemotron-specific defaults - assert provider.normalization == "LayerNorm" - assert provider.activation_func is squared_relu - assert provider.position_embedding_type == "rope" - assert provider.share_embeddings_and_output_weights is False - assert provider.add_bias_linear is False - assert provider.hidden_dropout == 0.0 - assert provider.attention_dropout == 0.0 - assert provider.rotary_percent == 0.5 - assert provider.bias_dropout_add_fusion is False - assert provider.layernorm_zero_centered_gamma is True - assert provider.cross_entropy_loss_fusion is True - - -class TestNemotronSpecificProviders: - """Test cases for specific Nemotron model provider configurations.""" - - def test_nemotron3_4b_config(self): - """Test Nemotron3 4B provider configuration matches HF model specs.""" - provider = Nemotron3ModelProvider4B() - - assert provider.hidden_size == 3072 - assert provider.num_layers == 32 - assert provider.num_attention_heads == 24 - assert provider.num_query_groups == 8 - assert provider.ffn_hidden_size == 9216 - assert provider.kv_channels == 128 - assert provider.seq_length == 4096 - assert provider.init_method_std == 0.0134 - - def test_nemotron3_8b_config(self): - """Test Nemotron3 8B provider configuration matches HF model specs.""" - provider = Nemotron3ModelProvider8B() - - assert provider.hidden_size == 4096 - assert provider.num_layers == 32 - assert provider.num_attention_heads == 32 - assert provider.num_query_groups is None - assert provider.ffn_hidden_size == 16384 - assert provider.kv_channels is None - assert provider.seq_length == 4096 - assert provider.init_method_std == 0.010 - - def test_nemotron3_22b_config(self): - """Test Nemotron3 22B provider configuration.""" - provider = Nemotron3ModelProvider22B() - - assert provider.hidden_size == 6144 - assert provider.num_layers == 40 - assert provider.num_attention_heads == 48 - assert provider.num_query_groups is None - assert provider.ffn_hidden_size == 24576 - assert provider.kv_channels is None - assert provider.seq_length == 4096 - assert provider.init_method_std == 0.008 - - def test_nemotron4_15b_config(self): - """Test Nemotron4 15B provider configuration.""" - provider = Nemotron4ModelProvider15B() - - assert provider.hidden_size == 6144 - assert provider.num_layers == 32 - assert provider.num_attention_heads == 48 - assert provider.num_query_groups == 8 - assert provider.ffn_hidden_size == 24576 - assert provider.kv_channels is None - assert provider.seq_length == 4096 - assert provider.init_method_std == 0.0134 - - def test_nemotron4_340b_config(self): - """Test Nemotron4 340B provider configuration.""" - provider = Nemotron4ModelProvider340B() - - # Should match nvidia/Nemotron-4-340B-Base/Instruct (if available) - assert provider.hidden_size == 18432 - assert provider.num_layers == 96 - assert provider.num_attention_heads == 96 - assert provider.num_query_groups == 8 - assert provider.ffn_hidden_size == 73728 - assert provider.kv_channels is None - assert provider.seq_length == 4096 - assert provider.init_method_std == 0.0063 - - def test_all_providers_have_nemotron_defaults(self): - """Test that all specific providers inherit Nemotron-specific defaults.""" - providers = [ - Nemotron3ModelProvider4B(), - Nemotron3ModelProvider8B(), - Nemotron3ModelProvider22B(), - Nemotron4ModelProvider15B(), - Nemotron4ModelProvider340B(), - ] - - for provider in providers: - # Check Nemotron-specific defaults - assert provider.normalization == "LayerNorm" - assert provider.position_embedding_type == "rope" - assert provider.share_embeddings_and_output_weights is False - assert provider.add_bias_linear is False - assert provider.hidden_dropout == 0.0 - assert provider.attention_dropout == 0.0 - assert provider.rotary_percent == 0.5 - assert provider.layernorm_zero_centered_gamma is True - assert provider.cross_entropy_loss_fusion is True diff --git a/tests/unit_tests/models/nemotronh/test_nemotron_h_provider.py b/tests/unit_tests/models/nemotronh/test_nemotron_h_provider.py deleted file mode 100644 index 956dc7f42a..0000000000 --- a/tests/unit_tests/models/nemotronh/test_nemotron_h_provider.py +++ /dev/null @@ -1,462 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn.functional as F - -from megatron.bridge.models.nemotronh.nemotron_h_provider import ( - Nemotron3NanoProvider, - NemotronHModel4BProvider, - NemotronHModel8BProvider, - NemotronHModel47BProvider, - NemotronHModel56BProvider, - NemotronHModelProvider, - NemotronNano9Bv2Provider, - NemotronNano12Bv2Provider, -) - - -class TestNemotronHModelProvider: - """Test cases for base NemotronHModelProvider class.""" - - def test_nemotron_h_model_provider_initialization(self): - """Test NemotronHModelProvider can be initialized with default values.""" - provider = NemotronHModelProvider( - num_layers=52, - hidden_size=4096, - num_attention_heads=32, - ) - - # Check required transformer config fields - assert provider.num_layers == 52 - assert provider.hidden_size == 4096 - assert provider.num_attention_heads == 32 - - # Check Nemotron-H specific defaults - assert provider.seq_length == 8192 - assert provider.mamba_num_groups == 8 - assert provider.mamba_head_dim == 64 - assert provider.num_query_groups == 8 - assert provider.make_vocab_size_divisible_by == 128 - assert provider.masked_softmax_fusion is True - assert provider.apply_query_key_layer_scaling is False - assert provider.persist_layer_norm is True - assert provider.attention_softmax_in_fp32 is False - assert provider.first_last_layers_bf16 is True - assert provider.is_hybrid_model is True - - def test_nemotron_h_custom_activation_function(self): - """Test NemotronHModelProvider with custom activation function.""" - - def custom_activation(x): - return torch.pow(F.relu(x), 2) - - provider = NemotronHModelProvider( - num_layers=52, - hidden_size=4096, - num_attention_heads=32, - activation_func=custom_activation, - ) - - # Test that the activation function is set correctly - test_input = torch.tensor([1.0, -1.0, 2.0]) - expected_output = torch.pow(F.relu(test_input), 2) - actual_output = provider.activation_func(test_input) - - assert torch.allclose(actual_output, expected_output) - - def test_nemotron_h_mamba_configuration(self): - """Test NemotronHModelProvider Mamba-specific configuration.""" - provider = NemotronHModelProvider( - num_layers=52, - hidden_size=4096, - num_attention_heads=32, - mamba_num_groups=16, - mamba_head_dim=128, - ) - - assert provider.mamba_num_groups == 16 - assert provider.mamba_head_dim == 128 - - def test_nemotron_h_moe_default_configuration(self): - """Test NemotronHModelProvider MoE default configuration.""" - provider = NemotronHModelProvider( - num_layers=52, - hidden_size=4096, - num_attention_heads=32, - ) - - # Check MoE default configurations - assert provider.moe_aux_loss_coeff == 0.0001 - assert provider.moe_router_score_function == "sigmoid" - assert provider.moe_router_enable_expert_bias is True - assert provider.moe_router_load_balancing_type == "seq_aux_loss" - assert provider.moe_router_dtype == "fp32" - assert provider.moe_grouped_gemm is True - assert provider.moe_token_dispatcher_type == "alltoall" - assert provider.moe_permute_fusion is True - assert provider.moe_shared_expert_overlap is True - - -class TestNemotronHModel4BProvider: - """Test cases for NemotronHModel4BProvider class.""" - - def test_nemotron_h_4b_default_configuration(self): - """Test Nemotron-H 4B model has correct default configuration.""" - provider = NemotronHModel4BProvider() - - # Check Nemotron-H 4B specific configuration - assert provider.num_layers == 52 - assert provider.hidden_size == 3072 - assert provider.num_attention_heads == 32 - assert provider.mamba_num_heads == 112 - assert provider.kv_channels == 128 - assert provider.mamba_state_dim == 128 - assert provider.ffn_hidden_size == 12288 - assert provider.hybrid_override_pattern == "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" - assert provider.use_mamba_mem_eff_path is False - - def test_nemotron_h_4b_override_configuration(self): - """Test Nemotron-H 4B model with overridden configuration.""" - provider = NemotronHModel4BProvider( - seq_length=16384, - hidden_dropout=0.1, - use_mamba_mem_eff_path=True, - ) - - # Check overridden values - assert provider.seq_length == 16384 - assert provider.hidden_dropout == 0.1 - assert provider.use_mamba_mem_eff_path is True - - # Check defaults remain - assert provider.num_layers == 52 - assert provider.hidden_size == 3072 - assert provider.mamba_num_heads == 112 - - -class TestNemotronHModel8BProvider: - """Test cases for NemotronHModel8BProvider class.""" - - def test_nemotron_h_8b_default_configuration(self): - """Test Nemotron-H 8B model has correct default configuration.""" - provider = NemotronHModel8BProvider() - - # Check Nemotron-H 8B specific configuration - assert provider.num_layers == 52 - assert provider.hidden_size == 4096 - assert provider.num_attention_heads == 32 - assert provider.mamba_state_dim == 128 - assert provider.ffn_hidden_size == 21504 - assert provider.hybrid_override_pattern == "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" - - def test_nemotron_h_8b_override_configuration(self): - """Test Nemotron-H 8B model with overridden configuration.""" - provider = NemotronHModel8BProvider( - seq_length=32768, - hidden_dropout=0.1, - ) - - # Check overridden values - assert provider.seq_length == 32768 - assert provider.hidden_dropout == 0.1 - - # Check critical defaults remain - assert provider.hidden_size == 4096 - assert provider.ffn_hidden_size == 21504 - - -class TestNemotronHModel47BProvider: - """Test cases for NemotronHModel47BProvider class.""" - - def test_nemotron_h_47b_default_configuration(self): - """Test Nemotron-H 47B model has correct default configuration.""" - provider = NemotronHModel47BProvider() - - # Check Nemotron-H 47B specific configuration - assert provider.num_layers == 98 - assert provider.hidden_size == 8192 - assert provider.num_attention_heads == 64 - assert provider.mamba_state_dim == 256 - assert provider.ffn_hidden_size == 30720 - assert ( - "M-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M---MM---M-M*-M-M-M-M-M-" - in provider.hybrid_override_pattern - ) - - def test_nemotron_h_47b_override_configuration(self): - """Test Nemotron-H 47B model with overridden configuration.""" - provider = NemotronHModel47BProvider( - seq_length=65536, - hidden_dropout=0.1, - ) - - # Check overridden values - assert provider.seq_length == 65536 - assert provider.hidden_dropout == 0.1 - - # Check critical defaults remain - assert provider.num_layers == 98 - assert provider.hidden_size == 8192 - - -class TestNemotronHModel56BProvider: - """Test cases for NemotronHModel56BProvider class.""" - - def test_nemotron_h_56b_default_configuration(self): - """Test Nemotron-H 56B model has correct default configuration.""" - provider = NemotronHModel56BProvider() - - # Check Nemotron-H 56B specific configuration - assert provider.num_layers == 118 - assert provider.hidden_size == 8192 - assert provider.num_attention_heads == 64 - assert provider.mamba_state_dim == 256 - assert provider.ffn_hidden_size == 32768 - assert ( - "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" - in provider.hybrid_override_pattern - ) - - def test_nemotron_h_56b_override_configuration(self): - """Test Nemotron-H 56B model with overridden configuration.""" - provider = NemotronHModel56BProvider( - seq_length=131072, # 128k context - hidden_dropout=0.1, - ) - - # Check overridden values - assert provider.seq_length == 131072 - assert provider.hidden_dropout == 0.1 - - # Check critical defaults remain - assert provider.num_layers == 118 - assert provider.hidden_size == 8192 - - -class TestNemotronHProviderInheritance: - """Test inheritance relationships between Nemotron-H providers.""" - - def test_nemotron_h_4b_inherits_from_base(self): - """Test Nemotron-H 4B provider inherits from NemotronHModelProvider.""" - assert issubclass(NemotronHModel4BProvider, NemotronHModelProvider) - - def test_nemotron_h_8b_inherits_from_base(self): - """Test Nemotron-H 8B provider inherits from NemotronHModelProvider.""" - assert issubclass(NemotronHModel8BProvider, NemotronHModelProvider) - - def test_nemotron_h_47b_inherits_from_base(self): - """Test Nemotron-H 47B provider inherits from NemotronHModelProvider.""" - assert issubclass(NemotronHModel47BProvider, NemotronHModelProvider) - - def test_nemotron_h_56b_inherits_from_base(self): - """Test Nemotron-H 56B provider inherits from NemotronHModelProvider.""" - assert issubclass(NemotronHModel56BProvider, NemotronHModelProvider) - - def test_nemotron_nano_9b_v2_inherits_from_base(self): - """Test Nemotron Nano v2 9B provider inherits from NemotronHModelProvider.""" - assert issubclass(NemotronNano9Bv2Provider, NemotronHModelProvider) - - def test_nemotron_nano_12b_v2_inherits_from_base(self): - """Test Nemotron Nano v2 12B provider inherits from NemotronHModelProvider.""" - assert issubclass(NemotronNano12Bv2Provider, NemotronHModelProvider) - - def test_provide_method_inherited(self): - """Test that provide method works correctly in inherited classes.""" - # Test with Nemotron-H 4B - providers = [ - NemotronHModel4BProvider(), - NemotronHModel8BProvider(), - NemotronHModel47BProvider(), - NemotronHModel56BProvider(), - NemotronNano9Bv2Provider(), - NemotronNano12Bv2Provider(), - Nemotron3NanoProvider(), - ] - - for provider in providers: - # The provide method should be inherited from MambaModelProvider - assert hasattr(provider, "provide") - assert callable(provider.provide) - - -class TestNemotronNano9Bv2Provider: - """Test cases for NemotronNano9Bv2Provider class.""" - - def test_nemotron_nano_9b_v2_default_configuration(self): - """Test Nemotron Nano v2 9B model has correct default configuration.""" - provider = NemotronNano9Bv2Provider() - provider.finalize() - - # Check Nemotron Nano v2 9B specific configuration - assert provider.num_layers == 56 - assert provider.hidden_size == 4480 - assert provider.num_attention_heads == 40 - assert provider.mamba_num_heads == 128 - assert provider.kv_channels == 128 - assert provider.mamba_state_dim == 128 - assert provider.ffn_hidden_size == 15680 - assert provider.mamba_head_dim == 80 - assert provider.hybrid_override_pattern == "M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-" - - def test_nemotron_nano_9b_v2_override_configuration(self): - """Test Nemotron Nano v2 9B model with overridden configuration.""" - provider = NemotronNano9Bv2Provider( - seq_length=16384, - hidden_dropout=0.1, - mamba_head_dim=96, - ) - - # Check overridden values - assert provider.seq_length == 16384 - assert provider.hidden_dropout == 0.1 - assert provider.mamba_head_dim == 96 - - # Check critical defaults remain - assert provider.num_layers == 56 - assert provider.hidden_size == 4480 - assert provider.mamba_num_heads == 128 - assert provider.ffn_hidden_size == 15680 - - -class TestNemotronNano12Bv2Provider: - """Test cases for NemotronNano12Bv2Provider class.""" - - def test_nemotron_nano_12b_v2_default_configuration(self): - """Test Nemotron Nano v2 12B model has correct default configuration.""" - provider = NemotronNano12Bv2Provider() - provider.finalize() - - # Check Nemotron Nano v2 12B specific configuration - assert provider.num_layers == 62 - assert provider.hidden_size == 5120 - assert provider.num_attention_heads == 40 - assert provider.mamba_num_heads == 128 - assert provider.kv_channels == 128 - assert provider.mamba_state_dim == 128 - assert provider.ffn_hidden_size == 20480 - assert provider.mamba_head_dim == 80 - assert provider.hybrid_override_pattern == "M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-" - - def test_nemotron_nano_12b_v2_override_configuration(self): - """Test Nemotron Nano v2 12B model with overridden configuration.""" - provider = NemotronNano12Bv2Provider( - seq_length=32768, - hidden_dropout=0.1, - mamba_head_dim=96, - ) - - # Check overridden values - assert provider.seq_length == 32768 - assert provider.hidden_dropout == 0.1 - assert provider.mamba_head_dim == 96 - - # Check critical defaults remain - assert provider.num_layers == 62 - assert provider.hidden_size == 5120 - assert provider.mamba_num_heads == 128 - assert provider.ffn_hidden_size == 20480 - - -class TestNemotron3NanoProvider: - """Test cases for Nemotron3NanoProvider class.""" - - def test_nemotron_3_nano_default_configuration(self): - """Test Nemotron 3 Nano model has correct default configuration.""" - provider = Nemotron3NanoProvider() - - # Check Nemotron 3 Nano specific configuration - assert provider.seq_length == 262144 - assert provider.num_layers == 52 - assert provider.hidden_size == 2688 - assert provider.num_attention_heads == 32 - assert provider.num_query_groups == 2 - assert provider.mamba_num_heads == 64 - assert provider.kv_channels == 128 - assert provider.mamba_state_dim == 128 - assert provider.ffn_hidden_size == 1856 - assert provider.mamba_head_dim == 64 - assert provider.hybrid_override_pattern == "MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME" - - def test_nemotron_3_nano_moe_configuration(self): - """Test Nemotron 3 Nano model MoE-specific configuration.""" - provider = Nemotron3NanoProvider() - - # Check MoE-specific configuration - assert provider.num_moe_experts == 128 - assert provider.moe_ffn_hidden_size == 1856 - assert provider.moe_shared_expert_intermediate_size == 3712 # 1856 * 2 shared expert - assert provider.moe_router_topk == 6 - assert provider.moe_router_topk_scaling_factor == 2.5 - assert provider.moe_router_num_groups == 1 - assert provider.moe_router_group_topk == 1 - - def test_nemotron_3_nano_override_configuration(self): - """Test Nemotron 3 Nano model with overridden configuration.""" - provider = Nemotron3NanoProvider( - seq_length=16384, - hidden_dropout=0.1, - num_moe_experts=64, - ) - - # Check overridden values - assert provider.seq_length == 16384 - assert provider.hidden_dropout == 0.1 - assert provider.num_moe_experts == 64 - - # Check critical defaults remain - assert provider.num_layers == 52 - assert provider.hidden_size == 2688 - assert provider.mamba_num_heads == 64 - - def test_nemotron_3_nano_inherits_from_base(self): - """Test Nemotron 3 Nano provider inherits from NemotronHModelProvider.""" - assert issubclass(Nemotron3NanoProvider, NemotronHModelProvider) - - def test_nemotron_3_nano_inherits_moe_defaults(self): - """Test Nemotron 3 Nano inherits MoE defaults from base class.""" - provider = Nemotron3NanoProvider() - - # Check inherited MoE defaults from NemotronHModelProvider - assert provider.moe_aux_loss_coeff == 0.0001 - assert provider.moe_router_score_function == "sigmoid" - assert provider.moe_router_enable_expert_bias is True - assert provider.moe_router_load_balancing_type == "seq_aux_loss" - assert provider.moe_router_dtype == "fp32" - assert provider.moe_grouped_gemm is True - assert provider.moe_token_dispatcher_type == "alltoall" - assert provider.moe_permute_fusion is True - assert provider.moe_shared_expert_overlap is True - - -class TestHybridPatterns: - """Test hybrid override patterns of Nemotron-H providers.""" - - def test_hybrid_patterns_contain_mamba_and_attention(self): - """Test that all hybrid patterns contain both Mamba and Attention layers.""" - providers = [ - NemotronHModel4BProvider(), - NemotronHModel8BProvider(), - NemotronHModel47BProvider(), - NemotronHModel56BProvider(), - NemotronNano9Bv2Provider(), - NemotronNano12Bv2Provider(), - Nemotron3NanoProvider(), - ] - - for provider in providers: - pattern = provider.hybrid_override_pattern - assert "M" in pattern # Mamba layers - assert "*" in pattern # Attention layers - assert len(pattern) > 0 diff --git a/tests/unit_tests/models/test_models_imports.py b/tests/unit_tests/models/test_models_imports.py index 9158b65012..432765ab53 100644 --- a/tests/unit_tests/models/test_models_imports.py +++ b/tests/unit_tests/models/test_models_imports.py @@ -32,19 +32,6 @@ def test_import_t5_provider(self): # Should be the same class assert T5ModelProvider is DirectImport - def test_import_llama_providers(self): - """Test importing all Llama model providers.""" - from megatron.bridge.models import ( - Llama2ModelProvider7B, - Llama3ModelProvider8B, - LlamaModelProvider, - ) - - # Verify all imports are successful and are classes - assert isinstance(LlamaModelProvider, type) - assert isinstance(Llama2ModelProvider7B, type) - assert isinstance(Llama3ModelProvider8B, type) - def test_models_package_all_exports(self): """Test that __all__ exports match available imports.""" import megatron.bridge.models as models @@ -71,21 +58,17 @@ def test_backwards_compatibility_imports(self): def test_model_provider(self): """Test that model providers inherit from ModelProviderMixin.""" - from megatron.bridge.models import GPTModelProvider, LlamaModelProvider, T5ModelProvider + from megatron.bridge.models import GPTModelProvider, T5ModelProvider from megatron.bridge.models.model_provider import ModelProviderMixin - # All providers should inherit from ModelProviderMixin assert issubclass(GPTModelProvider, ModelProviderMixin) assert issubclass(T5ModelProvider, ModelProviderMixin) - assert issubclass(LlamaModelProvider, ModelProviderMixin) def test_transformer_config_inheritance(self): """Test that model providers inherit from TransformerConfig.""" from megatron.core.transformer.transformer_config import TransformerConfig - from megatron.bridge.models import GPTModelProvider, LlamaModelProvider, T5ModelProvider + from megatron.bridge.models import GPTModelProvider, T5ModelProvider - # All providers should inherit from TransformerConfig assert issubclass(GPTModelProvider, TransformerConfig) assert issubclass(T5ModelProvider, TransformerConfig) - assert issubclass(LlamaModelProvider, TransformerConfig) diff --git a/tests/unit_tests/recipes/kimi/test_kimi_k2.py b/tests/unit_tests/recipes/kimi/test_kimi_k2.py index 007a0d4de5..07d53c53a9 100644 --- a/tests/unit_tests/recipes/kimi/test_kimi_k2.py +++ b/tests/unit_tests/recipes/kimi/test_kimi_k2.py @@ -15,7 +15,7 @@ import pytest import torch -from megatron.bridge.models.kimi import KimiK2Provider +from megatron.bridge.models.mla_provider import MLAModelProvider from megatron.bridge.recipes.kimi.kimi_k2 import _get_kimi_k2_pipeline_layout, kimi_k2_pretrain_config from megatron.bridge.training.config import ConfigContainer from megatron.bridge.training.mixed_precision import MixedPrecisionConfig @@ -56,7 +56,7 @@ def test_pretrain_config_basic_structure(self): # Check it returns a ConfigContainer with all required components assert isinstance(cfg, ConfigContainer) - assert isinstance(cfg.model, KimiK2Provider) + assert isinstance(cfg.model, MLAModelProvider) assert cfg.train is not None assert cfg.optimizer is not None assert cfg.scheduler is not None diff --git a/tests/unit_tests/recipes/nemotronh/test_nemotron_3_nano.py b/tests/unit_tests/recipes/nemotronh/test_nemotron_3_nano.py index f0f2735f7f..aeff5016e2 100644 --- a/tests/unit_tests/recipes/nemotronh/test_nemotron_3_nano.py +++ b/tests/unit_tests/recipes/nemotronh/test_nemotron_3_nano.py @@ -28,7 +28,7 @@ import pytest -from megatron.bridge.models.nemotronh import Nemotron3NanoProvider +from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider from megatron.bridge.recipes.nemotronh.nemotron_3_nano import ( nemotron_3_nano_peft_config, nemotron_3_nano_pretrain_config, @@ -51,7 +51,7 @@ def test_pretrain_config_default_parameters(self): config = nemotron_3_nano_pretrain_config() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, Nemotron3NanoProvider) + assert isinstance(config.model, MambaModelProvider) # Check model configuration defaults assert config.model.tensor_model_parallel_size == 4 @@ -142,7 +142,7 @@ def test_sft_config_default_parameters(self): config = nemotron_3_nano_sft_config() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, Nemotron3NanoProvider) + assert isinstance(config.model, MambaModelProvider) # Check default parallelism for SFT assert config.model.tensor_model_parallel_size == 1 @@ -254,7 +254,7 @@ def test_peft_config_default_lora(self): config = nemotron_3_nano_peft_config() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, Nemotron3NanoProvider) + assert isinstance(config.model, MambaModelProvider) # Check default parallelism for LoRA/DoRA assert config.model.tensor_model_parallel_size == 1 @@ -388,7 +388,7 @@ def test_config_container_structure(self, recipe_fn): config = recipe_fn() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, Nemotron3NanoProvider) + assert isinstance(config.model, MambaModelProvider) # Check required sections exist assert config.train is not None @@ -433,7 +433,7 @@ def test_moe_model_configuration(self, recipe_fn): # All configs use parameterless API (peft_config has optional peft_scheme) config = recipe_fn() - # Check MoE settings from Nemotron3NanoProvider + # Check MoE settings from MambaModelProvider assert config.model.num_moe_experts == 128 assert config.model.moe_ffn_hidden_size == 1856 assert config.model.moe_shared_expert_intermediate_size == 3712 diff --git a/tests/unit_tests/recipes/nemotronh/test_nemotron_nano_v2.py b/tests/unit_tests/recipes/nemotronh/test_nemotron_nano_v2.py index 2e492b93b0..0e14c5fe54 100644 --- a/tests/unit_tests/recipes/nemotronh/test_nemotron_nano_v2.py +++ b/tests/unit_tests/recipes/nemotronh/test_nemotron_nano_v2.py @@ -21,10 +21,7 @@ import pytest -from megatron.bridge.models.nemotronh import ( - NemotronNanoModelProvider9Bv2, - NemotronNanoModelProvider12Bv2, -) +from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider from megatron.bridge.recipes.nemotronh import ( nemotron_nano_9b_v2_pretrain_config, nemotron_nano_12b_v2_pretrain_config, @@ -41,7 +38,7 @@ def test_pretrain_config_default_parameters(self): config = nemotron_nano_9b_v2_pretrain_config() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, NemotronNanoModelProvider9Bv2) + assert isinstance(config.model, MambaModelProvider) # Check model configuration defaults assert config.model.tensor_model_parallel_size == 2 @@ -78,7 +75,7 @@ def test_pretrain_config_default_parameters(self): config = nemotron_nano_12b_v2_pretrain_config() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, NemotronNanoModelProvider12Bv2) + assert isinstance(config.model, MambaModelProvider) # Check model configuration defaults assert config.model.tensor_model_parallel_size == 4 @@ -106,8 +103,8 @@ class TestNemotronNanoV2Common: @pytest.mark.parametrize( "recipe_fn,provider_cls", [ - (nemotron_nano_9b_v2_pretrain_config, NemotronNanoModelProvider9Bv2), - (nemotron_nano_12b_v2_pretrain_config, NemotronNanoModelProvider12Bv2), + (nemotron_nano_9b_v2_pretrain_config, MambaModelProvider), + (nemotron_nano_12b_v2_pretrain_config, MambaModelProvider), ], ) def test_config_container_structure(self, recipe_fn, provider_cls): diff --git a/tests/unit_tests/recipes/nemotronh/test_nemotronh.py b/tests/unit_tests/recipes/nemotronh/test_nemotronh.py index 44aa05af6e..efb2505933 100644 --- a/tests/unit_tests/recipes/nemotronh/test_nemotronh.py +++ b/tests/unit_tests/recipes/nemotronh/test_nemotronh.py @@ -21,12 +21,7 @@ import pytest -from megatron.bridge.models.nemotronh import ( - NemotronHModelProvider4B, - NemotronHModelProvider8B, - NemotronHModelProvider47B, - NemotronHModelProvider56B, -) +from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider from megatron.bridge.recipes.nemotronh import ( nemotronh_4b_pretrain_config, nemotronh_8b_pretrain_config, @@ -45,7 +40,7 @@ def test_pretrain_config_default_parameters(self): config = nemotronh_4b_pretrain_config() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, NemotronHModelProvider4B) + assert isinstance(config.model, MambaModelProvider) # Check model configuration defaults assert config.model.tensor_model_parallel_size == 1 @@ -80,7 +75,7 @@ def test_pretrain_config_default_parameters(self): config = nemotronh_8b_pretrain_config() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, NemotronHModelProvider8B) + assert isinstance(config.model, MambaModelProvider) # Check model configuration defaults assert config.model.tensor_model_parallel_size == 2 @@ -106,7 +101,7 @@ def test_pretrain_config_default_parameters(self): config = nemotronh_47b_pretrain_config() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, NemotronHModelProvider47B) + assert isinstance(config.model, MambaModelProvider) # Check model configuration defaults assert config.model.tensor_model_parallel_size == 8 @@ -138,7 +133,7 @@ def test_pretrain_config_default_parameters(self): config = nemotronh_56b_pretrain_config() assert isinstance(config, ConfigContainer) - assert isinstance(config.model, NemotronHModelProvider56B) + assert isinstance(config.model, MambaModelProvider) # Check model configuration defaults assert config.model.tensor_model_parallel_size == 8 @@ -168,10 +163,10 @@ class TestNemotronHCommon: @pytest.mark.parametrize( "recipe_fn,provider_cls", [ - (nemotronh_4b_pretrain_config, NemotronHModelProvider4B), - (nemotronh_8b_pretrain_config, NemotronHModelProvider8B), - (nemotronh_47b_pretrain_config, NemotronHModelProvider47B), - (nemotronh_56b_pretrain_config, NemotronHModelProvider56B), + (nemotronh_4b_pretrain_config, MambaModelProvider), + (nemotronh_8b_pretrain_config, MambaModelProvider), + (nemotronh_47b_pretrain_config, MambaModelProvider), + (nemotronh_56b_pretrain_config, MambaModelProvider), ], ) def test_config_container_structure(self, recipe_fn, provider_cls): diff --git a/tests/unit_tests/recipes/test_glm45_recipes.py b/tests/unit_tests/recipes/test_glm45_recipes.py index 6593900278..d8177ecace 100644 --- a/tests/unit_tests/recipes/test_glm45_recipes.py +++ b/tests/unit_tests/recipes/test_glm45_recipes.py @@ -135,26 +135,14 @@ def _assert_basic_config(cfg): @pytest.mark.parametrize("recipe_func", _GLM45_RECIPE_FUNCS) def test_each_glm45_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): """Test that each GLM 4.5 recipe function builds a valid configuration.""" - # Monkeypatch the provider classes to return fake model configs - from megatron.bridge.models.glm import glm45_provider - - # Create a fake provider class that returns a fake model config - class FakeProvider(_FakeModelCfg): - def __init__(self, *args, **kwargs): - super().__init__() - - # Monkeypatch all provider classes - monkeypatch.setattr(glm45_provider, "GLMMoEModelProvider", FakeProvider) - monkeypatch.setattr(glm45_provider, "GLM45ModelProvider355B", FakeProvider) - monkeypatch.setattr(glm45_provider, "GLM45AirModelProvider106B", FakeProvider) + # Monkeypatch AutoBridge to return fake model configs (avoids HF I/O) + module_name = recipe_func.__module__ + mod = importlib.import_module(module_name) + monkeypatch.setattr(mod, "AutoBridge", _FakeBridge) - # For SFT/PEFT recipes, also monkeypatch AutoBridge and AutoTokenizer + # For SFT/PEFT recipes, also monkeypatch AutoTokenizer is_sft_or_peft = "sft" in recipe_func.__name__.lower() or "peft" in recipe_func.__name__.lower() if is_sft_or_peft: - module_name = recipe_func.__module__ - mod = importlib.import_module(module_name) - monkeypatch.setattr(mod, "AutoBridge", _FakeBridge) - # Mock AutoTokenizer to avoid HF I/O import transformers diff --git a/tests/unit_tests/recipes/test_run_plugins.py b/tests/unit_tests/recipes/test_run_plugins.py index ca3c7034b0..c4db1c7867 100644 --- a/tests/unit_tests/recipes/test_run_plugins.py +++ b/tests/unit_tests/recipes/test_run_plugins.py @@ -17,6 +17,7 @@ import pytest import torch +import torch.nn.functional as F try: @@ -48,7 +49,7 @@ def create_test_config(**kwargs): from megatron.core.distributed import DistributedDataParallelConfig from megatron.core.optimizer import OptimizerConfig - from megatron.bridge.models.llama import Llama3ModelProvider8B + from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -78,8 +79,31 @@ def create_test_config(**kwargs): min_lr = kwargs.pop("min_lr", 1e-5) # Create model config with apply_rope_fusion=False - model_cfg = Llama3ModelProvider8B( + model_cfg = GPTModelProvider( + normalization="RMSNorm", + activation_func=F.silu, + gated_linear_unit=True, + position_embedding_type="rope", + add_bias_linear=False, + attention_dropout=0.0, + hidden_dropout=0.0, + share_embeddings_and_output_weights=False, + bias_activation_fusion=True, + masked_softmax_fusion=True, + persist_layer_norm=True, + bias_dropout_fusion=True, apply_rope_fusion=False, # Disable to avoid TE/Apex requirement + num_query_groups=8, + init_method_std=0.01, + layernorm_epsilon=1e-05, + rotary_percent=1.0, + rotary_base=500_000, + seq_length=8192, + num_layers=32, + hidden_size=4096, + ffn_hidden_size=14336, + num_attention_heads=32, + cross_entropy_fusion_impl="te", tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size, pipeline_dtype=pipeline_dtype, diff --git a/tests/unit_tests/training/test_config.py b/tests/unit_tests/training/test_config.py index f230258334..8686152d43 100644 --- a/tests/unit_tests/training/test_config.py +++ b/tests/unit_tests/training/test_config.py @@ -19,8 +19,8 @@ import torch from megatron.core.transformer.enums import CudaGraphScope -from megatron.bridge.models.deepseek.deepseek_provider import DeepSeekModelProvider from megatron.bridge.models.gpt_provider import GPTModelProvider +from megatron.bridge.models.mla_provider import MLAModelProvider from megatron.bridge.models.t5_provider import T5ModelProvider from megatron.bridge.training.comm_overlap import CommOverlapConfig from megatron.bridge.training.config import ( @@ -76,8 +76,8 @@ def create_test_gpt_config(**kwargs: Any) -> GPTModelProvider: return GPTModelProvider(**defaults) -def create_test_deepseek_config(**kwargs: Any) -> DeepSeekModelProvider: - """Creates an instance of DeepSeekModelProvider for testing.""" +def create_test_deepseek_config(**kwargs: Any) -> MLAModelProvider: + """Creates an instance of MLAModelProvider for testing.""" defaults = { "num_layers": 1, "hidden_size": 128, @@ -86,7 +86,7 @@ def create_test_deepseek_config(**kwargs: Any) -> DeepSeekModelProvider: "apply_rope_fusion": False, } defaults.update(kwargs) - return DeepSeekModelProvider(**defaults) + return MLAModelProvider(**defaults) def create_test_t5_config(**kwargs: Any) -> T5ModelProvider: diff --git a/tests/unit_tests/training/test_log_non_default_values.py b/tests/unit_tests/training/test_log_non_default_values.py index f2649e289d..ba65f51133 100644 --- a/tests/unit_tests/training/test_log_non_default_values.py +++ b/tests/unit_tests/training/test_log_non_default_values.py @@ -22,8 +22,8 @@ from megatron.core.optimizer.optimizer_config import OptimizerConfig as MCoreOptimizerConfig from megatron.core.transformer.transformer_config import TransformerConfig as MCoreTransformerConfig -from megatron.bridge.models.deepseek.deepseek_provider import DeepSeekModelProvider from megatron.bridge.models.gpt_provider import GPTModelProvider +from megatron.bridge.models.mla_provider import MLAModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -56,12 +56,12 @@ def test_gpt_provider_returns_transformer_config(self): assert parent is MCoreTransformerConfig def test_deepseek_provider_returns_mla_transformer_config(self): - """DeepSeekModelProvider should return MCoreMLATransformerConfig as parent.""" + """MLAModelProvider should return MCoreMLATransformerConfig as parent.""" from megatron.core.transformer.transformer_config import ( MLATransformerConfig as MCoreMLATransformerConfig, ) - config = DeepSeekModelProvider( + config = MLAModelProvider( num_layers=2, hidden_size=128, num_attention_heads=4, @@ -298,7 +298,7 @@ def test_logs_other_config_values(self, mock_print_rank_0): @patch("megatron.bridge.training.config.print_rank_0") def test_handles_deepseek_model_correctly(self, mock_print_rank_0): """Should use MLATransformerConfig for DeepSeek models.""" - deepseek_model = DeepSeekModelProvider( + deepseek_model = MLAModelProvider( num_layers=2, hidden_size=128, num_attention_heads=4, From 0bca980780d2e262d07556541f2f6f49cc10ad4f Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Sat, 28 Feb 2026 11:58:41 -0700 Subject: [PATCH 2/4] fix: resolve import errors from removed model providers - Remove NemotronNano12Bv2Provider from nemotron_vl/__init__.py (was a deprecated alias from deleted nemotron_h_provider.py) - Remove invalid max_position_embeddings kwarg from kimi and moonlight recipes (not a field on MLAModelProvider) - Update moonlight test to monkeypatch MLAModelProvider instead of deleted MoonlightModelProvider16B Signed-off-by: yaoyu-33 Made-with: Cursor --- .../bridge/models/nemotron_vl/__init__.py | 2 - .../models/nemotron_vl/nemotron_vl_bridge.py | 2 +- src/megatron/bridge/recipes/kimi/kimi_k2.py | 114 +++--------------- .../bridge/recipes/moonlight/moonlight_16b.py | 2 - tests/unit_tests/recipes/kimi/test_kimi_k2.py | 36 +++++- .../recipes/test_moonlight_recipes.py | 18 +-- 6 files changed, 61 insertions(+), 113 deletions(-) diff --git a/src/megatron/bridge/models/nemotron_vl/__init__.py b/src/megatron/bridge/models/nemotron_vl/__init__.py index 5c16da4dc0..2cd3fb4e63 100644 --- a/src/megatron/bridge/models/nemotron_vl/__init__.py +++ b/src/megatron/bridge/models/nemotron_vl/__init__.py @@ -3,7 +3,6 @@ from megatron.bridge.models.nemotron_vl.modeling_nemotron_vl import NemotronVLModel from megatron.bridge.models.nemotron_vl.nemotron_vl_bridge import NemotronVLBridge from megatron.bridge.models.nemotron_vl.nemotron_vl_provider import ( - NemotronNano12Bv2Provider, NemotronNano12Bv2VLModelProvider, ) @@ -11,6 +10,5 @@ __all__ = [ "NemotronVLModel", "NemotronVLBridge", - "NemotronNano12Bv2Provider", "NemotronNano12Bv2VLModelProvider", ] diff --git a/src/megatron/bridge/models/nemotron_vl/nemotron_vl_bridge.py b/src/megatron/bridge/models/nemotron_vl/nemotron_vl_bridge.py index c750f67dd4..7246de5dc7 100644 --- a/src/megatron/bridge/models/nemotron_vl/nemotron_vl_bridge.py +++ b/src/megatron/bridge/models/nemotron_vl/nemotron_vl_bridge.py @@ -55,7 +55,7 @@ def provider_bridge(self, hf_pretrained: PreTrainedVLM) -> NemotronNano12Bv2VLMo provider = NemotronNano12Bv2VLModelProvider(**provider_kwargs) # Nemotron VL-specific settings - # Note: Most defaults come from the provider class hierarchy (NemotronNano12Bv2Provider) + # Note: Most defaults come from the provider class hierarchy (NemotronNano12Bv2VLModelProvider) provider.scatter_embedding_sequence_parallel = False provider.attention_softmax_in_fp32 = True diff --git a/src/megatron/bridge/recipes/kimi/kimi_k2.py b/src/megatron/bridge/recipes/kimi/kimi_k2.py index a8695b306c..21be245dfd 100644 --- a/src/megatron/bridge/recipes/kimi/kimi_k2.py +++ b/src/megatron/bridge/recipes/kimi/kimi_k2.py @@ -12,21 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from functools import partial - import torch -import torch.nn.functional as F -from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec - -from megatron.bridge.models.mla_provider import MLAModelProvider - - -try: - import transformer_engine # noqa: F401 - HAVE_TE = True -except (ImportError, ModuleNotFoundError): - HAVE_TE = False +from megatron.bridge import AutoBridge from megatron.bridge.recipes.common import _pretrain_common from megatron.bridge.recipes.utils.optimizer_utils import distributed_muon_with_cosine_annealing from megatron.bridge.training.comm_overlap import CommOverlapConfig @@ -66,90 +54,22 @@ def kimi_k2_pretrain_config() -> ConfigContainer: """ cfg = _pretrain_common() - # Model config - uses MLAModelProvider with Kimi-K2 architecture - cfg.model = MLAModelProvider( - # Architecture - transformer_layer_spec=partial(get_gpt_decoder_block_spec, use_transformer_engine=HAVE_TE), - num_layers=61, - hidden_size=7168, - ffn_hidden_size=18432, - num_moe_experts=384, - moe_ffn_hidden_size=2048, - moe_shared_expert_intermediate_size=2048, - moe_layer_freq=[0] + [1] * 60, - normalization="RMSNorm", - activation_func=F.silu, - gated_linear_unit=True, - position_embedding_type="rope", - add_bias_linear=False, - share_embeddings_and_output_weights=False, - num_attention_heads=64, - kv_channels=64, - max_position_embeddings=4096, - seq_length=4096, - rotary_base=50000.0, - make_vocab_size_divisible_by=1280, - attention_dropout=0.0, - hidden_dropout=0.0, - qk_layernorm=True, - # MoE - moe_router_topk=8, - moe_router_num_groups=1, - moe_router_group_topk=1, - moe_router_topk_scaling_factor=2.827, - moe_aux_loss_coeff=1e-3, - moe_router_score_function="sigmoid", - moe_router_enable_expert_bias=True, - moe_router_bias_update_rate=1e-3, - moe_grouped_gemm=True, - moe_router_pre_softmax=True, - moe_token_dispatcher_type="alltoall", - moe_router_load_balancing_type="seq_aux_loss", - moe_shared_expert_overlap=True, - moe_router_dtype="fp32", - moe_permute_fusion=False, - # MLA - multi_latent_attention=True, - q_lora_rank=1536, - kv_lora_rank=512, - qk_head_dim=128, - qk_pos_emb_head_dim=64, - v_head_dim=128, - rotary_scaling_factor=32, - beta_fast=1.0, - beta_slow=1.0, - mscale=1.0, - mscale_all_dim=1.0, - # Miscellaneous - init_method_std=0.006, - layernorm_epsilon=1e-6, - bf16=True, - params_dtype=torch.bfloat16, - attention_softmax_in_fp32=False, - persist_layer_norm=True, - vocab_size=163840, - # Fusions - apply_rope_fusion=False, - bias_activation_fusion=True, - bias_dropout_fusion=True, - masked_softmax_fusion=True, - gradient_accumulation_fusion=True, - cross_entropy_loss_fusion=True, - cross_entropy_fusion_impl="te", - # Parallelism - tensor_model_parallel_size=2, - pipeline_model_parallel_size=16, - pipeline_dtype=torch.bfloat16, - virtual_pipeline_model_parallel_size=None, - context_parallel_size=1, - expert_model_parallel_size=32, - sequence_parallel=True, - expert_tensor_parallel_size=1, - recompute_granularity="selective", - recompute_modules=None, - recompute_method=None, - recompute_num_layers=None, - ) + # Model config via AutoBridge (dispatches to KimiK2Bridge) + cfg.model = AutoBridge.from_hf_pretrained("moonshotai/Kimi-K2-Instruct").to_megatron_provider(load_weights=False) + + # Parallelism + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 16 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.expert_model_parallel_size = 32 + cfg.model.sequence_parallel = True + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.recompute_granularity = "selective" + cfg.model.recompute_modules = None + cfg.model.recompute_method = None + cfg.model.recompute_num_layers = None # Pipeline split settings (asymmetric stages) cfg.model.account_for_embedding_in_pipeline_split = False diff --git a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py index 1eecebee24..6a1530769f 100644 --- a/src/megatron/bridge/recipes/moonlight/moonlight_16b.py +++ b/src/megatron/bridge/recipes/moonlight/moonlight_16b.py @@ -206,7 +206,6 @@ def moonlight_16b_sft_config() -> ConfigContainer: kv_channels=16, q_lora_rank=None, kv_lora_rank=512, - max_position_embeddings=4096, num_moe_experts=64, moe_ffn_hidden_size=1408, moe_shared_expert_intermediate_size=2816, @@ -420,7 +419,6 @@ def moonlight_16b_peft_config( kv_channels=16, q_lora_rank=None, kv_lora_rank=512, - max_position_embeddings=4096, num_moe_experts=64, moe_ffn_hidden_size=1408, moe_shared_expert_intermediate_size=2816, diff --git a/tests/unit_tests/recipes/kimi/test_kimi_k2.py b/tests/unit_tests/recipes/kimi/test_kimi_k2.py index 07d53c53a9..5e3d8262b1 100644 --- a/tests/unit_tests/recipes/kimi/test_kimi_k2.py +++ b/tests/unit_tests/recipes/kimi/test_kimi_k2.py @@ -12,15 +12,47 @@ # See the License for the specific language governing permissions and # limitations under the License. +import importlib + import pytest import torch -from megatron.bridge.models.mla_provider import MLAModelProvider from megatron.bridge.recipes.kimi.kimi_k2 import _get_kimi_k2_pipeline_layout, kimi_k2_pretrain_config from megatron.bridge.training.config import ConfigContainer from megatron.bridge.training.mixed_precision import MixedPrecisionConfig +class _FakeKimiK2Provider: + """Fake provider for testing without HF Hub I/O.""" + + def __init__(self, **kwargs): + for key, value in kwargs.items(): + setattr(self, key, value) + self.vocab_size = 163840 + self.apply_rope_fusion = False + + def finalize(self): + return None + + +class _FakeAutoBridge: + """Fake AutoBridge that returns a _FakeKimiK2Provider without network access.""" + + @classmethod + def from_hf_pretrained(cls, *args, **kwargs): + return cls() + + def to_megatron_provider(self, *args, **kwargs): + return _FakeKimiK2Provider() + + +@pytest.fixture(autouse=True) +def _patch_autobridge(monkeypatch): + """Monkeypatch AutoBridge in the kimi_k2 recipe module to avoid HF Hub access.""" + mod = importlib.import_module("megatron.bridge.recipes.kimi.kimi_k2") + monkeypatch.setattr(mod, "AutoBridge", _FakeAutoBridge) + + class TestKimiK2PipelineLayout: """Test cases for _get_kimi_k2_pipeline_layout function.""" @@ -56,7 +88,7 @@ def test_pretrain_config_basic_structure(self): # Check it returns a ConfigContainer with all required components assert isinstance(cfg, ConfigContainer) - assert isinstance(cfg.model, MLAModelProvider) + assert cfg.model is not None assert cfg.train is not None assert cfg.optimizer is not None assert cfg.scheduler is not None diff --git a/tests/unit_tests/recipes/test_moonlight_recipes.py b/tests/unit_tests/recipes/test_moonlight_recipes.py index 82190ceb87..9b0db6e342 100644 --- a/tests/unit_tests/recipes/test_moonlight_recipes.py +++ b/tests/unit_tests/recipes/test_moonlight_recipes.py @@ -133,7 +133,7 @@ def test_each_moonlight_recipe_builds_config(recipe_func: Callable, monkeypatch: mod = importlib.import_module(module_name) # Monkeypatch the MoonlightModelProvider16B class - monkeypatch.setattr(mod, "MoonlightModelProvider16B", _FakeMoonlightModelProvider16B) + monkeypatch.setattr(mod, "MLAModelProvider", _FakeMoonlightModelProvider16B) func_name = recipe_func.__name__ is_peft = "peft" in func_name.lower() @@ -168,7 +168,7 @@ def test_moonlight_sft_config_builds(recipe_func: Callable, monkeypatch: pytest. """Test that each Moonlight SFT recipe builds a valid config.""" module_name = recipe_func.__module__ mod = importlib.import_module(module_name) - monkeypatch.setattr(mod, "MoonlightModelProvider16B", _FakeMoonlightModelProvider16B) + monkeypatch.setattr(mod, "MLAModelProvider", _FakeMoonlightModelProvider16B) cfg = recipe_func() _apply_test_overrides(cfg, recipe_func.__name__) @@ -193,7 +193,7 @@ def test_moonlight_peft_config_builds(recipe_func: Callable, monkeypatch: pytest """Test that each Moonlight PEFT recipe builds a valid config.""" module_name = recipe_func.__module__ mod = importlib.import_module(module_name) - monkeypatch.setattr(mod, "MoonlightModelProvider16B", _FakeMoonlightModelProvider16B) + monkeypatch.setattr(mod, "MLAModelProvider", _FakeMoonlightModelProvider16B) cfg = recipe_func(peft_scheme="lora") _apply_test_overrides(cfg, recipe_func.__name__) @@ -219,7 +219,7 @@ def test_moonlight_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeyp """Test that PEFT configurations are correctly applied with different schemes.""" module_name = recipe_func.__module__ mod = importlib.import_module(module_name) - monkeypatch.setattr(mod, "MoonlightModelProvider16B", _FakeMoonlightModelProvider16B) + monkeypatch.setattr(mod, "MLAModelProvider", _FakeMoonlightModelProvider16B) cfg = recipe_func(peft_scheme=peft_scheme) _apply_test_overrides(cfg, recipe_func.__name__) @@ -235,7 +235,7 @@ def test_moonlight_16b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch): from megatron.bridge.recipes.moonlight import moonlight_16b_peft_config mod = importlib.import_module("megatron.bridge.recipes.moonlight.moonlight_16b") - monkeypatch.setattr(mod, "MoonlightModelProvider16B", _FakeMoonlightModelProvider16B) + monkeypatch.setattr(mod, "MLAModelProvider", _FakeMoonlightModelProvider16B) cfg = moonlight_16b_peft_config(peft_scheme="lora") _apply_test_overrides(cfg, "moonlight_16b_peft_config") @@ -258,7 +258,7 @@ def test_moonlight_16b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch): from megatron.bridge.recipes.moonlight import moonlight_16b_peft_config mod = importlib.import_module("megatron.bridge.recipes.moonlight.moonlight_16b") - monkeypatch.setattr(mod, "MoonlightModelProvider16B", _FakeMoonlightModelProvider16B) + monkeypatch.setattr(mod, "MLAModelProvider", _FakeMoonlightModelProvider16B) cfg = moonlight_16b_peft_config(peft_scheme="dora") _apply_test_overrides(cfg, "moonlight_16b_peft_config") @@ -281,7 +281,7 @@ def test_moonlight_16b_sft_full_defaults(monkeypatch: pytest.MonkeyPatch): from megatron.bridge.recipes.moonlight import moonlight_16b_sft_config mod = importlib.import_module("megatron.bridge.recipes.moonlight.moonlight_16b") - monkeypatch.setattr(mod, "MoonlightModelProvider16B", _FakeMoonlightModelProvider16B) + monkeypatch.setattr(mod, "MLAModelProvider", _FakeMoonlightModelProvider16B) cfg = moonlight_16b_sft_config() _apply_test_overrides(cfg, "moonlight_16b_sft_config") @@ -304,7 +304,7 @@ def test_moonlight_16b_sft_precision_aware_optimizer(monkeypatch: pytest.MonkeyP from megatron.bridge.recipes.moonlight import moonlight_16b_sft_config mod = importlib.import_module("megatron.bridge.recipes.moonlight.moonlight_16b") - monkeypatch.setattr(mod, "MoonlightModelProvider16B", _FakeMoonlightModelProvider16B) + monkeypatch.setattr(mod, "MLAModelProvider", _FakeMoonlightModelProvider16B) cfg = moonlight_16b_sft_config() _apply_test_overrides(cfg, "moonlight_16b_sft_config") @@ -326,7 +326,7 @@ def test_moonlight_16b_sft_tokenizer_with_trust_remote_code(monkeypatch: pytest. from megatron.bridge.recipes.moonlight import moonlight_16b_sft_config mod = importlib.import_module("megatron.bridge.recipes.moonlight.moonlight_16b") - monkeypatch.setattr(mod, "MoonlightModelProvider16B", _FakeMoonlightModelProvider16B) + monkeypatch.setattr(mod, "MLAModelProvider", _FakeMoonlightModelProvider16B) cfg = moonlight_16b_sft_config() From 1434650a58f30ff5bc74913904acdf2986e7d270 Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Sat, 28 Feb 2026 18:20:46 -0700 Subject: [PATCH 3/4] [model] feat: add MiniMax-M2 MoE bridge Add Megatron Bridge for MiniMaxAI/MiniMax-M2, a sparse MoE model with 256 experts, top-8 sigmoid routing, and expert bias correction. Includes: - Bridge with config mapping and per-expert weight conversion (block_sparse_moe prefix, w1/w2/w3 format) - Partial RoPE support (rotary_dim -> rotary_percent) - QK layernorm intentionally disabled (full-dim vs per-head mismatch) - Functional test with toy model for TP/PP/EP parallelism - Example scripts for conversion, inference, and verification - compare.py fix: truncate Megatron logits to HF vocab size for proper comparison when Megatron pads vocab for kernel efficiency Signed-off-by: yaoyu-33 Made-with: Cursor --- .../compare_hf_and_megatron/compare.py | 47 +++-- examples/models/minimax_m2/conversion.sh | 48 +++++ examples/models/minimax_m2/inference.sh | 24 +++ .../models/minimax_m2/verify_toy_model.py | 176 +++++++++++++++++ src/megatron/bridge/models/__init__.py | 4 + .../bridge/models/hf_pretrained/utils.py | 1 + .../bridge/models/minimax_m2/__init__.py | 20 ++ .../models/minimax_m2/minimax_m2_bridge.py | 149 ++++++++++++++ .../models/minimax_m2/__init__.py | 0 .../minimax_m2/test_minimax_m2_conversion.py | 186 ++++++++++++++++++ 10 files changed, 634 insertions(+), 21 deletions(-) create mode 100644 examples/models/minimax_m2/conversion.sh create mode 100644 examples/models/minimax_m2/inference.sh create mode 100644 examples/models/minimax_m2/verify_toy_model.py create mode 100644 src/megatron/bridge/models/minimax_m2/__init__.py create mode 100644 src/megatron/bridge/models/minimax_m2/minimax_m2_bridge.py create mode 100644 tests/functional_tests/models/minimax_m2/__init__.py create mode 100644 tests/functional_tests/models/minimax_m2/test_minimax_m2_conversion.py diff --git a/examples/conversion/compare_hf_and_megatron/compare.py b/examples/conversion/compare_hf_and_megatron/compare.py index 60ce377cd1..b8ed909861 100644 --- a/examples/conversion/compare_hf_and_megatron/compare.py +++ b/examples/conversion/compare_hf_and_megatron/compare.py @@ -609,6 +609,8 @@ def _load_megatron_model(args): model_provider.finalize() megatron_model = model_provider.provide_distributed_model(wrap_with_ddp=False) + for m in megatron_model: + m.config.mtp_num_layers = None model_components = [m.eval() for m in megatron_model] # Register debug hooks if enabled @@ -715,17 +717,19 @@ def compare_models_one_step(args) -> None: ) del hf_model - # Reload Megatron model to ensure a fresh instance before comparison - megatron_model, _ = _load_megatron_model(args) + torch.cuda.empty_cache() # Broadcast HF results to all ranks after Megatron initialization # (following the pattern from generate_from_hf.py) if torch.distributed.is_initialized(): - # Create tensors for broadcasting if they don't exist on non-rank-0 + # Ensure consistent dtype across ranks: rank 0 has bfloat16 logits from the HF model, + # so all ranks must use the same dtype for NCCL broadcast to work correctly. + if hf_logits is not None: + hf_logits = hf_logits.float() + if hf_next_token is None: hf_next_token = torch.zeros(1, device=input_ids.device, dtype=torch.long) if hf_logits is None: - # Get vocab size from tokenizer for proper tensor size vocab_size = getattr( tokenizer, "vocab_size", len(tokenizer.vocab) if hasattr(tokenizer, "vocab") else 32000 ) @@ -734,6 +738,8 @@ def compare_models_one_step(args) -> None: # Broadcast from rank 0 to all ranks torch.distributed.broadcast(hf_next_token, 0) torch.distributed.broadcast(hf_logits, 0) + torch.distributed.barrier() + print_rank_0("HF results broadcast complete.") # Run Megatron model forward pass print_rank_0("=== RUNNING MEGATRON MODEL (1-STEP) ===") @@ -790,27 +796,26 @@ def compare_models_one_step(args) -> None: top5_tokens = [tokenizer.decode([idx]) for idx in top5_ids] print(f"Megatron Top 5: {list(zip(top5_tokens, top5_vals.tolist()))}") - # Compare outputs (only where we have valid Megatron results) + # Megatron may pad vocab_size for GPU kernel efficiency — truncate + # to the HF vocab size so logits are directly comparable. + hf_vocab_size = hf_logits.shape[0] + megatron_logits_cmp = megatron_logits[:hf_vocab_size] + megatron_next_token_cmp = torch.argmax(megatron_logits_cmp, dim=-1) + + # Compare outputs print("=== COMPARISON ===") - token_match = hf_next_token.item() == megatron_next_token.item() + token_match = hf_next_token.item() == megatron_next_token_cmp.item() token_status_emoji = "✅" if token_match else "❌" print(f"Token match: {token_match} {token_status_emoji}") - # Compare logits if shapes match - if hf_logits.shape == megatron_logits.shape: - diff = (hf_logits - megatron_logits).abs() - print(f"Logits diff - max: {diff.max():.6f}, mean: {diff.mean():.6f}") - cosine_sim = torch.cosine_similarity(hf_logits.unsqueeze(0), megatron_logits.unsqueeze(0)) - cos_val = cosine_sim.item() - percent = cos_val * 100.0 - status_emoji = "✅" if cos_val >= SIMILARITY_THRESHOLD else "❌" - tolerance_text = "within ±2%" if cos_val >= SIMILARITY_THRESHOLD else "outside ±2%" - print( - f"Cosine similarity: {cos_val:.6f} ({percent:.2f}%) {status_emoji} ({tolerance_text} tolerance)" - ) - else: - print(f"Shape mismatch: HF {hf_logits.shape} vs Megatron {megatron_logits.shape}") - print("Cannot compare logits directly due to shape mismatch") + diff = (hf_logits - megatron_logits_cmp).abs() + print(f"Logits diff - max: {diff.max():.6f}, mean: {diff.mean():.6f}") + cosine_sim = torch.cosine_similarity(hf_logits.unsqueeze(0), megatron_logits_cmp.unsqueeze(0)) + cos_val = cosine_sim.item() + percent = cos_val * 100.0 + status_emoji = "✅" if cos_val >= SIMILARITY_THRESHOLD else "❌" + tolerance_text = "within ±2%" if cos_val >= SIMILARITY_THRESHOLD else "outside ±2%" + print(f"Cosine similarity: {cos_val:.6f} ({percent:.2f}%) {status_emoji} ({tolerance_text} tolerance)") print("=== COMPARISON COMPLETE ===") else: diff --git a/examples/models/minimax_m2/conversion.sh b/examples/models/minimax_m2/conversion.sh new file mode 100644 index 0000000000..eff14c3864 --- /dev/null +++ b/examples/models/minimax_m2/conversion.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail + +# MiniMax-M2 (MoE: 256 experts, top-8, ~230GB fp8) +# Due to model size, multi-node setup is recommended. +# Adjust TP, PP, EP according to available resources. + +WORKSPACE=${WORKSPACE:-/workspace} +MODEL_NAME=MiniMax-M2 +HF_MODEL_ID=MiniMaxAI/$MODEL_NAME + +# Single-GPU round-trip (only feasible with sufficient memory) +# uv run python examples/conversion/hf_megatron_roundtrip.py \ +# --hf-model-id $HF_MODEL_ID \ +# --trust-remote-code + +# Multi-GPU round-trip (TP=2, EP=8) +uv run python -m torch.distributed.run --nproc_per_node=8 \ + examples/conversion/hf_megatron_roundtrip_multi_gpu.py \ + --hf-model-id $HF_MODEL_ID \ + --tp 2 --ep 8 \ + --trust-remote-code + +# Import HF → Megatron checkpoint +uv run python examples/conversion/convert_checkpoints.py import \ + --hf-model $HF_MODEL_ID \ + --megatron-path ${WORKSPACE}/models/$MODEL_NAME \ + --trust-remote-code + +# Export Megatron → HF checkpoint +uv run python examples/conversion/convert_checkpoints.py export \ + --hf-model $HF_MODEL_ID \ + --megatron-path ${WORKSPACE}/models/$MODEL_NAME/iter_0000000 \ + --hf-path ${WORKSPACE}/models/$MODEL_NAME-hf-export diff --git a/examples/models/minimax_m2/inference.sh b/examples/models/minimax_m2/inference.sh new file mode 100644 index 0000000000..c64ec41fd7 --- /dev/null +++ b/examples/models/minimax_m2/inference.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MiniMax-M2 (MoE: 256 experts, top-8, ~230GB fp8) +# Requires multi-node for full model. Adjust TP/EP for available resources. +uv run python -m torch.distributed.run --nproc_per_node=8 \ + examples/conversion/hf_to_megatron_generate_text.py \ + --hf_model_path MiniMaxAI/MiniMax-M2 \ + --prompt "What is artificial intelligence?" \ + --max_new_tokens 100 \ + --tp 2 --ep 4 \ + --trust-remote-code diff --git a/examples/models/minimax_m2/verify_toy_model.py b/examples/models/minimax_m2/verify_toy_model.py new file mode 100644 index 0000000000..8bd6de90c1 --- /dev/null +++ b/examples/models/minimax_m2/verify_toy_model.py @@ -0,0 +1,176 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Create a toy MiniMax-M2 model and verify HF ↔ Megatron conversion + forward pass. + +The toy model is saved to /tmp/minimax_m2_toy and reused across runs. + +Usage: + uv run python examples/models/minimax_m2/verify_toy_model.py + uv run python examples/models/minimax_m2/verify_toy_model.py --tp 2 + uv run python examples/models/minimax_m2/verify_toy_model.py --ep 2 + uv run python examples/models/minimax_m2/verify_toy_model.py --tp 2 --ep 2 +""" + +import argparse +import json +import os +import subprocess +import sys +import tempfile +from pathlib import Path + +import torch + + +HF_MODEL_ID = "MiniMaxAI/MiniMax-M2" + + +def create_toy_model(output_dir: str): + """Create and save a toy MiniMax-M2 model with random weights.""" + from transformers import AutoTokenizer, MiniMaxM2Config, MiniMaxM2ForCausalLM + + tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, trust_remote_code=True) + vocab_size = len(tokenizer) + print(f" Tokenizer vocab_size: {vocab_size}") + + config_dict = { + "architectures": ["MiniMaxM2ForCausalLM"], + "model_type": "minimax_m2", + "hidden_size": 512, + "intermediate_size": 256, + "num_hidden_layers": 2, + "num_attention_heads": 8, + "num_key_value_heads": 4, + "head_dim": 64, + "hidden_act": "silu", + "max_position_embeddings": 4096, + "rms_norm_eps": 1e-06, + "rope_theta": 5000000, + "rotary_dim": 32, + "vocab_size": vocab_size, + "tie_word_embeddings": False, + "attention_dropout": 0.0, + "num_local_experts": 4, + "num_experts_per_tok": 2, + "scoring_func": "sigmoid", + "use_routing_bias": True, + "use_qk_norm": True, + "qk_norm_type": "per_layer", + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "output_router_logits": False, + "torch_dtype": "bfloat16", + } + + os.makedirs(output_dir, exist_ok=True) + + print(f"Creating toy MiniMax-M2 model at {output_dir} ...") + config = MiniMaxM2Config(**config_dict) + config.torch_dtype = torch.bfloat16 + model = MiniMaxM2ForCausalLM(config).bfloat16() + + model.save_pretrained(output_dir, safe_serialization=True) + tokenizer.save_pretrained(output_dir) + + config_path = os.path.join(output_dir, "config.json") + with open(config_path, "w") as f: + json.dump(config_dict, f, indent=2) + + param_count = sum(p.numel() for p in model.parameters()) + print( + f" Params: {param_count:,} | Experts: {config_dict['num_local_experts']} | Top-K: {config_dict['num_experts_per_tok']}" + ) + print(f" Saved to: {output_dir}") + + +def run_compare(model_dir: str, tp: int, pp: int, ep: int): + """Run compare.py against the toy model.""" + script = str( + Path(__file__).resolve().parent.parent.parent / "conversion" / "compare_hf_and_megatron" / "compare.py" + ) + + nproc = tp * pp * ep + prompt = "Hello" + + if nproc == 1: + cmd = [ + sys.executable, + script, + "--hf_model_path", + model_dir, + "--prompt", + prompt, + ] + else: + cmd = [ + sys.executable, + "-m", + "torch.distributed.run", + f"--nproc_per_node={nproc}", + script, + "--hf_model_path", + model_dir, + "--prompt", + prompt, + "--tp", + str(tp), + "--pp", + str(pp), + "--ep", + str(ep), + ] + + print(f"\n{'=' * 60}") + print(f"Running compare.py TP={tp} PP={pp} EP={ep} (nproc={nproc})") + print(f"{'=' * 60}") + print(f" cmd: {' '.join(cmd)}\n") + + result = subprocess.run(cmd, cwd=str(Path(__file__).resolve().parent.parent.parent.parent)) + if result.returncode != 0: + print(f"\n[FAIL] compare.py exited with code {result.returncode}") + sys.exit(result.returncode) + print(f"\n[OK] TP={tp} PP={pp} EP={ep} comparison passed") + + +def main(): + """Verify MiniMax-M2 toy model conversion + forward pass.""" + parser = argparse.ArgumentParser(description="Verify MiniMax-M2 toy model conversion + forward pass") + parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism") + parser.add_argument("--pp", type=int, default=1, help="Pipeline parallelism") + parser.add_argument("--ep", type=int, default=1, help="Expert parallelism") + parser.add_argument( + "--model-dir", + type=str, + default=None, + help="Reuse an existing toy model directory instead of creating a new one", + ) + args = parser.parse_args() + + if args.model_dir: + model_dir = args.model_dir + else: + model_dir = os.path.join(tempfile.gettempdir(), "minimax_m2_toy") + + if not os.path.exists(os.path.join(model_dir, "config.json")): + create_toy_model(model_dir) + else: + print(f"Reusing existing toy model at: {model_dir}") + + run_compare(model_dir, args.tp, args.pp, args.ep) + + +if __name__ == "__main__": + main() diff --git a/src/megatron/bridge/models/__init__.py b/src/megatron/bridge/models/__init__.py index b05607100c..680dcbc71b 100644 --- a/src/megatron/bridge/models/__init__.py +++ b/src/megatron/bridge/models/__init__.py @@ -66,6 +66,9 @@ ) from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider from megatron.bridge.models.mimo.mimo_bridge import MimoBridge +from megatron.bridge.models.minimax_m2 import ( + MiniMaxM2Bridge, +) from megatron.bridge.models.ministral3 import ( Ministral3Bridge, Ministral3Model, @@ -175,6 +178,7 @@ "Ministral3ModelProvider3B", "Ministral3ModelProvider8B", "Ministral3ModelProvider14B", + "MiniMaxM2Bridge", "OlMoEBridge", "OlMoEModelProvider", "Qwen2ModelProvider", diff --git a/src/megatron/bridge/models/hf_pretrained/utils.py b/src/megatron/bridge/models/hf_pretrained/utils.py index 529deff16b..58fd16fcc1 100644 --- a/src/megatron/bridge/models/hf_pretrained/utils.py +++ b/src/megatron/bridge/models/hf_pretrained/utils.py @@ -24,6 +24,7 @@ "google", "llava-hf", "meta-llama", + "MiniMaxAI", "mistralai", "moonshotai", "nvidia", diff --git a/src/megatron/bridge/models/minimax_m2/__init__.py b/src/megatron/bridge/models/minimax_m2/__init__.py new file mode 100644 index 0000000000..542d0b761c --- /dev/null +++ b/src/megatron/bridge/models/minimax_m2/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from megatron.bridge.models.minimax_m2.minimax_m2_bridge import MiniMaxM2Bridge # noqa: F401 + + +__all__ = [ + "MiniMaxM2Bridge", +] diff --git a/src/megatron/bridge/models/minimax_m2/minimax_m2_bridge.py b/src/megatron/bridge/models/minimax_m2/minimax_m2_bridge.py new file mode 100644 index 0000000000..30e6158cd6 --- /dev/null +++ b/src/megatron/bridge/models/minimax_m2/minimax_m2_bridge.py @@ -0,0 +1,149 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from megatron.core.models.gpt.gpt_model import GPTModel + +from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry +from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge +from megatron.bridge.models.conversion.param_mapping import ( + AutoMapping, + GatedMLPMapping, + QKVMapping, +) + + +@MegatronModelBridge.register_bridge( + source="MiniMaxM2ForCausalLM", + target=GPTModel, + model_type="minimax_m2", +) +class MiniMaxM2Bridge(MegatronModelBridge): + """ + Megatron Bridge for MiniMax-M2 MoE Causal LM. + + MiniMax-M2 is a sparse MoE model (256 experts, top-8 routing with sigmoid + scoring and expert bias correction). HF weights use per-expert format + with block_sparse_moe prefix (w1/w2/w3). + + Known limitations: + - QK layernorm: MiniMax-M2 uses full-dimension QK norm (q_norm weight + shape = num_heads * head_dim), whereas Megatron uses per-head QK norm + (weight shape = head_dim). These are computationally different and + cannot be losslessly converted. QK norm is disabled in this bridge. + - MTP (Multi-Token Prediction) modules are not mapped. + + Example: + >>> from megatron.bridge import AutoBridge + >>> bridge = AutoBridge.from_hf_pretrained("MiniMaxAI/MiniMax-M2") + >>> provider = bridge.to_megatron_provider() + """ + + def provider_bridge(self, hf_pretrained): + """Convert HuggingFace MiniMax-M2 config to GPTModelProvider.""" + provider = super().provider_bridge(hf_pretrained) + + hf_config = hf_pretrained.config + + provider.normalization = "RMSNorm" + provider.gated_linear_unit = True + provider.position_embedding_type = "rope" + provider.add_bias_linear = False + provider.add_qkv_bias = False + provider.hidden_dropout = 0.0 + provider.autocast_dtype = torch.bfloat16 + + # MiniMax-M2 uses rotary_dim instead of partial_rotary_factor + rotary_dim = getattr(hf_config, "rotary_dim", None) + head_dim = getattr(hf_config, "head_dim", None) + if rotary_dim is not None and head_dim is not None: + provider.rotary_percent = rotary_dim / head_dim + + # TODO: MiniMax-M2 uses full-dimension QK norm (q_norm weight shape = num_heads * head_dim) + # while Megatron uses per-head QK norm (weight shape = head_dim). These are + # mathematically different (different normalization denominators), so the weights + # cannot be losslessly converted. Disabled here, which means q_norm.weight and + # k_norm.weight are dropped during conversion. This is acceptable for fine-tuning + # but will cause forward-pass divergence from HF for inference. + # Fix: add full-dimension QK norm support to Megatron-Core, or write a custom + # layer spec for this model. + provider.qk_layernorm = False + + # MoE settings — sigmoid routing with expert bias (same pattern as DeepSeek V3) + provider.moe_grouped_gemm = True + provider.moe_router_pre_softmax = False + provider.moe_router_load_balancing_type = "aux_loss" + provider.moe_aux_loss_coeff = 1e-3 + provider.moe_token_dispatcher_type = "alltoall" + provider.moe_permute_fusion = True + provider.moe_router_score_function = "sigmoid" + provider.moe_router_enable_expert_bias = True + + return provider + + def mapping_registry(self) -> MegatronMappingRegistry: + param_mappings = { + # Global weights + "embedding.word_embeddings.weight": "model.embed_tokens.weight", + "output_layer.weight": "lm_head.weight", + "decoder.final_layernorm.weight": "model.norm.weight", + # Per-layer layernorms (TE backend) + "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight", + "decoder.layers.*.pre_mlp_layernorm.weight": "model.layers.*.post_attention_layernorm.weight", + # Attention o_proj + "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight", + # MoE router and expert bias + "decoder.layers.*.mlp.router.weight": "model.layers.*.block_sparse_moe.gate.weight", + "decoder.layers.*.mlp.router.expert_bias": "model.layers.*.block_sparse_moe.e_score_correction_bias", + } + + mapping_list = [] + for megatron_param, hf_param in param_mappings.items(): + mapping_list.append(AutoMapping(megatron_param=megatron_param, hf_param=hf_param)) + + # QKV + mapping_list.append( + QKVMapping( + megatron_param="decoder.layers.*.self_attention.linear_qkv.weight", + q="model.layers.*.self_attn.q_proj.weight", + k="model.layers.*.self_attn.k_proj.weight", + v="model.layers.*.self_attn.v_proj.weight", + ) + ) + + # MoE expert weights (per-expert w1/w2/w3 with block_sparse_moe prefix) + mapping_list.extend( + [ + GatedMLPMapping( + megatron_param="decoder.layers.*.mlp.experts.linear_fc1.weight*", + gate="model.layers.*.block_sparse_moe.experts.*.w1.weight", + up="model.layers.*.block_sparse_moe.experts.*.w3.weight", + ), + AutoMapping( + megatron_param="decoder.layers.*.mlp.experts.linear_fc2.weight*", + hf_param="model.layers.*.block_sparse_moe.experts.*.w2.weight", + ), + GatedMLPMapping( + megatron_param="decoder.layers.*.mlp.experts.local_experts.*.linear_fc1.weight", + gate="model.layers.*.block_sparse_moe.experts.*.w1.weight", + up="model.layers.*.block_sparse_moe.experts.*.w3.weight", + ), + AutoMapping( + megatron_param="decoder.layers.*.mlp.experts.local_experts.*.linear_fc2.weight", + hf_param="model.layers.*.block_sparse_moe.experts.*.w2.weight", + ), + ] + ) + + return MegatronMappingRegistry(*mapping_list) diff --git a/tests/functional_tests/models/minimax_m2/__init__.py b/tests/functional_tests/models/minimax_m2/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/functional_tests/models/minimax_m2/test_minimax_m2_conversion.py b/tests/functional_tests/models/minimax_m2/test_minimax_m2_conversion.py new file mode 100644 index 0000000000..9596789174 --- /dev/null +++ b/tests/functional_tests/models/minimax_m2/test_minimax_m2_conversion.py @@ -0,0 +1,186 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import subprocess +from pathlib import Path + +import pytest +import torch + + +try: + from transformers import MiniMaxM2Config, MiniMaxM2ForCausalLM +except ImportError: + pytest.skip( + "MiniMaxM2 not available in current transformers version (requires >= 4.57)", + allow_module_level=True, + ) + +# Toy config: reduced dims for fast testing. +# Keeps architectural properties: MoE, partial RoPE, QK norm, sigmoid routing. +HF_MINIMAX_M2_TOY_MODEL_CONFIG = { + "architectures": ["MiniMaxM2ForCausalLM"], + "model_type": "minimax_m2", + "hidden_size": 512, + "intermediate_size": 256, + "num_hidden_layers": 2, + "num_attention_heads": 8, + "num_key_value_heads": 4, + "head_dim": 64, + "hidden_act": "silu", + "max_position_embeddings": 4096, + "rms_norm_eps": 1e-06, + "rope_theta": 5000000, + "rotary_dim": 32, + "vocab_size": 1024, + "tie_word_embeddings": False, + "attention_dropout": 0.0, + "num_local_experts": 4, + "num_experts_per_tok": 2, + "scoring_func": "sigmoid", + "use_routing_bias": True, + "use_qk_norm": True, + "qk_norm_type": "per_layer", + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "output_router_logits": False, + "torch_dtype": "bfloat16", +} + + +class TestMiniMaxM2Conversion: + """ + Test MiniMax-M2 MoE model conversion with different parallelism configurations. + Uses a toy model (2 layers, 4 experts) with random weights. + """ + + @pytest.fixture(scope="class") + def toy_model_path(self, tmp_path_factory): + """Create and save a toy MiniMax-M2 model to a temporary directory.""" + temp_dir = tmp_path_factory.mktemp("minimax_m2_toy_model") + model_dir = temp_dir / "minimax_m2_toy" + + config = MiniMaxM2Config(**HF_MINIMAX_M2_TOY_MODEL_CONFIG) + config.torch_dtype = torch.bfloat16 + + model = MiniMaxM2ForCausalLM(config).bfloat16() + + model.save_pretrained(model_dir, safe_serialization=True) + + config_path = model_dir / "config.json" + with open(config_path, "w") as f: + json.dump(HF_MINIMAX_M2_TOY_MODEL_CONFIG, f, indent=2) + + return str(model_dir) + + def test_toy_model_creation(self, toy_model_path): + """Verify the toy model was created correctly.""" + model_path = Path(toy_model_path) + assert model_path.exists() + + config_file = model_path / "config.json" + assert config_file.exists() + + weights_file = model_path / "model.safetensors" + if not weights_file.exists(): + sharded_files = list(model_path.glob("model-*-of-*.safetensors")) + assert len(sharded_files) > 0, "No model weight files found" + + with open(config_file) as f: + config_data = json.load(f) + + assert config_data["model_type"] == "minimax_m2" + assert config_data["hidden_size"] == 512 + assert config_data["num_hidden_layers"] == 2 + assert config_data["num_local_experts"] == 4 + assert config_data["num_experts_per_tok"] == 2 + + model = MiniMaxM2ForCausalLM.from_pretrained( + toy_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=False + ) + + assert len(model.model.layers) == 2 + first_layer = model.model.layers[0] + # Native transformers uses "mlp" for the MoE block + assert hasattr(first_layer, "mlp"), f"Expected 'mlp' attribute, got: {list(first_layer._modules.keys())}" + moe_block = first_layer.mlp + assert hasattr(moe_block, "experts"), f"MoE block missing 'experts', got: {list(moe_block._modules.keys())}" + + @pytest.mark.run_only_on("GPU") + @pytest.mark.parametrize( + "tp,pp,ep,test_name", + [ + (2, 1, 1, "TP"), + (1, 2, 1, "PP"), + (1, 1, 2, "EP"), + ], + ) + def test_minimax_m2_conversion_parallelism(self, toy_model_path, tmp_path, tp, pp, ep, test_name): + """ + Test MiniMax-M2 model conversion with different parallelism configurations. + """ + test_output_dir = tmp_path / f"minimax_m2_{test_name}" + test_output_dir.mkdir(exist_ok=True) + + cmd = [ + "python", + "-m", + "torch.distributed.run", + "--nproc_per_node=2", + "--nnodes=1", + "-m", + "coverage", + "run", + "--data-file=/opt/Megatron-Bridge/.coverage", + "--source=/opt/Megatron-Bridge/", + "--parallel-mode", + "examples/conversion/hf_megatron_roundtrip_multi_gpu.py", + "--hf-model-id", + toy_model_path, + "--output-dir", + str(test_output_dir), + "--tp", + str(tp), + "--pp", + str(pp), + "--ep", + str(ep), + ] + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent.parent.parent, + ) + + if result.returncode != 0: + print(f"STDOUT: {result.stdout}") + print(f"STDERR: {result.stderr}") + assert False, f"MiniMax-M2 {test_name} conversion failed with return code {result.returncode}" + + model_name = Path(toy_model_path).name + converted_dir = test_output_dir / model_name + assert converted_dir.exists(), f"Converted model directory not found at {converted_dir}" + + config_file = converted_dir / "config.json" + assert config_file.exists() + + with open(config_file) as f: + saved_config = json.load(f) + + assert saved_config["model_type"] == "minimax_m2" + assert saved_config["hidden_size"] == 512 + assert saved_config["num_local_experts"] == 4 From b1303696fdb72475149d30342d75f36d874105e9 Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Mon, 2 Mar 2026 10:16:56 -0700 Subject: [PATCH 4/4] [model] feat: add full-dimension QK norm, FP8 dequantization, and multi-node support for MiniMax-M2 Add custom full-dimension QK normalization (minimax_m2_provider.py) since MiniMax-M2 applies RMSNorm over the entire Q/K projection rather than per-head. The implementation uses sum-of-squares all-reduce across TP ranks and provides sharded_state_dict for distributed checkpointing. Add on-the-fly FP8 block-wise dequantization in the bridge via maybe_modify_loaded_hf_weight, converting float8_e4m3fn weights to bfloat16 using per-block scale_inv factors during HF->Megatron conversion. Add multi-node Slurm scripts (slurm_conversion.sh, slurm_inference.sh) for configurations requiring TP*EP*PP > 8 GPUs. Update verify_toy_model.py to extract real pretrained weights (N layers) from the FP8 model, dequantize to bf16, and verify round-trip accuracy. Fix dtype mismatch handling in hf_megatron_roundtrip_multi_gpu.py for FP8 source models. Signed-off-by: yaoyu-33 Made-with: Cursor --- .../hf_megatron_roundtrip_multi_gpu.py | 10 +- examples/models/minimax_m2/conversion.sh | 18 +- examples/models/minimax_m2/inference.sh | 5 +- .../models/minimax_m2/slurm_conversion.sh | 105 +++++++ examples/models/minimax_m2/slurm_inference.sh | 113 ++++++++ .../models/minimax_m2/verify_toy_model.py | 265 +++++++++++------- .../models/minimax_m2/minimax_m2_bridge.py | 126 ++++++++- .../models/minimax_m2/minimax_m2_provider.py | 154 ++++++++++ 8 files changed, 665 insertions(+), 131 deletions(-) create mode 100755 examples/models/minimax_m2/slurm_conversion.sh create mode 100755 examples/models/minimax_m2/slurm_inference.sh create mode 100644 src/megatron/bridge/models/minimax_m2/minimax_m2_provider.py diff --git a/examples/conversion/hf_megatron_roundtrip_multi_gpu.py b/examples/conversion/hf_megatron_roundtrip_multi_gpu.py index eebb8af8e2..d216651556 100644 --- a/examples/conversion/hf_megatron_roundtrip_multi_gpu.py +++ b/examples/conversion/hf_megatron_roundtrip_multi_gpu.py @@ -163,14 +163,12 @@ def main( original_param = bridge.hf_pretrained.state[name] compare_param = param compare_original = original_param - # Cast to float32 for params with known dtype mismatches between Megatron and HF - # (e.g. Megatron keeps expert_bias in float32 while HF may use bfloat16) - if any(p in name for p in IGNORE_PRECISION_PARAMS): + # Cast to float32 when dtypes differ (e.g. fp8 HF weights vs bf16 Megatron, + # or Megatron keeping expert_bias in float32 while HF uses bfloat16) + if compare_param.dtype != compare_original.dtype or any(p in name for p in IGNORE_PRECISION_PARAMS): compare_param = param.float() compare_original = original_param.float() - match = torch.allclose( - compare_param, compare_original.to(compare_param.device), atol=1e-1 - ) # Increased tolerance for bfloat16 + match = torch.allclose(compare_param, compare_original.to(compare_param.device), atol=1e-1) all_match = all_match and match table.add_row( name, diff --git a/examples/models/minimax_m2/conversion.sh b/examples/models/minimax_m2/conversion.sh index eff14c3864..884aa9d018 100644 --- a/examples/models/minimax_m2/conversion.sh +++ b/examples/models/minimax_m2/conversion.sh @@ -16,32 +16,28 @@ set -xeuo pipefail # MiniMax-M2 (MoE: 256 experts, top-8, ~230GB fp8) -# Due to model size, multi-node setup is recommended. -# Adjust TP, PP, EP according to available resources. +# +# Single-node (8 GPUs): use this script with TP*EP*PP <= 8. +# Multi-node (TP*EP*PP > 8): use slurm_conversion.sh instead. WORKSPACE=${WORKSPACE:-/workspace} MODEL_NAME=MiniMax-M2 HF_MODEL_ID=MiniMaxAI/$MODEL_NAME -# Single-GPU round-trip (only feasible with sufficient memory) -# uv run python examples/conversion/hf_megatron_roundtrip.py \ -# --hf-model-id $HF_MODEL_ID \ -# --trust-remote-code - -# Multi-GPU round-trip (TP=2, EP=8) +# Multi-GPU round-trip on a single 8-GPU node (TP=2, EP=4 → 8 GPUs) uv run python -m torch.distributed.run --nproc_per_node=8 \ examples/conversion/hf_megatron_roundtrip_multi_gpu.py \ --hf-model-id $HF_MODEL_ID \ - --tp 2 --ep 8 \ + --tp 2 --ep 4 \ --trust-remote-code -# Import HF → Megatron checkpoint +# Import HF → Megatron checkpoint (single-process) uv run python examples/conversion/convert_checkpoints.py import \ --hf-model $HF_MODEL_ID \ --megatron-path ${WORKSPACE}/models/$MODEL_NAME \ --trust-remote-code -# Export Megatron → HF checkpoint +# Export Megatron → HF checkpoint (single-process) uv run python examples/conversion/convert_checkpoints.py export \ --hf-model $HF_MODEL_ID \ --megatron-path ${WORKSPACE}/models/$MODEL_NAME/iter_0000000 \ diff --git a/examples/models/minimax_m2/inference.sh b/examples/models/minimax_m2/inference.sh index c64ec41fd7..bbf5121e71 100644 --- a/examples/models/minimax_m2/inference.sh +++ b/examples/models/minimax_m2/inference.sh @@ -14,7 +14,10 @@ # limitations under the License. # MiniMax-M2 (MoE: 256 experts, top-8, ~230GB fp8) -# Requires multi-node for full model. Adjust TP/EP for available resources. +# +# Single-node (8 GPUs): use this script with TP*EP*PP <= 8. +# Multi-node (TP*EP*PP > 8): use slurm_inference.sh instead. + uv run python -m torch.distributed.run --nproc_per_node=8 \ examples/conversion/hf_to_megatron_generate_text.py \ --hf_model_path MiniMaxAI/MiniMax-M2 \ diff --git a/examples/models/minimax_m2/slurm_conversion.sh b/examples/models/minimax_m2/slurm_conversion.sh new file mode 100755 index 0000000000..92ce4b1647 --- /dev/null +++ b/examples/models/minimax_m2/slurm_conversion.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ============================================================================== +# MiniMax-M2 Checkpoint Conversion (Multi-Node via Slurm) +# +# MiniMax-M2 (MoE: 256 experts, top-8, ~230GB fp8) +# Use this script when TP * EP * PP > 8 (requires more than one 8-GPU node). +# For single-node (TP * EP * PP <= 8), use conversion.sh instead. +# +# Usage: +# 1. Modify the #SBATCH directives and CONFIGURATION section for your cluster +# 2. Submit: sbatch slurm_conversion.sh +# 3. Submit inference after conversion: +# sbatch --dependency=afterok: slurm_inference.sh +# ============================================================================== + +#SBATCH --job-name=minimax-m2-convert +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node=8 +#SBATCH --time=4:00:00 +#SBATCH --account= +#SBATCH --output=logs/minimax_m2_convert_%j.out +#SBATCH --error=logs/minimax_m2_convert_%j.err +#SBATCH --exclusive + +# ============================================================================== +# CONFIGURATION — edit these for your environment +# ============================================================================== + +WORKSPACE=${WORKSPACE:-/workspace} +PROJECT_DIR=${PROJECT_DIR:-.} +MODEL_NAME=MiniMax-M2 +HF_MODEL_ID=MiniMaxAI/$MODEL_NAME +GPUS_PER_NODE=8 + +TP=2 +EP=8 +PP=1 + +CONTAINER_IMAGE=${CONTAINER_IMAGE:?Set CONTAINER_IMAGE to your container path} +CONTAINER_MOUNTS="/lustre:/lustre,${PROJECT_DIR}:/opt/Megatron-Bridge" +CONTAINER_WORKDIR=/opt/Megatron-Bridge + +# ============================================================================== +# Environment Setup +# ============================================================================== + +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export NCCL_NVLS_ENABLE=0 + +# ============================================================================== +# Job Execution +# ============================================================================== + +MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +MASTER_PORT=${MASTER_PORT:-29500} + +echo "======================================" +echo "MiniMax-M2 Checkpoint Conversion" +echo "======================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Nodes: $SLURM_JOB_NUM_NODES" +echo "GPUs per node: $GPUS_PER_NODE" +echo "Parallelism: TP=$TP, EP=$EP, PP=$PP" +echo "Total GPUs: $((TP * EP * PP))" +echo "Master: $MASTER_ADDR:$MASTER_PORT" +echo "======================================" + +mkdir -p logs + +SRUN_CMD="srun --ntasks-per-node=1 --no-container-mount-home \ + --container-image=$CONTAINER_IMAGE \ + --container-mounts=$CONTAINER_MOUNTS" + +echo "" +echo "Importing HF -> Megatron checkpoint ..." +$SRUN_CMD bash -c "cd $CONTAINER_WORKDIR && \ + if [ \$SLURM_LOCALID -eq 0 ]; then uv sync; else sleep 10; fi && \ + uv run --no-sync python examples/conversion/convert_checkpoints.py import \ + --hf-model $HF_MODEL_ID \ + --megatron-path ${WORKSPACE}/models/$MODEL_NAME \ + --trust-remote-code" +IMPORT_EXIT=$? +if [ $IMPORT_EXIT -ne 0 ]; then + echo "ERROR: Import failed (exit $IMPORT_EXIT)" + exit $IMPORT_EXIT +fi + +echo "======================================" +echo "Conversion completed successfully" +echo "======================================" diff --git a/examples/models/minimax_m2/slurm_inference.sh b/examples/models/minimax_m2/slurm_inference.sh new file mode 100755 index 0000000000..673c8b9ade --- /dev/null +++ b/examples/models/minimax_m2/slurm_inference.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ============================================================================== +# MiniMax-M2 Inference (Multi-Node via Slurm) +# +# MiniMax-M2 (MoE: 256 experts, top-8, ~230GB fp8) +# Use this script when TP * EP * PP > 8 (requires more than one 8-GPU node). +# For single-node (TP * EP * PP <= 8), use inference.sh instead. +# +# Usage: +# 1. Modify the #SBATCH directives and CONFIGURATION section for your cluster +# 2. Run conversion first: sbatch slurm_conversion.sh +# 3. Submit with dependency: sbatch --dependency=afterok: slurm_inference.sh +# ============================================================================== + +#SBATCH --job-name=minimax-m2-inference +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node=8 +#SBATCH --time=4:00:00 +#SBATCH --account= +#SBATCH --output=logs/minimax_m2_inference_%j.out +#SBATCH --error=logs/minimax_m2_inference_%j.err +#SBATCH --exclusive + +# ============================================================================== +# CONFIGURATION — edit these for your environment +# ============================================================================== + +WORKSPACE=${WORKSPACE:-/workspace} +PROJECT_DIR=${PROJECT_DIR:-.} +MODEL_NAME=MiniMax-M2 +HF_MODEL_ID=MiniMaxAI/$MODEL_NAME +MEGATRON_CKPT=${WORKSPACE}/models/${MODEL_NAME}/iter_0000000 +GPUS_PER_NODE=8 +PROMPT="What is artificial intelligence?" +MAX_NEW_TOKENS=100 + +# MiniMax-M2 needs EP=32 (8 nodes) to fit 256 experts in memory. +# Increasing TP does NOT reduce expert memory — increase EP instead. +TP=2 +EP=32 +PP=1 + +CONTAINER_IMAGE=${CONTAINER_IMAGE:?Set CONTAINER_IMAGE to your container path} +CONTAINER_MOUNTS="/lustre:/lustre,${PROJECT_DIR}:/opt/Megatron-Bridge" +CONTAINER_WORKDIR=/opt/Megatron-Bridge + +# ============================================================================== +# Environment Setup +# ============================================================================== + +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export NCCL_NVLS_ENABLE=0 + +# ============================================================================== +# Job Execution +# ============================================================================== + +MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +MASTER_PORT=${MASTER_PORT:-29500} + +echo "======================================" +echo "MiniMax-M2 Inference" +echo "======================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Nodes: $SLURM_JOB_NUM_NODES" +echo "GPUs per node: $GPUS_PER_NODE" +echo "Parallelism: TP=$TP, EP=$EP, PP=$PP" +echo "Total GPUs: $((TP * EP * PP))" +echo "Master: $MASTER_ADDR:$MASTER_PORT" +echo "======================================" + +mkdir -p logs + +SRUN_CMD="srun --ntasks-per-node=1 --no-container-mount-home \ + --container-image=$CONTAINER_IMAGE \ + --container-mounts=$CONTAINER_MOUNTS" + +echo "" +echo "Running inference ..." +$SRUN_CMD bash -c "cd $CONTAINER_WORKDIR && \ + if [ \$SLURM_LOCALID -eq 0 ]; then uv sync; else sleep 10; fi && \ + uv run --no-sync python -m torch.distributed.run \ + --nnodes=\$SLURM_JOB_NUM_NODES \ + --nproc_per_node=$GPUS_PER_NODE \ + --node_rank=\$SLURM_NODEID \ + --master_addr=$MASTER_ADDR \ + --master_port=$MASTER_PORT \ + examples/conversion/hf_to_megatron_generate_text.py \ + --hf_model_path $HF_MODEL_ID \ + --megatron_model_path $MEGATRON_CKPT \ + --prompt '$PROMPT' \ + --max_new_tokens $MAX_NEW_TOKENS \ + --tp $TP --ep $EP \ + --trust-remote-code" + +echo "======================================" +echo "Inference completed" +echo "======================================" diff --git a/examples/models/minimax_m2/verify_toy_model.py b/examples/models/minimax_m2/verify_toy_model.py index 8bd6de90c1..57e96c059f 100644 --- a/examples/models/minimax_m2/verify_toy_model.py +++ b/examples/models/minimax_m2/verify_toy_model.py @@ -13,125 +13,189 @@ # limitations under the License. """ -Create a toy MiniMax-M2 model and verify HF ↔ Megatron conversion + forward pass. +Verify MiniMax-M2 HF <-> Megatron conversion using real pretrained weights. -The toy model is saved to /tmp/minimax_m2_toy and reused across runs. +Extracts the first N layers from the full MiniMax-M2 FP8 checkpoint, +dequantizes to bf16 using block-wise scale factors, and saves a clean +bf16 model. Then runs compare.py to verify conversion + forward pass. Usage: uv run python examples/models/minimax_m2/verify_toy_model.py uv run python examples/models/minimax_m2/verify_toy_model.py --tp 2 - uv run python examples/models/minimax_m2/verify_toy_model.py --ep 2 - uv run python examples/models/minimax_m2/verify_toy_model.py --tp 2 --ep 2 + uv run python examples/models/minimax_m2/verify_toy_model.py --num-layers 1 """ import argparse import json import os +import re import subprocess import sys import tempfile +from collections import defaultdict from pathlib import Path import torch HF_MODEL_ID = "MiniMaxAI/MiniMax-M2" - - -def create_toy_model(output_dir: str): - """Create and save a toy MiniMax-M2 model with random weights.""" - from transformers import AutoTokenizer, MiniMaxM2Config, MiniMaxM2ForCausalLM - - tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, trust_remote_code=True) - vocab_size = len(tokenizer) - print(f" Tokenizer vocab_size: {vocab_size}") - - config_dict = { - "architectures": ["MiniMaxM2ForCausalLM"], - "model_type": "minimax_m2", - "hidden_size": 512, - "intermediate_size": 256, - "num_hidden_layers": 2, - "num_attention_heads": 8, - "num_key_value_heads": 4, - "head_dim": 64, - "hidden_act": "silu", - "max_position_embeddings": 4096, - "rms_norm_eps": 1e-06, - "rope_theta": 5000000, - "rotary_dim": 32, - "vocab_size": vocab_size, - "tie_word_embeddings": False, - "attention_dropout": 0.0, - "num_local_experts": 4, - "num_experts_per_tok": 2, - "scoring_func": "sigmoid", - "use_routing_bias": True, - "use_qk_norm": True, - "qk_norm_type": "per_layer", - "router_aux_loss_coef": 0.001, - "router_jitter_noise": 0.0, - "output_router_logits": False, - "torch_dtype": "bfloat16", - } +DEFAULT_NUM_LAYERS = 1 +FP8_BLOCK_SIZE = 128 + + +def _dequant_blockwise(weight: torch.Tensor, scale_inv: torch.Tensor) -> torch.Tensor: + """Block-wise FP8 dequantization: weight_bf16 = fp8_val * scale_inv per block.""" + if weight.ndim == 1: + return weight.float().to(torch.bfloat16) + + M, N = weight.shape + B = FP8_BLOCK_SIZE + w = weight.float() + out = torch.empty_like(w) + + sM, sN = scale_inv.shape + for bi in range(sM): + for bj in range(sN): + r0, r1 = bi * B, min((bi + 1) * B, M) + c0, c1 = bj * B, min((bj + 1) * B, N) + out[r0:r1, c0:c1] = w[r0:r1, c0:c1] * scale_inv[bi, bj] + + return out.to(torch.bfloat16) + + +def create_toy_model(output_dir: str, num_layers: int = DEFAULT_NUM_LAYERS): + """Extract first N layers from MiniMax-M2, dequantize FP8 -> bf16, save.""" + from huggingface_hub import snapshot_download + from safetensors.torch import load_file, save_file + from transformers import AutoConfig, AutoTokenizer + + print(f"Creating sliced MiniMax-M2 ({num_layers} layers) with FP8 dequantization...") + + config = AutoConfig.from_pretrained(HF_MODEL_ID, trust_remote_code=True) + print(f" Original: {config.num_hidden_layers} layers, {config.num_local_experts} experts") + + cache_dir = snapshot_download(HF_MODEL_ID, allow_patterns=["*.json", "*.safetensors"]) + print(f" Cache: {cache_dir}") + + with open(os.path.join(cache_dir, "model.safetensors.index.json")) as f: + weight_map = json.load(f)["weight_map"] + + layer_re = re.compile(r"^model\.layers\.(\d+)\.") + + needed = set() + for key in weight_map: + m = layer_re.match(key) + if m is None or int(m.group(1)) < num_layers: + needed.add(key) + + for key in list(needed): + sinv = key + "_scale_inv" + if sinv in weight_map: + needed.add(sinv) + + files_to_keys = defaultdict(list) + for key in needed: + files_to_keys[weight_map[key]].append(key) + + print(f" Loading {len(needed)} tensors from {len(files_to_keys)} shard(s)...") + raw = {} + for fn, keys in sorted(files_to_keys.items()): + print(f" {fn} ({len(keys)} tensors)") + data = load_file(os.path.join(cache_dir, fn)) + for k in keys: + raw[k] = data[k] + + state_dict = {} + n_dequant = 0 + for key, t in raw.items(): + if key.endswith("_scale_inv"): + continue + if t.dtype in (torch.float8_e4m3fn, torch.float8_e5m2): + sinv_key = key + "_scale_inv" + if sinv_key in raw and t.ndim == 2: + t = _dequant_blockwise(t, raw[sinv_key]) + n_dequant += 1 + else: + t = t.float().to(torch.bfloat16) + state_dict[key] = t + + print(f" Dequantized {n_dequant} FP8 tensors (block {FP8_BLOCK_SIZE}x{FP8_BLOCK_SIZE})") os.makedirs(output_dir, exist_ok=True) - print(f"Creating toy MiniMax-M2 model at {output_dir} ...") - config = MiniMaxM2Config(**config_dict) - config.torch_dtype = torch.bfloat16 - model = MiniMaxM2ForCausalLM(config).bfloat16() + cfg = config.__class__.from_dict(config.to_dict()) + cfg.num_hidden_layers = num_layers + for attr in ("quantization_config", "auto_map"): + try: + delattr(cfg, attr) + except AttributeError: + pass + cfg.torch_dtype = "bfloat16" + if not hasattr(cfg, "rope_parameters") or cfg.rope_parameters is None: + cfg.rope_parameters = { + "rope_type": "default", + "rope_theta": getattr(cfg, "rope_theta", 10000.0), + } + cfg.save_pretrained(output_dir) + + AutoTokenizer.from_pretrained(HF_MODEL_ID, trust_remote_code=True).save_pretrained(output_dir) + + save_file(state_dict, os.path.join(output_dir, "model.safetensors")) + + n_params = sum(t.numel() for t in state_dict.values()) + print(f" Params: {n_params:,} | Layers: {num_layers} | Experts: {config.num_local_experts}") + _verify_qk_norm(state_dict, cfg) + _print_shapes(state_dict) + print(f" Saved to: {output_dir}") + - model.save_pretrained(output_dir, safe_serialization=True) - tokenizer.save_pretrained(output_dir) +def _verify_qk_norm(sd, cfg): + hd, nh, nkv = cfg.head_dim, cfg.num_attention_heads, cfg.num_key_value_heads + for i in range(cfg.num_hidden_layers): + qk = f"model.layers.{i}.self_attn.q_norm.weight" + kk = f"model.layers.{i}.self_attn.k_norm.weight" + assert qk in sd, f"Missing {qk}" + assert kk in sd, f"Missing {kk}" + assert sd[qk].shape[0] == nh * hd, f"q_norm shape {sd[qk].shape[0]} != {nh * hd}" + assert sd[kk].shape[0] == nkv * hd, f"k_norm shape {sd[kk].shape[0]} != {nkv * hd}" + print(f" QK norm OK: q=[{nh}*{hd}={nh * hd}], k=[{nkv}*{hd}={nkv * hd}]") - config_path = os.path.join(output_dir, "config.json") - with open(config_path, "w") as f: - json.dump(config_dict, f, indent=2) - param_count = sum(p.numel() for p in model.parameters()) - print( - f" Params: {param_count:,} | Experts: {config_dict['num_local_experts']} | Top-K: {config_dict['num_experts_per_tok']}" - ) - print(f" Saved to: {output_dir}") +def _print_shapes(sd): + print(" Key shapes (layer 0):") + for k in [ + "model.layers.0.self_attn.q_proj.weight", + "model.layers.0.self_attn.q_norm.weight", + "model.layers.0.self_attn.k_norm.weight", + "model.layers.0.block_sparse_moe.gate.weight", + "model.layers.0.block_sparse_moe.experts.0.w1.weight", + ]: + if k in sd: + print(f" {k}: {list(sd[k].shape)} {sd[k].dtype}") def run_compare(model_dir: str, tp: int, pp: int, ep: int): - """Run compare.py against the toy model.""" + """Run compare.py to verify HF <-> Megatron round-trip conversion.""" script = str( Path(__file__).resolve().parent.parent.parent / "conversion" / "compare_hf_and_megatron" / "compare.py" ) - nproc = tp * pp * ep - prompt = "Hello" + args = ["--hf_model_path", model_dir, "--prompt", "Hello", "--trust_remote_code"] if nproc == 1: - cmd = [ - sys.executable, - script, - "--hf_model_path", - model_dir, - "--prompt", - prompt, - ] + cmd = [sys.executable, script] + args else: - cmd = [ - sys.executable, - "-m", - "torch.distributed.run", - f"--nproc_per_node={nproc}", - script, - "--hf_model_path", - model_dir, - "--prompt", - prompt, - "--tp", - str(tp), - "--pp", - str(pp), - "--ep", - str(ep), - ] + cmd = ( + [ + sys.executable, + "-m", + "torch.distributed.run", + f"--nproc_per_node={nproc}", + script, + ] + + args + + ["--tp", str(tp), "--pp", str(pp), "--ep", str(ep)] + ) print(f"\n{'=' * 60}") print(f"Running compare.py TP={tp} PP={pp} EP={ep} (nproc={nproc})") @@ -146,28 +210,27 @@ def run_compare(model_dir: str, tp: int, pp: int, ep: int): def main(): - """Verify MiniMax-M2 toy model conversion + forward pass.""" - parser = argparse.ArgumentParser(description="Verify MiniMax-M2 toy model conversion + forward pass") - parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism") - parser.add_argument("--pp", type=int, default=1, help="Pipeline parallelism") - parser.add_argument("--ep", type=int, default=1, help="Expert parallelism") - parser.add_argument( - "--model-dir", - type=str, - default=None, - help="Reuse an existing toy model directory instead of creating a new one", - ) + """CLI entrypoint: create a toy model and run compare.py.""" + parser = argparse.ArgumentParser(description="Verify MiniMax-M2 conversion with real weights") + parser.add_argument("--tp", type=int, default=1) + parser.add_argument("--pp", type=int, default=1) + parser.add_argument("--ep", type=int, default=1) + parser.add_argument("--num-layers", type=int, default=DEFAULT_NUM_LAYERS) + parser.add_argument("--model-dir", type=str, default=None) + parser.add_argument("--force-recreate", action="store_true") args = parser.parse_args() - if args.model_dir: - model_dir = args.model_dir - else: - model_dir = os.path.join(tempfile.gettempdir(), "minimax_m2_toy") + model_dir = args.model_dir or os.path.join(tempfile.gettempdir(), "minimax_m2_toy") + + if args.force_recreate and os.path.exists(model_dir): + import shutil + + shutil.rmtree(model_dir) if not os.path.exists(os.path.join(model_dir, "config.json")): - create_toy_model(model_dir) + create_toy_model(model_dir, num_layers=args.num_layers) else: - print(f"Reusing existing toy model at: {model_dir}") + print(f"Reusing existing model at: {model_dir}") run_compare(model_dir, args.tp, args.pp, args.ep) diff --git a/src/megatron/bridge/models/minimax_m2/minimax_m2_bridge.py b/src/megatron/bridge/models/minimax_m2/minimax_m2_bridge.py index 30e6158cd6..92016dd7a3 100644 --- a/src/megatron/bridge/models/minimax_m2/minimax_m2_bridge.py +++ b/src/megatron/bridge/models/minimax_m2/minimax_m2_bridge.py @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections.abc import Mapping +from typing import Dict, Optional + import torch +import torch.nn as nn from megatron.core.models.gpt.gpt_model import GPTModel from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry @@ -20,8 +24,69 @@ from megatron.bridge.models.conversion.param_mapping import ( AutoMapping, GatedMLPMapping, + MegatronParamMapping, QKVMapping, ) +from megatron.bridge.models.minimax_m2.minimax_m2_provider import minimax_m2_layer_spec + + +_FP8_BLOCK_SIZE = 128 + + +def _dequant_fp8_blockwise(weight: torch.Tensor, scale_inv: torch.Tensor) -> torch.Tensor: + """Block-wise FP8 dequantization: out = fp8_val * scale_inv per 128x128 block.""" + M, N = weight.shape + B = _FP8_BLOCK_SIZE + w = weight.float() + out = torch.empty_like(w) + sM, sN = scale_inv.shape + for bi in range(sM): + for bj in range(sN): + r0, r1 = bi * B, min((bi + 1) * B, M) + c0, c1 = bj * B, min((bj + 1) * B, N) + out[r0:r1, c0:c1] = w[r0:r1, c0:c1] * scale_inv[bi, bj] + return out.to(torch.bfloat16) + + +class _FullDimQKNormMapping(MegatronParamMapping[torch.Tensor]): + """TP-sharded mapping for full-dimension QK norm weights. + + HF weight shape: ``[num_heads * head_dim]`` + Megatron weight shape per rank: ``[num_heads_per_partition * head_dim]`` + + Uses broadcast-then-slice instead of ``scatter_to_tp_ranks`` because the + ``_FullDimRMSNorm`` module may reside on CPU / meta device where NCCL + scatter is not available. + """ + + def hf_to_megatron(self, hf_weights: torch.Tensor, megatron_module: nn.Module) -> torch.Tensor: + target_param = megatron_module.weight + shard_size = target_param.shape[0] + + if self.tp_size == 1: + return hf_weights.to(device=target_param.device, dtype=target_param.dtype) + + device = torch.device("cuda", torch.cuda.current_device()) + hf_weights = hf_weights.to(device=device, dtype=target_param.dtype) + + if self.tp_rank > 0: + hf_weights = torch.empty_like(hf_weights) + + full_weight = self.broadcast_tensor_to_tp_ranks(hf_weights, src_rank=0) + start = self.tp_rank * shard_size + return full_weight[start : start + shard_size] + + def megatron_to_hf( + self, megatron_weights: Optional[torch.Tensor], megatron_module: Optional[nn.Module] + ) -> Dict[str, torch.Tensor]: + megatron_weights = self.broadcast_from_pp_rank(megatron_weights, cache_key=str(self.hf_param)) + if megatron_weights is None: + return {} + megatron_weights = self.maybe_dequantize(megatron_weights) + if self.tp_size == 1: + return {str(self.hf_param): megatron_weights} + gathered = self.gather_from_tp_ranks(megatron_weights) + return {str(self.hf_param): torch.cat(gathered, dim=0)} @MegatronModelBridge.register_bridge( @@ -37,11 +102,16 @@ class MiniMaxM2Bridge(MegatronModelBridge): scoring and expert bias correction). HF weights use per-expert format with block_sparse_moe prefix (w1/w2/w3). + QK normalization: + MiniMax-M2 applies full-dimension RMSNorm to Q/K (weight shape = + num_heads * head_dim) before splitting into heads. Megatron's built-in + QK norm is per-head (weight shape = head_dim). This bridge uses a custom + layer spec (``minimax_m2_layer_spec``) with ``FullDimQNorm``/``FullDimKNorm`` + that normalizes over the full partition dimension. With TP > 1 the + sum-of-squares is all-reduced across TP ranks so the RMS denominator + matches the single-GPU case. + Known limitations: - - QK layernorm: MiniMax-M2 uses full-dimension QK norm (q_norm weight - shape = num_heads * head_dim), whereas Megatron uses per-head QK norm - (weight shape = head_dim). These are computationally different and - cannot be losslessly converted. QK norm is disabled in this bridge. - MTP (Multi-Token Prediction) modules are not mapped. Example: @@ -70,15 +140,11 @@ def provider_bridge(self, hf_pretrained): if rotary_dim is not None and head_dim is not None: provider.rotary_percent = rotary_dim / head_dim - # TODO: MiniMax-M2 uses full-dimension QK norm (q_norm weight shape = num_heads * head_dim) - # while Megatron uses per-head QK norm (weight shape = head_dim). These are - # mathematically different (different normalization denominators), so the weights - # cannot be losslessly converted. Disabled here, which means q_norm.weight and - # k_norm.weight are dropped during conversion. This is acceptable for fine-tuning - # but will cause forward-pass divergence from HF for inference. - # Fix: add full-dimension QK norm support to Megatron-Core, or write a custom - # layer spec for this model. + # Full-dimension QK norm via custom layer spec (see minimax_m2_provider.py). + # qk_layernorm stays False to avoid the default per-head TENorm; our custom + # spec injects FullDimQNorm/FullDimKNorm directly into SelfAttention. provider.qk_layernorm = False + provider.transformer_layer_spec = minimax_m2_layer_spec # MoE settings — sigmoid routing with expert bias (same pattern as DeepSeek V3) provider.moe_grouped_gemm = True @@ -92,6 +158,27 @@ def provider_bridge(self, hf_pretrained): return provider + def maybe_modify_loaded_hf_weight( + self, hf_param: str | dict[str, str], hf_state_dict: Mapping[str, torch.Tensor] + ) -> torch.Tensor: + """Load HF weights with FP8 block-wise dequantization when needed. + + MiniMax-M2 stores linear weights as float8_e4m3fn with per-block scale + factors in ``_scale_inv`` tensors (128x128 blocks). + """ + if isinstance(hf_param, dict): + return {k: self._load_and_dequant(v, hf_state_dict) for k, v in hf_param.items()} + return self._load_and_dequant(hf_param, hf_state_dict) + + def _load_and_dequant(self, key: str, hf_state_dict: Mapping[str, torch.Tensor]) -> torch.Tensor: + w = hf_state_dict[key] + if w.dtype not in (torch.float8_e4m3fn, torch.float8_e5m2): + return w + sinv_key = key + "_scale_inv" + if w.ndim == 2 and sinv_key in hf_state_dict: + return _dequant_fp8_blockwise(w, hf_state_dict[sinv_key]) + return w.float().to(torch.bfloat16) + def mapping_registry(self) -> MegatronMappingRegistry: param_mappings = { # Global weights @@ -112,6 +199,21 @@ def mapping_registry(self) -> MegatronMappingRegistry: for megatron_param, hf_param in param_mappings.items(): mapping_list.append(AutoMapping(megatron_param=megatron_param, hf_param=hf_param)) + # QK norm — FullDimQNorm/FullDimKNorm weight is [num_heads_per_partition * head_dim], + # which is a TP shard of the HF [num_heads * head_dim] weight. + mapping_list.append( + _FullDimQKNormMapping( + megatron_param="decoder.layers.*.self_attention.q_layernorm.weight", + hf_param="model.layers.*.self_attn.q_norm.weight", + ) + ) + mapping_list.append( + _FullDimQKNormMapping( + megatron_param="decoder.layers.*.self_attention.k_layernorm.weight", + hf_param="model.layers.*.self_attn.k_norm.weight", + ) + ) + # QKV mapping_list.append( QKVMapping( diff --git a/src/megatron/bridge/models/minimax_m2/minimax_m2_provider.py b/src/megatron/bridge/models/minimax_m2/minimax_m2_provider.py new file mode 100644 index 0000000000..a149353eb5 --- /dev/null +++ b/src/megatron/bridge/models/minimax_m2/minimax_m2_provider.py @@ -0,0 +1,154 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MiniMax-M2 custom layer spec with full-dimension QK normalization. + +MiniMax-M2 applies RMSNorm to the entire Q/K projection (weight shape = +num_heads * head_dim) before splitting into heads. Megatron's built-in +QK norm applies per-head (weight shape = head_dim). This module bridges +the gap by applying full-partition-dimension RMSNorm inside the standard +SelfAttention flow. +""" + +from typing import Dict, Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn as nn +from megatron.core.transformer import ModuleSpec, TransformerConfig +from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint + + +class _FullDimRMSNorm(nn.Module): + """RMSNorm applied across all attention heads (full Q/K dimension). + + Standard per-head QK norm normalizes over ``head_dim`` independently per head. + This module normalizes over the *full* ``num_heads * head_dim`` dimension, + matching HuggingFace models that use ``nn.RMSNorm(num_heads * head_dim)`` on + the full Q/K vector before reshaping into heads. + + With TP > 1 each rank holds only ``num_heads_per_partition`` heads, so the + sum-of-squares is all-reduced across the TP group before computing the RMS. + This keeps the normalization denominator identical to the single-GPU case. + """ + + def __init__(self, local_dim: int, global_dim: int, tp_group_getter, eps: float = 1e-6): + super().__init__() + self.local_dim = local_dim + self.global_dim = global_dim + self._tp_group_getter = tp_group_getter + self.eps = eps + self.weight = nn.Parameter(torch.ones(local_dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x: [sq, b, num_heads_per_partition, head_dim] + orig_shape = x.shape + x = x.reshape(*orig_shape[:2], -1) # [sq, b, local_dim] + dtype = x.dtype + x_fp32 = x.to(torch.float32) + + # sum-of-squares over the local partition + local_ss = x_fp32.pow(2).sum(-1, keepdim=True) # [sq, b, 1] + + # all-reduce across TP ranks so every rank sees the global sum-of-squares + tp_group = self._tp_group_getter() + if tp_group is not None and dist.get_world_size(tp_group) > 1: + dist.all_reduce(local_ss, op=dist.ReduceOp.SUM, group=tp_group) + + variance = local_ss / self.global_dim + x = x_fp32 * torch.rsqrt(variance + self.eps) + return (self.weight * x.to(dtype)).reshape(orig_shape) + + def sharded_state_dict( + self, + prefix: str = "", + sharded_offsets: Tuple[Tuple[int, int, int], ...] = (), + metadata: Optional[Dict] = None, + ) -> Dict[str, "ShardedTensor"]: # noqa: F821 + """Weight is TP-sharded along axis 0 (same as ColumnParallelLinear).""" + state_dict = self.state_dict(prefix="", keep_vars=True) + tp_group = self._tp_group_getter() + if metadata is None: + from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group + + metadata = ensure_metadata_has_dp_cp_group(metadata) + return make_sharded_tensors_for_checkpoint( + state_dict, + prefix, + {"weight": 0}, + sharded_offsets, + tp_group=tp_group, + dp_cp_group=metadata["dp_cp_group"], + ) + + +def _get_tp_group(): + """Lazy accessor for the TP process group (not available at module init time).""" + from megatron.core.parallel_state import get_tensor_model_parallel_group + + return get_tensor_model_parallel_group(check_initialized=False) + + +class FullDimQNorm: + """Factory callable that creates a full-dimension RMSNorm for Q heads. + + Passed as ``q_layernorm`` in the layer spec. The ``SelfAttention`` constructor + calls ``submodules.q_layernorm(hidden_size=head_dim, config=..., eps=...)``; + this factory ignores the per-head ``hidden_size`` and computes the correct + full partition dimension from ``config``. + """ + + def __new__(cls, hidden_size: int, config: TransformerConfig, eps: float = 1e-6): + tp = config.tensor_model_parallel_size + num_heads = config.num_attention_heads + local_dim = (num_heads // tp) * hidden_size + global_dim = num_heads * hidden_size + return _FullDimRMSNorm(local_dim, global_dim, _get_tp_group, eps) + + +class FullDimKNorm: + """Factory callable that creates a full-dimension RMSNorm for K heads. + + Same as ``FullDimQNorm`` but uses ``num_query_groups`` (GQA key-value heads) + instead of ``num_attention_heads``. + """ + + def __new__(cls, hidden_size: int, config: TransformerConfig, eps: float = 1e-6): + tp = config.tensor_model_parallel_size + num_kv_heads = config.num_query_groups or config.num_attention_heads + local_dim = (num_kv_heads // tp) * hidden_size + global_dim = num_kv_heads * hidden_size + return _FullDimRMSNorm(local_dim, global_dim, _get_tp_group, eps) + + +def minimax_m2_layer_spec(config: "GPTModelProvider") -> ModuleSpec: # noqa: F821 + """Build a TE layer spec for MiniMax-M2 with full-dimension QK norm. + + Starts from the standard TE MoE spec (which handles grouped-gemm experts, + router, etc.) and replaces the per-head ``TENorm`` Q/K layernorm with + ``FullDimQNorm`` / ``FullDimKNorm``. + """ + from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_layer_with_transformer_engine_spec, + ) + + spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=config.num_moe_experts, + moe_grouped_gemm=config.moe_grouped_gemm, + qk_layernorm=True, + ) + attn_sub = spec.submodules.self_attention.submodules + attn_sub.q_layernorm = FullDimQNorm + attn_sub.k_layernorm = FullDimKNorm + return spec