From 0b3834dc3c1ed76eae7ba48f2de01ef4cbb59b4a Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 20 Mar 2026 16:43:01 +0800 Subject: [PATCH 1/5] support multimodal MTP --- docs/source/BestPractices/Qwen3_5-Best-Practice.md | 1 - docs/source_en/BestPractices/Qwen3_5-Best-Practice.md | 1 - examples/models/qwen3_5/mcore_full.sh | 1 - examples/models/qwen3_5/packing.sh | 1 - swift/megatron/model/mm_gpt_model.py | 4 ---- 5 files changed, 8 deletions(-) diff --git a/docs/source/BestPractices/Qwen3_5-Best-Practice.md b/docs/source/BestPractices/Qwen3_5-Best-Practice.md index 68e84e27e8..9df312035a 100644 --- a/docs/source/BestPractices/Qwen3_5-Best-Practice.md +++ b/docs/source/BestPractices/Qwen3_5-Best-Practice.md @@ -309,7 +309,6 @@ swift infer \ Megatron-SWIFT训练Qwen3.5的提示: - 全参数训练:参考[这个例子](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/mcore_full.sh)。 -- 关于MTP训练:ms-swift暂不支持多模态MTP的训练。如果你只训练纯文本数据,请设置`SKIP_MULTIMODAL_MTP_VALIDATION=1`环境变量,忽略检查。 - TP 限制解除:使用 "megatron-core>=0.16" 可解除 TP 受到的 `num_query_groups` 限制。 - 默认 `GatedDeltaNet` 使用 transformers 实现(为保证稳定性,暂时保持默认行为不变)。使用 "megatron-core>=0.16"并设置环境变量 `SWIFT_USE_MCORE_GDN=1`可切换至 mcore 实现,支持 GDN 的 TP 并降低显存。 - padding_free/packing的支持:packing可以提升训练速度,你需要设置`SWIFT_USE_MCORE_GDN=1`环境变量。参考[这个例子](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/packing.sh)。 diff --git a/docs/source_en/BestPractices/Qwen3_5-Best-Practice.md b/docs/source_en/BestPractices/Qwen3_5-Best-Practice.md index 1c04c1c93f..48687daa50 100644 --- a/docs/source_en/BestPractices/Qwen3_5-Best-Practice.md +++ b/docs/source_en/BestPractices/Qwen3_5-Best-Practice.md @@ -307,7 +307,6 @@ swift infer \ Tips for training Qwen3.5 with Megatron-SWIFT: - Full parameter training: Refer to [this example](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/mcore_full.sh). -- Regarding MTP training: ms-swift currently does not support multimodal MTP training. If you are only training on pure text data, please set the `SKIP_MULTIMODAL_MTP_VALIDATION=1` environment variable to skip the validation check. - TP Limitation Removed: Using `megatron-core>=0.16` removes the `num_query_groups` limitation on TP. - By default, `GatedDeltaNet` uses the transformers implementation (to ensure stability, the default behavior remains unchanged for now). Using `megatron-core>=0.16` and setting the environment variable `SWIFT_USE_MCORE_GDN=1` switches to the mcore implementation, which supports TP for GDN and reduces memory usage. - Support for padding_free/packing: Packing can improve training speed. You need to set the `SWIFT_USE_MCORE_GDN=1` environment variable. Refer to [this example](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/packing.sh). diff --git a/examples/models/qwen3_5/mcore_full.sh b/examples/models/qwen3_5/mcore_full.sh index c54ba63a26..9cca80a4d9 100644 --- a/examples/models/qwen3_5/mcore_full.sh +++ b/examples/models/qwen3_5/mcore_full.sh @@ -5,7 +5,6 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ MAX_PIXELS=1003520 \ VIDEO_MAX_PIXELS=50176 \ FPS_MAX_FRAMES=12 \ -SKIP_MULTIMODAL_MTP_VALIDATION=1 \ megatron sft \ --model Qwen/Qwen3.5-35B-A3B \ --save_safetensors true \ diff --git a/examples/models/qwen3_5/packing.sh b/examples/models/qwen3_5/packing.sh index 9686249234..4fee6e76a0 100644 --- a/examples/models/qwen3_5/packing.sh +++ b/examples/models/qwen3_5/packing.sh @@ -5,7 +5,6 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ MAX_PIXELS=1003520 \ VIDEO_MAX_PIXELS=50176 \ FPS_MAX_FRAMES=12 \ -SKIP_MULTIMODAL_MTP_VALIDATION=1 \ SWIFT_USE_MCORE_GDN=1 \ megatron sft \ --model Qwen/Qwen3.5-35B-A3B \ diff --git a/swift/megatron/model/mm_gpt_model.py b/swift/megatron/model/mm_gpt_model.py index 962d7bb282..5f2dcbdf44 100644 --- a/swift/megatron/model/mm_gpt_model.py +++ b/swift/megatron/model/mm_gpt_model.py @@ -36,10 +36,6 @@ def __init__(self, self.share_embeddings_and_output_weights = self.language_model.share_embeddings_and_output_weights self.megatron_model_meta = get_megatron_model_meta(self.args.model_type) self.visual = None - if self.args.mtp_num_layers: - skip_validation = get_env_args('SKIP_MULTIMODAL_MTP_VALIDATION', bool, False) - if not skip_validation: - raise ValueError('MTP currently does not support multimodal models.') if pre_process and self.megatron_model_meta.visual_cls is not None: self.visual = self.megatron_model_meta.visual_cls(config) From 9e880028c8cb4a44ade2ba9df0182bfb31a1c78f Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 31 Mar 2026 11:05:30 +0800 Subject: [PATCH 2/5] fix qwen3_vl router_logits --- swift/model/models/qwen.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/swift/model/models/qwen.py b/swift/model/models/qwen.py index 9d88f1139b..1dd8805e45 100644 --- a/swift/model/models/qwen.py +++ b/swift/model/models/qwen.py @@ -1059,7 +1059,8 @@ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrain from transformers import Qwen3VLForConditionalGeneration self.auto_model_cls = self.auto_model_cls or Qwen3VLForConditionalGeneration model = super().get_model(model_dir, config, processor, model_kwargs) - _compat_qwen3_vl_mixed_data(model.model, processor) + is_moe = getattr(self, 'is_moe', False) + _compat_qwen3_vl_mixed_data(model.model, processor, is_moe=is_moe) return model @@ -1093,6 +1094,7 @@ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrain class Qwen3VLMoeLoader(Qwen3VLLoader): + is_moe = True def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrainedModel: from transformers import Qwen3VLMoeForConditionalGeneration From e0fa5b235bd07b86d49258af70aa558b33138d01 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 5 Apr 2026 17:27:42 +0800 Subject: [PATCH 3/5] update --- docs/source/BestPractices/Qwen3_5-Best-Practice.md | 1 + docs/source/GetStarted/SWIFT-installation.md | 4 +++- docs/source/Megatron-SWIFT/Command-line-parameters.md | 1 + docs/source/Megatron-SWIFT/Quick-start.md | 2 +- docs/source_en/BestPractices/Qwen3_5-Best-Practice.md | 1 + docs/source_en/GetStarted/SWIFT-installation.md | 4 +++- docs/source_en/Megatron-SWIFT/Command-line-parameters.md | 1 + docs/source_en/Megatron-SWIFT/Quick-start.md | 2 +- swift/megatron/init.py | 2 +- 9 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/source/BestPractices/Qwen3_5-Best-Practice.md b/docs/source/BestPractices/Qwen3_5-Best-Practice.md index 049524f157..2329bf8bbb 100644 --- a/docs/source/BestPractices/Qwen3_5-Best-Practice.md +++ b/docs/source/BestPractices/Qwen3_5-Best-Practice.md @@ -309,6 +309,7 @@ swift infer \ Megatron-SWIFT训练Qwen3.5的提示: - 全参数训练:参考[这个例子](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/mcore_full.sh)。 +- 关于MTP训练:"mcore-bridge>=1.1.0"支持了多模态MTP的训练(暂时需安装[main分支](https://github.com/modelscope/mcore-bridge/pull/14)),请安装对应版本。 - TP 限制解除:使用 "megatron-core>=0.16" 可解除 TP 受到的 `num_query_groups` 限制。 - 默认 `GatedDeltaNet` 使用 Megatron 实现,需使用 "megatron-core>=0.16"(ms-swift>=4.1.0,之前版本默认使用transformers实现)。设置环境变量 `USE_MCORE_GDN=0`可切换至 transformers 实现,transformers实现不支持packing和GDN的TP。 - padding_free/packing的支持:packing可以提升训练速度。参考[这个例子](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/packing.sh)。 diff --git a/docs/source/GetStarted/SWIFT-installation.md b/docs/source/GetStarted/SWIFT-installation.md index 29b4a4ff51..e47b5a6a1d 100644 --- a/docs/source/GetStarted/SWIFT-installation.md +++ b/docs/source/GetStarted/SWIFT-installation.md @@ -7,7 +7,9 @@ ```shell # 推荐 pip install 'ms-swift' -U -# 使用评测 +# 额外安装megatron依赖 +pip install 'ms-swift[megatron]' -U +# 额外安装评测依赖 pip install 'ms-swift[eval]' -U # 全能力 pip install 'ms-swift[all]' -U diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md index d1a2b4e03d..e5aec46ec8 100644 --- a/docs/source/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source/Megatron-SWIFT/Command-line-parameters.md @@ -206,6 +206,7 @@ **MTP参数** - mtp_num_layers: 多token预测(MTP)层的数量。MTP将每个位置的预测范围扩展到多个未来token。此MTP实现使用D个顺序模块依次预测D个额外的token。默认为None。(需要"megatron-core>=0.14") - 注意:mtp_num_layers的值,将不自动从config.json获取,需手动设置。你可以参考config.json中的`num_nextn_predict_layers`字段填写该值。使用mcore-bridge时,将优先从safetensors文件中加载MTP权重,若无法找到,则进行随机初始化。(若要使用blockwise fp8 + mtp,请使用mcore>=0.15) + - 多模态MTP的支持: 选择安装"mcore-bridge>=1.1.0"。 - mtp_loss_scaling_factor: 多token预测(MTP)损失的缩放因子。我们计算所有深度上MTP损失的平均值,然后乘以该缩放因子得到总体MTP损失,它将作为一个额外的训练目标。默认为0.1。 **Tuner参数**: diff --git a/docs/source/Megatron-SWIFT/Quick-start.md b/docs/source/Megatron-SWIFT/Quick-start.md index 1b7407bb2f..b1cca1c1af 100644 --- a/docs/source/Megatron-SWIFT/Quick-start.md +++ b/docs/source/Megatron-SWIFT/Quick-start.md @@ -67,7 +67,7 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2 | transformer-engine | >=2.3 | 2.12.0 | | | apex | | 0.1 | | | megatron-core | >=0.12,<0.17 | 0.16 | | -| mcore-bridge | >=1.0.1 | | | +| mcore-bridge | >=1.0.2 | | | | flash-attn | | 2.8.3/3.0.0b1 | | | transformers | >=4.33 | 4.57.6/5.2.0 | | | modelscope | >=1.23 | | | diff --git a/docs/source_en/BestPractices/Qwen3_5-Best-Practice.md b/docs/source_en/BestPractices/Qwen3_5-Best-Practice.md index 88f58d7a10..3d00899dfd 100644 --- a/docs/source_en/BestPractices/Qwen3_5-Best-Practice.md +++ b/docs/source_en/BestPractices/Qwen3_5-Best-Practice.md @@ -307,6 +307,7 @@ swift infer \ Tips for training Qwen3.5 with Megatron-SWIFT: - Full parameter training: Refer to [this example](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/mcore_full.sh). +- Regarding MTP training: `mcore-bridge>=1.1.0` supports multimodal MTP training (currently requires installing the [main branch](https://github.com/modelscope/mcore-bridge/pull/14)). Please install the corresponding version. - TP Limitation Removed: Using `megatron-core>=0.16` removes the `num_query_groups` limitation on TP. - By default, `GatedDeltaNet` uses the Megatron implementation, which requires "megatron-core>=0.16" (ms-swift>=4.1.0; previous versions defaulted to the transformers implementation). Set the environment variable `USE_MCORE_GDN=0` to switch to the transformers implementation. Note that the transformers implementation does not support packing and GDN's TP. - Support for padding_free/packing: Packing can improve training speed. Refer to [this example](https://github.com/modelscope/ms-swift/tree/main/examples/models/qwen3_5/packing.sh). diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md index de2f8f5ae6..588d75e677 100644 --- a/docs/source_en/GetStarted/SWIFT-installation.md +++ b/docs/source_en/GetStarted/SWIFT-installation.md @@ -7,7 +7,9 @@ You can install it using pip: ```shell # recommend pip install 'ms-swift' -U -# For evaluation usage +# Install additional Megatron dependencies +pip install 'ms-swift[megatron]' -U +# Install additional evaluation dependencies pip install 'ms-swift[eval]' -U # Full capabilities pip install 'ms-swift[all]' -U diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md index 5a2de8ac4d..a740a070d5 100644 --- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md @@ -218,6 +218,7 @@ For guidance on selecting parallelization strategies, please refer to the [Train **MTP Parameters** - mtp_num_layers: Number of Multi-Token Prediction (MTP) layers. MTP extends the prediction scope at each position to multiple future tokens. This MTP implementation uses D sequential modules to sequentially predict D additional tokens. Default is None. (requires "megatron-core>=0.14") - Note: The value of mtp_num_layers will not be automatically retrieved from config.json and must be set manually. You can refer to the `num_nextn_predict_layers` field in config.json to fill in this value. When using mcore-bridge, MTP weights will be loaded from safetensors files first. If not found, random initialization will be performed. (To use blockwise fp8 + mtp, please use mcore>=0.15) + - Multimodal MTP support: Optionally install "mcore-bridge>=1.1.0". - mtp_loss_scaling_factor: Scaling factor of Multi-Token Prediction (MTP) loss. We compute the average of MTP losses across all depths, then multiply it by this scaling factor to obtain the overall MTP loss, which serves as an additional training objective. Default is 0.1. **Tuner Parameters**: diff --git a/docs/source_en/Megatron-SWIFT/Quick-start.md b/docs/source_en/Megatron-SWIFT/Quick-start.md index 12e3eacda2..0f8a01fd24 100644 --- a/docs/source_en/Megatron-SWIFT/Quick-start.md +++ b/docs/source_en/Megatron-SWIFT/Quick-start.md @@ -67,7 +67,7 @@ Recommended Operating Environment: | transformer-engine | >=2.3 | 2.12.0 | | | apex | | 0.1 | | | megatron-core | >=0.12,<0.17 | 0.16 | | -| mcore-bridge | >=1.0.1 | | | +| mcore-bridge | >=1.0.2 | | | | flash-attn | | 2.8.3/3.0.0b1 | | | transformers | >=4.33 | 4.57.6/5.2.0 | | | modelscope | >=1.23 | | | diff --git a/swift/megatron/init.py b/swift/megatron/init.py index 4e01e45b3f..b44be31d32 100644 --- a/swift/megatron/init.py +++ b/swift/megatron/init.py @@ -139,7 +139,7 @@ def _new_load_inline(*args, **kwargs): def _patch_mcore_bridge(): - require_version('mcore-bridge>=1.0.1.dev', 'please install mcore-bridge via `pip install mcore-bridge -U`') + require_version('mcore-bridge>=1.0.2', 'please install mcore-bridge via `pip install mcore-bridge -U`') import mcore_bridge from mcore_bridge import GPTBridge logger.info(f'mcore_bridge.__version__: {mcore_bridge.__version__}') From 97cbb808a3efa68b4300554836e9986198e518ce Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 5 Apr 2026 17:32:07 +0800 Subject: [PATCH 4/5] update --- docs/source/Megatron-SWIFT/Command-line-parameters.md | 2 +- docs/source/Megatron-SWIFT/Quick-start.md | 9 +++++++-- docs/source_en/Megatron-SWIFT/Command-line-parameters.md | 2 +- docs/source_en/Megatron-SWIFT/Quick-start.md | 9 +++++++-- setup.py | 1 + 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md index e5aec46ec8..089e73ebd3 100644 --- a/docs/source/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source/Megatron-SWIFT/Command-line-parameters.md @@ -206,7 +206,7 @@ **MTP参数** - mtp_num_layers: 多token预测(MTP)层的数量。MTP将每个位置的预测范围扩展到多个未来token。此MTP实现使用D个顺序模块依次预测D个额外的token。默认为None。(需要"megatron-core>=0.14") - 注意:mtp_num_layers的值,将不自动从config.json获取,需手动设置。你可以参考config.json中的`num_nextn_predict_layers`字段填写该值。使用mcore-bridge时,将优先从safetensors文件中加载MTP权重,若无法找到,则进行随机初始化。(若要使用blockwise fp8 + mtp,请使用mcore>=0.15) - - 多模态MTP的支持: 选择安装"mcore-bridge>=1.1.0"。 + - 多模态MTP的支持: 需安装"mcore-bridge>=1.1.0"。 - mtp_loss_scaling_factor: 多token预测(MTP)损失的缩放因子。我们计算所有深度上MTP损失的平均值,然后乘以该缩放因子得到总体MTP损失,它将作为一个额外的训练目标。默认为0.1。 **Tuner参数**: diff --git a/docs/source/Megatron-SWIFT/Quick-start.md b/docs/source/Megatron-SWIFT/Quick-start.md index b1cca1c1af..803aa72b59 100644 --- a/docs/source/Megatron-SWIFT/Quick-start.md +++ b/docs/source/Megatron-SWIFT/Quick-start.md @@ -32,8 +32,13 @@ git clone https://github.com/NVIDIA/apex cd apex pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ -# mcore-bridge megatron-core -pip install "megatron-core==0.16.*" mcore-bridge -U +# mcore-bridge +pip install mcore-bridge -U +# 安装main分支 +# pip install git+https://github.com/modelscope/mcore-bridge.git + +# megatron-core +pip install "megatron-core==0.16.*" -U # 若使用多机训练,请额外设置`MODELSCOPE_CACHE`环境变量为共享存储路径 # 这将确保数据集缓存共享,而加速预处理速度。 diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md index a740a070d5..fab4d00ea3 100644 --- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md +++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md @@ -218,7 +218,7 @@ For guidance on selecting parallelization strategies, please refer to the [Train **MTP Parameters** - mtp_num_layers: Number of Multi-Token Prediction (MTP) layers. MTP extends the prediction scope at each position to multiple future tokens. This MTP implementation uses D sequential modules to sequentially predict D additional tokens. Default is None. (requires "megatron-core>=0.14") - Note: The value of mtp_num_layers will not be automatically retrieved from config.json and must be set manually. You can refer to the `num_nextn_predict_layers` field in config.json to fill in this value. When using mcore-bridge, MTP weights will be loaded from safetensors files first. If not found, random initialization will be performed. (To use blockwise fp8 + mtp, please use mcore>=0.15) - - Multimodal MTP support: Optionally install "mcore-bridge>=1.1.0". + - Multimodal MTP support: Requires installing "mcore-bridge>=1.1.0". - mtp_loss_scaling_factor: Scaling factor of Multi-Token Prediction (MTP) loss. We compute the average of MTP losses across all depths, then multiply it by this scaling factor to obtain the overall MTP loss, which serves as an additional training objective. Default is 0.1. **Tuner Parameters**: diff --git a/docs/source_en/Megatron-SWIFT/Quick-start.md b/docs/source_en/Megatron-SWIFT/Quick-start.md index 0f8a01fd24..68693afcbd 100644 --- a/docs/source_en/Megatron-SWIFT/Quick-start.md +++ b/docs/source_en/Megatron-SWIFT/Quick-start.md @@ -31,8 +31,13 @@ git clone https://github.com/NVIDIA/apex cd apex pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ -# mcore-bridge megatron-core -pip install "megatron-core==0.16.*" mcore-bridge -U +# mcore-bridge +pip install mcore-bridge -U +# Install from main branch +# pip install git+https://github.com/modelscope/mcore-bridge.git + +# megatron-core +pip install "megatron-core==0.16.*" -U # If you are using multi-node training, please additionally set the `MODELSCOPE_CACHE` environment variable to a shared storage path. # This will ensure that the dataset cache is shared, thereby speeding up preprocessing. diff --git a/setup.py b/setup.py index a3a1c193a7..eab2091b04 100644 --- a/setup.py +++ b/setup.py @@ -120,6 +120,7 @@ def gen_packages_items(): install_requires, deps_link = parse_requirements('requirements.txt') extra_requires = {} all_requires = [] + extra_requires['megatron'], _ = parse_requirements('requirements/megatron.txt') extra_requires['eval'], _ = parse_requirements('requirements/eval.txt') extra_requires['swanlab'], _ = parse_requirements('requirements/swanlab.txt') extra_requires['ray'], _ = parse_requirements('requirements/ray.txt') From 5943443c53d56f61f1ad6c012c242f41c0d26834 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 5 Apr 2026 17:42:05 +0800 Subject: [PATCH 5/5] update --- requirements/megatron.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 requirements/megatron.txt diff --git a/requirements/megatron.txt b/requirements/megatron.txt new file mode 100644 index 0000000000..3b58956250 --- /dev/null +++ b/requirements/megatron.txt @@ -0,0 +1,3 @@ +mcore-bridge>=1.0.2 +megatron-core>=0.12 +peft>=0.15