From 238b9afe6bf818ca62607d3fc903d5839d3a1d6e Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Mon, 2 Mar 2026 10:06:36 -0800 Subject: [PATCH 1/9] fix test Signed-off-by: Chen Cui --- .../models/qwen_vl/test_qwen35_vl_bridge.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/unit_tests/models/qwen_vl/test_qwen35_vl_bridge.py b/tests/unit_tests/models/qwen_vl/test_qwen35_vl_bridge.py index d9d8a52521..605e6627e1 100644 --- a/tests/unit_tests/models/qwen_vl/test_qwen35_vl_bridge.py +++ b/tests/unit_tests/models/qwen_vl/test_qwen35_vl_bridge.py @@ -33,7 +33,7 @@ def _make_dense_text_config(): """Create a mock text config matching Qwen3.5-27B dense architecture.""" - cfg = Mock() + cfg = Mock(spec=[]) cfg.num_hidden_layers = 64 cfg.hidden_size = 5120 cfg.intermediate_size = 17408 @@ -58,18 +58,14 @@ def _make_dense_text_config(): cfg.linear_num_value_heads = 48 cfg.bos_token_id = 248045 cfg.eos_token_id = 248044 - cfg.q_lora_rank = None - cfg.kv_lora_rank = None - cfg.qk_nope_head_dim = None - cfg.qk_rope_head_dim = None - cfg.v_head_dim = None cfg.num_nextn_predict_layers = None + cfg.torch_dtype = "bfloat16" return cfg def _make_moe_text_config(): """Create a mock text config matching Qwen3.5-397B-A17B MoE architecture.""" - cfg = Mock() + cfg = Mock(spec=[]) cfg.num_hidden_layers = 60 cfg.hidden_size = 4096 cfg.intermediate_size = 1024 @@ -98,12 +94,8 @@ def _make_moe_text_config(): cfg.shared_expert_intermediate_size = 4096 cfg.bos_token_id = 248045 cfg.eos_token_id = 248046 - cfg.q_lora_rank = None - cfg.kv_lora_rank = None - cfg.qk_nope_head_dim = None - cfg.qk_rope_head_dim = None - cfg.v_head_dim = None cfg.num_nextn_predict_layers = None + cfg.torch_dtype = "bfloat16" return cfg From cfd9d90f48cf4e5df1bfe278e0e09533a72a4239 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Wed, 4 Mar 2026 15:55:37 -0800 Subject: [PATCH 2/9] add 3 new dense models and training recipes Signed-off-by: Chen Cui --- examples/models/vlm/qwen35_vl/conversion.sh | 22 +- examples/models/vlm/qwen35_vl/inference.sh | 28 +- examples/models/vlm/qwen35_vl/slurm_peft.sh | 194 +++++ examples/models/vlm/qwen35_vl/slurm_sft.sh | 194 +++++ .../bridge/models/qwen_vl/qwen35_vl_bridge.py | 7 + .../models/qwen_vl/qwen35_vl_provider.py | 26 +- .../bridge/recipes/qwen_vl/__init__.py | 22 + .../bridge/recipes/qwen_vl/qwen35_vl.py | 691 ++++++++++++++++ .../test_qwen35_vl_recipes_finetune.py | 199 +++++ .../recipes/qwen_vl/test_qwen35_vl_recipes.py | 756 ++++++++++++++++++ 10 files changed, 2126 insertions(+), 13 deletions(-) create mode 100755 examples/models/vlm/qwen35_vl/slurm_peft.sh create mode 100644 examples/models/vlm/qwen35_vl/slurm_sft.sh create mode 100644 src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py create mode 100644 tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py create mode 100644 tests/unit_tests/recipes/qwen_vl/test_qwen35_vl_recipes.py diff --git a/examples/models/vlm/qwen35_vl/conversion.sh b/examples/models/vlm/qwen35_vl/conversion.sh index b7bcd54ad3..e24d6b1c3c 100755 --- a/examples/models/vlm/qwen35_vl/conversion.sh +++ b/examples/models/vlm/qwen35_vl/conversion.sh @@ -12,15 +12,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +set -e # Workspace directory for checkpoints and results WORKSPACE=${WORKSPACE:-/workspace} -MODEL_NAME=Qwen3.5-35B-A3B # Qwen3.5-35B-A3B, Qwen3.5-122B-A10B, Qwen3.5-397B-A17B, Qwen3.5-27B +# Supported model variants are: +# Qwen3.5-0.8B, Qwen3.5-2B, Qwen3.5-4B, Qwen3.5-9B, Qwen3.5-27B, Qwen3.5-35B-A3B, Qwen3.5-122B-A10B, Qwen3.5-397B-A17B +MODEL_NAME=Qwen3.5-35B-A3B -if [ "${MODEL_NAME}" = "Qwen3.5-27B" ]; then +if [ "${MODEL_NAME}" = "Qwen3.5-0.8B" ] || [ "${MODEL_NAME}" = "Qwen3.5-2B" ] || [ "${MODEL_NAME}" = "Qwen3.5-4B" ] || [ "${MODEL_NAME}" = "Qwen3.5-9B" ] || [ "${MODEL_NAME}" = "Qwen3.5-27B" ]; then HF_MODEL_CLASS="Qwen3_5ForConditionalGeneration" -else + EP=1 + PP=8 + TP=1 +elif [ "${MODEL_NAME}" = "Qwen3.5-35B-A3B" ] || [ "${MODEL_NAME}" = "Qwen3.5-122B-A10B" ] || [ "${MODEL_NAME}" = "Qwen3.5-397B-A17B" ]; then HF_MODEL_CLASS="Qwen3_5MoeForConditionalGeneration" + EP=8 + PP=1 + TP=1 +else + echo "Unsupported model variant: ${MODEL_NAME}" + exit 1 fi # Make sure to upgrade to transformers >= 5.2.0 @@ -39,7 +51,7 @@ uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/co --model_class "${HF_MODEL_CLASS}" \ --image_path "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" \ --prompt "Describe this image." \ - --tp 1 --pp 1 --ep 8 + --tp ${TP} --pp ${PP} --ep ${EP} # Export Megatron → HF uv run python examples/conversion/convert_checkpoints.py export \ @@ -49,4 +61,4 @@ uv run python examples/conversion/convert_checkpoints.py export \ # Round-trip validation uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_megatron_roundtrip_multi_gpu.py \ - --hf-model-id Qwen/${MODEL_NAME} --tp 1 --pp 2 --ep 4 --trust-remote-code + --hf-model-id Qwen/${MODEL_NAME} --tp ${TP} --pp ${PP} --ep ${EP} diff --git a/examples/models/vlm/qwen35_vl/inference.sh b/examples/models/vlm/qwen35_vl/inference.sh index 17cfbec635..41e80720d7 100755 --- a/examples/models/vlm/qwen35_vl/inference.sh +++ b/examples/models/vlm/qwen35_vl/inference.sh @@ -13,9 +13,29 @@ # See the License for the specific language governing permissions and # limitations under the License. +set -e + # Workspace directory for checkpoints and results WORKSPACE=${WORKSPACE:-/workspace} -MODEL_NAME=Qwen3.5-35B-A3B # Qwen3.5-35B-A3B, Qwen3.5-122B-A10B, Qwen3.5-27B +# Set the model name to any of the supported dense or MoE Qwen3.5-VL models: +# Dense: Qwen3.5-0.8B, Qwen3.5-2B, Qwen3.5-4B, Qwen3.5-9B, Qwen3.5-27B +# MoE: Qwen3.5-35B-A3B, Qwen3.5-122B-A10B, Qwen3.5-397B-A17B +# For Qwen3.5-397B-A17B, please use the slurm_inference.sh script for multinode inference. +MODEL_NAME=Qwen3.5-35B-A3B + +# Set EP (Expert Parallelism) to 1 for dense models, 4 for MoE models +case "$MODEL_NAME" in + Qwen3.5-0.8B|Qwen3.5-2B|Qwen3.5-4B|Qwen3.5-9B|Qwen3.5-27B) + EP=1 + ;; + Qwen3.5-35B-A3B|Qwen3.5-122B-A10B|Qwen3.5-397B-A17B) + EP=4 + ;; + *) + echo "ERROR: Unknown model type for \$MODEL_NAME: $MODEL_NAME" + exit 1 + ;; +esac # Inference with Hugging Face checkpoints uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_vlm.py \ @@ -23,7 +43,7 @@ uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \ --prompt "Describe this image." \ --max_new_tokens 50 \ - --tp 2 --pp 2 --ep 4 + --tp 2 --pp 2 --ep ${EP} # Inference with imported Megatron checkpoints uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_vlm.py \ @@ -32,7 +52,7 @@ uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \ --prompt "Describe this image." \ --max_new_tokens 50 \ - --tp 2 --pp 2 --ep 4 + --tp 2 --pp 2 --ep ${EP} # Inference with exported HF checkpoints uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_vlm.py \ @@ -40,4 +60,4 @@ uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \ --prompt "Describe this image." \ --max_new_tokens 50 \ - --tp 2 --pp 2 --ep 4 + --tp 2 --pp 2 --ep ${EP} diff --git a/examples/models/vlm/qwen35_vl/slurm_peft.sh b/examples/models/vlm/qwen35_vl/slurm_peft.sh new file mode 100755 index 0000000000..f2b9b66791 --- /dev/null +++ b/examples/models/vlm/qwen35_vl/slurm_peft.sh @@ -0,0 +1,194 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ============================================================================== +# Qwen3.5 VL Parameter-Efficient Fine-Tuning (PEFT) with LoRA +# +# Supports all Qwen3.5 VL models (dense and MoE). +# LoRA/DoRA significantly reduces memory requirements. +# +# Usage: +# sbatch slurm_peft.sh +# +# model: 0.8B | 2B | 4B | 9B | 27B | 35B-A3B | 122B-A10B | 397B-A17B +# +# Recommended parallelism (recipe defaults for LoRA): +# 0.8B (dense): TP=1, PP=1 (1 node) +# 2B (dense): TP=1, PP=1 (1 node) +# 4B (dense): TP=1, PP=1 (1 node) +# 9B (dense): TP=2, PP=1 (1 node) +# 27B (dense): TP=2, PP=1 (1 node) +# 35B-A3B (MoE): TP=2, PP=1, EP=4 (1 node) +# 122B-A10B (MoE): TP=2, PP=1, EP=8 (1 node) +# 397B-A17B (MoE): TP=2, PP=1, EP=32 (4 nodes) +# +# Examples: +# sbatch slurm_peft.sh 4B +# sbatch --nodes=4 slurm_peft.sh 397B-A17B +# ============================================================================== + +#SBATCH --job-name=qwen35vl-lora +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gpus-per-node=8 +#SBATCH --time=08:00:00 +#SBATCH --partition=gpu +#SBATCH --account=my_account +#SBATCH --output=logs/qwen35vl_lora_%j.out +#SBATCH --error=logs/qwen35vl_lora_%j.err +#SBATCH --exclusive + +# ============================================================================== +# Parse arguments +# ============================================================================== + +MODEL_SIZE="${1:?Usage: sbatch $0 (model: 0.8B|2B|4B|9B|27B|35B-A3B|122B-A10B|397B-A17B)}" + +# Map model size to HF name and recipe +case "$MODEL_SIZE" in + 0.8B) + HF_MODEL_NAME="Qwen3.5-0.8B" + RECIPE="qwen35_vl_800m_finetune_config" + ;; + 2B) + HF_MODEL_NAME="Qwen3.5-2B" + RECIPE="qwen35_vl_2b_finetune_config" + ;; + 4B) + HF_MODEL_NAME="Qwen3.5-4B" + RECIPE="qwen35_vl_4b_finetune_config" + ;; + 9B) + HF_MODEL_NAME="Qwen3.5-9B" + RECIPE="qwen35_vl_9b_finetune_config" + ;; + 27B) + HF_MODEL_NAME="Qwen3.5-27B" + RECIPE="qwen35_vl_27b_finetune_config" + ;; + 35B-A3B) + HF_MODEL_NAME="Qwen3.5-35B-A3B" + RECIPE="qwen35_vl_35b_a3b_finetune_config" + ;; + 122B-A10B) + HF_MODEL_NAME="Qwen3.5-122B-A10B" + RECIPE="qwen35_vl_122b_a10b_finetune_config" + ;; + 397B-A17B) + HF_MODEL_NAME="Qwen3.5-397B-A17B" + RECIPE="qwen35_vl_397b_a17b_finetune_config" + ;; + *) + echo "ERROR: Unknown model '$MODEL_SIZE'. Must be one of: 0.8B, 2B, 4B, 9B, 27B, 35B-A3B, 122B-A10B, 397B-A17B" + exit 1 + ;; +esac + +# ============================================================================== +# CONFIGURATION +# ============================================================================== + +WORKSPACE=${WORKSPACE:-/workspace} + +PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen/${HF_MODEL_NAME} +DATASET_NAME=cord_v2 +SEQ_LENGTH=4096 +TRAIN_ITERS=500 +GLOBAL_BATCH_SIZE=32 +MICRO_BATCH_SIZE=1 +EVAL_ITERS=10 +LOG_INTERVAL=1 +WANDB_PROJECT=megatron-bridge-${DATASET_NAME} + +# Container image (required) +CONTAINER_IMAGE="" +# CONTAINER_IMAGE="/path/to/container.sqsh" + +# Container mounts (optional, space-separated) +CONTAINER_MOUNTS="" +# CONTAINER_MOUNTS="/data:/data /workspace:/workspace" + +# ============================================================================== +# Environment Setup +# ============================================================================== + +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export NCCL_NVLS_ENABLE=0 + +# export UV_CACHE_DIR="/path/to/shared/uv_cache" +# export HF_HOME="/path/to/shared/HF_HOME" +# export HF_TOKEN="hf_your_token_here" +# export WANDB_API_KEY="your_wandb_key_here" +# export WANDB_MODE=disabled + +# ============================================================================== +# Job Execution +# ============================================================================== + +echo "======================================" +echo "Qwen3.5-VL LoRA Fine-Tuning Job" +echo "======================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Nodes: $SLURM_JOB_NUM_NODES" +echo "GPUs per node: $SLURM_GPUS_PER_NODE" +echo "Model: $HF_MODEL_NAME" +echo "Recipe: $RECIPE" +echo "PEFT: LoRA" +echo "Checkpoint: $PRETRAINED_CHECKPOINT" +echo "======================================" + +mkdir -p logs + +CLI_OVERRIDES="\ + checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ + model.seq_length=$SEQ_LENGTH \ + train.train_iters=$TRAIN_ITERS \ + train.global_batch_size=$GLOBAL_BATCH_SIZE \ + train.micro_batch_size=$MICRO_BATCH_SIZE \ + train.eval_iters=$EVAL_ITERS \ + checkpoint.save=${WORKSPACE}/results/${RECIPE}_lora \ + logger.log_interval=$LOG_INTERVAL \ + logger.wandb_project=$WANDB_PROJECT \ + logger.wandb_exp_name=${RECIPE}_${DATASET_NAME}_lora \ + dataset.maker_name=make_${DATASET_NAME}_dataset \ + dataset.seq_length=$SEQ_LENGTH" + +CMD="uv run --no-sync python scripts/training/run_recipe.py \ + --recipe $RECIPE \ + --step_func vlm_step \ + --peft_scheme lora \ + $CLI_OVERRIDES" + +echo "Executing command..." +echo "======================================" + +if [ -z "$CONTAINER_IMAGE" ]; then + echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image." + exit 1 +fi + +SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE" + +if [ -n "$CONTAINER_MOUNTS" ]; then + for mount in $CONTAINER_MOUNTS; do + SRUN_CMD="$SRUN_CMD --container-mounts=$mount" + done +fi + +$SRUN_CMD bash -c "$CMD" + +echo "======================================" +echo "Job completed" +echo "======================================" diff --git a/examples/models/vlm/qwen35_vl/slurm_sft.sh b/examples/models/vlm/qwen35_vl/slurm_sft.sh new file mode 100644 index 0000000000..176990efd9 --- /dev/null +++ b/examples/models/vlm/qwen35_vl/slurm_sft.sh @@ -0,0 +1,194 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ============================================================================== +# Qwen3.5 VL Full Supervised Fine-Tuning (SFT) +# +# Supports all Qwen3.5 VL models (dense and MoE). +# For smaller setups, use LoRA/DoRA instead (see slurm_peft.sh). +# +# Usage: +# sbatch slurm_sft.sh +# +# model: 0.8B | 2B | 4B | 9B | 27B | 35B-A3B | 122B-A10B | 397B-A17B +# +# Recommended parallelism (recipe defaults for full SFT): +# 0.8B (dense): TP=1, PP=1 (1 node) +# 2B (dense): TP=1, PP=1 (1 node) +# 4B (dense): TP=2, PP=1 (1 node) +# 9B (dense): TP=4, PP=1 (1 node) +# 27B (dense): TP=4, PP=4 (2 nodes) +# 35B-A3B (MoE): TP=2, PP=1, EP=16 (2 nodes) +# 122B-A10B (MoE): TP=2, PP=1, EP=32 (4 nodes) +# 397B-A17B (MoE): TP=2, PP=4, EP=32 (16 nodes) +# +# Examples: +# sbatch slurm_sft.sh 4B +# sbatch --nodes=2 slurm_sft.sh 27B +# sbatch --nodes=16 slurm_sft.sh 397B-A17B +# ============================================================================== + +#SBATCH --job-name=qwen35vl-sft +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gpus-per-node=8 +#SBATCH --time=24:00:00 +#SBATCH --partition=gpu +#SBATCH --account=my_account +#SBATCH --output=logs/qwen35vl_sft_%j.out +#SBATCH --error=logs/qwen35vl_sft_%j.err +#SBATCH --exclusive + +# ============================================================================== +# Parse arguments +# ============================================================================== + +MODEL_SIZE="${1:?Usage: sbatch $0 (model: 0.8B|2B|4B|9B|27B|35B-A3B|122B-A10B|397B-A17B)}" + +# Map model size to HF name and recipe +case "$MODEL_SIZE" in + 0.8B) + HF_MODEL_NAME="Qwen3.5-0.8B" + RECIPE="qwen35_vl_800m_finetune_config" + ;; + 2B) + HF_MODEL_NAME="Qwen3.5-2B" + RECIPE="qwen35_vl_2b_finetune_config" + ;; + 4B) + HF_MODEL_NAME="Qwen3.5-4B" + RECIPE="qwen35_vl_4b_finetune_config" + ;; + 9B) + HF_MODEL_NAME="Qwen3.5-9B" + RECIPE="qwen35_vl_9b_finetune_config" + ;; + 27B) + HF_MODEL_NAME="Qwen3.5-27B" + RECIPE="qwen35_vl_27b_finetune_config" + ;; + 35B-A3B) + HF_MODEL_NAME="Qwen3.5-35B-A3B" + RECIPE="qwen35_vl_35b_a3b_finetune_config" + ;; + 122B-A10B) + HF_MODEL_NAME="Qwen3.5-122B-A10B" + RECIPE="qwen35_vl_122b_a10b_finetune_config" + ;; + 397B-A17B) + HF_MODEL_NAME="Qwen3.5-397B-A17B" + RECIPE="qwen35_vl_397b_a17b_finetune_config" + ;; + *) + echo "ERROR: Unknown model '$MODEL_SIZE'. Must be one of: 0.8B, 2B, 4B, 9B, 27B, 35B-A3B, 122B-A10B, 397B-A17B" + exit 1 + ;; +esac + +# ============================================================================== +# CONFIGURATION +# ============================================================================== + +WORKSPACE=${WORKSPACE:-/workspace} + +PRETRAINED_CHECKPOINT=${WORKSPACE}/models/Qwen/${HF_MODEL_NAME} +DATASET_NAME=cord_v2 +SEQ_LENGTH=4096 +TRAIN_ITERS=500 +GLOBAL_BATCH_SIZE=32 +MICRO_BATCH_SIZE=1 +EVAL_ITERS=10 +LOG_INTERVAL=1 +WANDB_PROJECT=megatron-bridge-${DATASET_NAME} + +# Container image (required) +CONTAINER_IMAGE="" +# CONTAINER_IMAGE="/path/to/container.sqsh" + +# Container mounts (optional, space-separated) +CONTAINER_MOUNTS="" +# CONTAINER_MOUNTS="/data:/data /workspace:/workspace" + +# ============================================================================== +# Environment Setup +# ============================================================================== + +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export NCCL_NVLS_ENABLE=0 + +# export UV_CACHE_DIR="/path/to/shared/uv_cache" +# export HF_HOME="/path/to/shared/HF_HOME" +# export HF_TOKEN="hf_your_token_here" +# export WANDB_API_KEY="your_wandb_key_here" +# export WANDB_MODE=disabled + +# ============================================================================== +# Job Execution +# ============================================================================== + +echo "======================================" +echo "Qwen3.5-VL Full SFT Training Job" +echo "======================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Nodes: $SLURM_JOB_NUM_NODES" +echo "GPUs per node: $SLURM_GPUS_PER_NODE" +echo "Total GPUs: $((SLURM_JOB_NUM_NODES * SLURM_GPUS_PER_NODE))" +echo "Model: $HF_MODEL_NAME" +echo "Recipe: $RECIPE" +echo "Checkpoint: $PRETRAINED_CHECKPOINT" +echo "======================================" + +mkdir -p logs + +CLI_OVERRIDES="\ + checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ + model.seq_length=$SEQ_LENGTH \ + train.train_iters=$TRAIN_ITERS \ + train.global_batch_size=$GLOBAL_BATCH_SIZE \ + train.micro_batch_size=$MICRO_BATCH_SIZE \ + train.eval_iters=$EVAL_ITERS \ + checkpoint.save=${WORKSPACE}/results/${RECIPE}_sft \ + logger.log_interval=$LOG_INTERVAL \ + logger.wandb_project=$WANDB_PROJECT \ + logger.wandb_exp_name=${RECIPE}_${DATASET_NAME}_sft \ + dataset.maker_name=make_${DATASET_NAME}_dataset \ + dataset.seq_length=$SEQ_LENGTH" + +CMD="uv run --no-sync python scripts/training/run_recipe.py \ + --recipe $RECIPE \ + --step_func vlm_step \ + $CLI_OVERRIDES" + +echo "Executing command..." +echo "======================================" + +if [ -z "$CONTAINER_IMAGE" ]; then + echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image." + exit 1 +fi + +SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE" + +if [ -n "$CONTAINER_MOUNTS" ]; then + for mount in $CONTAINER_MOUNTS; do + SRUN_CMD="$SRUN_CMD --container-mounts=$mount" + done +fi + +$SRUN_CMD bash -c "$CMD" + +echo "======================================" +echo "Job completed" +echo "======================================" diff --git a/src/megatron/bridge/models/qwen_vl/qwen35_vl_bridge.py b/src/megatron/bridge/models/qwen_vl/qwen35_vl_bridge.py index fd26808bc5..a2c9feb136 100644 --- a/src/megatron/bridge/models/qwen_vl/qwen35_vl_bridge.py +++ b/src/megatron/bridge/models/qwen_vl/qwen35_vl_bridge.py @@ -118,6 +118,9 @@ def provider_bridge(self, hf_pretrained: PreTrainedVLM) -> Qwen35VLMoEModelProvi provider = Qwen35VLMoEModelProvider(**provider_kwargs) + # For VLMs, tie_word_embeddings lives on the top-level config, not text_config. + provider.share_embeddings_and_output_weights = getattr(hf_config, "tie_word_embeddings", False) + # --- Common Qwen3 LLM settings --- provider.normalization = "RMSNorm" provider.gated_linear_unit = True @@ -438,6 +441,10 @@ def provider_bridge(self, hf_pretrained: PreTrainedVLM) -> Qwen35VLModelProvider provider = Qwen35VLModelProvider(**provider_kwargs) + # For VLMs, tie_word_embeddings lives on the top-level config, not text_config. + # text_config inherits PretrainedConfig's default of True which is wrong for 9B/27B. + provider.share_embeddings_and_output_weights = getattr(hf_config, "tie_word_embeddings", False) + # --- Common Qwen3 LLM settings --- provider.normalization = "RMSNorm" provider.gated_linear_unit = True diff --git a/src/megatron/bridge/models/qwen_vl/qwen35_vl_provider.py b/src/megatron/bridge/models/qwen_vl/qwen35_vl_provider.py index cac4042a24..458a4e81cb 100644 --- a/src/megatron/bridge/models/qwen_vl/qwen35_vl_provider.py +++ b/src/megatron/bridge/models/qwen_vl/qwen35_vl_provider.py @@ -179,11 +179,20 @@ def __post_init__(self): _check_qwen3_5_available() if self.vision_config is None: self.vision_config = Qwen3_5VisionConfig() + self.validate_parallelism() + super().__post_init__() + + def validate_parallelism(self): + """Validate that parallelism settings are compatible with this model's architecture. + + Call this after mutating parallelism attributes (e.g. tensor_model_parallel_size) + on an already-constructed provider, since __post_init__ only runs at construction time. + """ if self.num_query_groups < self.tensor_model_parallel_size: raise ValueError( - f"TP size {self.tensor_model_parallel_size} should be less than or equal to num_query_groups {self.num_query_groups}. Please use a smaller TP size." + f"TP size {self.tensor_model_parallel_size} should be less than or equal to " + f"num_query_groups {self.num_query_groups}. Please use a smaller TP size." ) - super().__post_init__() def provide(self, pre_process=None, post_process=None, vp_stage=None) -> Qwen3VLModel: """Provide a Qwen3.5 VL dense model instance with vision and language components.""" @@ -346,11 +355,20 @@ def __post_init__(self): _check_qwen3_5_moe_available() if self.vision_config is None: self.vision_config = Qwen3_5MoeVisionConfig() + self.validate_parallelism() + super().__post_init__() + + def validate_parallelism(self): + """Validate that parallelism settings are compatible with this model's architecture. + + Call this after mutating parallelism attributes (e.g. tensor_model_parallel_size) + on an already-constructed provider, since __post_init__ only runs at construction time. + """ if self.num_query_groups < self.tensor_model_parallel_size: raise ValueError( - f"TP size {self.tensor_model_parallel_size} should be less than or equal to num_query_groups {self.num_query_groups}. Please use a smaller TP size." + f"TP size {self.tensor_model_parallel_size} should be less than or equal to " + f"num_query_groups {self.num_query_groups}. Please use a smaller TP size." ) - super().__post_init__() def provide(self, pre_process=None, post_process=None, vp_stage=None) -> Qwen3VLModel: """Provide a Qwen3.5 VL model instance with vision and language components. diff --git a/src/megatron/bridge/recipes/qwen_vl/__init__.py b/src/megatron/bridge/recipes/qwen_vl/__init__.py index afb6ac1048..99bacb6b4c 100644 --- a/src/megatron/bridge/recipes/qwen_vl/__init__.py +++ b/src/megatron/bridge/recipes/qwen_vl/__init__.py @@ -22,6 +22,18 @@ qwen3_vl_235b_a22b_pretrain_config, ) +# Qwen3.5 models +from .qwen35_vl import ( + qwen35_vl_2b_finetune_config, + qwen35_vl_4b_finetune_config, + qwen35_vl_9b_finetune_config, + qwen35_vl_27b_finetune_config, + qwen35_vl_35b_a3b_finetune_config, + qwen35_vl_122b_a10b_finetune_config, + qwen35_vl_397b_a17b_finetune_config, + qwen35_vl_800m_finetune_config, +) + __all__ = [ # Qwen3-VL pretrain configs @@ -32,4 +44,14 @@ "qwen3_vl_8b_finetune_config", "qwen3_vl_30b_a3b_finetune_config", "qwen3_vl_235b_a22b_finetune_config", + # Qwen3.5-VL finetune configs — dense (with PEFT support) + "qwen35_vl_800m_finetune_config", + "qwen35_vl_2b_finetune_config", + "qwen35_vl_4b_finetune_config", + "qwen35_vl_9b_finetune_config", + "qwen35_vl_27b_finetune_config", + # Qwen3.5-VL finetune configs — MoE (with PEFT support) + "qwen35_vl_35b_a3b_finetune_config", + "qwen35_vl_122b_a10b_finetune_config", + "qwen35_vl_397b_a17b_finetune_config", ] diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py new file mode 100644 index 0000000000..15d60d2bf6 --- /dev/null +++ b/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py @@ -0,0 +1,691 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Fine-tuning recipes for Qwen3.5 Vision-Language Models. + +Qwen3.5 is a family of VLMs that combine a hybrid Gated DeltaNet (GDN) + Gated +Attention language model with a vision encoder. Two variants are supported: + +- **Dense** (e.g., Qwen3.5-27B): standard dense MLP +- **MoE** (e.g., Qwen3.5-35B-A3B, 122B-A10B, 397B-A17B): Mixture of Experts + with shared experts + +Each public function returns a ready-to-use :class:`ConfigContainer` for +fine-tuning. Pass ``peft="lora"`` for parameter-efficient +fine-tuning, or leave ``peft=None`` for full supervised fine-tuning (SFT). +""" + +import os +from typing import List, Optional, Union + +import torch +from typing_extensions import TypedDict, Unpack + +from megatron.bridge import AutoBridge +from megatron.bridge.data.vlm_datasets import ( + HFDatasetConversationProvider, + MockVLMConversationProvider, + PreloadedVLMConversationProvider, +) +from megatron.bridge.peft.base import PEFT +from megatron.bridge.recipes.utils.finetune_utils import default_peft_config as _default_peft_config +from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing +from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE +from megatron.bridge.training.comm_overlap import CommOverlapConfig +from megatron.bridge.training.config import ( + CheckpointConfig, + ConfigContainer, + DatasetProvider, + DistributedDataParallelConfig, + LoggerConfig, + RNGConfig, + TokenizerConfig, + TrainingConfig, + ValidationConfig, +) +from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed + + +class Qwen35VLCommonKwargs(TypedDict, total=False): + """Typed options accepted by Qwen3.5 VL recipe helpers.""" + + # Core identifiers + hf_path: str + dir: Optional[str] + name: str + # Dataset configuration + train_data_path: Optional[List[str]] + valid_data_path: Optional[List[str]] + test_data_path: Optional[List[str]] + dataset_type: Optional[str] + image_folder: Optional[str] + tokenizer_model: Optional[str] + mock: bool + # Model configuration + tensor_model_parallel_size: int + pipeline_model_parallel_size: int + pipeline_dtype: Optional[torch.dtype] + virtual_pipeline_model_parallel_size: Optional[int] + context_parallel_size: int + expert_model_parallel_size: Optional[int] + expert_tensor_parallel_size: int + sequence_parallel: bool + use_megatron_fsdp: bool + enable_recompute: bool + account_for_embedding_in_pipeline_split: bool + account_for_loss_in_pipeline_split: bool + # Training hyperparameters + train_iters: int + global_batch_size: int + micro_batch_size: int + seq_length: int + lr: float + min_lr: float + lr_warmup_iters: int + lr_decay_iters: Optional[int] + eval_interval: int + save_interval: int + use_null_tokenizer: bool + # Precision / overlap configs + precision_config: Optional[Union[MixedPrecisionConfig, str]] + comm_overlap_config: Optional[CommOverlapConfig] + # Freeze options + pretrained_checkpoint: Optional[str] + freeze_language_model: bool + freeze_vision_model: bool + freeze_vision_projection: bool + # PEFT options + peft: Optional[Union[str, PEFT]] + finetune_lr: float + # W&B logging + wandb_project: Optional[str] + wandb_entity: Optional[str] + wandb_exp_name: Optional[str] + + +# --------------------------------------------------------------------------- +# Dense variant: Qwen3.5-800M +# --------------------------------------------------------------------------- + + +def qwen35_vl_800m_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: + """Return a fine-tuning config for Qwen3.5-800M (dense). + + Default configuration: + - LoRA/DoRA: TP=1, PP=1 (1 node), LR=1e-4 + - Full SFT: TP=1, PP=1 (1 node), LR=5e-6 + + Note: num_kv_heads=2, so max TP=2. + + See `_qwen35_vl_common` for the full list of parameters. + """ + peft_value = user_kwargs.get("peft", None) + is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + + recommended_kwargs: Qwen35VLCommonKwargs = { + "hf_path": "Qwen/Qwen3.5-0.8B", + "tensor_model_parallel_size": 1, + "pipeline_model_parallel_size": 1, + "expert_model_parallel_size": 1, + "peft": peft_value, + "finetune_lr": 5e-6 if is_full_sft else 1e-4, + "freeze_language_model": False, + "freeze_vision_model": False, + "freeze_vision_projection": False, + "lr_warmup_iters": 200, + "micro_batch_size": 1, + "global_batch_size": 32, + } + combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} + return _qwen35_vl_common(**combined_kwargs) + + +# --------------------------------------------------------------------------- +# Dense variant: Qwen3.5-2B +# --------------------------------------------------------------------------- + + +def qwen35_vl_2b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: + """Return a fine-tuning config for Qwen3.5-2B (dense). + + Default configuration: + - LoRA/DoRA: TP=1, PP=1 (1 node), LR=1e-4 + - Full SFT: TP=1, PP=1 (1 node), LR=5e-6 + + Note: num_kv_heads=2, so max TP=2. + + See `_qwen35_vl_common` for the full list of parameters. + """ + peft_value = user_kwargs.get("peft", None) + is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + + recommended_kwargs: Qwen35VLCommonKwargs = { + "hf_path": "Qwen/Qwen3.5-2B", + "tensor_model_parallel_size": 1, + "pipeline_model_parallel_size": 1, + "expert_model_parallel_size": 1, + "peft": peft_value, + "finetune_lr": 5e-6 if is_full_sft else 1e-4, + "freeze_language_model": False, + "freeze_vision_model": False, + "freeze_vision_projection": False, + "lr_warmup_iters": 200, + "micro_batch_size": 1, + "global_batch_size": 32, + } + combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} + return _qwen35_vl_common(**combined_kwargs) + + +# --------------------------------------------------------------------------- +# Dense variant: Qwen3.5-4B +# --------------------------------------------------------------------------- + + +def qwen35_vl_4b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: + """Return a fine-tuning config for Qwen3.5-4B (dense). + + Default configuration: + - LoRA/DoRA: TP=1, PP=1 (1 node), LR=1e-4 + - Full SFT: TP=2, PP=1 (1 node), LR=5e-6 + + Note: num_kv_heads=4, so max TP=4. + + See `_qwen35_vl_common` for the full list of parameters. + """ + peft_value = user_kwargs.get("peft", None) + is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + + recommended_kwargs: Qwen35VLCommonKwargs = { + "hf_path": "Qwen/Qwen3.5-4B", + "tensor_model_parallel_size": 2 if is_full_sft else 1, + "pipeline_model_parallel_size": 1, + "expert_model_parallel_size": 1, + "peft": peft_value, + "finetune_lr": 5e-6 if is_full_sft else 1e-4, + "freeze_language_model": False, + "freeze_vision_model": False, + "freeze_vision_projection": False, + "lr_warmup_iters": 200, + "micro_batch_size": 1, + "global_batch_size": 32, + } + combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} + return _qwen35_vl_common(**combined_kwargs) + + +# --------------------------------------------------------------------------- +# Dense variant: Qwen3.5-9B +# --------------------------------------------------------------------------- + + +def qwen35_vl_9b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: + """Return a fine-tuning config for Qwen3.5-9B (dense). + + Default configuration: + - LoRA/DoRA: TP=1, PP=1 (1 node), LR=1e-4 + - Full SFT: TP=4, PP=1 (1 node), LR=5e-6 + + Note: num_kv_heads=4, so max TP=4. + + See `_qwen35_vl_common` for the full list of parameters. + """ + peft_value = user_kwargs.get("peft", None) + is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + + recommended_kwargs: Qwen35VLCommonKwargs = { + "hf_path": "Qwen/Qwen3.5-9B", + "tensor_model_parallel_size": 4 if is_full_sft else 1, + "pipeline_model_parallel_size": 1, + "expert_model_parallel_size": 1, + "peft": peft_value, + "finetune_lr": 5e-6 if is_full_sft else 1e-4, + "freeze_language_model": False, + "freeze_vision_model": False, + "freeze_vision_projection": False, + "lr_warmup_iters": 200, + "micro_batch_size": 1, + "global_batch_size": 32, + } + combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} + return _qwen35_vl_common(**combined_kwargs) + + +# --------------------------------------------------------------------------- +# Dense variant: Qwen3.5-27B +# --------------------------------------------------------------------------- + + +def qwen35_vl_27b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: + """Return a fine-tuning config for Qwen3.5-27B (dense). + + Default configuration: + - LoRA/DoRA: TP=2, PP=1 (1 node), LR=1e-4 + - Full SFT: TP=4, PP=4 (2 nodes), LR=5e-6 + + See `_qwen35_vl_common` for the full list of parameters. + """ + peft_value = user_kwargs.get("peft", None) + is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + + recommended_kwargs: Qwen35VLCommonKwargs = { + "hf_path": "Qwen/Qwen3.5-27B", + "tensor_model_parallel_size": 4 if is_full_sft else 2, + "pipeline_model_parallel_size": 4 if is_full_sft else 1, + "pipeline_dtype": torch.bfloat16 if is_full_sft else None, + "expert_model_parallel_size": 1, + "peft": peft_value, + "finetune_lr": 5e-6 if is_full_sft else 1e-4, + "freeze_language_model": False, + "freeze_vision_model": False, + "freeze_vision_projection": False, + "lr_warmup_iters": 200, + "micro_batch_size": 1, + "global_batch_size": 32, + } + combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} + return _qwen35_vl_common(**combined_kwargs) + + +# --------------------------------------------------------------------------- +# MoE variant: Qwen3.5-35B-A3B +# --------------------------------------------------------------------------- + + +def qwen35_vl_35b_a3b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: + """Return a fine-tuning config for Qwen3.5-35B-A3B (MoE). + + This is a small Mixture-of-Experts model. Recommended to use with expert + parallelism (EP) for efficient training. + + Default configuration: + - LoRA/DoRA: TP=2, PP=1, EP=4 (1 node), LR=2e-4 + - Full SFT: TP=2, PP=1, EP=16 (2 nodes), LR=2e-5 + + See `_qwen35_vl_common` for the full list of parameters. + """ + peft_value = user_kwargs.get("peft", None) + is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + + recommended_kwargs: Qwen35VLCommonKwargs = { + "hf_path": "Qwen/Qwen3.5-35B-A3B", + "tensor_model_parallel_size": 2, + "pipeline_model_parallel_size": 1, + "pipeline_dtype": torch.bfloat16, + "expert_model_parallel_size": 16 if is_full_sft else 4, + "expert_tensor_parallel_size": 1, + "peft": peft_value, + "finetune_lr": 2e-5 if is_full_sft else 2e-4, + "freeze_language_model": False, + "freeze_vision_model": False, + "freeze_vision_projection": False, + "min_lr": 2e-6 if is_full_sft else 1e-4, + "lr_warmup_iters": 200, + "micro_batch_size": 1, + "global_batch_size": 32, + } + combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} + return _qwen35_vl_common(**combined_kwargs) + + +# --------------------------------------------------------------------------- +# MoE variant: Qwen3.5-122B-A10B +# --------------------------------------------------------------------------- + + +def qwen35_vl_122b_a10b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: + """Return a fine-tuning config for Qwen3.5-122B-A10B (MoE). + + This is a medium-sized Mixture-of-Experts model. Recommended to use with + expert parallelism (EP) for efficient training. + + Default configuration: + - LoRA/DoRA: TP=2, PP=1, EP=8 (1 node), LR=2e-4 + - Full SFT: TP=2, PP=4, EP=8 (4 nodes), LR=2e-5 + + See `_qwen35_vl_common` for the full list of parameters. + """ + peft_value = user_kwargs.get("peft", None) + is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + + recommended_kwargs: Qwen35VLCommonKwargs = { + "hf_path": "Qwen/Qwen3.5-122B-A10B", + "tensor_model_parallel_size": 2, + "pipeline_model_parallel_size": 6 if is_full_sft else 1, + "pipeline_dtype": torch.bfloat16, + "expert_model_parallel_size": 8, + "expert_tensor_parallel_size": 1, + "peft": peft_value, + "enable_recompute": is_full_sft, + "finetune_lr": 2e-5 if is_full_sft else 2e-4, + "freeze_language_model": False, + "freeze_vision_model": False, + "freeze_vision_projection": False, + "lr_warmup_iters": 200, + "micro_batch_size": 1, + "global_batch_size": 36, + } + combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} + return _qwen35_vl_common(**combined_kwargs) + + +# --------------------------------------------------------------------------- +# MoE variant: Qwen3.5-397B-A17B +# --------------------------------------------------------------------------- +# TODO note this down somewhere +# For multinode training, if you encounter a file lock issue, you can replace hf_path with the local +# path to the model, e.g hf_home/hub/models--Qwen--Qwen3.5-397B-A17B/snapshots/... directory + + +def qwen35_vl_397b_a17b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: + """Return a fine-tuning config for Qwen3.5-397B-A17B (MoE). + + This is a Mixture-of-Experts model with 512 experts and top-10 routing. + Recommended to use with expert parallelism (EP) for efficient training. + + Default configuration: + - LoRA/DoRA: TP=2, PP=1, EP=32 (4 nodes), LR=2e-4 + - Full SFT: TP=2, PP=4, EP=32 (16 nodes), LR=2e-5 + + See `_qwen35_vl_common` for the full list of parameters. + """ + peft_value = user_kwargs.get("peft", None) + is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") + + recommended_kwargs: Qwen35VLCommonKwargs = { + "hf_path": "Qwen/Qwen3.5-397B-A17B", + "tensor_model_parallel_size": 2, + "pipeline_model_parallel_size": 4 if is_full_sft else 1, + "pipeline_dtype": torch.bfloat16, + "expert_model_parallel_size": 32, + "expert_tensor_parallel_size": 1, + "peft": peft_value, + "enable_recompute": is_full_sft, + "finetune_lr": 2e-5 if is_full_sft else 2e-4, + "freeze_language_model": False, + "freeze_vision_model": False, + "freeze_vision_projection": False, + "lr_warmup_iters": 200, + "micro_batch_size": 1, + "global_batch_size": 32, + } + combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} + return _qwen35_vl_common(**combined_kwargs) + + +# --------------------------------------------------------------------------- +# Shared implementation +# --------------------------------------------------------------------------- + + +def _qwen35_vl_common( + hf_path: str, + dir: Optional[str] = None, + name: str = "qwen35_vl_finetune", + # Dataset configuration + train_data_path: Optional[List[str]] = None, + valid_data_path: Optional[List[str]] = None, + test_data_path: Optional[List[str]] = None, + dataset_type: Optional[str] = None, + image_folder: Optional[str] = None, + tokenizer_model: Optional[str] = None, + mock: bool = False, + # Model configuration + tensor_model_parallel_size: int = 4, + pipeline_model_parallel_size: int = 1, + pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, + virtual_pipeline_model_parallel_size: Optional[int] = None, + context_parallel_size: int = 1, + expert_model_parallel_size: Optional[int] = 1, + expert_tensor_parallel_size: int = 1, + sequence_parallel: bool = False, + use_megatron_fsdp: bool = False, + enable_recompute: bool = False, + account_for_embedding_in_pipeline_split: bool = False, + account_for_loss_in_pipeline_split: bool = False, + # Training hyperparameters + train_iters: int = 300000, + global_batch_size: int = 32, + micro_batch_size: int = 1, + seq_length: int = 4096, + lr: float = 3e-4, + min_lr: float = 3e-5, + lr_warmup_iters: int = 500, + lr_decay_iters: Optional[int] = None, + eval_interval: int = 500, + save_interval: int = 500, + use_null_tokenizer: bool = False, + # Precision recipe + precision_config: Optional[Union[MixedPrecisionConfig, str]] = None, + comm_overlap_config: Optional[CommOverlapConfig] = None, + # Freeze options + pretrained_checkpoint: Optional[str] = None, + freeze_language_model: bool = True, + freeze_vision_model: bool = True, + freeze_vision_projection: bool = False, + # PEFT options + peft: Optional[Union[str, PEFT]] = None, + finetune_lr: Optional[float] = None, + # W&B logging + wandb_project: Optional[str] = None, + wandb_entity: Optional[str] = None, + wandb_exp_name: Optional[str] = None, +) -> ConfigContainer: + """Create a fine-tuning configuration for Qwen3.5 VL models. + + Supports the dense (Qwen3.5-27B) and MoE (Qwen3.5-35B-A3B, + Qwen3.5-122B-A10B, Qwen3.5-397B-A17B) variants. The model + architecture is automatically determined from ``hf_path`` via + :class:`AutoBridge`. + + Args: + hf_path: HuggingFace model path. + dir: Base directory for logs and checkpoints. + name: Name of the training run. + train_data_path: Training data paths. + valid_data_path: Validation data paths. + test_data_path: Test data paths. + dataset_type: One of ``"mock"``, ``"hf"``, ``"preloaded"``. + image_folder: Path to image folder (for preloaded datasets). + tokenizer_model: Path or HF name for the tokenizer/processor. + mock: If *True*, equivalent to ``dataset_type="mock"``. + tensor_model_parallel_size: Tensor parallelism degree. + pipeline_model_parallel_size: Pipeline parallelism degree. + pipeline_dtype: Data type for pipeline parallelism. + virtual_pipeline_model_parallel_size: Virtual pipeline parallelism. + context_parallel_size: Context parallelism degree. + expert_model_parallel_size: Expert parallelism degree (MoE). + expert_tensor_parallel_size: Expert tensor parallelism (MoE). + sequence_parallel: Whether to use sequence parallelism. + use_megatron_fsdp: Whether to use Megatron FSDP. + enable_recompute: Whether to enable activation recomputation. + account_for_embedding_in_pipeline_split: Account for embedding in PP split. + account_for_loss_in_pipeline_split: Account for loss in PP split. + train_iters: Total training iterations. + global_batch_size: Global batch size. + micro_batch_size: Micro batch size. + seq_length: Sequence length. + lr: Learning rate. + min_lr: Minimum learning rate for cosine decay. + lr_warmup_iters: Warmup iterations. + lr_decay_iters: LR decay iterations (defaults to *train_iters*). + eval_interval: Evaluation interval. + save_interval: Checkpoint save interval. + use_null_tokenizer: Use NullTokenizer instead of HuggingFace tokenizer. + precision_config: Precision configuration (default: bf16 mixed). + comm_overlap_config: Communication overlap configuration. + pretrained_checkpoint: Path to a pretrained checkpoint. + freeze_language_model: Freeze the language model weights. + freeze_vision_model: Freeze the vision encoder weights. + freeze_vision_projection: Freeze the vision projection weights. + peft: PEFT configuration (``"lora"``, ``"dora"``, or a PEFT object). + finetune_lr: Learning rate override for fine-tuning. + wandb_project: W&B project name. + wandb_entity: W&B entity name. + wandb_exp_name: W&B experiment name. + + Returns: + ConfigContainer ready for training. + """ + base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") + run_output_dir = os.path.join(base_output_dir, name) + checkpoint_dir = os.path.join(run_output_dir, "checkpoints") + tensorboard_dir = os.path.join(run_output_dir, "tb_logs") + + bridge = AutoBridge.from_hf_pretrained(hf_path) + model_cfg = bridge.to_megatron_provider(load_weights=False) + model_cfg.tensor_model_parallel_size = tensor_model_parallel_size + model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size + model_cfg.pipeline_dtype = pipeline_dtype + model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size + model_cfg.context_parallel_size = context_parallel_size + model_cfg.expert_model_parallel_size = expert_model_parallel_size + model_cfg.expert_tensor_parallel_size = expert_tensor_parallel_size + if not sequence_parallel and tensor_model_parallel_size > 1 and (expert_model_parallel_size or 1) > 1: + sequence_parallel = True + model_cfg.sequence_parallel = sequence_parallel + model_cfg.freeze_language_model = freeze_language_model + model_cfg.freeze_vision_model = freeze_vision_model + model_cfg.freeze_vision_projection = freeze_vision_projection + model_cfg.seq_length = seq_length + + if precision_config is None: + precision_config = bf16_mixed() + + if account_for_embedding_in_pipeline_split: + model_cfg.account_for_embedding_in_pipeline_split = True + if account_for_loss_in_pipeline_split: + model_cfg.account_for_loss_in_pipeline_split = True + + if enable_recompute: + model_cfg.recompute_granularity = "full" + model_cfg.recompute_method = "uniform" + model_cfg.recompute_num_layers = 1 + + model_cfg.validate_parallelism() + + # Optimizer and scheduler + effective_lr = finetune_lr if finetune_lr is not None else lr + if min_lr > effective_lr: + min_lr = effective_lr * 0.1 + opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=lr_warmup_iters, + lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters, + max_lr=effective_lr, + min_lr=min_lr, + ) + + peft_config = _default_peft_config(peft) + + # Dataset selection + _processor_model = tokenizer_model or hf_path + _dataset_choice = dataset_type or ("mock" if mock else "hf") + + if _dataset_choice == "mock": + dataset_cfg: DatasetProvider = MockVLMConversationProvider( + seq_length=seq_length, + hf_processor_path=_processor_model, + prompt="Describe this image.", + num_workers=1, + dataloader_type="single", + data_sharding=True, + pin_memory=True, + persistent_workers=False, + create_attention_mask=True, + pad_to_max_length=True, + ) + elif _dataset_choice == "preloaded": + dataset_cfg = PreloadedVLMConversationProvider( + seq_length=seq_length, + hf_processor_path=_processor_model, + train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path, + valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path, + test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path, + image_folder=image_folder, + num_workers=2, + dataloader_type="single", + data_sharding=True, + pin_memory=True, + persistent_workers=False, + ) + elif _dataset_choice == "hf": + dataset_cfg = HFDatasetConversationProvider( + seq_length=seq_length, + hf_processor_path=_processor_model, + maker_name="make_cord_v2_dataset", + num_workers=2, + dataloader_type="single", + data_sharding=True, + pin_memory=True, + persistent_workers=False, + ) + else: + raise ValueError(f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf'].") + + cfg = ConfigContainer( + model=model_cfg, + train=TrainingConfig( + train_iters=train_iters, + global_batch_size=global_batch_size, + micro_batch_size=micro_batch_size, + manual_gc=True, + manual_gc_interval=100, + manual_gc_eval=100, + ), + validation=ValidationConfig( + eval_interval=eval_interval, + eval_iters=32, + ), + optimizer=opt_config, + scheduler=scheduler, + ddp=DistributedDataParallelConfig( + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=False, + overlap_param_gather=False, + average_in_collective=True, + data_parallel_sharding_strategy="optim_grads_params", + use_distributed_optimizer=True, + use_megatron_fsdp=use_megatron_fsdp, + ), + dataset=dataset_cfg, + logger=LoggerConfig( + log_interval=10, + tensorboard_dir=tensorboard_dir, + log_timers_to_tensorboard=True, + wandb_project=wandb_project, + wandb_entity=wandb_entity, + wandb_exp_name=wandb_exp_name, + ), + tokenizer=TokenizerConfig( + tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", + tokenizer_model=hf_path if not use_null_tokenizer else None, + vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, + ), + checkpoint=CheckpointConfig( + pretrained_checkpoint=pretrained_checkpoint, + save_interval=save_interval, + save=checkpoint_dir, + load=checkpoint_dir, + ckpt_format="torch_dist", + fully_parallel_save=True, + ), + rng=RNGConfig(seed=1234), + peft=peft_config, + comm_overlap=comm_overlap_config, + mixed_precision=precision_config, + ) + + return cfg diff --git a/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py new file mode 100644 index 0000000000..5d78889e77 --- /dev/null +++ b/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py @@ -0,0 +1,199 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for Qwen3.5-VL finetuning recipes. + +Covers three training scenarios: +1. SFT with nothing frozen (all modules trainable) +2. SFT with language model frozen (train vision + projection) +3. SFT with vision + language frozen (train projection only) +4. SFT with activation recomputation + +Run with: + torchrun --nproc_per_node=2 -m pytest tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py -v +""" + +import pytest + +from megatron.bridge.recipes.qwen_vl.qwen35_vl import qwen35_vl_27b_finetune_config +from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test + + +_TP2_PP1 = {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1} +_TINY_MODEL = {"num_layers": 4} + + +# --------------------------------------------------------------------------- +# Scenario 1: SFT — nothing frozen +# --------------------------------------------------------------------------- + +QWEN35_VL_SFT_NONE_FROZEN = [ + ( + qwen35_vl_27b_finetune_config, + "qwen35_vl_27b_sft_none_frozen", + _TP2_PP1, + { + **_TINY_MODEL, + "freeze_language_model": False, + "freeze_vision_model": False, + "freeze_vision_projection": False, + }, + ), +] + +# --------------------------------------------------------------------------- +# Scenario 2: SFT — language model frozen +# --------------------------------------------------------------------------- + +QWEN35_VL_SFT_LM_FROZEN = [ + ( + qwen35_vl_27b_finetune_config, + "qwen35_vl_27b_sft_lm_frozen", + _TP2_PP1, + { + **_TINY_MODEL, + "freeze_language_model": True, + "freeze_vision_model": False, + "freeze_vision_projection": False, + }, + ), +] + +# --------------------------------------------------------------------------- +# Scenario 3: SFT — vision + language frozen (train projection only) +# --------------------------------------------------------------------------- + +QWEN35_VL_SFT_PROJ_ONLY = [ + ( + qwen35_vl_27b_finetune_config, + "qwen35_vl_27b_sft_projection_only", + _TP2_PP1, + { + **_TINY_MODEL, + "freeze_language_model": True, + "freeze_vision_model": True, + "freeze_vision_projection": False, + }, + ), +] + +# --------------------------------------------------------------------------- +# Scenario 4: SFT — activation recomputation +# --------------------------------------------------------------------------- + +QWEN35_VL_SFT_RECOMPUTE = [ + ( + qwen35_vl_27b_finetune_config, + "qwen35_vl_27b_sft_recompute", + _TP2_PP1, + { + **_TINY_MODEL, + "recompute_granularity": "full", + "recompute_method": "uniform", + "recompute_num_layers": 1, + }, + ), +] + + +class TestQwen35VLFinetuneRecipes: + """Functional tests covering SFT freeze combos and recompute.""" + + @pytest.fixture(autouse=True) + def _reset_microbatch_calculator(self): + """Ensure the global microbatch calculator is cleared between tests. + + If a previous test fails mid-pretrain, destroy_global_state() never + runs and the calculator leaks into the next test. + """ + yield + from megatron.core.num_microbatches_calculator import ( + _GLOBAL_NUM_MICROBATCHES_CALCULATOR, + destroy_num_microbatches_calculator, + ) + + if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is not None: + destroy_num_microbatches_calculator() + + # ----------------------------------------------------------------------- + # SFT scenarios + # ----------------------------------------------------------------------- + + @pytest.mark.run_only_on("GPU") + @pytest.mark.parametrize( + "config_func,recipe_name,parallelism_overrides,model_overrides", + QWEN35_VL_SFT_NONE_FROZEN, + ) + def test_sft_nothing_frozen(self, config_func, recipe_name, parallelism_overrides, model_overrides, tmp_path): + """Scenario 1: all modules trainable.""" + run_pretrain_vl_recipe_test( + config_func, + recipe_name, + tmp_path, + model_overrides=model_overrides, + **parallelism_overrides, + ) + + @pytest.mark.run_only_on("GPU") + @pytest.mark.parametrize( + "config_func,recipe_name,parallelism_overrides,model_overrides", + QWEN35_VL_SFT_LM_FROZEN, + ) + def test_sft_language_model_frozen( + self, config_func, recipe_name, parallelism_overrides, model_overrides, tmp_path + ): + """Scenario 2: language model frozen, train vision + projection.""" + run_pretrain_vl_recipe_test( + config_func, + recipe_name, + tmp_path, + model_overrides=model_overrides, + **parallelism_overrides, + ) + + @pytest.mark.run_only_on("GPU") + @pytest.mark.parametrize( + "config_func,recipe_name,parallelism_overrides,model_overrides", + QWEN35_VL_SFT_PROJ_ONLY, + ) + def test_sft_vision_and_language_frozen( + self, config_func, recipe_name, parallelism_overrides, model_overrides, tmp_path + ): + """Scenario 3: vision + language frozen, train projection only.""" + run_pretrain_vl_recipe_test( + config_func, + recipe_name, + tmp_path, + model_overrides=model_overrides, + **parallelism_overrides, + ) + + # ----------------------------------------------------------------------- + # Recompute + # ----------------------------------------------------------------------- + + @pytest.mark.run_only_on("GPU") + @pytest.mark.parametrize( + "config_func,recipe_name,parallelism_overrides,model_overrides", + QWEN35_VL_SFT_RECOMPUTE, + ) + def test_recompute(self, config_func, recipe_name, parallelism_overrides, model_overrides, tmp_path): + """SFT with activation recomputation.""" + run_pretrain_vl_recipe_test( + config_func, + recipe_name, + tmp_path, + model_overrides=model_overrides, + **parallelism_overrides, + ) diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen35_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen35_vl_recipes.py new file mode 100644 index 0000000000..8a9effa514 --- /dev/null +++ b/tests/unit_tests/recipes/qwen_vl/test_qwen35_vl_recipes.py @@ -0,0 +1,756 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Test purpose: +# - Parametrize over all exported Qwen3.5-VL recipe functions. +# - Monkeypatch AutoBridge and the provider to avoid I/O and heavy model init. +# - Build a config with small, safe overrides and assert it forms a valid ConfigContainer. +# - Verify dataset provider selection, parallelism fields, freeze options, and PEFT defaults. +# + +import importlib +from typing import Callable + +import pytest +import torch + + +_qwen35_vl_module = importlib.import_module("megatron.bridge.recipes.qwen_vl.qwen35_vl") +_QWEN35_VL_RECIPE_FUNCS = [ + _qwen35_vl_module.qwen35_vl_800m_finetune_config, + _qwen35_vl_module.qwen35_vl_2b_finetune_config, + _qwen35_vl_module.qwen35_vl_4b_finetune_config, + _qwen35_vl_module.qwen35_vl_9b_finetune_config, + _qwen35_vl_module.qwen35_vl_27b_finetune_config, + _qwen35_vl_module.qwen35_vl_35b_a3b_finetune_config, + _qwen35_vl_module.qwen35_vl_122b_a10b_finetune_config, + _qwen35_vl_module.qwen35_vl_397b_a17b_finetune_config, +] + + +def _safe_overrides_for(name: str) -> dict: + """Create safe test overrides for a given recipe function name.""" + overrides = { + "name": f"unit_{name}", + "dir": ".", + "dataset_type": "mock", + "train_iters": 10, + "global_batch_size": 2, + "micro_batch_size": 1, + "seq_length": 64, + "lr": 1e-4, + "min_lr": 1e-5, + "lr_warmup_iters": 2, + "tensor_model_parallel_size": 1, + "pipeline_model_parallel_size": 1, + "context_parallel_size": 1, + "expert_model_parallel_size": 1, + "use_null_tokenizer": True, + } + return overrides + + +class _FakeModelCfg: + """Fake model configuration for testing.""" + + def __init__(self): + self.tensor_model_parallel_size = 1 + self.pipeline_model_parallel_size = 1 + self.pipeline_dtype = None + self.virtual_pipeline_model_parallel_size = None + self.context_parallel_size = 1 + self.expert_model_parallel_size = 1 + self.expert_tensor_parallel_size = 1 + self.sequence_parallel = False + self.seq_length = 64 + self.freeze_language_model = False + self.freeze_vision_model = False + self.freeze_vision_projection = False + self.account_for_embedding_in_pipeline_split = False + self.account_for_loss_in_pipeline_split = False + self.recompute_granularity = None + self.recompute_method = None + self.recompute_num_layers = None + + def validate_parallelism(self): + return None + + def finalize(self): + return None + + +class _FakeAutoBridge: + """Fake AutoBridge for testing.""" + + @staticmethod + def from_hf_pretrained(hf_path: str, **kwargs): + return _FakeAutoBridge() + + def to_megatron_provider(self, load_weights: bool = False): + return _FakeModelCfg() + + +def _assert_basic_config(cfg): + """Assert that a config has all required components.""" + from megatron.bridge.training.config import ConfigContainer + + assert isinstance(cfg, ConfigContainer) + assert cfg.model is not None + assert cfg.train is not None + assert cfg.optimizer is not None + assert cfg.scheduler is not None + assert cfg.dataset is not None + assert cfg.logger is not None + assert cfg.tokenizer is not None + assert cfg.checkpoint is not None + assert cfg.rng is not None + + assert cfg.train.global_batch_size >= 1 + assert cfg.train.micro_batch_size >= 1 + + if hasattr(cfg.dataset, "seq_length"): + assert cfg.dataset.seq_length >= 1 + + +# --------------------------------------------------------------------------- +# Basic recipe building tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("recipe_func", _QWEN35_VL_RECIPE_FUNCS) +def test_each_qwen35_vl_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Each Qwen3.5-VL recipe function should build a valid ConfigContainer.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for(recipe_func.__name__) + cfg = recipe_func(**overrides) + + _assert_basic_config(cfg) + + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") + + +# --------------------------------------------------------------------------- +# Dataset type selection +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("dataset_type", ["mock", "hf", "preloaded"]) +def test_qwen35_vl_dataset_type_selection(dataset_type: str, monkeypatch: pytest.MonkeyPatch): + """Different dataset_type values should produce the correct dataset provider.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["dataset_type"] = dataset_type + + if dataset_type == "preloaded": + overrides["train_data_path"] = ["/fake/train.json"] + overrides["valid_data_path"] = ["/fake/valid.json"] + overrides["test_data_path"] = ["/fake/test.json"] + overrides["image_folder"] = "/fake/images" + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + from megatron.bridge.data.vlm_datasets import ( + HFDatasetConversationProvider, + MockVLMConversationProvider, + PreloadedVLMConversationProvider, + ) + + if dataset_type == "mock": + assert isinstance(cfg.dataset, MockVLMConversationProvider) + elif dataset_type == "hf": + assert isinstance(cfg.dataset, HFDatasetConversationProvider) + elif dataset_type == "preloaded": + assert isinstance(cfg.dataset, PreloadedVLMConversationProvider) + + +# --------------------------------------------------------------------------- +# Training scenarios: SFT freeze combinations +# --------------------------------------------------------------------------- + + +def test_sft_nothing_frozen(monkeypatch: pytest.MonkeyPatch): + """Scenario 1: Full SFT with nothing frozen — all modules trainable.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["peft"] = None + overrides["freeze_language_model"] = False + overrides["freeze_vision_model"] = False + overrides["freeze_vision_projection"] = False + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + assert cfg.peft is None + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +def test_sft_language_model_frozen(monkeypatch: pytest.MonkeyPatch): + """Scenario 2: SFT with language model frozen — train vision + projection.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["peft"] = None + overrides["freeze_language_model"] = True + overrides["freeze_vision_model"] = False + overrides["freeze_vision_projection"] = False + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + assert cfg.peft is None + assert cfg.model.freeze_language_model is True + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +def test_sft_vision_and_language_frozen(monkeypatch: pytest.MonkeyPatch): + """Scenario 3: SFT with vision + language frozen — train projection only.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["peft"] = None + overrides["freeze_language_model"] = True + overrides["freeze_vision_model"] = True + overrides["freeze_vision_projection"] = False + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + assert cfg.peft is None + assert cfg.model.freeze_language_model is True + assert cfg.model.freeze_vision_model is True + assert cfg.model.freeze_vision_projection is False + + +# --------------------------------------------------------------------------- +# Training scenarios: PEFT + freeze combinations +# --------------------------------------------------------------------------- + + +def test_peft_lora_language_only(monkeypatch: pytest.MonkeyPatch): + """Scenario 4: LoRA adapters on all modules, vision base weights frozen. + + Default LoRA targets linear_qkv/proj/fc1/fc2 in both vision and language. + Freezing vision base weights means only LoRA adapter deltas are trainable + on the vision side, while the language model base weights remain trainable + as well (unless also frozen via freeze_language_model). The typical + "language-only PEFT" pattern freezes vision + projection and adds LoRA + adapters; the language base weights are also frozen by the recipe default, + but LoRA adapter weights on the language side are always trainable. + """ + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["peft"] = "lora" + overrides["freeze_language_model"] = True + overrides["freeze_vision_model"] = True + overrides["freeze_vision_projection"] = True + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.model.freeze_language_model is True + assert cfg.model.freeze_vision_model is True + assert cfg.model.freeze_vision_projection is True + + +def test_peft_lora_vision_and_language(monkeypatch: pytest.MonkeyPatch): + """Scenario 5: LoRA adapters with nothing frozen — adapters on all modules. + + LoRA targets linear_qkv/proj/fc1/fc2 in both vision and language. + With nothing frozen, all base weights and all adapter weights are trainable. + """ + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["peft"] = "lora" + overrides["freeze_language_model"] = False + overrides["freeze_vision_model"] = False + overrides["freeze_vision_projection"] = False + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False + + +# --------------------------------------------------------------------------- +# PEFT vs full SFT (parametrized across all recipes) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("recipe_func", _QWEN35_VL_RECIPE_FUNCS) +@pytest.mark.parametrize("peft", ["lora", "dora", None]) +def test_qwen35_vl_finetune_peft_vs_full_sft(recipe_func, peft, monkeypatch: pytest.MonkeyPatch): + """PEFT and full SFT configurations should be correctly applied.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for(recipe_func.__name__) + overrides["peft"] = peft + + cfg = recipe_func(**overrides) + + _assert_basic_config(cfg) + + if peft in ["lora", "dora"]: + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") + elif peft is None: + assert cfg.peft is None + + +# --------------------------------------------------------------------------- +# 800M dense defaults +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_800m_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """800M LoRA should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_800m_finetune_config") + overrides["peft"] = "lora" + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_800m_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + assert cfg.optimizer.lr == 1e-4 + + +def test_qwen35_vl_800m_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """800M full SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_800m_finetune_config") + overrides["peft"] = None + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_800m_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + assert cfg.optimizer.lr == 5e-6 + + +# --------------------------------------------------------------------------- +# 2B dense defaults +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_2b_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """2B LoRA should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_2b_finetune_config") + overrides["peft"] = "lora" + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_2b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + assert cfg.optimizer.lr == 1e-4 + + +def test_qwen35_vl_2b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """2B full SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_2b_finetune_config") + overrides["peft"] = None + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_2b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + assert cfg.optimizer.lr == 5e-6 + + +# --------------------------------------------------------------------------- +# 4B dense defaults +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_4b_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """4B LoRA should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_4b_finetune_config") + overrides["peft"] = "lora" + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_4b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + assert cfg.optimizer.lr == 1e-4 + + +def test_qwen35_vl_4b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """4B full SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_4b_finetune_config") + overrides["peft"] = None + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_4b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + assert cfg.optimizer.lr == 5e-6 + + +# --------------------------------------------------------------------------- +# 9B dense defaults +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_9b_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """9B LoRA should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_9b_finetune_config") + overrides["peft"] = "lora" + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_9b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + assert cfg.optimizer.lr == 1e-4 + + +def test_qwen35_vl_9b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """9B full SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_9b_finetune_config") + overrides["peft"] = None + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_9b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 4 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + assert cfg.optimizer.lr == 5e-6 + + +# --------------------------------------------------------------------------- +# 27B dense defaults +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_27b_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """27B LoRA should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["peft"] = "lora" + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.peft.alpha == 32 + assert cfg.optimizer.lr == 1e-4 + assert cfg.model.pipeline_dtype is None + + +def test_qwen35_vl_27b_dora_defaults(monkeypatch: pytest.MonkeyPatch): + """27B DoRA should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["peft"] = "dora" + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.peft.alpha == 64 + + +def test_qwen35_vl_27b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """27B full SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["peft"] = None + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 4 + assert cfg.model.pipeline_model_parallel_size == 4 + assert cfg.peft is None + assert cfg.optimizer.lr == 5e-6 + assert cfg.model.pipeline_dtype == torch.bfloat16 + + +# --------------------------------------------------------------------------- +# 35B-A3B MoE defaults +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_35b_a3b_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """35B-A3B LoRA should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_35b_a3b_finetune_config") + overrides["peft"] = "lora" + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + overrides.pop("expert_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_35b_a3b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.model.expert_model_parallel_size == 4 + assert cfg.peft is not None + assert cfg.optimizer.lr == 2e-4 + + +def test_qwen35_vl_35b_a3b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """35B-A3B full SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_35b_a3b_finetune_config") + overrides["peft"] = None + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + overrides.pop("expert_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_35b_a3b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.model.expert_model_parallel_size == 16 + assert cfg.peft is None + assert cfg.optimizer.lr == 2e-5 + assert cfg.model.pipeline_dtype == torch.bfloat16 + + +# --------------------------------------------------------------------------- +# 122B-A10B MoE defaults +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_122b_a10b_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """122B-A10B LoRA should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_122b_a10b_finetune_config") + overrides["peft"] = "lora" + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + overrides.pop("expert_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_122b_a10b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.model.expert_model_parallel_size == 8 + assert cfg.peft is not None + assert cfg.optimizer.lr == 2e-4 + assert cfg.model.pipeline_dtype == torch.bfloat16 + + +def test_qwen35_vl_122b_a10b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """122B-A10B full SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_122b_a10b_finetune_config") + overrides["peft"] = None + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + overrides.pop("expert_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_122b_a10b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 6 + assert cfg.model.expert_model_parallel_size == 8 + assert cfg.peft is None + assert cfg.optimizer.lr == 2e-5 + assert cfg.model.pipeline_dtype == torch.bfloat16 + + +# --------------------------------------------------------------------------- +# 397B MoE defaults +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_397b_a17b_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """397B-A17B LoRA should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_397b_a17b_finetune_config") + overrides["peft"] = "lora" + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + overrides.pop("expert_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_397b_a17b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.model.expert_model_parallel_size == 32 + assert cfg.peft is not None + assert cfg.optimizer.lr == 2e-4 + + +def test_qwen35_vl_397b_a17b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """397B-A17B full SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_397b_a17b_finetune_config") + overrides["peft"] = None + overrides.pop("tensor_model_parallel_size", None) + overrides.pop("pipeline_model_parallel_size", None) + overrides.pop("expert_model_parallel_size", None) + + cfg = _qwen35_vl_module.qwen35_vl_397b_a17b_finetune_config(**overrides) + + _assert_basic_config(cfg) + + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 4 + assert cfg.model.expert_model_parallel_size == 32 + assert cfg.peft is None + assert cfg.optimizer.lr == 2e-5 + assert cfg.model.pipeline_dtype == torch.bfloat16 + + +# --------------------------------------------------------------------------- +# Custom overrides +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_custom_finetune_lr(monkeypatch: pytest.MonkeyPatch): + """Custom finetune_lr should override default learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["peft"] = "lora" + overrides["finetune_lr"] = 2e-4 + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + assert cfg.optimizer.lr == 2e-4 + + +def test_qwen35_vl_recompute_option(monkeypatch: pytest.MonkeyPatch): + """enable_recompute should set recompute fields on the model config.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["enable_recompute"] = True + + cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + + _assert_basic_config(cfg) + assert cfg.model.recompute_granularity == "full" + assert cfg.model.recompute_method == "uniform" + assert cfg.model.recompute_num_layers == 1 + + +def test_qwen35_vl_invalid_dataset_type(monkeypatch: pytest.MonkeyPatch): + """An unsupported dataset_type should raise ValueError.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") + overrides["dataset_type"] = "unsupported" + + with pytest.raises(ValueError, match="Unsupported dataset_type"): + _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) From 935c40c83adf5fcf9c747cc5857b2a83739176fb Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Wed, 4 Mar 2026 22:42:46 -0800 Subject: [PATCH 3/9] recipe refactor Signed-off-by: Chen Cui --- examples/models/vlm/qwen35_vl/slurm_peft.sh | 16 +- examples/models/vlm/qwen35_vl/slurm_sft.sh | 16 +- .../bridge/recipes/qwen_vl/__init__.py | 62 +- .../bridge/recipes/qwen_vl/qwen35_vl.py | 2399 ++++++++++++----- .../test_qwen35_vl_recipes_finetune.py | 10 +- .../recipes/qwen_vl/test_qwen35_vl_recipes.py | 727 +++-- 6 files changed, 2117 insertions(+), 1113 deletions(-) diff --git a/examples/models/vlm/qwen35_vl/slurm_peft.sh b/examples/models/vlm/qwen35_vl/slurm_peft.sh index f2b9b66791..fd2abb2ec1 100755 --- a/examples/models/vlm/qwen35_vl/slurm_peft.sh +++ b/examples/models/vlm/qwen35_vl/slurm_peft.sh @@ -60,35 +60,35 @@ MODEL_SIZE="${1:?Usage: sbatch $0 (model: 0.8B|2B|4B|9B|27B|35B-A3B|122 case "$MODEL_SIZE" in 0.8B) HF_MODEL_NAME="Qwen3.5-0.8B" - RECIPE="qwen35_vl_800m_finetune_config" + RECIPE="qwen35_vl_800m_peft_config" ;; 2B) HF_MODEL_NAME="Qwen3.5-2B" - RECIPE="qwen35_vl_2b_finetune_config" + RECIPE="qwen35_vl_2b_peft_config" ;; 4B) HF_MODEL_NAME="Qwen3.5-4B" - RECIPE="qwen35_vl_4b_finetune_config" + RECIPE="qwen35_vl_4b_peft_config" ;; 9B) HF_MODEL_NAME="Qwen3.5-9B" - RECIPE="qwen35_vl_9b_finetune_config" + RECIPE="qwen35_vl_9b_peft_config" ;; 27B) HF_MODEL_NAME="Qwen3.5-27B" - RECIPE="qwen35_vl_27b_finetune_config" + RECIPE="qwen35_vl_27b_peft_config" ;; 35B-A3B) HF_MODEL_NAME="Qwen3.5-35B-A3B" - RECIPE="qwen35_vl_35b_a3b_finetune_config" + RECIPE="qwen35_vl_35b_a3b_peft_config" ;; 122B-A10B) HF_MODEL_NAME="Qwen3.5-122B-A10B" - RECIPE="qwen35_vl_122b_a10b_finetune_config" + RECIPE="qwen35_vl_122b_a10b_peft_config" ;; 397B-A17B) HF_MODEL_NAME="Qwen3.5-397B-A17B" - RECIPE="qwen35_vl_397b_a17b_finetune_config" + RECIPE="qwen35_vl_397b_a17b_peft_config" ;; *) echo "ERROR: Unknown model '$MODEL_SIZE'. Must be one of: 0.8B, 2B, 4B, 9B, 27B, 35B-A3B, 122B-A10B, 397B-A17B" diff --git a/examples/models/vlm/qwen35_vl/slurm_sft.sh b/examples/models/vlm/qwen35_vl/slurm_sft.sh index 176990efd9..60eb3176e7 100644 --- a/examples/models/vlm/qwen35_vl/slurm_sft.sh +++ b/examples/models/vlm/qwen35_vl/slurm_sft.sh @@ -61,35 +61,35 @@ MODEL_SIZE="${1:?Usage: sbatch $0 (model: 0.8B|2B|4B|9B|27B|35B-A3B|122 case "$MODEL_SIZE" in 0.8B) HF_MODEL_NAME="Qwen3.5-0.8B" - RECIPE="qwen35_vl_800m_finetune_config" + RECIPE="qwen35_vl_800m_sft_config" ;; 2B) HF_MODEL_NAME="Qwen3.5-2B" - RECIPE="qwen35_vl_2b_finetune_config" + RECIPE="qwen35_vl_2b_sft_config" ;; 4B) HF_MODEL_NAME="Qwen3.5-4B" - RECIPE="qwen35_vl_4b_finetune_config" + RECIPE="qwen35_vl_4b_sft_config" ;; 9B) HF_MODEL_NAME="Qwen3.5-9B" - RECIPE="qwen35_vl_9b_finetune_config" + RECIPE="qwen35_vl_9b_sft_config" ;; 27B) HF_MODEL_NAME="Qwen3.5-27B" - RECIPE="qwen35_vl_27b_finetune_config" + RECIPE="qwen35_vl_27b_sft_config" ;; 35B-A3B) HF_MODEL_NAME="Qwen3.5-35B-A3B" - RECIPE="qwen35_vl_35b_a3b_finetune_config" + RECIPE="qwen35_vl_35b_a3b_sft_config" ;; 122B-A10B) HF_MODEL_NAME="Qwen3.5-122B-A10B" - RECIPE="qwen35_vl_122b_a10b_finetune_config" + RECIPE="qwen35_vl_122b_a10b_sft_config" ;; 397B-A17B) HF_MODEL_NAME="Qwen3.5-397B-A17B" - RECIPE="qwen35_vl_397b_a17b_finetune_config" + RECIPE="qwen35_vl_397b_a17b_sft_config" ;; *) echo "ERROR: Unknown model '$MODEL_SIZE'. Must be one of: 0.8B, 2B, 4B, 9B, 27B, 35B-A3B, 122B-A10B, 397B-A17B" diff --git a/src/megatron/bridge/recipes/qwen_vl/__init__.py b/src/megatron/bridge/recipes/qwen_vl/__init__.py index e7543903c8..35ef162de1 100644 --- a/src/megatron/bridge/recipes/qwen_vl/__init__.py +++ b/src/megatron/bridge/recipes/qwen_vl/__init__.py @@ -35,36 +35,46 @@ # Qwen3.5 models from .qwen35_vl import ( - qwen35_vl_2b_finetune_config, - qwen35_vl_4b_finetune_config, - qwen35_vl_9b_finetune_config, - qwen35_vl_27b_finetune_config, - qwen35_vl_35b_a3b_finetune_config, - qwen35_vl_122b_a10b_finetune_config, - qwen35_vl_397b_a17b_finetune_config, - qwen35_vl_800m_finetune_config, + qwen35_vl_2b_peft_config, + qwen35_vl_2b_sft_config, + qwen35_vl_4b_peft_config, + qwen35_vl_4b_sft_config, + qwen35_vl_9b_peft_config, + qwen35_vl_9b_sft_config, + qwen35_vl_27b_peft_config, + qwen35_vl_27b_sft_config, + qwen35_vl_35b_a3b_peft_config, + qwen35_vl_35b_a3b_sft_config, + qwen35_vl_122b_a10b_peft_config, + qwen35_vl_122b_a10b_sft_config, + qwen35_vl_397b_a17b_peft_config, + qwen35_vl_397b_a17b_sft_config, + qwen35_vl_800m_peft_config, + qwen35_vl_800m_sft_config, ) __all__ = [ - # Qwen3-VL pretrain configs - "qwen3_vl_8b_pretrain_config", - "qwen3_vl_30b_a3b_pretrain_config", - "qwen3_vl_235b_a22b_pretrain_config", - # Qwen3-VL finetune configs (with PEFT support) - "qwen3_vl_8b_finetune_config", - "qwen3_vl_30b_a3b_finetune_config", - "qwen3_vl_235b_a22b_finetune_config", - # Qwen3.5-VL finetune configs — dense (with PEFT support) - "qwen35_vl_800m_finetune_config", - "qwen35_vl_2b_finetune_config", - "qwen35_vl_4b_finetune_config", - "qwen35_vl_9b_finetune_config", - "qwen35_vl_27b_finetune_config", - # Qwen3.5-VL finetune configs — MoE (with PEFT support) - "qwen35_vl_35b_a3b_finetune_config", - "qwen35_vl_122b_a10b_finetune_config", - "qwen35_vl_397b_a17b_finetune_config", + # Qwen3.5-VL SFT configs — dense + "qwen35_vl_800m_sft_config", + "qwen35_vl_2b_sft_config", + "qwen35_vl_4b_sft_config", + "qwen35_vl_9b_sft_config", + "qwen35_vl_27b_sft_config", + # Qwen3.5-VL SFT configs — MoE + "qwen35_vl_35b_a3b_sft_config", + "qwen35_vl_122b_a10b_sft_config", + "qwen35_vl_397b_a17b_sft_config", + # Qwen3.5-VL PEFT configs — dense + "qwen35_vl_800m_peft_config", + "qwen35_vl_2b_peft_config", + "qwen35_vl_4b_peft_config", + "qwen35_vl_9b_peft_config", + "qwen35_vl_27b_peft_config", + # Qwen3.5-VL PEFT configs — MoE + "qwen35_vl_35b_a3b_peft_config", + "qwen35_vl_122b_a10b_peft_config", + "qwen35_vl_397b_a17b_peft_config", # Qwen2.5-VL SFT configs "qwen25_vl_3b_sft_config", "qwen25_vl_7b_sft_config", diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py index 15d60d2bf6..b7b3ef83cb 100644 --- a/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py +++ b/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py @@ -12,680 +12,1801 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Fine-tuning recipes for Qwen3.5 Vision-Language Models. - -Qwen3.5 is a family of VLMs that combine a hybrid Gated DeltaNet (GDN) + Gated -Attention language model with a vision encoder. Two variants are supported: +"""Qwen3.5-VL finetuning recipes with parameterless API. -- **Dense** (e.g., Qwen3.5-27B): standard dense MLP -- **MoE** (e.g., Qwen3.5-35B-A3B, 122B-A10B, 397B-A17B): Mixture of Experts - with shared experts +This module provides SFT and PEFT configurations for Qwen3.5-VL models: -Each public function returns a ready-to-use :class:`ConfigContainer` for -fine-tuning. Pass ``peft="lora"`` for parameter-efficient -fine-tuning, or leave ``peft=None`` for full supervised fine-tuning (SFT). +- **Dense**: 800M, 2B, 4B, 9B, 27B +- **MoE**: 35B-A3B, 122B-A10B, 397B-A17B """ -import os -from typing import List, Optional, Union - import torch -from typing_extensions import TypedDict, Unpack from megatron.bridge import AutoBridge -from megatron.bridge.data.vlm_datasets import ( - HFDatasetConversationProvider, - MockVLMConversationProvider, - PreloadedVLMConversationProvider, -) from megatron.bridge.peft.base import PEFT -from megatron.bridge.recipes.utils.finetune_utils import default_peft_config as _default_peft_config +from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm +from megatron.bridge.recipes.utils.finetune_utils import default_peft_config from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE -from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DatasetProvider, - DistributedDataParallelConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, - ValidationConfig, -) -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, bf16_mixed - - -class Qwen35VLCommonKwargs(TypedDict, total=False): - """Typed options accepted by Qwen3.5 VL recipe helpers.""" - - # Core identifiers - hf_path: str - dir: Optional[str] - name: str - # Dataset configuration - train_data_path: Optional[List[str]] - valid_data_path: Optional[List[str]] - test_data_path: Optional[List[str]] - dataset_type: Optional[str] - image_folder: Optional[str] - tokenizer_model: Optional[str] - mock: bool - # Model configuration - tensor_model_parallel_size: int - pipeline_model_parallel_size: int - pipeline_dtype: Optional[torch.dtype] - virtual_pipeline_model_parallel_size: Optional[int] - context_parallel_size: int - expert_model_parallel_size: Optional[int] - expert_tensor_parallel_size: int - sequence_parallel: bool - use_megatron_fsdp: bool - enable_recompute: bool - account_for_embedding_in_pipeline_split: bool - account_for_loss_in_pipeline_split: bool - # Training hyperparameters - train_iters: int - global_batch_size: int - micro_batch_size: int - seq_length: int - lr: float - min_lr: float - lr_warmup_iters: int - lr_decay_iters: Optional[int] - eval_interval: int - save_interval: int - use_null_tokenizer: bool - # Precision / overlap configs - precision_config: Optional[Union[MixedPrecisionConfig, str]] - comm_overlap_config: Optional[CommOverlapConfig] - # Freeze options - pretrained_checkpoint: Optional[str] - freeze_language_model: bool - freeze_vision_model: bool - freeze_vision_projection: bool - # PEFT options - peft: Optional[Union[str, PEFT]] - finetune_lr: float - # W&B logging - wandb_project: Optional[str] - wandb_entity: Optional[str] - wandb_exp_name: Optional[str] - - -# --------------------------------------------------------------------------- -# Dense variant: Qwen3.5-800M -# --------------------------------------------------------------------------- - - -def qwen35_vl_800m_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3.5-800M (dense). - - Default configuration: - - LoRA/DoRA: TP=1, PP=1 (1 node), LR=1e-4 - - Full SFT: TP=1, PP=1 (1 node), LR=5e-6 +from megatron.bridge.training.config import ConfigContainer - Note: num_kv_heads=2, so max TP=2. - See `_qwen35_vl_common` for the full list of parameters. +# ============================================================================= +# Qwen3.5-VL 800M SFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_800m_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3.5-VL 800M (dense). + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 + + Note: num_kv_heads=2, so max TP=2. """ - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen35VLCommonKwargs = { - "hf_path": "Qwen/Qwen3.5-0.8B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "expert_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen35_vl_common(**combined_kwargs) - - -# --------------------------------------------------------------------------- -# Dense variant: Qwen3.5-2B -# --------------------------------------------------------------------------- - - -def qwen35_vl_2b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3.5-2B (dense). - - Default configuration: - - LoRA/DoRA: TP=1, PP=1 (1 node), LR=1e-4 - - Full SFT: TP=1, PP=1 (1 node), LR=5e-6 + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3.5-0.8B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=5e-7, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 2B SFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_2b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3.5-VL 2B (dense). + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 Note: num_kv_heads=2, so max TP=2. + """ + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3.5-2B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=5e-7, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg - See `_qwen35_vl_common` for the full list of parameters. + +# ============================================================================= +# Qwen3.5-VL 4B SFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_4b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3.5-VL 4B (dense). + + Default configuration: 1 node, 8 GPUs + - TP=2, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 + + Note: num_kv_heads=4, so max TP=4. """ - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen35VLCommonKwargs = { - "hf_path": "Qwen/Qwen3.5-2B", - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "expert_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen35_vl_common(**combined_kwargs) - - -# --------------------------------------------------------------------------- -# Dense variant: Qwen3.5-4B -# --------------------------------------------------------------------------- - - -def qwen35_vl_4b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3.5-4B (dense). - - Default configuration: - - LoRA/DoRA: TP=1, PP=1 (1 node), LR=1e-4 - - Full SFT: TP=2, PP=1 (1 node), LR=5e-6 + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3.5-4B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=5e-7, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 9B SFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_9b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3.5-VL 9B (dense). + + Default configuration: 1 node, 8 GPUs + - TP=4, PP=1 + - LR=5e-6 (full SFT) + - Sequence length: 4096 Note: num_kv_heads=4, so max TP=4. + """ + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3.5-9B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=5e-7, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + - See `_qwen35_vl_common` for the full list of parameters. +# ============================================================================= +# Qwen3.5-VL 27B SFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_27b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3.5-VL 27B (dense). + + Default configuration: 2 nodes, 16 GPUs total + - TP=4, PP=4 + - LR=5e-6 (full SFT) + - Sequence length: 4096 """ - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen35VLCommonKwargs = { - "hf_path": "Qwen/Qwen3.5-4B", - "tensor_model_parallel_size": 2 if is_full_sft else 1, - "pipeline_model_parallel_size": 1, - "expert_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen35_vl_common(**combined_kwargs) - - -# --------------------------------------------------------------------------- -# Dense variant: Qwen3.5-9B -# --------------------------------------------------------------------------- - - -def qwen35_vl_9b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3.5-9B (dense). - - Default configuration: - - LoRA/DoRA: TP=1, PP=1 (1 node), LR=1e-4 - - Full SFT: TP=4, PP=1 (1 node), LR=5e-6 + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3.5-27B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 4 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=5e-6, + min_lr=5e-7, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg - Note: num_kv_heads=4, so max TP=4. - See `_qwen35_vl_common` for the full list of parameters. +# ============================================================================= +# Qwen3.5-VL 35B-A3B SFT Configuration (MoE) +# ============================================================================= +def qwen35_vl_35b_a3b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3.5-VL 35B-A3B (MoE). + + Default configuration: 2 nodes, 16 GPUs + - TP=2, PP=1, EP=16 + - LR=2e-5 (full SFT) + - Sequence length: 4096 """ - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen35VLCommonKwargs = { - "hf_path": "Qwen/Qwen3.5-9B", - "tensor_model_parallel_size": 4 if is_full_sft else 1, - "pipeline_model_parallel_size": 1, - "expert_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen35_vl_common(**combined_kwargs) - - -# --------------------------------------------------------------------------- -# Dense variant: Qwen3.5-27B -# --------------------------------------------------------------------------- - - -def qwen35_vl_27b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3.5-27B (dense). - - Default configuration: - - LoRA/DoRA: TP=2, PP=1 (1 node), LR=1e-4 - - Full SFT: TP=4, PP=4 (2 nodes), LR=5e-6 - - See `_qwen35_vl_common` for the full list of parameters. + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3.5-35B-A3B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 16 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = True + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=2e-5, + min_lr=2e-6, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 122B-A10B SFT Configuration (MoE) +# ============================================================================= +def qwen35_vl_122b_a10b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3.5-VL 122B-A10B (MoE). + + Default configuration: 4 nodes, 32 GPUs + - TP=2, PP=6, EP=8 + - LR=2e-5 (full SFT) + - Sequence length: 4096 """ - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen35VLCommonKwargs = { - "hf_path": "Qwen/Qwen3.5-27B", - "tensor_model_parallel_size": 4 if is_full_sft else 2, - "pipeline_model_parallel_size": 4 if is_full_sft else 1, - "pipeline_dtype": torch.bfloat16 if is_full_sft else None, - "expert_model_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 5e-6 if is_full_sft else 1e-4, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen35_vl_common(**combined_kwargs) - - -# --------------------------------------------------------------------------- -# MoE variant: Qwen3.5-35B-A3B -# --------------------------------------------------------------------------- - - -def qwen35_vl_35b_a3b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3.5-35B-A3B (MoE). - - This is a small Mixture-of-Experts model. Recommended to use with expert - parallelism (EP) for efficient training. - - Default configuration: - - LoRA/DoRA: TP=2, PP=1, EP=4 (1 node), LR=2e-4 - - Full SFT: TP=2, PP=1, EP=16 (2 nodes), LR=2e-5 - - See `_qwen35_vl_common` for the full list of parameters. + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3.5-122B-A10B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 6 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 8 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = True + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving — activation recomputation enabled for this large model + cfg.model.recompute_granularity = "full" + cfg.model.recompute_method = "uniform" + cfg.model.recompute_num_layers = 1 + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 36 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=2e-5, + min_lr=2e-6, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 397B-A17B SFT Configuration (MoE) +# ============================================================================= +def qwen35_vl_397b_a17b_sft_config() -> ConfigContainer: + """Return a full SFT config for Qwen3.5-VL 397B-A17B (MoE). + + Default configuration: 16 nodes, 128 GPUs + - TP=2, PP=4, EP=32 + - LR=2e-5 (full SFT) + - Sequence length: 4096 """ - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen35VLCommonKwargs = { - "hf_path": "Qwen/Qwen3.5-35B-A3B", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 1, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 16 if is_full_sft else 4, - "expert_tensor_parallel_size": 1, - "peft": peft_value, - "finetune_lr": 2e-5 if is_full_sft else 2e-4, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - "min_lr": 2e-6 if is_full_sft else 1e-4, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen35_vl_common(**combined_kwargs) - - -# --------------------------------------------------------------------------- -# MoE variant: Qwen3.5-122B-A10B -# --------------------------------------------------------------------------- - - -def qwen35_vl_122b_a10b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3.5-122B-A10B (MoE). - - This is a medium-sized Mixture-of-Experts model. Recommended to use with - expert parallelism (EP) for efficient training. - - Default configuration: - - LoRA/DoRA: TP=2, PP=1, EP=8 (1 node), LR=2e-4 - - Full SFT: TP=2, PP=4, EP=8 (4 nodes), LR=2e-5 - - See `_qwen35_vl_common` for the full list of parameters. + cfg = _sft_common_vlm() + + # Model configuration + hf_path = "Qwen/Qwen3.5-397B-A17B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 4 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 32 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = True + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving — activation recomputation enabled for this large model + cfg.model.recompute_granularity = "full" + cfg.model.recompute_method = "uniform" + cfg.model.recompute_num_layers = 1 + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - lower LR for full SFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=2e-5, + min_lr=2e-6, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 800M PEFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_800m_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3.5-VL 800M (dense). + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen35VLCommonKwargs = { - "hf_path": "Qwen/Qwen3.5-122B-A10B", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 6 if is_full_sft else 1, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 8, - "expert_tensor_parallel_size": 1, - "peft": peft_value, - "enable_recompute": is_full_sft, - "finetune_lr": 2e-5 if is_full_sft else 2e-4, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 36, - } - combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen35_vl_common(**combined_kwargs) - - -# --------------------------------------------------------------------------- -# MoE variant: Qwen3.5-397B-A17B -# --------------------------------------------------------------------------- -# TODO note this down somewhere -# For multinode training, if you encounter a file lock issue, you can replace hf_path with the local -# path to the model, e.g hf_home/hub/models--Qwen--Qwen3.5-397B-A17B/snapshots/... directory - - -def qwen35_vl_397b_a17b_finetune_config(**user_kwargs: Unpack[Qwen35VLCommonKwargs]) -> ConfigContainer: - """Return a fine-tuning config for Qwen3.5-397B-A17B (MoE). - - This is a Mixture-of-Experts model with 512 experts and top-10 routing. - Recommended to use with expert parallelism (EP) for efficient training. - - Default configuration: - - LoRA/DoRA: TP=2, PP=1, EP=32 (4 nodes), LR=2e-4 - - Full SFT: TP=2, PP=4, EP=32 (16 nodes), LR=2e-5 - - See `_qwen35_vl_common` for the full list of parameters. + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen3.5-0.8B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 2B PEFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_2b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3.5-VL 2B (dense). + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen3.5-2B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 4B PEFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_4b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3.5-VL 4B (dense). + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen3.5-4B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 9B PEFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_9b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3.5-VL 9B (dense). + + Default configuration: 1 node, 8 GPUs + - TP=1, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen3.5-9B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP for PEFT + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 27B PEFT Configuration (Dense) +# ============================================================================= +def qwen35_vl_27b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3.5-VL 27B (dense). + + Default configuration: 1 node, 8 GPUs + - TP=2, PP=1 + - LR=1e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - peft_value = user_kwargs.get("peft", None) - is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none") - - recommended_kwargs: Qwen35VLCommonKwargs = { - "hf_path": "Qwen/Qwen3.5-397B-A17B", - "tensor_model_parallel_size": 2, - "pipeline_model_parallel_size": 4 if is_full_sft else 1, - "pipeline_dtype": torch.bfloat16, - "expert_model_parallel_size": 32, - "expert_tensor_parallel_size": 1, - "peft": peft_value, - "enable_recompute": is_full_sft, - "finetune_lr": 2e-5 if is_full_sft else 2e-4, - "freeze_language_model": False, - "freeze_vision_model": False, - "freeze_vision_projection": False, - "lr_warmup_iters": 200, - "micro_batch_size": 1, - "global_batch_size": 32, - } - combined_kwargs: Qwen35VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen35_vl_common(**combined_kwargs) - - -# --------------------------------------------------------------------------- -# Shared implementation -# --------------------------------------------------------------------------- - - -def _qwen35_vl_common( - hf_path: str, - dir: Optional[str] = None, - name: str = "qwen35_vl_finetune", + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen3.5-27B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower TP/PP for PEFT + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = None + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = False + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=1e-4, + min_lr=3e-5, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + # Dataset configuration - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - dataset_type: Optional[str] = None, - image_folder: Optional[str] = None, - tokenizer_model: Optional[str] = None, - mock: bool = False, + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 35B-A3B PEFT Configuration (MoE) +# ============================================================================= +def qwen35_vl_35b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3.5-VL 35B-A3B (MoE). + + Default configuration: 1 node, 8 GPUs + - TP=2, PP=1, EP=4 + - LR=2e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + # Model configuration - tensor_model_parallel_size: int = 4, - pipeline_model_parallel_size: int = 1, - pipeline_dtype: Optional[torch.dtype] = torch.bfloat16, - virtual_pipeline_model_parallel_size: Optional[int] = None, - context_parallel_size: int = 1, - expert_model_parallel_size: Optional[int] = 1, - expert_tensor_parallel_size: int = 1, - sequence_parallel: bool = False, - use_megatron_fsdp: bool = False, - enable_recompute: bool = False, - account_for_embedding_in_pipeline_split: bool = False, - account_for_loss_in_pipeline_split: bool = False, - # Training hyperparameters - train_iters: int = 300000, - global_batch_size: int = 32, - micro_batch_size: int = 1, - seq_length: int = 4096, - lr: float = 3e-4, - min_lr: float = 3e-5, - lr_warmup_iters: int = 500, - lr_decay_iters: Optional[int] = None, - eval_interval: int = 500, - save_interval: int = 500, - use_null_tokenizer: bool = False, - # Precision recipe - precision_config: Optional[Union[MixedPrecisionConfig, str]] = None, - comm_overlap_config: Optional[CommOverlapConfig] = None, - # Freeze options - pretrained_checkpoint: Optional[str] = None, - freeze_language_model: bool = True, - freeze_vision_model: bool = True, - freeze_vision_projection: bool = False, - # PEFT options - peft: Optional[Union[str, PEFT]] = None, - finetune_lr: Optional[float] = None, - # W&B logging - wandb_project: Optional[str] = None, - wandb_entity: Optional[str] = None, - wandb_exp_name: Optional[str] = None, -) -> ConfigContainer: - """Create a fine-tuning configuration for Qwen3.5 VL models. - - Supports the dense (Qwen3.5-27B) and MoE (Qwen3.5-35B-A3B, - Qwen3.5-122B-A10B, Qwen3.5-397B-A17B) variants. The model - architecture is automatically determined from ``hf_path`` via - :class:`AutoBridge`. + hf_path = "Qwen/Qwen3.5-35B-A3B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower EP for PEFT + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 4 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = True + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=2e-4, + min_lr=1e-4, + ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 122B-A10B PEFT Configuration (MoE) +# ============================================================================= +def qwen35_vl_122b_a10b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3.5-VL 122B-A10B (MoE). + + Default configuration: 2 nodes, 16 GPUs + - TP=2, PP=1, EP=8 + - LR=2e-4 (PEFT) + - Sequence length: 4096 Args: - hf_path: HuggingFace model path. - dir: Base directory for logs and checkpoints. - name: Name of the training run. - train_data_path: Training data paths. - valid_data_path: Validation data paths. - test_data_path: Test data paths. - dataset_type: One of ``"mock"``, ``"hf"``, ``"preloaded"``. - image_folder: Path to image folder (for preloaded datasets). - tokenizer_model: Path or HF name for the tokenizer/processor. - mock: If *True*, equivalent to ``dataset_type="mock"``. - tensor_model_parallel_size: Tensor parallelism degree. - pipeline_model_parallel_size: Pipeline parallelism degree. - pipeline_dtype: Data type for pipeline parallelism. - virtual_pipeline_model_parallel_size: Virtual pipeline parallelism. - context_parallel_size: Context parallelism degree. - expert_model_parallel_size: Expert parallelism degree (MoE). - expert_tensor_parallel_size: Expert tensor parallelism (MoE). - sequence_parallel: Whether to use sequence parallelism. - use_megatron_fsdp: Whether to use Megatron FSDP. - enable_recompute: Whether to enable activation recomputation. - account_for_embedding_in_pipeline_split: Account for embedding in PP split. - account_for_loss_in_pipeline_split: Account for loss in PP split. - train_iters: Total training iterations. - global_batch_size: Global batch size. - micro_batch_size: Micro batch size. - seq_length: Sequence length. - lr: Learning rate. - min_lr: Minimum learning rate for cosine decay. - lr_warmup_iters: Warmup iterations. - lr_decay_iters: LR decay iterations (defaults to *train_iters*). - eval_interval: Evaluation interval. - save_interval: Checkpoint save interval. - use_null_tokenizer: Use NullTokenizer instead of HuggingFace tokenizer. - precision_config: Precision configuration (default: bf16 mixed). - comm_overlap_config: Communication overlap configuration. - pretrained_checkpoint: Path to a pretrained checkpoint. - freeze_language_model: Freeze the language model weights. - freeze_vision_model: Freeze the vision encoder weights. - freeze_vision_projection: Freeze the vision projection weights. - peft: PEFT configuration (``"lora"``, ``"dora"``, or a PEFT object). - finetune_lr: Learning rate override for fine-tuning. - wandb_project: W&B project name. - wandb_entity: W&B entity name. - wandb_exp_name: W&B experiment name. - - Returns: - ConfigContainer ready for training. + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. """ - base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, name) - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - bridge = AutoBridge.from_hf_pretrained(hf_path) - model_cfg = bridge.to_megatron_provider(load_weights=False) - model_cfg.tensor_model_parallel_size = tensor_model_parallel_size - model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size - model_cfg.pipeline_dtype = pipeline_dtype - model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size - model_cfg.context_parallel_size = context_parallel_size - model_cfg.expert_model_parallel_size = expert_model_parallel_size - model_cfg.expert_tensor_parallel_size = expert_tensor_parallel_size - if not sequence_parallel and tensor_model_parallel_size > 1 and (expert_model_parallel_size or 1) > 1: - sequence_parallel = True - model_cfg.sequence_parallel = sequence_parallel - model_cfg.freeze_language_model = freeze_language_model - model_cfg.freeze_vision_model = freeze_vision_model - model_cfg.freeze_vision_projection = freeze_vision_projection - model_cfg.seq_length = seq_length - - if precision_config is None: - precision_config = bf16_mixed() - - if account_for_embedding_in_pipeline_split: - model_cfg.account_for_embedding_in_pipeline_split = True - if account_for_loss_in_pipeline_split: - model_cfg.account_for_loss_in_pipeline_split = True - - if enable_recompute: - model_cfg.recompute_granularity = "full" - model_cfg.recompute_method = "uniform" - model_cfg.recompute_num_layers = 1 - - model_cfg.validate_parallelism() - - # Optimizer and scheduler - effective_lr = finetune_lr if finetune_lr is not None else lr - if min_lr > effective_lr: - min_lr = effective_lr * 0.1 - opt_config, scheduler = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=lr_warmup_iters, - lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters, - max_lr=effective_lr, - min_lr=min_lr, + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) + else: + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen3.5-122B-A10B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower PP for PEFT + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 8 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = True + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 36 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=2e-4, + min_lr=3e-5, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None - peft_config = _default_peft_config(peft) - - # Dataset selection - _processor_model = tokenizer_model or hf_path - _dataset_choice = dataset_type or ("mock" if mock else "hf") - - if _dataset_choice == "mock": - dataset_cfg: DatasetProvider = MockVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - prompt="Describe this image.", - num_workers=1, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - create_attention_mask=True, - pad_to_max_length=True, - ) - elif _dataset_choice == "preloaded": - dataset_cfg = PreloadedVLMConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path, - valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path, - test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path, - image_folder=image_folder, - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) - elif _dataset_choice == "hf": - dataset_cfg = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=_processor_model, - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - ) + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" + + return cfg + + +# ============================================================================= +# Qwen3.5-VL 397B-A17B PEFT Configuration (MoE) +# ============================================================================= +def qwen35_vl_397b_a17b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: + """Return a PEFT config for Qwen3.5-VL 397B-A17B (MoE). + + Default configuration: 4 nodes, 32 GPUs + - TP=2, PP=1, EP=32 + - LR=2e-4 (PEFT) + - Sequence length: 4096 + + Args: + peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + """ + cfg = _peft_common_vlm() + + # PEFT scheme + if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: + cfg.peft = default_peft_config(peft_scheme) else: - raise ValueError(f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf'].") - - cfg = ConfigContainer( - model=model_cfg, - train=TrainingConfig( - train_iters=train_iters, - global_batch_size=global_batch_size, - micro_batch_size=micro_batch_size, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - validation=ValidationConfig( - eval_interval=eval_interval, - eval_iters=32, - ), - optimizer=opt_config, - scheduler=scheduler, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - use_megatron_fsdp=use_megatron_fsdp, - ), - dataset=dataset_cfg, - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - wandb_project=wandb_project, - wandb_entity=wandb_entity, - wandb_exp_name=wandb_exp_name, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer" if use_null_tokenizer else "HuggingFaceTokenizer", - tokenizer_model=hf_path if not use_null_tokenizer else None, - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE if use_null_tokenizer else None, - ), - checkpoint=CheckpointConfig( - pretrained_checkpoint=pretrained_checkpoint, - save_interval=save_interval, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - rng=RNGConfig(seed=1234), - peft=peft_config, - comm_overlap=comm_overlap_config, - mixed_precision=precision_config, + cfg.peft = peft_scheme + + # Model configuration + hf_path = "Qwen/Qwen3.5-397B-A17B" + cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) + cfg.model.seq_length = 4096 + + # Parallel settings - lower PP for PEFT + cfg.model.tensor_model_parallel_size = 2 + cfg.model.pipeline_model_parallel_size = 1 + cfg.model.pipeline_dtype = torch.bfloat16 + cfg.model.virtual_pipeline_model_parallel_size = None + cfg.model.expert_model_parallel_size = 32 + cfg.model.expert_tensor_parallel_size = 1 + cfg.model.context_parallel_size = 1 + cfg.model.sequence_parallel = True + + # VLM-specific settings + cfg.model.freeze_language_model = False + cfg.model.freeze_vision_model = False + cfg.model.freeze_vision_projection = False + + # TE / Transformer implementation + cfg.model.transformer_impl = "transformer_engine" + + # CUDA Graph settings + cfg.model.cuda_graph_impl = "none" + cfg.model.cuda_graph_scope = "full" + cfg.model.cuda_graph_warmup_steps = 3 + + # Kernel selections + cfg.model.attention_backend = "auto" + cfg.model.cross_entropy_loss_fusion = True + cfg.model.cross_entropy_fusion_impl = "native" + + # MoE kernel selections + cfg.model.moe_router_fusion = False + cfg.model.moe_permute_fusion = True + cfg.model.moe_grouped_gemm = True + + # Memory saving (disabled by default) + cfg.model.recompute_granularity = None + cfg.model.recompute_modules = None + cfg.model.fine_grained_activation_offloading = False + cfg.model.offload_modules = None + + # MoE overlap + cfg.model.moe_shared_expert_overlap = False + + # MoE force balance + cfg.model.moe_router_force_load_balancing = False + + # MoE FP8 padding + cfg.model.moe_router_padding_for_fp8 = False + + # Training config + cfg.train.train_iters = 300000 + cfg.train.global_batch_size = 32 + cfg.train.micro_batch_size = 1 + cfg.train.manual_gc = True + cfg.train.manual_gc_interval = 100 + cfg.train.manual_gc_eval = 100 + + # Validation config + cfg.validation.eval_interval = 500 + cfg.validation.eval_iters = 32 + + # Optimizer - higher LR for PEFT + opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( + lr_warmup_iters=200, + lr_decay_iters=300000, + max_lr=2e-4, + min_lr=3e-5, ) + cfg.optimizer = opt_cfg + cfg.scheduler = scheduler_cfg + + # Optimizer precision settings (disabled by default for full precision) + cfg.optimizer.use_precision_aware_optimizer = False + cfg.optimizer.main_grads_dtype = torch.float32 + cfg.optimizer.main_params_dtype = torch.float32 + cfg.optimizer.exp_avg_dtype = torch.float32 + cfg.optimizer.exp_avg_sq_dtype = torch.float32 + + # Dataset configuration + cfg.dataset.seq_length = 4096 + cfg.dataset.hf_processor_path = hf_path + cfg.dataset.pack_sequences_in_batch = False + + # DDP settings + cfg.ddp.overlap_grad_reduce = False + cfg.ddp.overlap_param_gather = False + cfg.ddp.check_for_nan_in_grad = True + cfg.ddp.use_distributed_optimizer = True + cfg.ddp.grad_reduce_in_fp32 = True + cfg.ddp.average_in_collective = True + cfg.ddp.data_parallel_sharding_strategy = "optim_grads_params" + + # Comm overlap settings (MoE) + cfg.comm_overlap = None + + # FP8 and MXFP8 settings (disabled by default) + cfg.mixed_precision = "bf16_mixed" return cfg diff --git a/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py index 5d78889e77..79a283930f 100644 --- a/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py +++ b/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py @@ -26,7 +26,7 @@ import pytest -from megatron.bridge.recipes.qwen_vl.qwen35_vl import qwen35_vl_27b_finetune_config +from megatron.bridge.recipes.qwen_vl.qwen35_vl import qwen35_vl_27b_sft_config from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test @@ -40,7 +40,7 @@ QWEN35_VL_SFT_NONE_FROZEN = [ ( - qwen35_vl_27b_finetune_config, + qwen35_vl_27b_sft_config, "qwen35_vl_27b_sft_none_frozen", _TP2_PP1, { @@ -58,7 +58,7 @@ QWEN35_VL_SFT_LM_FROZEN = [ ( - qwen35_vl_27b_finetune_config, + qwen35_vl_27b_sft_config, "qwen35_vl_27b_sft_lm_frozen", _TP2_PP1, { @@ -76,7 +76,7 @@ QWEN35_VL_SFT_PROJ_ONLY = [ ( - qwen35_vl_27b_finetune_config, + qwen35_vl_27b_sft_config, "qwen35_vl_27b_sft_projection_only", _TP2_PP1, { @@ -94,7 +94,7 @@ QWEN35_VL_SFT_RECOMPUTE = [ ( - qwen35_vl_27b_finetune_config, + qwen35_vl_27b_sft_config, "qwen35_vl_27b_sft_recompute", _TP2_PP1, { diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen35_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen35_vl_recipes.py index 8a9effa514..1b14d24de0 100644 --- a/tests/unit_tests/recipes/qwen_vl/test_qwen35_vl_recipes.py +++ b/tests/unit_tests/recipes/qwen_vl/test_qwen35_vl_recipes.py @@ -16,7 +16,7 @@ # Test purpose: # - Parametrize over all exported Qwen3.5-VL recipe functions. # - Monkeypatch AutoBridge and the provider to avoid I/O and heavy model init. -# - Build a config with small, safe overrides and assert it forms a valid ConfigContainer. +# - Build a config and assert it forms a valid ConfigContainer. # - Verify dataset provider selection, parallelism fields, freeze options, and PEFT defaults. # @@ -28,38 +28,30 @@ _qwen35_vl_module = importlib.import_module("megatron.bridge.recipes.qwen_vl.qwen35_vl") -_QWEN35_VL_RECIPE_FUNCS = [ - _qwen35_vl_module.qwen35_vl_800m_finetune_config, - _qwen35_vl_module.qwen35_vl_2b_finetune_config, - _qwen35_vl_module.qwen35_vl_4b_finetune_config, - _qwen35_vl_module.qwen35_vl_9b_finetune_config, - _qwen35_vl_module.qwen35_vl_27b_finetune_config, - _qwen35_vl_module.qwen35_vl_35b_a3b_finetune_config, - _qwen35_vl_module.qwen35_vl_122b_a10b_finetune_config, - _qwen35_vl_module.qwen35_vl_397b_a17b_finetune_config, -] +# SFT configs (parameterless) +_QWEN35_VL_SFT_FUNCS = [ + _qwen35_vl_module.qwen35_vl_800m_sft_config, + _qwen35_vl_module.qwen35_vl_2b_sft_config, + _qwen35_vl_module.qwen35_vl_4b_sft_config, + _qwen35_vl_module.qwen35_vl_9b_sft_config, + _qwen35_vl_module.qwen35_vl_27b_sft_config, + _qwen35_vl_module.qwen35_vl_35b_a3b_sft_config, + _qwen35_vl_module.qwen35_vl_122b_a10b_sft_config, + _qwen35_vl_module.qwen35_vl_397b_a17b_sft_config, +] -def _safe_overrides_for(name: str) -> dict: - """Create safe test overrides for a given recipe function name.""" - overrides = { - "name": f"unit_{name}", - "dir": ".", - "dataset_type": "mock", - "train_iters": 10, - "global_batch_size": 2, - "micro_batch_size": 1, - "seq_length": 64, - "lr": 1e-4, - "min_lr": 1e-5, - "lr_warmup_iters": 2, - "tensor_model_parallel_size": 1, - "pipeline_model_parallel_size": 1, - "context_parallel_size": 1, - "expert_model_parallel_size": 1, - "use_null_tokenizer": True, - } - return overrides +# PEFT configs (take peft_scheme parameter) +_QWEN35_VL_PEFT_FUNCS = [ + _qwen35_vl_module.qwen35_vl_800m_peft_config, + _qwen35_vl_module.qwen35_vl_2b_peft_config, + _qwen35_vl_module.qwen35_vl_4b_peft_config, + _qwen35_vl_module.qwen35_vl_9b_peft_config, + _qwen35_vl_module.qwen35_vl_27b_peft_config, + _qwen35_vl_module.qwen35_vl_35b_a3b_peft_config, + _qwen35_vl_module.qwen35_vl_122b_a10b_peft_config, + _qwen35_vl_module.qwen35_vl_397b_a17b_peft_config, +] class _FakeModelCfg: @@ -78,14 +70,6 @@ def __init__(self): self.freeze_language_model = False self.freeze_vision_model = False self.freeze_vision_projection = False - self.account_for_embedding_in_pipeline_split = False - self.account_for_loss_in_pipeline_split = False - self.recompute_granularity = None - self.recompute_method = None - self.recompute_num_layers = None - - def validate_parallelism(self): - return None def finalize(self): return None @@ -95,7 +79,7 @@ class _FakeAutoBridge: """Fake AutoBridge for testing.""" @staticmethod - def from_hf_pretrained(hf_path: str, **kwargs): + def from_hf_pretrained(hf_path: str): return _FakeAutoBridge() def to_megatron_provider(self, load_weights: bool = False): @@ -119,638 +103,527 @@ def _assert_basic_config(cfg): assert cfg.train.global_batch_size >= 1 assert cfg.train.micro_batch_size >= 1 - - if hasattr(cfg.dataset, "seq_length"): - assert cfg.dataset.seq_length >= 1 + assert cfg.dataset.seq_length >= 1 # --------------------------------------------------------------------------- -# Basic recipe building tests +# Basic SFT recipe building tests # --------------------------------------------------------------------------- -@pytest.mark.parametrize("recipe_func", _QWEN35_VL_RECIPE_FUNCS) -def test_each_qwen35_vl_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): - """Each Qwen3.5-VL recipe function should build a valid ConfigContainer.""" +@pytest.mark.parametrize("recipe_func", _QWEN35_VL_SFT_FUNCS) +def test_each_qwen35_vl_sft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Qwen3.5-VL SFT recipe function builds a valid configuration.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for(recipe_func.__name__) - cfg = recipe_func(**overrides) + cfg = recipe_func() _assert_basic_config(cfg) + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 + assert hasattr(cfg.model, "freeze_language_model") assert hasattr(cfg.model, "freeze_vision_model") assert hasattr(cfg.model, "freeze_vision_projection") + assert cfg.peft is None + # --------------------------------------------------------------------------- -# Dataset type selection +# Basic PEFT recipe building tests # --------------------------------------------------------------------------- -@pytest.mark.parametrize("dataset_type", ["mock", "hf", "preloaded"]) -def test_qwen35_vl_dataset_type_selection(dataset_type: str, monkeypatch: pytest.MonkeyPatch): - """Different dataset_type values should produce the correct dataset provider.""" +@pytest.mark.parametrize("recipe_func", _QWEN35_VL_PEFT_FUNCS) +def test_each_qwen35_vl_peft_recipe_builds_config(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that each Qwen3.5-VL PEFT recipe function builds a valid configuration.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["dataset_type"] = dataset_type + cfg = recipe_func() + + _assert_basic_config(cfg) - if dataset_type == "preloaded": - overrides["train_data_path"] = ["/fake/train.json"] - overrides["valid_data_path"] = ["/fake/valid.json"] - overrides["test_data_path"] = ["/fake/test.json"] - overrides["image_folder"] = "/fake/images" + if hasattr(cfg, "tokenizer") and hasattr(cfg.tokenizer, "tokenizer_type"): + assert cfg.tokenizer.tokenizer_type == "NullTokenizer" - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + assert getattr(cfg.model, "tensor_model_parallel_size", 1) >= 1 + assert getattr(cfg.model, "pipeline_model_parallel_size", 1) >= 1 - from megatron.bridge.data.vlm_datasets import ( - HFDatasetConversationProvider, - MockVLMConversationProvider, - PreloadedVLMConversationProvider, - ) + assert hasattr(cfg.model, "freeze_language_model") + assert hasattr(cfg.model, "freeze_vision_model") + assert hasattr(cfg.model, "freeze_vision_projection") - if dataset_type == "mock": - assert isinstance(cfg.dataset, MockVLMConversationProvider) - elif dataset_type == "hf": - assert isinstance(cfg.dataset, HFDatasetConversationProvider) - elif dataset_type == "preloaded": - assert isinstance(cfg.dataset, PreloadedVLMConversationProvider) + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") # --------------------------------------------------------------------------- -# Training scenarios: SFT freeze combinations +# PEFT schemes # --------------------------------------------------------------------------- -def test_sft_nothing_frozen(monkeypatch: pytest.MonkeyPatch): - """Scenario 1: Full SFT with nothing frozen — all modules trainable.""" +@pytest.mark.parametrize("recipe_func", _QWEN35_VL_PEFT_FUNCS) +@pytest.mark.parametrize("peft_scheme", ["lora", "dora"]) +def test_qwen35_vl_peft_schemes(recipe_func: Callable, peft_scheme: str, monkeypatch: pytest.MonkeyPatch): + """Test that different PEFT schemes are correctly applied for Qwen3.5-VL models.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["peft"] = None - overrides["freeze_language_model"] = False - overrides["freeze_vision_model"] = False - overrides["freeze_vision_projection"] = False - - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + cfg = recipe_func(peft_scheme=peft_scheme) _assert_basic_config(cfg) - assert cfg.peft is None - assert cfg.model.freeze_language_model is False - assert cfg.model.freeze_vision_model is False - assert cfg.model.freeze_vision_projection is False + assert cfg.peft is not None + assert hasattr(cfg.peft, "dim") + assert hasattr(cfg.peft, "alpha") -def test_sft_language_model_frozen(monkeypatch: pytest.MonkeyPatch): - """Scenario 2: SFT with language model frozen — train vision + projection.""" - monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["peft"] = None - overrides["freeze_language_model"] = True - overrides["freeze_vision_model"] = False - overrides["freeze_vision_projection"] = False +# --------------------------------------------------------------------------- +# 800M dense defaults +# --------------------------------------------------------------------------- + + +def test_qwen35_vl_800m_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """800M SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() _assert_basic_config(cfg) + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 assert cfg.peft is None - assert cfg.model.freeze_language_model is True - assert cfg.model.freeze_vision_model is False - assert cfg.model.freeze_vision_projection is False + assert cfg.optimizer.lr == 5e-6 -def test_sft_vision_and_language_frozen(monkeypatch: pytest.MonkeyPatch): - """Scenario 3: SFT with vision + language frozen — train projection only.""" +def test_qwen35_vl_800m_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """800M PEFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["peft"] = None - overrides["freeze_language_model"] = True - overrides["freeze_vision_model"] = True - overrides["freeze_vision_projection"] = False - - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_800m_peft_config() _assert_basic_config(cfg) - assert cfg.peft is None - assert cfg.model.freeze_language_model is True - assert cfg.model.freeze_vision_model is True - assert cfg.model.freeze_vision_projection is False + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is not None + assert cfg.optimizer.lr == 1e-4 # --------------------------------------------------------------------------- -# Training scenarios: PEFT + freeze combinations +# 2B dense defaults # --------------------------------------------------------------------------- -def test_peft_lora_language_only(monkeypatch: pytest.MonkeyPatch): - """Scenario 4: LoRA adapters on all modules, vision base weights frozen. - - Default LoRA targets linear_qkv/proj/fc1/fc2 in both vision and language. - Freezing vision base weights means only LoRA adapter deltas are trainable - on the vision side, while the language model base weights remain trainable - as well (unless also frozen via freeze_language_model). The typical - "language-only PEFT" pattern freezes vision + projection and adds LoRA - adapters; the language base weights are also frozen by the recipe default, - but LoRA adapter weights on the language side are always trainable. - """ +def test_qwen35_vl_2b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """2B SFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["peft"] = "lora" - overrides["freeze_language_model"] = True - overrides["freeze_vision_model"] = True - overrides["freeze_vision_projection"] = True - - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_2b_sft_config() _assert_basic_config(cfg) - assert cfg.peft is not None - assert cfg.peft.dim == 32 - assert cfg.model.freeze_language_model is True - assert cfg.model.freeze_vision_model is True - assert cfg.model.freeze_vision_projection is True - + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + assert cfg.optimizer.lr == 5e-6 -def test_peft_lora_vision_and_language(monkeypatch: pytest.MonkeyPatch): - """Scenario 5: LoRA adapters with nothing frozen — adapters on all modules. - LoRA targets linear_qkv/proj/fc1/fc2 in both vision and language. - With nothing frozen, all base weights and all adapter weights are trainable. - """ +def test_qwen35_vl_2b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """2B PEFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["peft"] = "lora" - overrides["freeze_language_model"] = False - overrides["freeze_vision_model"] = False - overrides["freeze_vision_projection"] = False - - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_2b_peft_config() _assert_basic_config(cfg) + assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.pipeline_model_parallel_size == 1 assert cfg.peft is not None - assert cfg.peft.dim == 32 - assert cfg.model.freeze_language_model is False - assert cfg.model.freeze_vision_model is False - assert cfg.model.freeze_vision_projection is False + assert cfg.optimizer.lr == 1e-4 # --------------------------------------------------------------------------- -# PEFT vs full SFT (parametrized across all recipes) +# 4B dense defaults # --------------------------------------------------------------------------- -@pytest.mark.parametrize("recipe_func", _QWEN35_VL_RECIPE_FUNCS) -@pytest.mark.parametrize("peft", ["lora", "dora", None]) -def test_qwen35_vl_finetune_peft_vs_full_sft(recipe_func, peft, monkeypatch: pytest.MonkeyPatch): - """PEFT and full SFT configurations should be correctly applied.""" +def test_qwen35_vl_4b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """4B SFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for(recipe_func.__name__) - overrides["peft"] = peft - - cfg = recipe_func(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_4b_sft_config() _assert_basic_config(cfg) - - if peft in ["lora", "dora"]: - assert cfg.peft is not None - assert hasattr(cfg.peft, "dim") - assert hasattr(cfg.peft, "alpha") - elif peft is None: - assert cfg.peft is None - - -# --------------------------------------------------------------------------- -# 800M dense defaults -# --------------------------------------------------------------------------- + assert cfg.model.tensor_model_parallel_size == 2 + assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.peft is None + assert cfg.optimizer.lr == 5e-6 -def test_qwen35_vl_800m_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """800M LoRA should have correct default parallelism and learning rate.""" +def test_qwen35_vl_4b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """4B PEFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_800m_finetune_config") - overrides["peft"] = "lora" - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_800m_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_4b_peft_config() _assert_basic_config(cfg) - assert cfg.model.tensor_model_parallel_size == 1 assert cfg.model.pipeline_model_parallel_size == 1 assert cfg.peft is not None assert cfg.optimizer.lr == 1e-4 -def test_qwen35_vl_800m_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """800M full SFT should have correct default parallelism and learning rate.""" - monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) +# --------------------------------------------------------------------------- +# 9B dense defaults +# --------------------------------------------------------------------------- + - overrides = _safe_overrides_for("qwen35_vl_800m_finetune_config") - overrides["peft"] = None - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) +def test_qwen35_vl_9b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """9B SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - cfg = _qwen35_vl_module.qwen35_vl_800m_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_9b_sft_config() _assert_basic_config(cfg) - - assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.tensor_model_parallel_size == 4 assert cfg.model.pipeline_model_parallel_size == 1 assert cfg.peft is None assert cfg.optimizer.lr == 5e-6 -# --------------------------------------------------------------------------- -# 2B dense defaults -# --------------------------------------------------------------------------- - - -def test_qwen35_vl_2b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """2B LoRA should have correct default parallelism and learning rate.""" +def test_qwen35_vl_9b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """9B PEFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_2b_finetune_config") - overrides["peft"] = "lora" - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_2b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_9b_peft_config() _assert_basic_config(cfg) - assert cfg.model.tensor_model_parallel_size == 1 assert cfg.model.pipeline_model_parallel_size == 1 assert cfg.peft is not None assert cfg.optimizer.lr == 1e-4 -def test_qwen35_vl_2b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """2B full SFT should have correct default parallelism and learning rate.""" - monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) +# --------------------------------------------------------------------------- +# 27B dense defaults +# --------------------------------------------------------------------------- - overrides = _safe_overrides_for("qwen35_vl_2b_finetune_config") - overrides["peft"] = None - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - cfg = _qwen35_vl_module.qwen35_vl_2b_finetune_config(**overrides) +def test_qwen35_vl_27b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """27B SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - _assert_basic_config(cfg) + cfg = _qwen35_vl_module.qwen35_vl_27b_sft_config() - assert cfg.model.tensor_model_parallel_size == 1 - assert cfg.model.pipeline_model_parallel_size == 1 + _assert_basic_config(cfg) + assert cfg.model.tensor_model_parallel_size == 4 + assert cfg.model.pipeline_model_parallel_size == 4 + assert cfg.model.pipeline_dtype == torch.bfloat16 assert cfg.peft is None assert cfg.optimizer.lr == 5e-6 -# --------------------------------------------------------------------------- -# 4B dense defaults -# --------------------------------------------------------------------------- - - -def test_qwen35_vl_4b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """4B LoRA should have correct default parallelism and learning rate.""" +def test_qwen35_vl_27b_peft_lora_defaults(monkeypatch: pytest.MonkeyPatch): + """27B LoRA should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_4b_finetune_config") - overrides["peft"] = "lora" - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_4b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_27b_peft_config(peft_scheme="lora") _assert_basic_config(cfg) - - assert cfg.model.tensor_model_parallel_size == 1 + assert cfg.model.tensor_model_parallel_size == 2 assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.model.pipeline_dtype is None assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.peft.alpha == 32 assert cfg.optimizer.lr == 1e-4 -def test_qwen35_vl_4b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """4B full SFT should have correct default parallelism and learning rate.""" +def test_qwen35_vl_27b_peft_dora_defaults(monkeypatch: pytest.MonkeyPatch): + """27B DoRA should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_4b_finetune_config") - overrides["peft"] = None - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_4b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_27b_peft_config(peft_scheme="dora") _assert_basic_config(cfg) - assert cfg.model.tensor_model_parallel_size == 2 assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.peft is None - assert cfg.optimizer.lr == 5e-6 + assert cfg.peft is not None + assert cfg.peft.dim == 32 + assert cfg.peft.alpha == 64 # --------------------------------------------------------------------------- -# 9B dense defaults +# 35B-A3B MoE defaults # --------------------------------------------------------------------------- -def test_qwen35_vl_9b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """9B LoRA should have correct default parallelism and learning rate.""" +def test_qwen35_vl_35b_a3b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """35B-A3B SFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_9b_finetune_config") - overrides["peft"] = "lora" - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_9b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_35b_a3b_sft_config() _assert_basic_config(cfg) - assert cfg.model.tensor_model_parallel_size == 2 assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.peft is not None - assert cfg.optimizer.lr == 1e-4 + assert cfg.model.expert_model_parallel_size == 16 + assert cfg.model.pipeline_dtype == torch.bfloat16 + assert cfg.peft is None + assert cfg.optimizer.lr == 2e-5 -def test_qwen35_vl_9b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """9B full SFT should have correct default parallelism and learning rate.""" +def test_qwen35_vl_35b_a3b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """35B-A3B PEFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_9b_finetune_config") - overrides["peft"] = None - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_9b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_35b_a3b_peft_config() _assert_basic_config(cfg) - - assert cfg.model.tensor_model_parallel_size == 4 + assert cfg.model.tensor_model_parallel_size == 2 assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.peft is None - assert cfg.optimizer.lr == 5e-6 + assert cfg.model.expert_model_parallel_size == 4 + assert cfg.peft is not None + assert cfg.optimizer.lr == 2e-4 # --------------------------------------------------------------------------- -# 27B dense defaults +# 122B-A10B MoE defaults # --------------------------------------------------------------------------- -def test_qwen35_vl_27b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """27B LoRA should have correct default parallelism and learning rate.""" +def test_qwen35_vl_122b_a10b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """122B-A10B SFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["peft"] = "lora" - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_122b_a10b_sft_config() _assert_basic_config(cfg) - assert cfg.model.tensor_model_parallel_size == 2 - assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.peft is not None - assert cfg.peft.dim == 32 - assert cfg.peft.alpha == 32 - assert cfg.optimizer.lr == 1e-4 - assert cfg.model.pipeline_dtype is None + assert cfg.model.pipeline_model_parallel_size == 6 + assert cfg.model.expert_model_parallel_size == 8 + assert cfg.model.pipeline_dtype == torch.bfloat16 + assert cfg.peft is None + assert cfg.optimizer.lr == 2e-5 + assert cfg.model.recompute_granularity == "full" -def test_qwen35_vl_27b_dora_defaults(monkeypatch: pytest.MonkeyPatch): - """27B DoRA should have correct default parallelism and learning rate.""" +def test_qwen35_vl_122b_a10b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """122B-A10B PEFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["peft"] = "dora" - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_122b_a10b_peft_config() _assert_basic_config(cfg) - assert cfg.model.tensor_model_parallel_size == 2 assert cfg.model.pipeline_model_parallel_size == 1 + assert cfg.model.expert_model_parallel_size == 8 + assert cfg.model.pipeline_dtype == torch.bfloat16 assert cfg.peft is not None - assert cfg.peft.dim == 32 - assert cfg.peft.alpha == 64 + assert cfg.optimizer.lr == 2e-4 -def test_qwen35_vl_27b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """27B full SFT should have correct default parallelism and learning rate.""" - monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) +# --------------------------------------------------------------------------- +# 397B-A17B MoE defaults +# --------------------------------------------------------------------------- - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["peft"] = None - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) +def test_qwen35_vl_397b_a17b_sft_defaults(monkeypatch: pytest.MonkeyPatch): + """397B-A17B SFT should have correct default parallelism and learning rate.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - _assert_basic_config(cfg) + cfg = _qwen35_vl_module.qwen35_vl_397b_a17b_sft_config() - assert cfg.model.tensor_model_parallel_size == 4 + _assert_basic_config(cfg) + assert cfg.model.tensor_model_parallel_size == 2 assert cfg.model.pipeline_model_parallel_size == 4 - assert cfg.peft is None - assert cfg.optimizer.lr == 5e-6 + assert cfg.model.expert_model_parallel_size == 32 assert cfg.model.pipeline_dtype == torch.bfloat16 + assert cfg.peft is None + assert cfg.optimizer.lr == 2e-5 + assert cfg.model.recompute_granularity == "full" -# --------------------------------------------------------------------------- -# 35B-A3B MoE defaults -# --------------------------------------------------------------------------- - - -def test_qwen35_vl_35b_a3b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """35B-A3B LoRA should have correct default parallelism and learning rate.""" +def test_qwen35_vl_397b_a17b_peft_defaults(monkeypatch: pytest.MonkeyPatch): + """397B-A17B PEFT should have correct default parallelism and learning rate.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_35b_a3b_finetune_config") - overrides["peft"] = "lora" - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - overrides.pop("expert_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_35b_a3b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_397b_a17b_peft_config() _assert_basic_config(cfg) - assert cfg.model.tensor_model_parallel_size == 2 assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.model.expert_model_parallel_size == 4 + assert cfg.model.expert_model_parallel_size == 32 assert cfg.peft is not None assert cfg.optimizer.lr == 2e-4 + assert cfg.model.pipeline_dtype == torch.bfloat16 + +# --------------------------------------------------------------------------- +# Common config properties +# --------------------------------------------------------------------------- -def test_qwen35_vl_35b_a3b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """35B-A3B full SFT should have correct default parallelism and learning rate.""" + +def test_qwen35_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs use HFDatasetConversationProvider by default.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_35b_a3b_finetune_config") - overrides["peft"] = None - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - overrides.pop("expert_model_parallel_size", None) + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() - cfg = _qwen35_vl_module.qwen35_vl_35b_a3b_finetune_config(**overrides) + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider - _assert_basic_config(cfg) + assert isinstance(cfg.dataset, HFDatasetConversationProvider) - assert cfg.model.tensor_model_parallel_size == 2 - assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.model.expert_model_parallel_size == 16 - assert cfg.peft is None - assert cfg.optimizer.lr == 2e-5 - assert cfg.model.pipeline_dtype == torch.bfloat16 +def test_qwen35_vl_peft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs use HFDatasetConversationProvider by default.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) -# --------------------------------------------------------------------------- -# 122B-A10B MoE defaults -# --------------------------------------------------------------------------- + cfg = _qwen35_vl_module.qwen35_vl_800m_peft_config() + from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider -def test_qwen35_vl_122b_a10b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """122B-A10B LoRA should have correct default parallelism and learning rate.""" - monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + assert isinstance(cfg.dataset, HFDatasetConversationProvider) - overrides = _safe_overrides_for("qwen35_vl_122b_a10b_finetune_config") - overrides["peft"] = "lora" - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - overrides.pop("expert_model_parallel_size", None) - cfg = _qwen35_vl_module.qwen35_vl_122b_a10b_finetune_config(**overrides) +def test_qwen35_vl_sft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that SFT configs have freeze options set to False by default.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - _assert_basic_config(cfg) + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() - assert cfg.model.tensor_model_parallel_size == 2 - assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.model.expert_model_parallel_size == 8 - assert cfg.peft is not None - assert cfg.optimizer.lr == 2e-4 - assert cfg.model.pipeline_dtype == torch.bfloat16 + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False -def test_qwen35_vl_122b_a10b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """122B-A10B full SFT should have correct default parallelism and learning rate.""" +def test_qwen35_vl_peft_freeze_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that PEFT configs have freeze options set to False by default.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_122b_a10b_finetune_config") - overrides["peft"] = None - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - overrides.pop("expert_model_parallel_size", None) + cfg = _qwen35_vl_module.qwen35_vl_800m_peft_config() - cfg = _qwen35_vl_module.qwen35_vl_122b_a10b_finetune_config(**overrides) + assert cfg.model.freeze_language_model is False + assert cfg.model.freeze_vision_model is False + assert cfg.model.freeze_vision_projection is False - _assert_basic_config(cfg) - assert cfg.model.tensor_model_parallel_size == 2 - assert cfg.model.pipeline_model_parallel_size == 6 - assert cfg.model.expert_model_parallel_size == 8 - assert cfg.peft is None - assert cfg.optimizer.lr == 2e-5 - assert cfg.model.pipeline_dtype == torch.bfloat16 +def test_qwen35_vl_precision_config(monkeypatch: pytest.MonkeyPatch): + """Test that precision config is correctly set.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() -# --------------------------------------------------------------------------- -# 397B MoE defaults -# --------------------------------------------------------------------------- + _assert_basic_config(cfg) + assert cfg.mixed_precision == "bf16_mixed" -def test_qwen35_vl_397b_a17b_lora_defaults(monkeypatch: pytest.MonkeyPatch): - """397B-A17B LoRA should have correct default parallelism and learning rate.""" +def test_qwen35_vl_ddp_config(monkeypatch: pytest.MonkeyPatch): + """Test that DDP config is correctly set for VLMs.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_397b_a17b_finetune_config") - overrides["peft"] = "lora" - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - overrides.pop("expert_model_parallel_size", None) - - cfg = _qwen35_vl_module.qwen35_vl_397b_a17b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() _assert_basic_config(cfg) - - assert cfg.model.tensor_model_parallel_size == 2 - assert cfg.model.pipeline_model_parallel_size == 1 - assert cfg.model.expert_model_parallel_size == 32 - assert cfg.peft is not None - assert cfg.optimizer.lr == 2e-4 + assert cfg.ddp.overlap_grad_reduce is False + assert cfg.ddp.overlap_param_gather is False + assert cfg.ddp.check_for_nan_in_grad is True + assert cfg.ddp.use_distributed_optimizer is True -def test_qwen35_vl_397b_a17b_full_sft_defaults(monkeypatch: pytest.MonkeyPatch): - """397B-A17B full SFT should have correct default parallelism and learning rate.""" +def test_qwen35_vl_optimizer_precision_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that optimizer precision settings are correctly configured.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_397b_a17b_finetune_config") - overrides["peft"] = None - overrides.pop("tensor_model_parallel_size", None) - overrides.pop("pipeline_model_parallel_size", None) - overrides.pop("expert_model_parallel_size", None) + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() - cfg = _qwen35_vl_module.qwen35_vl_397b_a17b_finetune_config(**overrides) + _assert_basic_config(cfg) + assert cfg.optimizer.use_precision_aware_optimizer is False + assert cfg.optimizer.main_grads_dtype == torch.float32 + assert cfg.optimizer.main_params_dtype == torch.float32 + assert cfg.optimizer.exp_avg_dtype == torch.float32 + assert cfg.optimizer.exp_avg_sq_dtype == torch.float32 + + +def test_qwen35_vl_training_config(monkeypatch: pytest.MonkeyPatch): + """Test that training configuration is correctly set.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() _assert_basic_config(cfg) + assert cfg.train.train_iters == 300000 + assert cfg.train.global_batch_size == 32 + assert cfg.train.micro_batch_size == 1 + assert cfg.train.manual_gc is True + assert cfg.train.manual_gc_interval == 100 - assert cfg.model.tensor_model_parallel_size == 2 - assert cfg.model.pipeline_model_parallel_size == 4 - assert cfg.model.expert_model_parallel_size == 32 - assert cfg.peft is None - assert cfg.optimizer.lr == 2e-5 - assert cfg.model.pipeline_dtype == torch.bfloat16 +def test_qwen35_vl_validation_config(monkeypatch: pytest.MonkeyPatch): + """Test that validation configuration is correctly set.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) -# --------------------------------------------------------------------------- -# Custom overrides -# --------------------------------------------------------------------------- + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() + + _assert_basic_config(cfg) + assert cfg.validation.eval_interval == 500 + assert cfg.validation.eval_iters == 32 -def test_qwen35_vl_custom_finetune_lr(monkeypatch: pytest.MonkeyPatch): - """Custom finetune_lr should override default learning rate.""" +def test_qwen35_vl_sft_learning_rate(monkeypatch: pytest.MonkeyPatch): + """Test that SFT has lower learning rate than PEFT.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["peft"] = "lora" - overrides["finetune_lr"] = 2e-4 + sft_cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() + peft_cfg = _qwen35_vl_module.qwen35_vl_800m_peft_config() + + assert sft_cfg.optimizer.lr < peft_cfg.optimizer.lr + + +def test_qwen35_vl_kernel_settings(monkeypatch: pytest.MonkeyPatch): + """Test that kernel settings are correctly configured.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() _assert_basic_config(cfg) - assert cfg.optimizer.lr == 2e-4 + assert cfg.model.attention_backend == "auto" + assert cfg.model.cross_entropy_loss_fusion is True + assert cfg.model.cross_entropy_fusion_impl == "native" -def test_qwen35_vl_recompute_option(monkeypatch: pytest.MonkeyPatch): - """enable_recompute should set recompute fields on the model config.""" +def test_qwen35_vl_cuda_graph_settings(monkeypatch: pytest.MonkeyPatch): + """Test that CUDA graph settings are correctly configured.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["enable_recompute"] = True + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() + + _assert_basic_config(cfg) + assert cfg.model.cuda_graph_impl == "none" + assert cfg.model.cuda_graph_scope == "full" + assert cfg.model.cuda_graph_warmup_steps == 3 + - cfg = _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) +def test_qwen35_vl_transformer_impl(monkeypatch: pytest.MonkeyPatch): + """Test that transformer implementation is set correctly.""" + monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() _assert_basic_config(cfg) - assert cfg.model.recompute_granularity == "full" - assert cfg.model.recompute_method == "uniform" - assert cfg.model.recompute_num_layers == 1 + assert cfg.model.transformer_impl == "transformer_engine" -def test_qwen35_vl_invalid_dataset_type(monkeypatch: pytest.MonkeyPatch): - """An unsupported dataset_type should raise ValueError.""" +def test_qwen35_vl_memory_saving_defaults(monkeypatch: pytest.MonkeyPatch): + """Test that memory saving settings are disabled by default.""" monkeypatch.setattr(_qwen35_vl_module, "AutoBridge", _FakeAutoBridge) - overrides = _safe_overrides_for("qwen35_vl_27b_finetune_config") - overrides["dataset_type"] = "unsupported" + cfg = _qwen35_vl_module.qwen35_vl_800m_sft_config() - with pytest.raises(ValueError, match="Unsupported dataset_type"): - _qwen35_vl_module.qwen35_vl_27b_finetune_config(**overrides) + _assert_basic_config(cfg) + assert cfg.model.recompute_granularity is None + assert cfg.model.recompute_modules is None + assert cfg.model.fine_grained_activation_offloading is False + assert cfg.model.offload_modules is None From ad628e26f782c27d3165de14e1788508e3d6e9c4 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Wed, 4 Mar 2026 22:51:01 -0800 Subject: [PATCH 4/9] Address CodeRabbit review feedback on SLURM scripts and tests - Add `set -euo pipefail` to slurm_sft.sh and slurm_peft.sh for fail-fast behavior - Remove `logs/` prefix from SBATCH output/error paths (directory doesn't exist at job start) - Remove now-unnecessary `mkdir -p logs` calls - Fix 122B-A10B parallelism comment in slurm_sft.sh to match recipe (TP=2, PP=6, EP=8) - Add `pytestmark = pytest.mark.integration` to functional test module - Reset microbatch calculator both before and after each test in fixture Signed-off-by: Chen Cui Made-with: Cursor --- examples/models/vlm/qwen35_vl/slurm_peft.sh | 7 +++---- examples/models/vlm/qwen35_vl/slurm_sft.sh | 9 ++++----- .../recipes/test_qwen35_vl_recipes_finetune.py | 9 ++++++++- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/examples/models/vlm/qwen35_vl/slurm_peft.sh b/examples/models/vlm/qwen35_vl/slurm_peft.sh index fd2abb2ec1..1ce905045d 100755 --- a/examples/models/vlm/qwen35_vl/slurm_peft.sh +++ b/examples/models/vlm/qwen35_vl/slurm_peft.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -euo pipefail # Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -46,8 +47,8 @@ #SBATCH --time=08:00:00 #SBATCH --partition=gpu #SBATCH --account=my_account -#SBATCH --output=logs/qwen35vl_lora_%j.out -#SBATCH --error=logs/qwen35vl_lora_%j.err +#SBATCH --output=qwen35vl_lora_%j.out +#SBATCH --error=qwen35vl_lora_%j.err #SBATCH --exclusive # ============================================================================== @@ -149,8 +150,6 @@ echo "PEFT: LoRA" echo "Checkpoint: $PRETRAINED_CHECKPOINT" echo "======================================" -mkdir -p logs - CLI_OVERRIDES="\ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ diff --git a/examples/models/vlm/qwen35_vl/slurm_sft.sh b/examples/models/vlm/qwen35_vl/slurm_sft.sh index 60eb3176e7..cc03d51211 100644 --- a/examples/models/vlm/qwen35_vl/slurm_sft.sh +++ b/examples/models/vlm/qwen35_vl/slurm_sft.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -euo pipefail # Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -31,7 +32,7 @@ # 9B (dense): TP=4, PP=1 (1 node) # 27B (dense): TP=4, PP=4 (2 nodes) # 35B-A3B (MoE): TP=2, PP=1, EP=16 (2 nodes) -# 122B-A10B (MoE): TP=2, PP=1, EP=32 (4 nodes) +# 122B-A10B (MoE): TP=2, PP=6, EP=8 (4 nodes) # 397B-A17B (MoE): TP=2, PP=4, EP=32 (16 nodes) # # Examples: @@ -47,8 +48,8 @@ #SBATCH --time=24:00:00 #SBATCH --partition=gpu #SBATCH --account=my_account -#SBATCH --output=logs/qwen35vl_sft_%j.out -#SBATCH --error=logs/qwen35vl_sft_%j.err +#SBATCH --output=qwen35vl_sft_%j.out +#SBATCH --error=qwen35vl_sft_%j.err #SBATCH --exclusive # ============================================================================== @@ -150,8 +151,6 @@ echo "Recipe: $RECIPE" echo "Checkpoint: $PRETRAINED_CHECKPOINT" echo "======================================" -mkdir -p logs - CLI_OVERRIDES="\ checkpoint.pretrained_checkpoint=$PRETRAINED_CHECKPOINT \ model.seq_length=$SEQ_LENGTH \ diff --git a/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py index 79a283930f..b320c3966d 100644 --- a/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py +++ b/tests/functional_tests/recipes/test_qwen35_vl_recipes_finetune.py @@ -30,6 +30,9 @@ from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test +pytestmark = pytest.mark.integration + + _TP2_PP1 = {"tensor_model_parallel_size": 2, "pipeline_model_parallel_size": 1} _TINY_MODEL = {"num_layers": 4} @@ -117,7 +120,6 @@ def _reset_microbatch_calculator(self): If a previous test fails mid-pretrain, destroy_global_state() never runs and the calculator leaks into the next test. """ - yield from megatron.core.num_microbatches_calculator import ( _GLOBAL_NUM_MICROBATCHES_CALCULATOR, destroy_num_microbatches_calculator, @@ -126,6 +128,11 @@ def _reset_microbatch_calculator(self): if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is not None: destroy_num_microbatches_calculator() + yield + + if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is not None: + destroy_num_microbatches_calculator() + # ----------------------------------------------------------------------- # SFT scenarios # ----------------------------------------------------------------------- From c4448fc60f989bcfaa630faf5c034d9c8518a98a Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Thu, 5 Mar 2026 09:45:13 -0800 Subject: [PATCH 5/9] fix test Signed-off-by: Chen Cui --- .../bridge/recipes/qwen_vl/qwen35_vl.py | 65 ------------------- .../models/qwen_vl/test_qwen35_vl_bridge.py | 2 +- 2 files changed, 1 insertion(+), 66 deletions(-) diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py index b7b3ef83cb..ee65d4ba1f 100644 --- a/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py +++ b/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py @@ -90,10 +90,6 @@ def qwen35_vl_800m_sft_config() -> ConfigContainer: cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - lower LR for full SFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -191,10 +187,6 @@ def qwen35_vl_2b_sft_config() -> ConfigContainer: cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - lower LR for full SFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -292,10 +284,6 @@ def qwen35_vl_4b_sft_config() -> ConfigContainer: cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - lower LR for full SFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -393,10 +381,6 @@ def qwen35_vl_9b_sft_config() -> ConfigContainer: cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - lower LR for full SFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -492,10 +476,6 @@ def qwen35_vl_27b_sft_config() -> ConfigContainer: cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - lower LR for full SFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -607,10 +587,6 @@ def qwen35_vl_35b_a3b_sft_config() -> ConfigContainer: cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - lower LR for full SFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -727,10 +703,6 @@ def qwen35_vl_122b_a10b_sft_config() -> ConfigContainer: cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - lower LR for full SFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -847,10 +819,6 @@ def qwen35_vl_397b_a17b_sft_config() -> ConfigContainer: cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - lower LR for full SFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -958,10 +926,6 @@ def qwen35_vl_800m_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContai cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - higher LR for PEFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -1066,10 +1030,6 @@ def qwen35_vl_2b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - higher LR for PEFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -1174,10 +1134,6 @@ def qwen35_vl_4b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - higher LR for PEFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -1282,10 +1238,6 @@ def qwen35_vl_9b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - higher LR for PEFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -1389,11 +1341,6 @@ def qwen35_vl_27b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContain cfg.train.manual_gc = True cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - higher LR for PEFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -1514,10 +1461,6 @@ def qwen35_vl_35b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCon cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - higher LR for PEFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -1641,10 +1584,6 @@ def qwen35_vl_122b_a10b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigC cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - higher LR for PEFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, @@ -1768,10 +1707,6 @@ def qwen35_vl_397b_a17b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigC cfg.train.manual_gc_interval = 100 cfg.train.manual_gc_eval = 100 - # Validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - # Optimizer - higher LR for PEFT opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( lr_warmup_iters=200, diff --git a/tests/unit_tests/models/qwen_vl/test_qwen35_vl_bridge.py b/tests/unit_tests/models/qwen_vl/test_qwen35_vl_bridge.py index 605e6627e1..c7aa09207c 100644 --- a/tests/unit_tests/models/qwen_vl/test_qwen35_vl_bridge.py +++ b/tests/unit_tests/models/qwen_vl/test_qwen35_vl_bridge.py @@ -209,7 +209,7 @@ def test_provider_bridge_dtype_handling(self, mock_dtype, bridge, mock_pretraine def test_provider_bridge_tied_embeddings(self, bridge): text_config = _make_dense_text_config() text_config.tie_word_embeddings = True - pretrained = _make_mock_pretrained(text_config, _make_vision_config()) + pretrained = _make_mock_pretrained(text_config, _make_vision_config(), tie_word_embeddings=True) provider = bridge.provider_bridge(pretrained) assert provider.share_embeddings_and_output_weights is True From 3996cd503845384d4499861cc72753a85fa9d9cd Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Thu, 5 Mar 2026 15:12:01 -0800 Subject: [PATCH 6/9] update docs and readmes Signed-off-by: Chen Cui --- docs/models/vlm/index.md | 1 + docs/models/vlm/qwen35-vl.md | 62 +++++++++++ examples/models/vlm/qwen35_vl/README.md | 100 ++++++++++++++++++ examples/models/vlm/qwen35_vl/slurm_peft.sh | 4 +- examples/models/vlm/qwen35_vl/slurm_sft.sh | 4 +- scripts/training/run_recipe.py | 14 +++ .../bridge/recipes/qwen_vl/qwen35_vl.py | 90 ++++++++++------ 7 files changed, 240 insertions(+), 35 deletions(-) create mode 100644 docs/models/vlm/qwen35-vl.md create mode 100644 examples/models/vlm/qwen35_vl/README.md diff --git a/docs/models/vlm/index.md b/docs/models/vlm/index.md index 8c030258f2..b7407793b1 100644 --- a/docs/models/vlm/index.md +++ b/docs/models/vlm/index.md @@ -11,4 +11,5 @@ ministral3.md nemotron-nano-v2-vl.md qwen2.5-vl.md qwen3-vl.md +qwen35-vl.md ``` diff --git a/docs/models/vlm/qwen35-vl.md b/docs/models/vlm/qwen35-vl.md new file mode 100644 index 0000000000..228d4ebac0 --- /dev/null +++ b/docs/models/vlm/qwen35-vl.md @@ -0,0 +1,62 @@ +# Qwen3.5-VL + +[Alibaba Cloud's Qwen3.5-VL](https://huggingface.co/collections/Qwen/qwen35-vl) is a family of vision-language models supporting multimodal understanding across text, images, and videos. Qwen3.5-VL includes both dense models and Mixture-of-Experts (MoE) variants for improved efficiency at scale. + +Qwen3.5-VL models feature a hybrid architecture combining GDN (Gated DeltaNet) layers with standard attention layers, SwiGLU activations, and RMSNorm. MoE variants use top-k routing with shared experts for better quality. + +Qwen3.5-VL models are supported via Megatron Bridge with auto-detected configuration and weight mapping. + +```{important} +Please upgrade to `transformers` >= 5.2.0 in order to use the Qwen3.5-VL models. +``` + +## Available Models + +### Dense Models +- **Qwen3.5 0.8B** (`Qwen/Qwen3.5-0.8B`): 0.8B parameter vision-language model + - Recommended: 1 node, 8 GPUs + +- **Qwen3.5 2B** (`Qwen/Qwen3.5-2B`): 2B parameter vision-language model + - Recommended: 1 node, 8 GPUs + +- **Qwen3.5 4B** (`Qwen/Qwen3.5-4B`): 4B parameter vision-language model + - Recommended: 1 node, 8 GPUs + +- **Qwen3.5 9B** (`Qwen/Qwen3.5-9B`): 9B parameter vision-language model + - Recommended: 1 node, 8 GPUs + +- **Qwen3.5 27B** (`Qwen/Qwen3.5-27B`): 27B parameter vision-language model + - Recommended: 2 nodes, 16 GPUs + +### Mixture-of-Experts (MoE) Models +- **Qwen3.5 35B-A3B** (`Qwen/Qwen3.5-35B-A3B`): 35B total parameters, 3B activated per token + - Recommended: 2 nodes, 16 GPUs + +- **Qwen3.5 122B-A10B** (`Qwen/Qwen3.5-122B-A10B`): 122B total parameters, 10B activated per token + - Recommended: 4 nodes, 32 GPUs + +- **Qwen3.5 397B-A17B** (`Qwen/Qwen3.5-397B-A17B`): 397B total parameters, 17B activated per token + - 512 experts with top-10 routing and shared experts + - Recommended: 16 nodes, 128 GPUs + +## Examples + +For checkpoint conversion, inference, finetuning recipes, and step-by-step training guides, see the [Qwen3.5-VL Examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/vlm/qwen35_vl/README.md). + +## Hugging Face Model Cards + +- Qwen3.5 0.8B: https://huggingface.co/Qwen/Qwen3.5-0.8B +- Qwen3.5 2B: https://huggingface.co/Qwen/Qwen3.5-2B +- Qwen3.5 4B: https://huggingface.co/Qwen/Qwen3.5-4B +- Qwen3.5 9B: https://huggingface.co/Qwen/Qwen3.5-9B +- Qwen3.5 27B: https://huggingface.co/Qwen/Qwen3.5-27B +- Qwen3.5 35B-A3B (MoE): https://huggingface.co/Qwen/Qwen3.5-35B-A3B +- Qwen3.5 122B-A10B (MoE): https://huggingface.co/Qwen/Qwen3.5-122B-A10B +- Qwen3.5 397B-A17B (MoE): https://huggingface.co/Qwen/Qwen3.5-397B-A17B + +## Related Docs +- Related VLM: [Qwen3-VL](qwen3-vl.md) +- Related LLM: [Qwen](../llm/qwen.md) +- Recipe usage: [Recipe usage](../../recipe-usage.md) +- Customizing the training recipe configuration: [Configuration overview](../../training/config-container-overview.md) +- Training entry points: [Entry points](../../training/entry-points.md) diff --git a/examples/models/vlm/qwen35_vl/README.md b/examples/models/vlm/qwen35_vl/README.md new file mode 100644 index 0000000000..b15c08727b --- /dev/null +++ b/examples/models/vlm/qwen35_vl/README.md @@ -0,0 +1,100 @@ +# Qwen3.5-VL Examples + +This directory contains example scripts for Qwen3.5-VL vision-language models. + +For model introduction and architecture details, see the [Qwen3.5-VL documentation](../../../../docs/models/vlm/qwen35-vl.md). + +## Workspace Configuration + +All scripts use a `WORKSPACE` environment variable to define the base directory for checkpoints and results. By default, this is set to `/workspace`. You can override it: + +```bash +export WORKSPACE=/your/custom/path +``` + +Directory structure: +- `${WORKSPACE}/models/` - Converted checkpoints +- `${WORKSPACE}/results/` - Training outputs and experiment results + +## Checkpoint Conversion + +### Import HF → Megatron +To import the HF VL model to your desired Megatron path: +```bash +python examples/conversion/convert_checkpoints.py import \ + --hf-model Qwen/Qwen3.5-35B-A3B \ + --megatron-path ${WORKSPACE}/models/Qwen/Qwen3.5-35B-A3B +``` + +### Export Megatron → HF +```bash +python examples/conversion/convert_checkpoints.py export \ + --hf-model Qwen/Qwen3.5-35B-A3B \ + --megatron-path ${WORKSPACE}/models/Qwen/Qwen3.5-35B-A3B/iter_0000000 \ + --hf-path ${WORKSPACE}/models/Qwen/Qwen3.5-35B-A3B-hf-export +``` + +See the [conversion.sh](conversion.sh) script for more examples including multi-GPU round-trip validation. + +## Inference + +### Run Inference on Converted Checkpoint + +```bash +python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_vlm.py \ + --hf_model_path Qwen/Qwen3.5-35B-A3B \ + --megatron_model_path ${WORKSPACE}/models/Qwen/Qwen3.5-35B-A3B/iter_0000000 \ + --image_path "https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/resolve/main/images/table.png" \ + --prompt "Describe this image." \ + --max_new_tokens 100 \ + --tp 2 --pp 2 --ep 4 +``` + +Note: +- `--megatron_model_path` is optional. If not specified, the script will convert the model and then run forward. +- You can also use image URLs: `--image_path="https://example.com/image.jpg"` +- For MoE models, set `--ep` to the desired expert parallelism degree. + +See the [inference.sh](inference.sh) script for commands to: +- Run inference with Hugging Face checkpoints +- Run inference with imported Megatron checkpoints +- Run inference with exported Hugging Face checkpoints + +For multi-node distributed inference—required for the largest 397B model—see the [slurm_inference.sh](slurm_inference.sh) script. + +## Finetune Recipes + +- Available recipes: + - `qwen35_vl_800m_sft_config` / `qwen35_vl_800m_peft_config`: 0.8B dense model + - `qwen35_vl_2b_sft_config` / `qwen35_vl_2b_peft_config`: 2B dense model + - `qwen35_vl_4b_sft_config` / `qwen35_vl_4b_peft_config`: 4B dense model + - `qwen35_vl_9b_sft_config` / `qwen35_vl_9b_peft_config`: 9B dense model + - `qwen35_vl_27b_sft_config` / `qwen35_vl_27b_peft_config`: 27B dense model + - `qwen35_vl_35b_a3b_sft_config` / `qwen35_vl_35b_a3b_peft_config`: 35B-A3B MoE model + - `qwen35_vl_122b_a10b_sft_config` / `qwen35_vl_122b_a10b_peft_config`: 122B-A10B MoE model + - `qwen35_vl_397b_a17b_sft_config` / `qwen35_vl_397b_a17b_peft_config`: 397B-A17B MoE model + +Before training, ensure the following environment variables are set: +1. `SAVE_DIR`: checkpoint and log saving directory +2. `HF_TOKEN`: to download models from HF Hub (if required) +3. `HF_HOME`: (optional) to avoid re-downloading models and datasets +4. `WANDB_API_KEY`: (optional) to enable WandB logging + +### Pretrain + +Pretraining is not verified for this model. + +### Supervised Fine-Tuning (SFT) + +See the [slurm_sft.sh](slurm_sft.sh) script for full parameter fine-tuning with configurable model sizes. + +### Parameter-Efficient Fine-Tuning (PEFT) with LoRA + +See the [slurm_peft.sh](slurm_peft.sh) script for LoRA fine-tuning with configurable model sizes. + +### Expected Training Dynamics +We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/rt6uzrvf) for the expected loss curves and grad norms. + +## Evaluation + +Coming soon. diff --git a/examples/models/vlm/qwen35_vl/slurm_peft.sh b/examples/models/vlm/qwen35_vl/slurm_peft.sh index 1ce905045d..9de0b3641b 100755 --- a/examples/models/vlm/qwen35_vl/slurm_peft.sh +++ b/examples/models/vlm/qwen35_vl/slurm_peft.sh @@ -156,7 +156,6 @@ CLI_OVERRIDES="\ train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ checkpoint.save=${WORKSPACE}/results/${RECIPE}_lora \ logger.log_interval=$LOG_INTERVAL \ logger.wandb_project=$WANDB_PROJECT \ @@ -164,6 +163,9 @@ CLI_OVERRIDES="\ dataset.maker_name=make_${DATASET_NAME}_dataset \ dataset.seq_length=$SEQ_LENGTH" +# For multinode runs, the recipe's online HF path can be unstable. Pass --hf_path +# with a local model directory for more reliable config loading, e.g.: +# --hf_path ${WORKSPACE}/models/Qwen/${HF_MODEL_NAME} CMD="uv run --no-sync python scripts/training/run_recipe.py \ --recipe $RECIPE \ --step_func vlm_step \ diff --git a/examples/models/vlm/qwen35_vl/slurm_sft.sh b/examples/models/vlm/qwen35_vl/slurm_sft.sh index cc03d51211..0ea7a67609 100644 --- a/examples/models/vlm/qwen35_vl/slurm_sft.sh +++ b/examples/models/vlm/qwen35_vl/slurm_sft.sh @@ -157,7 +157,6 @@ CLI_OVERRIDES="\ train.train_iters=$TRAIN_ITERS \ train.global_batch_size=$GLOBAL_BATCH_SIZE \ train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ checkpoint.save=${WORKSPACE}/results/${RECIPE}_sft \ logger.log_interval=$LOG_INTERVAL \ logger.wandb_project=$WANDB_PROJECT \ @@ -165,6 +164,9 @@ CLI_OVERRIDES="\ dataset.maker_name=make_${DATASET_NAME}_dataset \ dataset.seq_length=$SEQ_LENGTH" +# For multinode runs, the recipe's online HF path can be unstable. Pass --hf_path +# with a local model directory for more reliable config loading, e.g.: +# --hf_path ${WORKSPACE}/models/Qwen/${HF_MODEL_NAME} CMD="uv run --no-sync python scripts/training/run_recipe.py \ --recipe $RECIPE \ --step_func vlm_step \ diff --git a/scripts/training/run_recipe.py b/scripts/training/run_recipe.py index 4b923f5907..e927143f35 100755 --- a/scripts/training/run_recipe.py +++ b/scripts/training/run_recipe.py @@ -139,6 +139,13 @@ def parse_args() -> tuple[argparse.Namespace, list[str]]: default=None, help="Dataset type for VLM recipes (e.g., 'energon', 'mock', 'hf', 'preloaded').", ) + parser.add_argument( + "--hf_path", + type=str, + default=None, + help="HuggingFace model ID or local path to model directory. " + "Use a local path for more stable multinode training.", + ) args, cli_overrides = parser.parse_known_args() return args, cli_overrides @@ -149,6 +156,7 @@ def load_recipe( packed_sequence: bool = False, seq_length: int | None = None, dataset_type: str | None = None, + hf_path: str | None = None, ) -> ConfigContainer: """ Load recipe by name from megatron.bridge.recipes. @@ -159,6 +167,7 @@ def load_recipe( packed_sequence: Enable packed sequence training (default: False) seq_length: Sequence length for training (optional) dataset_type: Dataset type for VLM recipes (e.g., 'energon', 'mock', 'hf', 'preloaded') + hf_path: HuggingFace model ID or local path to model directory (optional) Returns: ConfigContainer from calling the recipe @@ -185,12 +194,14 @@ def load_recipe( accepts_packed_sequence = "packed_sequence" in params or has_var_keyword accepts_seq_length = "seq_length" in params or has_var_keyword accepts_dataset_type = "dataset_type" in params or has_var_keyword + accepts_hf_path = "hf_path" in params or has_var_keyword except (ValueError, TypeError): # If signature inspection fails, fallback conservatively accepts_peft = True # peft is widely supported, try passing it accepts_packed_sequence = False # new parameter, don't pass if unsure accepts_seq_length = False # new parameter, don't pass if unsure accepts_dataset_type = False # VLM-specific, don't pass if unsure + accepts_hf_path = False # model-specific, don't pass if unsure # Build kwargs dynamically based on what the recipe accepts kwargs = {} @@ -202,6 +213,8 @@ def load_recipe( kwargs["seq_length"] = seq_length if accepts_dataset_type and dataset_type is not None: kwargs["dataset_type"] = dataset_type + if accepts_hf_path and hf_path is not None: + kwargs["hf_path"] = hf_path try: return config_builder(**kwargs) @@ -238,6 +251,7 @@ def main() -> None: args.packed_sequence, args.seq_length, args.dataset_type, + args.hf_path, ) config = process_config_with_overrides( diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py index ee65d4ba1f..c0e9b2b89e 100644 --- a/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py +++ b/src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Qwen3.5-VL finetuning recipes with parameterless API. +"""Qwen3.5-VL finetuning recipes. This module provides SFT and PEFT configurations for Qwen3.5-VL models: @@ -33,7 +33,7 @@ # ============================================================================= # Qwen3.5-VL 800M SFT Configuration (Dense) # ============================================================================= -def qwen35_vl_800m_sft_config() -> ConfigContainer: +def qwen35_vl_800m_sft_config(hf_path: str = "Qwen/Qwen3.5-0.8B") -> ConfigContainer: """Return a full SFT config for Qwen3.5-VL 800M (dense). Default configuration: 1 node, 8 GPUs @@ -42,11 +42,13 @@ def qwen35_vl_800m_sft_config() -> ConfigContainer: - Sequence length: 4096 Note: num_kv_heads=2, so max TP=2. + + Args: + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3.5-0.8B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -130,7 +132,7 @@ def qwen35_vl_800m_sft_config() -> ConfigContainer: # ============================================================================= # Qwen3.5-VL 2B SFT Configuration (Dense) # ============================================================================= -def qwen35_vl_2b_sft_config() -> ConfigContainer: +def qwen35_vl_2b_sft_config(hf_path: str = "Qwen/Qwen3.5-2B") -> ConfigContainer: """Return a full SFT config for Qwen3.5-VL 2B (dense). Default configuration: 1 node, 8 GPUs @@ -139,11 +141,13 @@ def qwen35_vl_2b_sft_config() -> ConfigContainer: - Sequence length: 4096 Note: num_kv_heads=2, so max TP=2. + + Args: + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3.5-2B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -227,7 +231,7 @@ def qwen35_vl_2b_sft_config() -> ConfigContainer: # ============================================================================= # Qwen3.5-VL 4B SFT Configuration (Dense) # ============================================================================= -def qwen35_vl_4b_sft_config() -> ConfigContainer: +def qwen35_vl_4b_sft_config(hf_path: str = "Qwen/Qwen3.5-4B") -> ConfigContainer: """Return a full SFT config for Qwen3.5-VL 4B (dense). Default configuration: 1 node, 8 GPUs @@ -236,11 +240,13 @@ def qwen35_vl_4b_sft_config() -> ConfigContainer: - Sequence length: 4096 Note: num_kv_heads=4, so max TP=4. + + Args: + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3.5-4B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -324,7 +330,7 @@ def qwen35_vl_4b_sft_config() -> ConfigContainer: # ============================================================================= # Qwen3.5-VL 9B SFT Configuration (Dense) # ============================================================================= -def qwen35_vl_9b_sft_config() -> ConfigContainer: +def qwen35_vl_9b_sft_config(hf_path: str = "Qwen/Qwen3.5-9B") -> ConfigContainer: """Return a full SFT config for Qwen3.5-VL 9B (dense). Default configuration: 1 node, 8 GPUs @@ -333,11 +339,13 @@ def qwen35_vl_9b_sft_config() -> ConfigContainer: - Sequence length: 4096 Note: num_kv_heads=4, so max TP=4. + + Args: + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3.5-9B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -421,18 +429,20 @@ def qwen35_vl_9b_sft_config() -> ConfigContainer: # ============================================================================= # Qwen3.5-VL 27B SFT Configuration (Dense) # ============================================================================= -def qwen35_vl_27b_sft_config() -> ConfigContainer: +def qwen35_vl_27b_sft_config(hf_path: str = "Qwen/Qwen3.5-27B") -> ConfigContainer: """Return a full SFT config for Qwen3.5-VL 27B (dense). Default configuration: 2 nodes, 16 GPUs total - TP=4, PP=4 - LR=5e-6 (full SFT) - Sequence length: 4096 + + Args: + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3.5-27B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -516,18 +526,20 @@ def qwen35_vl_27b_sft_config() -> ConfigContainer: # ============================================================================= # Qwen3.5-VL 35B-A3B SFT Configuration (MoE) # ============================================================================= -def qwen35_vl_35b_a3b_sft_config() -> ConfigContainer: +def qwen35_vl_35b_a3b_sft_config(hf_path: str = "Qwen/Qwen3.5-35B-A3B") -> ConfigContainer: """Return a full SFT config for Qwen3.5-VL 35B-A3B (MoE). Default configuration: 2 nodes, 16 GPUs - TP=2, PP=1, EP=16 - LR=2e-5 (full SFT) - Sequence length: 4096 + + Args: + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3.5-35B-A3B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -630,18 +642,20 @@ def qwen35_vl_35b_a3b_sft_config() -> ConfigContainer: # ============================================================================= # Qwen3.5-VL 122B-A10B SFT Configuration (MoE) # ============================================================================= -def qwen35_vl_122b_a10b_sft_config() -> ConfigContainer: +def qwen35_vl_122b_a10b_sft_config(hf_path: str = "Qwen/Qwen3.5-122B-A10B") -> ConfigContainer: """Return a full SFT config for Qwen3.5-VL 122B-A10B (MoE). Default configuration: 4 nodes, 32 GPUs - TP=2, PP=6, EP=8 - LR=2e-5 (full SFT) - Sequence length: 4096 + + Args: + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3.5-122B-A10B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -746,18 +760,20 @@ def qwen35_vl_122b_a10b_sft_config() -> ConfigContainer: # ============================================================================= # Qwen3.5-VL 397B-A17B SFT Configuration (MoE) # ============================================================================= -def qwen35_vl_397b_a17b_sft_config() -> ConfigContainer: +def qwen35_vl_397b_a17b_sft_config(hf_path: str = "Qwen/Qwen3.5-397B-A17B") -> ConfigContainer: """Return a full SFT config for Qwen3.5-VL 397B-A17B (MoE). Default configuration: 16 nodes, 128 GPUs - TP=2, PP=4, EP=32 - LR=2e-5 (full SFT) - Sequence length: 4096 + + Args: + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3.5-397B-A17B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -862,7 +878,9 @@ def qwen35_vl_397b_a17b_sft_config() -> ConfigContainer: # ============================================================================= # Qwen3.5-VL 800M PEFT Configuration (Dense) # ============================================================================= -def qwen35_vl_800m_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: +def qwen35_vl_800m_peft_config( + peft_scheme: str | PEFT = "lora", hf_path: str = "Qwen/Qwen3.5-0.8B" +) -> ConfigContainer: """Return a PEFT config for Qwen3.5-VL 800M (dense). Default configuration: 1 node, 8 GPUs @@ -872,6 +890,7 @@ def qwen35_vl_800m_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContai Args: peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _peft_common_vlm() @@ -882,7 +901,6 @@ def qwen35_vl_800m_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContai cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3.5-0.8B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -966,7 +984,7 @@ def qwen35_vl_800m_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContai # ============================================================================= # Qwen3.5-VL 2B PEFT Configuration (Dense) # ============================================================================= -def qwen35_vl_2b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: +def qwen35_vl_2b_peft_config(peft_scheme: str | PEFT = "lora", hf_path: str = "Qwen/Qwen3.5-2B") -> ConfigContainer: """Return a PEFT config for Qwen3.5-VL 2B (dense). Default configuration: 1 node, 8 GPUs @@ -976,6 +994,7 @@ def qwen35_vl_2b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine Args: peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _peft_common_vlm() @@ -986,7 +1005,6 @@ def qwen35_vl_2b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3.5-2B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -1070,7 +1088,7 @@ def qwen35_vl_2b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine # ============================================================================= # Qwen3.5-VL 4B PEFT Configuration (Dense) # ============================================================================= -def qwen35_vl_4b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: +def qwen35_vl_4b_peft_config(peft_scheme: str | PEFT = "lora", hf_path: str = "Qwen/Qwen3.5-4B") -> ConfigContainer: """Return a PEFT config for Qwen3.5-VL 4B (dense). Default configuration: 1 node, 8 GPUs @@ -1080,6 +1098,7 @@ def qwen35_vl_4b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine Args: peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _peft_common_vlm() @@ -1090,7 +1109,6 @@ def qwen35_vl_4b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3.5-4B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -1174,7 +1192,7 @@ def qwen35_vl_4b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine # ============================================================================= # Qwen3.5-VL 9B PEFT Configuration (Dense) # ============================================================================= -def qwen35_vl_9b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: +def qwen35_vl_9b_peft_config(peft_scheme: str | PEFT = "lora", hf_path: str = "Qwen/Qwen3.5-9B") -> ConfigContainer: """Return a PEFT config for Qwen3.5-VL 9B (dense). Default configuration: 1 node, 8 GPUs @@ -1184,6 +1202,7 @@ def qwen35_vl_9b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine Args: peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _peft_common_vlm() @@ -1194,7 +1213,6 @@ def qwen35_vl_9b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3.5-9B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -1278,7 +1296,7 @@ def qwen35_vl_9b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContaine # ============================================================================= # Qwen3.5-VL 27B PEFT Configuration (Dense) # ============================================================================= -def qwen35_vl_27b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: +def qwen35_vl_27b_peft_config(peft_scheme: str | PEFT = "lora", hf_path: str = "Qwen/Qwen3.5-27B") -> ConfigContainer: """Return a PEFT config for Qwen3.5-VL 27B (dense). Default configuration: 1 node, 8 GPUs @@ -1288,6 +1306,7 @@ def qwen35_vl_27b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContain Args: peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _peft_common_vlm() @@ -1298,7 +1317,6 @@ def qwen35_vl_27b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContain cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3.5-27B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -1381,7 +1399,9 @@ def qwen35_vl_27b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContain # ============================================================================= # Qwen3.5-VL 35B-A3B PEFT Configuration (MoE) # ============================================================================= -def qwen35_vl_35b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: +def qwen35_vl_35b_a3b_peft_config( + peft_scheme: str | PEFT = "lora", hf_path: str = "Qwen/Qwen3.5-35B-A3B" +) -> ConfigContainer: """Return a PEFT config for Qwen3.5-VL 35B-A3B (MoE). Default configuration: 1 node, 8 GPUs @@ -1391,6 +1411,7 @@ def qwen35_vl_35b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCon Args: peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _peft_common_vlm() @@ -1401,7 +1422,6 @@ def qwen35_vl_35b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCon cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3.5-35B-A3B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -1504,7 +1524,9 @@ def qwen35_vl_35b_a3b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCon # ============================================================================= # Qwen3.5-VL 122B-A10B PEFT Configuration (MoE) # ============================================================================= -def qwen35_vl_122b_a10b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: +def qwen35_vl_122b_a10b_peft_config( + peft_scheme: str | PEFT = "lora", hf_path: str = "Qwen/Qwen3.5-122B-A10B" +) -> ConfigContainer: """Return a PEFT config for Qwen3.5-VL 122B-A10B (MoE). Default configuration: 2 nodes, 16 GPUs @@ -1514,6 +1536,7 @@ def qwen35_vl_122b_a10b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigC Args: peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _peft_common_vlm() @@ -1524,7 +1547,6 @@ def qwen35_vl_122b_a10b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigC cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3.5-122B-A10B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -1627,7 +1649,9 @@ def qwen35_vl_122b_a10b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigC # ============================================================================= # Qwen3.5-VL 397B-A17B PEFT Configuration (MoE) # ============================================================================= -def qwen35_vl_397b_a17b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigContainer: +def qwen35_vl_397b_a17b_peft_config( + peft_scheme: str | PEFT = "lora", hf_path: str = "Qwen/Qwen3.5-397B-A17B" +) -> ConfigContainer: """Return a PEFT config for Qwen3.5-VL 397B-A17B (MoE). Default configuration: 4 nodes, 32 GPUs @@ -1637,6 +1661,7 @@ def qwen35_vl_397b_a17b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigC Args: peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. + hf_path: HuggingFace model ID or local path to model directory. """ cfg = _peft_common_vlm() @@ -1647,7 +1672,6 @@ def qwen35_vl_397b_a17b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigC cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3.5-397B-A17B" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 From 0c14274accabfa26a358285f3585705b72c71765 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Thu, 5 Mar 2026 16:25:02 -0800 Subject: [PATCH 7/9] doc Signed-off-by: Chen Cui --- examples/models/vlm/qwen35_vl/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/models/vlm/qwen35_vl/README.md b/examples/models/vlm/qwen35_vl/README.md index b15c08727b..23f5929aab 100644 --- a/examples/models/vlm/qwen35_vl/README.md +++ b/examples/models/vlm/qwen35_vl/README.md @@ -92,6 +92,25 @@ See the [slurm_sft.sh](slurm_sft.sh) script for full parameter fine-tuning with See the [slurm_peft.sh](slurm_peft.sh) script for LoRA fine-tuning with configurable model sizes. +### Multi-Token Prediction (MTP) + +All Qwen3.5 models are trained with Multi-Token Prediction (`mtp_num_hidden_layers=1` in the HuggingFace config). MTP adds an auxiliary loss that predicts the next-next token alongside the standard next-token prediction, improving training quality. + +MTP is **enabled by default** in all recipes. The MTP layer uses standard attention (not GDN) and the same MLP architecture as the main decoder (dense MLP for dense models, MoE for MoE models). The MTP loss is scaled by `mtp_loss_scaling_factor=0.1` relative to the main LM loss. + +**Finetune with MTP** (default): +```python +cfg.model.mtp_num_layers = 1 +cfg.model.mtp_loss_scaling_factor = 0.1 +``` + +**Finetune without MTP** (discard MTP weights, standard LM loss only): +```python +cfg.model.mtp_num_layers = None +``` + +When converting checkpoints, MTP weights are included by default. Setting `mtp_num_layers = None` skips MTP weight conversion and removes the MTP auxiliary loss during training. + ### Expected Training Dynamics We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/rt6uzrvf) for the expected loss curves and grad norms. From 8407255f46c06eb15efa22e9314ae666db306263 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Thu, 5 Mar 2026 16:53:25 -0800 Subject: [PATCH 8/9] fix doc link Signed-off-by: Chen Cui --- docs/models/vlm/qwen35-vl.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/vlm/qwen35-vl.md b/docs/models/vlm/qwen35-vl.md index 228d4ebac0..f3cf784985 100644 --- a/docs/models/vlm/qwen35-vl.md +++ b/docs/models/vlm/qwen35-vl.md @@ -1,6 +1,6 @@ # Qwen3.5-VL -[Alibaba Cloud's Qwen3.5-VL](https://huggingface.co/collections/Qwen/qwen35-vl) is a family of vision-language models supporting multimodal understanding across text, images, and videos. Qwen3.5-VL includes both dense models and Mixture-of-Experts (MoE) variants for improved efficiency at scale. +[Alibaba Cloud's Qwen3.5-VL](https://huggingface.co/collections/Qwen/qwen35) is a family of vision-language models supporting multimodal understanding across text, images, and videos. Qwen3.5-VL includes both dense models and Mixture-of-Experts (MoE) variants for improved efficiency at scale. Qwen3.5-VL models feature a hybrid architecture combining GDN (Gated DeltaNet) layers with standard attention layers, SwiGLU activations, and RMSNorm. MoE variants use top-k routing with shared experts for better quality. From c617fee7869e1cd5d2a7ffd2a0ce64e1d9e2fd50 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Thu, 5 Mar 2026 16:55:29 -0800 Subject: [PATCH 9/9] doc Signed-off-by: Chen Cui --- docs/models/vlm/qwen35-vl.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/models/vlm/qwen35-vl.md b/docs/models/vlm/qwen35-vl.md index 228d4ebac0..9608cfdf14 100644 --- a/docs/models/vlm/qwen35-vl.md +++ b/docs/models/vlm/qwen35-vl.md @@ -1,13 +1,13 @@ -# Qwen3.5-VL +# Qwen 3.5 -[Alibaba Cloud's Qwen3.5-VL](https://huggingface.co/collections/Qwen/qwen35-vl) is a family of vision-language models supporting multimodal understanding across text, images, and videos. Qwen3.5-VL includes both dense models and Mixture-of-Experts (MoE) variants for improved efficiency at scale. +[Alibaba Cloud's Qwen 3.5](https://huggingface.co/collections/Qwen/qwen35) is a family of vision-language models supporting multimodal understanding across text, images, and videos. Qwen 3.5 includes both dense models and Mixture-of-Experts (MoE) variants for improved efficiency at scale. -Qwen3.5-VL models feature a hybrid architecture combining GDN (Gated DeltaNet) layers with standard attention layers, SwiGLU activations, and RMSNorm. MoE variants use top-k routing with shared experts for better quality. +Qwen 3.5 models feature a hybrid architecture combining GDN (Gated DeltaNet) layers with standard attention layers, SwiGLU activations, and RMSNorm. MoE variants use top-k routing with shared experts for better quality. -Qwen3.5-VL models are supported via Megatron Bridge with auto-detected configuration and weight mapping. +Qwen 3.5 models are supported via Megatron Bridge with auto-detected configuration and weight mapping. ```{important} -Please upgrade to `transformers` >= 5.2.0 in order to use the Qwen3.5-VL models. +Please upgrade to `transformers` >= 5.2.0 in order to use the Qwen 3.5 models. ``` ## Available Models @@ -41,7 +41,7 @@ Please upgrade to `transformers` >= 5.2.0 in order to use the Qwen3.5-VL models. ## Examples -For checkpoint conversion, inference, finetuning recipes, and step-by-step training guides, see the [Qwen3.5-VL Examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/vlm/qwen35_vl/README.md). +For checkpoint conversion, inference, finetuning recipes, and step-by-step training guides, see the [Qwen 3.5 Examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/vlm/qwen35_vl/README.md). ## Hugging Face Model Cards