radixark · yushengsu-thu · Jan 7, 2026 · Jan 7, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/examples/reproducibility/run-qwen2.5-0.5B-gsm8k-lora.sh b/examples/reproducibility/run-qwen2.5-0.5B-gsm8k-lora.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+
+# Debug this:
+#   "--offload-rollout-level kv_cache weight "
+
+export FLASHINFER_DISABLE_VERSION_CHECK=1
+
+# for rerun the task
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+
+# will prevent ray from buffering stdout/stderr
+export PYTHONBUFFERED=16
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+source "${SCRIPT_DIR}/../../scripts/models/qwen2.5-0.5B.sh"
+
+CKPT_ARGS=(
+   --hf-checkpoint /root/Qwen2.5-0.5B-Instruct/
+   # --ref-load /root/Qwen2.5-0.5B-Instruct_torch_dist/
+   # Uncomment to save checkpoints (required for LoRA)
+   #### train
+   # --save /root/checkpoints/qwen2.5-0.5B-lora-megatron/
+   # --save-interval 100
+   ###
+)
+
+# target-module only support: linear (the proj_ need to be supported in future: in Megatron-Bridge/src/megatron/bridge/models/conversion/peft_bridge.py -  build_adapter_conversion_tasks) 
+# example: if one module have two lora: (linear_proj): LoRALinear(), (linear_fc2): LoRALinear()
+
+# LORA_TARGET_MODULES=${LORA_TARGET_MODULES:-"['linear_qkv','linear_proj','linear_fc1','linear_fc2']"}. It will broken
+
+##############################
+###########lora###############
+##############################
+LORA_ARGS=(
+   --lora-rank 32                    # LoRA rank (typical values: 8, 16, 32, 64)
+   --lora-alpha 32                   # LoRA alpha (usually 2x rank)
+   --lora-dropout 0.0                # LoRA dropout (0.0 for RL training)
+   # Target modules - use Megatron naming or HF naming
+   # Megatron: linear_qkv, linear_proj, linear_fc1, linear_fc2
+   # Need this PR: Update LoRA Weights via Tensor sgl-project/sglang#16226
+   # --lora-sync-from-tensor           # Use tensor-based sync (more efficient)
+   # # Uncomment to share base model between actor and ref (saves memory)
+   # --share-ref-base-model
+
+   --target-modules "all-linear"
+   # --target-modules "o_proj,down_proj,k_proj,gate_proj,q_proj,v_proj,up_proj"
+   # --target-modules "q_proj,k_proj,v_proj,o_proj"
+   ##############################
+   ##############################
+   # # Debug
+   #### inference
+   # --debug-rollout-only
+   ### --lora-adapter-path /root/checkpoints/qwen2.5-0.5B-lora-megatron/lora_adapter.pt
+   # --lora-adapter-path lewtun/Qwen2.5-0.5B-SFT-LoRA
+   ## --lora-adapter-path /root/checkpoints/qwen2.5-0.5B-lora-megatron/
+   ###
+
+   #### train
+   # --debug-train-only
+   # --load-debug-rollout-data /root/debug_data/rollout_data.pt
+   ## --save /root/checkpoints/qwen2.5-0.5B-lora-megatron/
+
+   # --save-debug-rollout-data /root/debug_data/rollout_data.pt
+   ###
+   ##############################
+   ##############################
+   # --no-use-distributed-optimizer  # if open it will has error: /home/radixark/yushengsu/miles-pr/miles/miles/utils/arguments.py: 
+   #def set_default_megatron_args(args): (error) # optimizer cannot distributed to other gpus (enable)
+
+   --megatron-to-hf-mode bridge
+   # Disable gradient accumulation fusion for LoRA training
+
+   # --no-gradient-accumulation-fusion #Root cause: When training with LoRA, the base model’s parameters are frozen (requires_grad=False). However, Megatron-LM’s tensor-parallel layers use gradient-accumulation fusion during the backward pass, and that fusion path checks weight.main_grad.dtype. For frozen parameters, main_grad is never allocated (it remains None), which triggers the error. (enable)
+
+   #### debug
+   --no-offload-train 
+   --no-offload-rollout 
+)
+##############################
+##############################
+##############################
+
+ROLLOUT_ARGS=(
+   --prompt-data /root/gsm8k/train.parquet
+   --input-key messages
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --rm-type math
+   --num-rollout 100
+   # --num-rollout 10 # onyl train 10 stesp
+   --rollout-batch-size 32
+   # --rollout-batch-size 16 # for testing 
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 1024
+   --rollout-temperature 1
+
+   --global-batch-size 256
+   # --global-batch-size 32 # for testing
+)
+
+EVAL_ARGS=(
+   # --eval-interval 20
+   --eval-interval 10
+   --eval-prompt-data gsm8k /root/gsm8k/test.parquet
+   --n-samples-per-eval-prompt 1
+   --eval-max-response-len 1024
+   --eval-top-k 1
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 1
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 9216
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   # --use-kl-loss # if use kl loss, should use --ref-load
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --kl-coef 0.00
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   # --lr 1e-6
+   --lr 1e-5                         # Higher LR often works better for LoRA
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+)
+
+# WANDB_ARGS=(
+#    --use-wandb
+#    --wandb-host https://wandb.ai/
+#    --wandb-team glm-zero
+#    --wandb-project miles-dev
+#    --wandb-group qwen2.5-0.5B-gsm8k-deterministic
+# )
+
+
+WANDB_ARGS=(
+   --use-wandb
+   --wandb-host https://wandb.ai/
+   --wandb-team miles-lora
+   --wandb-project miles-lora-megatron
+   --wandb-group qwen2.5-0.5B-gsm8k-test
+)
+
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 1
+   # --sglang-mem-fraction-static 0.7
+   --sglang-mem-fraction-static 0.4
+
+   --sglang-enable-deterministic-inference
+   --sglang-attention-backend flashinfer
+
+   --deterministic-mode
+)
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   --attention-backend flash
+)
+
+
+##############################
+###########lora###############
+##############################
+######## Note: Need to set export CUDA_VISIBLE_DEVICES= , or it will fail and have cuda error
+# export GPUS_PER_NODE=1
+# export GPUS_PER_NODE=2
+export GPUS_PER_NODE=4
+# export GPUS_PER_NODE=8
+##############################
+##############################
+##############################
+
+# launch the master node of ray in container
+ray start --head --node-ip-address 127.0.0.1 --num-gpus $GPUS_PER_NODE --disable-usage-stats
+# ray start --head --node-ip-address 127.0.0.1 --num-gpus 1 --disable-usage-stats
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json='{
+     "env_vars": {
+        "PYTHONPATH": "/root/Megatron-LM",
+        "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+        "NCCL_ALGO": "Ring",
+        "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0",
+        "CUBLAS_WORKSPACE_CONFIG": ":4096:8"
+     }
+   }' \
+   -- python3 train.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node $GPUS_PER_NODE \
+   --colocate \
+   --calculate-per-token-loss \
+   --use-miles-router \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${LORA_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${LORA_ARGS[@]} 
+
+
+# colocate : update from tesnor
+# disaggrate : update from distributed