fix moe sp

hjh0119 · hjh0119 · commit 4316583fc47e · 2026-02-11T01:54:05.000+08:00
diff --git a/cookbook/legacy/grpo/dapo_math.py b/cookbook/legacy/grpo/dapo_math.py
@@ -54,14 +54,14 @@
 SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
 SAMPLER_TP = int(os.environ.get('SAMPLER_TP', 1))
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
-PP_SIZE = 2
-NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
+PP_SIZE = 4
+NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 4))
 MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
 LEARNING_RATE = float(os.environ.get('LR', 1e-5))
 GRPO_EPSILON = float(os.environ.get('GRPO_EPSILON', 0.2))
 GRPO_BETA = float(os.environ.get('GRPO_BETA', 0.0))
 MAX_STEPS = int(os.environ.get('MAX_STEPS', 200))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 2))
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 1))
 GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
 TEMPERATURE = float(os.environ.get('TEMPERATURE', 1.0))
 WEIGHT_SYNC_INTERVAL = int(os.environ.get('WEIGHT_SYNC_INTERVAL', 1))
@@ -334,6 +334,9 @@ def compute_rewards(
 
 # ========== Main ==========
 def main():
+    from twinkle.utils.import_utils import requires
+    requires("vllm>=0.15.0")
+
     device_groups = [
         DeviceGroup(
             name='model',
@@ -350,8 +353,10 @@ def main():
     ]
     if USE_MEGATRON:
         model_mesh = DeviceMesh.from_sizes(
-            dp_size=MODEL_GPUS // PP_SIZE, pp_size=PP_SIZE,
-            ep_size=MODEL_GPUS // PP_SIZE,
+            dp_size=1, 
+            tp_size=2,
+            pp_size=2,
+            ep_size=2,
         )
     else:
         model_mesh = DeviceMesh.from_sizes(
@@ -370,7 +375,7 @@ def main():
     )
 
     lora_config = LoraConfig(
-        target_modules='all-linear',
+        target_modules=['linear_qkv', 'linear_proj'],
         r=8,
         lora_alpha=32,
         lora_dropout=0.05,
diff --git a/src/twinkle/model/megatron/args.py b/src/twinkle/model/megatron/args.py
@@ -557,9 +557,11 @@ def _get_base_model(m):
         use_sequence_parallel = self.sequence_parallel and self.tp_size > 1
         if num_experts > 0 and self.tp_size > 1 and not use_sequence_parallel:
             use_sequence_parallel = True
-            print(
-                f'Auto-enabling sequence_parallel for MoE with TP={self.tp_size}'
-            )
+            # Sync the flag back so that callers (e.g. padding logic in
+            # megatron.py) see the auto-enabled value.
+            self.sequence_parallel = True
+            if self.device_mesh is not None:
+                self.device_mesh.sequence_parallel = True
 
         # For MoE models, ffn_hidden_size should be moe_ffn_hidden_size if not specified
         ffn_hidden_size = mg_config_dict.get('ffn_hidden_size')
diff --git a/src/twinkle/model/megatron/strategy/megatron.py b/src/twinkle/model/megatron/strategy/megatron.py
@@ -18,11 +18,15 @@ def __init__(
         **kwargs,
     ):
         self.device_mesh = device_mesh
-        self.sequence_parallel = self.device_mesh.sequence_parallel
         self.use_distributed_optimizer = use_distributed_optimizer
         self.mixed_precision = mixed_precision
         self._params_dtype = params_dtype
     
+    @property
+    def sequence_parallel(self) -> bool:
+        """Read from device_mesh so auto-enable in args.py is visible."""
+        return getattr(self.device_mesh, 'sequence_parallel', False)
+
     def _check_device_mesh(self):
         from megatron.core import parallel_state as mpu