Merge commit '9871f3cde444687ba53d8bcda41d82398fba0740' into dev

tastelikefeet · tastelikefeet · commit af9ee699fafa · 2026-02-12T11:06:20.000+08:00
diff --git a/cookbook/legacy/grpo/dapo_math.py b/cookbook/legacy/grpo/dapo_math.py
@@ -55,12 +55,12 @@
 SAMPLER_TP = int(os.environ.get('SAMPLER_TP', 1))
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 PP_SIZE = 4
-NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 4))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 2048))
+NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
+MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
 LEARNING_RATE = float(os.environ.get('LR', 1e-5))
 GRPO_EPSILON = float(os.environ.get('GRPO_EPSILON', 0.2))
 GRPO_BETA = float(os.environ.get('GRPO_BETA', 0.0))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 200))
+MAX_STEPS = int(os.environ.get('MAX_STEPS', 2000))
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 1))
 GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
 TEMPERATURE = float(os.environ.get('TEMPERATURE', 1.0))
@@ -387,7 +387,7 @@ def main():
         model_id=MODEL_ID,
         engine_args={
             'gpu_memory_utilization': 0.8,
-            'max_model_len': 8192,
+            'max_model_len': 6000,
             'max_loras': 1,
             'max_lora_rank': 32,
             'enable_sleep_mode': False,
@@ -408,7 +408,7 @@ def main():
             remote_group='model',
             mixed_precision='bf16',
             recompute_granularity='full',
-            recompute_num_layers=None,
+            recompute_num_layers=1,
         )
     else:
         model = TransformersModel(
diff --git a/cookbook/legacy/grpo/gsm8k_dense.py b/cookbook/legacy/grpo/gsm8k_dense.py
@@ -261,8 +261,8 @@ def main():
 
     lora_config = LoraConfig(
         target_modules="all-linear",
-        r=8,
-        lora_alpha=32,
+        r=32,
+        lora_alpha=64,
         lora_dropout=0.05,
     )
 
@@ -274,7 +274,7 @@ def main():
             remote_group='model',
             mixed_precision='bf16',
             recompute_granularity='full',
-            recompute_num_layers=None,
+            recompute_num_layers=1,
         )
     else:
         model = TransformersModel(
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
@@ -1240,28 +1240,41 @@ def weight_generator():
                 else:
                     yield from _raw_weights()
 
-        async def _send():
-            await engine.send_weights(weight_generator())
+        is_sender = (engine.rank is not None and engine.rank == 0)
 
-        result_container = {'error': None}
+        if not is_sender:
+            for _name, _tensor in weight_generator():
+                pass
+            return
+
+        import queue
+        buf: queue.Queue = queue.Queue(maxsize=4)
+        error: list = []
 
-        def _run():
+        def _send():
+            def _iter():
+                while (item := buf.get()) is not None:
+                    yield item
+            loop = asyncio.new_event_loop()
             try:
-                loop = asyncio.new_event_loop()
-                asyncio.set_event_loop(loop)
-                try:
-                    loop.run_until_complete(_send())
-                finally:
-                    loop.close()
-            except Exception as e:
-                result_container['error'] = e
-
-        thread = threading.Thread(target=_run)
-        thread.start()
-        thread.join()
-
-        if result_container['error'] is not None:
-            raise result_container['error']
+                loop.run_until_complete(engine.send_weights(_iter()))
+            except Exception as exc:
+                error.append(exc)
+            finally:
+                loop.close()
+
+        sender = threading.Thread(target=_send, name="ce-broadcast", daemon=True)
+        sender.start()
+        try:
+            for name, tensor in weight_generator():
+                buf.put((name, tensor.clone()))
+                if error:
+                    break
+        finally:
+            buf.put(None)  # sentinel
+        sender.join()
+        if error:
+            raise error[0]
 
     @remote_function(collect='first')
     def get_peft_config_dict(self, adapter_name: str = None) -> dict:
diff --git a/src/twinkle/utils/torch_utils.py b/src/twinkle/utils/torch_utils.py
@@ -131,7 +131,6 @@ def selective_log_softmax(logits, index) -> 'torch.Tensor':
                 print(traceback.format_exc())
     except Exception:
         pass
-        
     if logits.dtype in [torch.float32, torch.float64]:
         selected_logits = torch.gather(logits, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
         # loop to reduce peak mem consumption
diff --git a/tests/sampler/test_weight_sync.py b/tests/sampler/test_weight_sync.py
@@ -140,14 +140,11 @@ def test_standalone_weight_sync(model_gpus: int = 1, sampler_gpus: int = 1):
     )
     from peft import LoraConfig
     model.add_adapter_to_model('default', LoraConfig(r=8, lora_alpha=32, lora_dropout=0.05, target_modules="all-linear"), gradient_accumulation_steps=1)
-    lora_path = '/mnt/nas2/hujinghan.hjh/swift/output/v1168-20260209-194533/checkpoint-32/default/output/v0-20260209-212154/checkpoint-32'
-
-    model.load('default', output_dir=lora_path, adapter_name='default')
     # ── Create Sampler (dummy weights) ────────────────────────────────
     sampler = vLLMSampler(
         model_id=model_path,
         engine_args={
-            # 'load_format': 'dummy',         # start with random weights
+            'load_format': 'dummy',         # start with random weights
             'gpu_memory_utilization': 0.3,
             'max_model_len': 256,
             'enforce_eager': True,