Merge remote-tracking branch 'origin/dev' into kernels_unittest_fix_ljl

meichangsu1 · meichangsu1 · commit c3651985bb9f · 2026-02-13T17:33:16.000+08:00
diff --git a/cookbook/client/tinker/short_math_grpo.py b/cookbook/client/tinker/short_math_grpo.py
@@ -56,15 +56,6 @@
                  '2. Final answer after ####\n\n'
                  'Example:\n<step>Key step1 -> Ket step 2 -> conclusion</step>\n#### 42')
 
-# SwanLab experiment tracking
-USE_SWANLAB = True
-if USE_SWANLAB:
-    import swanlab
-    swanlab.login(api_key=os.environ['SWANLAB_API_KEY'])
-    swanlab.init(
-        project='twinkle-Math', config={
-            'model_id': BASE_MODEL,
-        })
 
 
 class MathPreprocessor(Preprocessor):
@@ -403,8 +394,6 @@ def main():
         log_dict['train/num_training_samples'] = len(training_data)
         logger.info(f'Step {step}: {log_dict}')
         step += 1
-        if USE_SWANLAB:
-            swanlab.log(log_dict)
 
     # Save final checkpoint
     save_future = training_client.save_state('Math-grpo-final')
diff --git a/cookbook/megatron/tp.py b/cookbook/megatron/tp.py
@@ -9,13 +9,6 @@
 from twinkle.model import MegatronModel
 from twinkle.preprocessor import SelfCognitionProcessor
 
-if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-    # rank0 recording
-    import swanlab
-    swanlab.login(api_key=os.environ['SWANLAB_API_KEY'], save=True)
-
-    run = swanlab.init(project='twinkle', )
-
 # Construct a device_mesh, tp=pp=cp=2, dp=1
 device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2)
 # use torchrun mode
@@ -75,8 +68,6 @@ def train():
         if step % 5 == 0:
             # Print metric
             metric = model.calculate_metric(is_training=True)
-            if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-                swanlab.log(metric)
             logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
         if step > 0 and step % 20 == 0:
             metrics = eval(model)
diff --git a/cookbook/megatron/tp_moe.py b/cookbook/megatron/tp_moe.py
@@ -9,13 +9,6 @@
 from twinkle.model import MegatronModel
 from twinkle.preprocessor import SelfCognitionProcessor
 
-if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-    # rank0 recording
-    import swanlab
-    swanlab.login(api_key=os.environ['SWANLAB_API_KEY'], save=True)
-
-    run = swanlab.init(project='twinkle', )
-
 # Construct a device_mesh, tp=pp=cp=ep=2, dp=1
 device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2)
 # use torchrun mode
@@ -74,8 +67,6 @@ def train():
         if step % 5 == 0:
             # Print metric
             metric = model.calculate_metric(is_training=True)
-            if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-                swanlab.log(metric)
             logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
         if step > 0 and step % 20 == 0:
             metrics = eval(model)
diff --git a/cookbook/ray/single_controller.py b/cookbook/ray/single_controller.py
@@ -9,13 +9,6 @@
 from twinkle.model import TransformersModel
 from twinkle.preprocessor import SelfCognitionProcessor
 
-if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-    # rank0 recording
-    import swanlab
-    swanlab.login(api_key=os.environ['SWANLAB_API_KEY'], save=True)
-
-    run = swanlab.init(project='twinkle', )
-
 device_group = [DeviceGroup(
     name='default',
     ranks=8,
@@ -83,8 +76,6 @@ def train():
         if step % 20 == 0:
             # Print metric
             metric = model.calculate_metric(is_training=True)
-            if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-                swanlab.log(metric)
             logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
         if step > 0 and step % 40 == 0:
             metrics = eval(model)
diff --git a/cookbook/rl/grpo.py b/cookbook/rl/grpo.py
@@ -1,4 +1,3 @@
-# WIP, not working yet
 import os
 from typing import List, Tuple, Dict, Any
 
@@ -32,7 +31,9 @@
 MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
 LEARNING_RATE = float(os.environ.get('LR', 1e-5))
 MAX_STEPS = int(os.environ.get('MAX_STEPS', 200))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 4))
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 16)) # global prompt-level, global completion-level batch size = BATCH_SIZE * num_generations * dp_size
+MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 16)) # global completion-level mini-batch-size
+MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2)) # per-device-micro-batch-size (completion-level), batch_size in forward_backward
 GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
 ADAPTER_NAME = 'default'
 
@@ -150,19 +151,31 @@ def main():
             },
         )
 
-        advantages = advantage_fn(
-            total_rewards,
-            num_generations=NUM_GENERATIONS,
-            scale='group',
-        )
-        advantages = advantages.tolist()
-
-        model.forward_backward(inputs=all_input_data, old_logps=all_old_logps, advantages=advantages, micro_batch_size=2)
-        model.clip_grad_and_step()
-        optim_step += 1
-        log_dict = metrics.calculate()
-        log_dict.update(model.calculate_metric(is_training=True))
-        logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}')
+        advantages = advantage_fn(total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
+        # Split completions into mini-batches and run one optim step per mini-batch.
+        total_completions = len(all_input_data)
+        for mb_start in range(0, total_completions, MINI_BATCH_SIZE):
+            mb_end = min(mb_start + MINI_BATCH_SIZE, total_completions)
+            mb_inputs = all_input_data[mb_start:mb_end]
+            mb_old_logps = all_old_logps[mb_start:mb_end]
+            mb_advantages = advantages[mb_start:mb_end]
+
+            model.forward_backward(
+                inputs=mb_inputs,
+                old_logps=mb_old_logps,
+                advantages=mb_advantages,
+                micro_batch_size=MICRO_BATCH_SIZE,
+            )
+            model.clip_grad_and_step()
+            optim_step += 1
+
+            if optim_step >= MAX_STEPS:
+                break
+            log_dict = metrics.calculate()
+            log_dict.update(model.calculate_metric(is_training=True))
+            metrics.reset()
+            logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}')
 
     logger.info(f'Training completed. optim_steps={optim_step}')
     model.save('grpo-gsm8k-checkpoint')
diff --git a/cookbook/transformers/fsdp2.py b/cookbook/transformers/fsdp2.py
@@ -9,13 +9,6 @@
 from twinkle.model import TransformersModel
 from twinkle.preprocessor import SelfCognitionProcessor
 
-if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-    # rank0 recording
-    import swanlab
-    swanlab.login(api_key=os.environ['SWANLAB_API_KEY'], save=True)
-
-    run = swanlab.init(project='twinkle', )
-
 # Construct a device_mesh, fsdp=4, dp=2
 device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2)
 # use torchrun mode
@@ -77,8 +70,6 @@ def train():
         if step % 20 == 0:
             # Print metric
             metric = model.calculate_metric(is_training=True)
-            if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-                swanlab.log(metric)
             logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
         if step > 0 and step % 40 == 0:
             metrics = eval(model)
diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py
@@ -9,16 +9,6 @@
 from twinkle.model import TransformersModel
 from twinkle.preprocessor import SelfCognitionProcessor
 
-if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-    # rank0 recording
-    import swanlab
-    swanlab.login(api_key=os.environ['SWANLAB_API_KEY'], save=True)
-
-    run = swanlab.init(
-        project='twinkle',
-    )
-
-
 # Construct a device_mesh, fsdp=4, dp=2
 device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2)
 # use torchrun mode
@@ -83,8 +73,6 @@ def train():
         if step % 20 == 0:
             # Print metric
             metric = model.calculate_metric(is_training=True)
-            if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
-                swanlab.log(metric)
             logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
         if step > 0 and step % 40 == 0:
            metrics = eval(model)
diff --git a/src/twinkle/checkpoint_engine/manager.py b/src/twinkle/checkpoint_engine/manager.py
@@ -77,6 +77,25 @@ def decide_backend_engine(platform: Optional[str] = None) -> 'CheckpointEngine':
             raise NotImplementedError
 
     def sync_weights(self, merge_and_sync=True):
+        """
+        Synchronize the weights between the model and the sampler.
+
+        This method ensures that the sampler's weights are consistent with the model's
+        current state. It supports two synchronization modes: full merge-and-sync or
+        separate base-and-LoRA sync.
+
+        Args:
+            merge_and_sync (bool, optional): Whether to merge and sync the weights.
+                - If True: LoRA weights are merged into the base model, then the
+                combined weights are synchronized to the sampler on every call.
+                - If False: On the first call, base model weights are synced to the
+                sampler. On subsequent calls, only the LoRA adapter weights are
+                synced incrementally.
+                Defaults to True.
+
+        Returns:
+            None
+        """
         start_time = time.time()
         model_metadata = self.model.prepare_checkpoint_engine([True]
                                                               + [False] * (self.model.device_mesh.world_size - 1))
diff --git a/src/twinkle/metric/completion_and_reward.py b/src/twinkle/metric/completion_and_reward.py
@@ -57,9 +57,9 @@ def _std(statistic_list: List[float]) -> float:
 
     def calculate(self) -> Dict[str, Any]:
         metric_dict = {}
-        if self.weight_sync_time is not None:
+        if self.weight_sync_time:
             metric_dict['profiling/Time taken: move_model_to_sampler'] = self._mean(self.weight_sync_time)
-        if self.generate_time is not None:
+        if self.generate_time:
             metric_dict['profiling/Time taken: generate'] = self._mean(self.generate_time)
         for key, values in self.rewards.items():
             metric_dict[f'train/{key}_reward'] = self._mean(values)
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_engine.py b/src/twinkle/sampler/vllm_sampler/vllm_engine.py
@@ -410,7 +410,8 @@ async def _get_or_load_lora(
             await self.engine.add_lora(lora_request)
             self._lora_request_cache[lora_path] = lora_request
             return lora_request
-        except Exception:  # noqa
+        except Exception as e:
+            logger.error(f'Failed to load LoRA from {lora_path}: {e}')
             return None
 
     async def sleep(self, level: int = 2) -> None: