Merge remote-tracking branch 'origin/dev' into kernels_unittest_fix_ljl

meichangsu1 · meichangsu1 · commit 1d8094d09753 · 2026-02-13T16:40:46.000+08:00
diff --git a/cookbook/client/twinkle/grpo.py b/cookbook/client/twinkle/grpo.py
@@ -107,7 +107,7 @@ def __call__(self, trajectories: List[Trajectory], ground_truths: List[Trajector
 
 def compute_rewards(trajectories: List[dict], ) -> Tuple[List[float], List[float], List[float]]:
     """Compute format and accuracy rewards for Countdown game."""
-    from twinkle.reward import CountDownAccuracy, FormatReward
+    from twinkle.reward import FormatReward
     format_rewards = FormatReward()(trajectories, [])
     accuracy_rewards = CountDownAccuracy()(trajectories, [])
     total_rewards = [a + b for a, b in zip(accuracy_rewards, format_rewards)]
diff --git a/cookbook/rl/grpo.py b/cookbook/rl/grpo.py
@@ -142,8 +142,6 @@ def main():
             all_input_data
         )
         metrics.accumulate(
-            None,
-            None,
             completion_lengths=all_completion_lengths,
             rewards={
                 'total': total_rewards,
@@ -159,7 +157,7 @@ def main():
         )
         advantages = advantages.tolist()
 
-        model.forward_backward(inputs=all_input_data, old_logps=all_old_logps, advantages=advantages)
+        model.forward_backward(inputs=all_input_data, old_logps=all_old_logps, advantages=advantages, micro_batch_size=2)
         model.clip_grad_and_step()
         optim_step += 1
         log_dict = metrics.calculate()
diff --git a/docs/source_en/Components/Reward/Reward.md b/docs/source_en/Components/Reward/Reward.md
@@ -44,17 +44,6 @@ reward_fn = FormatReward()
 rewards = reward_fn(trajectories, ground_truths)
 ```
 
-## CountDownAccuracyReward
-
-The countdown accuracy reward function provides partial rewards when answers are close to correct.
-
-```python
-from twinkle.reward import CountDownAccuracyReward
-
-reward_fn = CountDownAccuracyReward()
-rewards = reward_fn(trajectories, ground_truths)
-```
-
 ## Custom Reward Functions
 
 You can create custom rewards by inheriting from the Reward base class or using functions:
diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md
@@ -16,7 +16,9 @@ API endpoint: `base_url="https://www.modelscope.cn/twinkle"`
 
 ## Step 2. Review the Cookbook and Customize Development
 
-We strongly recommend that developers review our [cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook/client/) and build upon the training code provided there.
+We strongly recommend that developers review our [cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook/client/tinker) and build upon the training code provided there.
+
+> The ModelScope server is tinker-compatible, so use the tinker cookbooks. In the future version, we will support a server works both for twinkle/tinker clients.
 
 Developers can customize datasets, advantage functions, rewards, templates, and more. However, the Loss component is not currently customizable since it needs to be executed on the server side (for security reasons). If you need support for additional Loss functions, you can upload your Loss implementation to ModelHub and contact us via the Q&A group or through an issue to have the corresponding component added to the whitelist.
 
diff --git a/docs/source_zh/使用指引/训练服务.md b/docs/source_zh/使用指引/训练服务.md
@@ -19,7 +19,9 @@
 
 ## Step 2. 查看 Cookbook 并二次定制开发
 
-我们强烈推荐开发者查看我们的 [cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook/client/)，并根据其中的训练代码进行二次开发。
+我们强烈推荐开发者查看我们的 [cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook/client/tinker)，并根据其中的训练代码进行二次开发。
+
+> 目前的服务兼容tinker client，因此请使用tinker的cookbook进行训练。后续我们会支持单服务器支持twinkle/tinker双client。
 
 开发者可以定制数据集/优势函数/奖励/模板等，其中 Loss 部分由于需要在服务端执行，因此当前暂不支持（安全性原因）。
 如果需要支持您的额外 Loss，可以将该 Loss 实现上传到 ModelHub 中，并在答疑群中或者 issue 中联系我们，将对应组件开放白名单即可使用。
diff --git a/docs/source_zh/组件/奖励/Reward.md b/docs/source_zh/组件/奖励/Reward.md
@@ -44,17 +44,6 @@ reward_fn = FormatReward()
 rewards = reward_fn(trajectories, ground_truths)
 ```
 
-## CountDownAccuracyReward
-
-倒计时准确率奖励函数,在答案接近正确时给予部分奖励。
-
-```python
-from twinkle.reward import CountDownAccuracyReward
-
-reward_fn = CountDownAccuracyReward()
-rewards = reward_fn(trajectories, ground_truths)
-```
-
 ## 自定义奖励函数
 
 你可以通过继承 Reward 基类或使用函数来创建自定义奖励:
diff --git a/src/twinkle/loss/grpo.py b/src/twinkle/loss/grpo.py
@@ -34,20 +34,6 @@ def __init__(
         self.beta = beta
         self.ignore_index = ignore_index
 
-    def _extract_advantages_from_trajectories(self, trajectories: List[Trajectory],
-                                              device: 'torch.device') -> 'torch.Tensor':
-        """Extract advantages from trajectory objects."""
-        import torch
-        advantages_list = []
-        for traj in trajectories:
-            if isinstance(traj, dict):
-                adv = traj.get('advantages', None)
-            else:
-                adv = getattr(traj, 'advantages', None)
-            assert adv is not None, "trajectories must contain 'advantages'"
-            advantages_list.append(float(adv))
-        return torch.tensor(advantages_list, dtype=torch.float32, device=device)
-
     def _compute_loss_mask(self, labels: 'torch.Tensor') -> 'torch.Tensor':
         """
         Compute loss mask from labels.
@@ -275,7 +261,6 @@ def __call__(
         *,
         old_logps: Optional[Union['torch.Tensor', List[List[float]]]] = None,
         ref_logps: Optional['torch.Tensor'] = None,
-        trajectories: Optional[List[Trajectory]] = None,  # TODO: remove this argument
         advantages: Optional[Union['torch.Tensor', List[float], np.ndarray]] = None,
         **kwargs,
     ) -> 'torch.Tensor':
@@ -326,13 +311,8 @@ def __call__(
         # In padding_free / packing mode the processor concatenates all
         # sequences into a single row [1, total_tokens].  We detect this
         # by checking: batch_size == 1 but the actual number of sequences
-        # (known from trajectories or advantages) is greater than 1.
-        if trajectories is not None:
-            num_sequences = len(trajectories)
-        elif advantages is not None:
-            num_sequences = len(advantages) if isinstance(advantages, (list, tuple)) else advantages.shape[0]
-        else:
-            num_sequences = logps.shape[0]
+        # is greater than 1.
+        num_sequences = len(advantages) if isinstance(advantages, (list, tuple)) else advantages.shape[0]
         is_packed = (logps.shape[0] == 1 and num_sequences > 1)
         if is_packed:
             position_ids = inputs.get('position_ids')
diff --git a/src/twinkle/metric/completion_and_reward.py b/src/twinkle/metric/completion_and_reward.py
@@ -21,8 +21,8 @@ def reset(self):
         self.completion_lengths = []
 
     def accumulate(self,
-                   _,
-                   __,
+                   inputs=None, # ignore
+                   outputs=None,# ignore
                    *,
                    rewards=None,
                    completion_lengths=None,
@@ -55,11 +55,11 @@ def _std(statistic_list: List[float]) -> float:
         return 0.0
 
     def calculate(self) -> Dict[str, Any]:
-        metric_dict = {
-            'profiling/Time taken: move_model_to_sampler': self._mean(self.weight_sync_time),
-            'profiling/Time taken: generate': self._mean(self.generate_time),
-        }
-
+        metric_dict = {}
+        if self.weight_sync_time is not None:
+            metric_dict['profiling/Time taken: move_model_to_sampler'] = self._mean(self.weight_sync_time)
+        if self.generate_time is not None:
+            metric_dict['profiling/Time taken: generate'] = self._mean(self.generate_time)
         for key, values in self.rewards.items():
             metric_dict[f'train/{key}_reward'] = self._mean(values)
             metric_dict[f'train/{key}_reward_std'] = self._std(values)
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
@@ -393,11 +393,35 @@ def forward_backward(self,
             else:
                 seq_length = original_seq_length
 
-        loss_extra_kwargs = kwargs
+        num_microbatches = len(inputs)
+        loss_extra_kwargs_per_mb = []
+        if num_microbatches <= 1:
+            loss_extra_kwargs_per_mb = [kwargs]
+        else:
+            for mb_idx in range(num_microbatches):
+                mb_start = mb_idx * micro_batch_size
+                mb_end = mb_start + micro_batch_size
+                mb_kwargs = {}
+                for key, value in kwargs.items():
+                    if isinstance(value, torch.Tensor) and value.dim() >= 1 and value.shape[0] > micro_batch_size:
+                        mb_kwargs[key] = value[mb_start:mb_end]
+                    elif isinstance(value, np.ndarray) and value.ndim >= 1 and value.shape[0] > micro_batch_size:
+                        mb_kwargs[key] = value[mb_start:mb_end]
+                    elif isinstance(value, (list, tuple)) and len(value) > micro_batch_size:
+                        mb_kwargs[key] = value[mb_start:mb_end]
+                    else:
+                        # Scalars, small tensors, or non-sliceable values pass through as-is
+                        mb_kwargs[key] = value
+                loss_extra_kwargs_per_mb.append(mb_kwargs)
+
+        _mb_counter = [0]  # mutable counter for closure
 
         def post_loss_function(output_tensor, inputs):
+            mb_idx = _mb_counter[0]
+            _mb_counter[0] += 1
+            current_kwargs = loss_extra_kwargs_per_mb[mb_idx % len(loss_extra_kwargs_per_mb)]
             outputs = ModelOutput(logits=output_tensor)
-            result = loss_instance(inputs, outputs, **loss_extra_kwargs)
+            result = loss_instance(inputs, outputs, **current_kwargs)
             if isinstance(result, tuple):
                 losses, counts = result
             else:
@@ -789,7 +813,7 @@ def clip_grad_and_step(self, max_grad_norm: float = 1.0, norm_type=2, **kwargs):
         self.zero_grad(**kwargs)
         self.lr_step(**kwargs)
 
-    @remote_function(dispatch='all', sync=True)
+    @remote_function(dispatch='all', collect='first', sync=True)
     def save(self,
              name: Optional[str] = None,
              output_dir: Optional[str] = None,
diff --git a/src/twinkle/preprocessor/llm.py b/src/twinkle/preprocessor/llm.py
@@ -96,7 +96,7 @@ class GSM8KProcessor(Preprocessor):
                      'numerical answer after ####.\n'
                      'For example:\n<think> ... reasoning ... </think>\n#### 42')
 
-    def extract_ground_truth(answer_str: str) -> str:
+    def extract_ground_truth(self, answer_str: str) -> str:
         """Extract the number after '####' from GSM8K answer."""
         match = re.search(r'####\s*([\-\d,\.]+)', answer_str)
         if match:
diff --git a/src/twinkle/reward/__init__.py b/src/twinkle/reward/__init__.py
@@ -1,6 +1,5 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 from .base import Reward
-from .count_down_accuracy import CountDownAccuracy
 from .format_reward import FormatReward
 from .gsm8k import GSM8KAccuracyReward, GSM8KFormatReward
 from .math_reward import MathReward
diff --git a/src/twinkle/reward/gsm8k.py b/src/twinkle/reward/gsm8k.py
@@ -34,7 +34,10 @@ def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
 
             # Get ground truth from user_data
             user_data = trajectory.get('user_data')
-            gt = user_data.get('ground_truth')
+            for item in user_data:
+                if item[0] == 'ground_truth':
+                    gt = item[1]
+                    break
 
             predicted = self.extract_answer(completion)
 
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_engine.py b/src/twinkle/sampler/vllm_sampler/vllm_engine.py
@@ -52,7 +52,7 @@ def __init__(
         max_model_len: Optional[int] = None,
         max_num_seqs: int = 256,
         enable_lora: bool = True,
-        max_loras: int = 5,
+        max_loras: int = 1,
         max_lora_rank: int = 32,
         enable_sleep_mode: bool = False,
         enable_prefix_caching: bool = False,
@@ -84,8 +84,7 @@ def __init__(
         self.logprobs_mode = logprobs_mode or 'processed_logprobs'
         self.engine_kwargs = kwargs or {}
 
-        # Only need to track which users have loaded LoRAs and their IDs
-        self._user_lora_ids: Dict[str, int] = {}
+        self._lora_request_cache: Dict[str, Any] = {}
         self._next_lora_id = 1
 
         # Cached LoRARequest for the RL-training synced LoRA.
@@ -374,36 +373,20 @@ def _generate_lora_id(self) -> int:
     async def _get_or_load_lora(
         self,
         lora_path: str,
-        *,
-        force_reload: bool = False,
     ):
         """Get or load a LoRA adapter from *lora_path*.
 
         Args:
             lora_path: Resolved filesystem path to the LoRA adapter directory.
-            force_reload: If ``True``, remove the existing adapter and reload.
 
         Returns:
             ``LoRARequest`` or ``None`` if loading fails.
         """
         from vllm.lora.request import LoRARequest
-        user_id = 'default'  # TODO, multi-tenant
-        # Fast path: return cached request if it exists and reload is not forced.
-        if user_id in self._user_lora_ids and not force_reload:
-            lora_int_id = self._user_lora_ids[user_id]
-            loaded_loras = await self.engine.list_loras()
-            if lora_int_id in loaded_loras:
-                return LoRARequest(
-                    lora_name=str(lora_int_id),
-                    lora_int_id=lora_int_id,
-                    lora_path=lora_path,
-                )
-            # Stale entry — clean up and fall through to load below.
-            self._user_lora_ids.pop(user_id, None)
 
-        # If force reload, remove existing first.
-        if force_reload and user_id in self._user_lora_ids:
-            await self.engine.remove_lora(self._user_lora_ids[user_id])
+        # Fast path: return cached request for this path.
+        if lora_path in self._lora_request_cache:
+            return self._lora_request_cache[lora_path]
 
         if not os.path.exists(lora_path):
             logger.error(f'LoRA path does not exist: {lora_path}')
@@ -425,10 +408,9 @@ async def _get_or_load_lora(
 
         try:
             await self.engine.add_lora(lora_request)
-            self._user_lora_ids[user_id] = lora_int_id
+            self._lora_request_cache[lora_path] = lora_request
             return lora_request
         except Exception as e:
-            logger.error(f'Failed to load LoRA: {e}')
             return None
 
     async def sleep(self, level: int = 2) -> None:
@@ -492,7 +474,7 @@ async def update_weights(
         The streaming path avoids accumulating a full model copy on GPU:
         tensors are consumed one-by-one from the generator, copied into a
         GPU IPC bucket, and flushed to the vLLM worker subprocess when the
-        bucket is full — identical to verl's approach.
+        bucket is full.
 
         Args:
             weights: Weights to transfer.  ``dict[str, Tensor]`` or
@@ -690,7 +672,7 @@ async def shutdown(self) -> None:
                 self.engine = None
 
         # Clear LoRA state
-        self._user_lora_ids.clear()
+        self._lora_request_cache.clear()
 
         # Force garbage collection
         gc.collect()
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -285,11 +285,15 @@ def sample(
             inputs: Either InputFeature(s) or Trajectory(s).
                 - InputFeature: Must contain 'input_ids'. For multimodal, include 'images'/'videos'.
                 - Trajectory: Must contain 'messages'. Requires template to be set.
+
             sampling_params: Sampling parameters.
+
             adapter_name: Optional LoRA adapter name.
+
             adapter_path: Optional LoRA adapter path.
+
             num_samples: Number of completions to generate per input prompt.
-                        When > 1, returns num_samples sequences for each input.
+                When > 1, returns num_samples sequences for each input.
 
         Returns:
             SampleResponse containing sampled sequences.
@@ -321,7 +325,7 @@ def sample(
 
         lora_request = None
         if adapter_path is not None:
-            lora_request = self._run_in_loop(self.engine._get_or_load_lora(adapter_path, force_reload=True))
+            lora_request = self._run_in_loop(self.engine._get_or_load_lora(adapter_path))
             if lora_request is None:
                 logger.warning(f'Failed to pre-load LoRA from {adapter_path}, '
                                'sampling will proceed without LoRA')
@@ -369,7 +373,7 @@ def receive_weights(
     ):
         """Receive weights via NCCL broadcast and stream into vLLM.
 
-        Uses a **streaming pipeline** (like verl) to avoid accumulating a
+        Uses a **streaming pipeline** to avoid accumulating a
         full model-weight copy on GPU:
 
         1. ``CheckpointEngine.receive_weights()`` yields tensors from
diff --git a/src/twinkle/utils/torch_utils.py b/src/twinkle/utils/torch_utils.py