update

Yunnglin · Yunnglin · commit 987d89e7551c · 2026-02-13T12:01:09.000+08:00
diff --git a/cookbook/client/tinker/megatron/server_config_7b.yaml b/cookbook/client/tinker/megatron/server_config_7b.yaml
@@ -71,37 +71,37 @@ applications:
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
-  # - name: sampler-Qwen2.5-7B-Instruct
-  #   route_prefix: /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct
-  #   import_path: sampler
-  #   args:
-  #     model_id: "ms://Qwen/Qwen2.5-7B-Instruct"   # ModelScope model identifier
-  #     nproc_per_node: 2               # Number of GPU processes per node
-  #     sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
-  #     engine_args:                    # vLLM engine-specific settings
-  #       max_model_len: 4096           # Maximum sequence length the engine supports
-  #       gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
-  #       enable_lora: true             # Allow loading LoRA adapters during inference
-  #       logprobs_mode: processed_logprobs # Logprobs mode for sampling results
-  #     device_group:                   # Logical device group for the sampler
-  #       name: sampler
-  #       ranks: [2]                    # GPU rank indices to use
-  #       device_type: cuda
-  #     device_mesh:
-  #       device_type: cuda
-  #       dp_size: 1
-  #     queue_config:
-  #       rps_limit: 100                             # Max requests per second
-  #       tps_limit: 100000                           # Max tokens per second
-  #   deployments:
-  #     - name: SamplerManagement
-  #       autoscaling_config:
-  #         min_replicas: 1
-  #         max_replicas: 1
-  #         target_ongoing_requests: 16
-  #       ray_actor_options:
-  #         num_cpus: 0.1
-  #         runtime_env:
-  #           env_vars:
-  #             TWINKLE_TRUST_REMOTE_CODE: "0"
-  #             DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
+  - name: sampler-Qwen2.5-7B-Instruct
+    route_prefix: /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct
+    import_path: sampler
+    args:
+      model_id: "ms://Qwen/Qwen2.5-7B-Instruct"   # ModelScope model identifier
+      nproc_per_node: 2               # Number of GPU processes per node
+      sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
+      engine_args:                    # vLLM engine-specific settings
+        max_model_len: 4096           # Maximum sequence length the engine supports
+        gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
+        enable_lora: true             # Allow loading LoRA adapters during inference
+        logprobs_mode: processed_logprobs # Logprobs mode for sampling results
+      device_group:                   # Logical device group for the sampler
+        name: sampler
+        ranks: [2]                    # GPU rank indices to use
+        device_type: cuda
+      device_mesh:
+        device_type: cuda
+        dp_size: 1
+      queue_config:
+        rps_limit: 100                             # Max requests per second
+        tps_limit: 100000                           # Max tokens per second
+    deployments:
+      - name: SamplerManagement
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 1
+          target_ongoing_requests: 16
+        ray_actor_options:
+          num_cpus: 0.1
+          runtime_env:
+            env_vars:
+              TWINKLE_TRUST_REMOTE_CODE: "0"
+              DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
diff --git a/cookbook/client/tinker/sample.py b/cookbook/client/tinker/sample.py
@@ -4,7 +4,6 @@
 # for text generation (sampling) via the Tinker-compatible client API.
 # The server must be running first (see server.py and server_config.yaml).
 
-from modelscope import AutoTokenizer
 from tinker import types
 
 from twinkle.data_format import Message, Trajectory
diff --git a/cookbook/client/tinker/short_math_grpo.py b/cookbook/client/tinker/short_math_grpo.py
@@ -21,15 +21,17 @@
 import numpy as np
 import os
 import re
-from modelscope import AutoTokenizer
 from tinker import types
 from typing import List, Tuple
 
+from twinkle_client import init_tinker_compat_client
 from twinkle import get_logger
 from twinkle.advantage import GRPOAdvantage
 from twinkle.data_format import Message, Trajectory
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.preprocessor import Preprocessor
+from twinkle.reward.base import Reward
 from twinkle.metric import CompletionRewardMetric
 from twinkle.template import Template
 
@@ -332,6 +334,10 @@ def main():
         ).tolist()
 
         frac_zero_std = (1.0 if all(abs(a) < 1e-8 for a in advantages) else 0.0)
+        if frac_zero_std == 1.0:
+            logger.info(f'Step {step}: All advantages are zero, skipping training')
+            step += 1
+            continue
 
         # ========== 6. Train the policies with GRPO loss ==========
         # Train the policies with the Advantage-Regularized policy