fix

tastelikefeet · tastelikefeet · commit b6c05f0b7e84 · 2026-02-13T08:14:03.000+08:00
diff --git a/cookbook/client/tinker/grpo.py b/cookbook/client/tinker/grpo.py
@@ -19,6 +19,8 @@
 # Requires both model and sampler services to be configured.
 
 import gc
+import os
+
 import numpy as np
 from typing import List, Tuple
 
@@ -34,7 +36,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-BASE_MODEL = 'Qwen/Qwen2.5-7B-Instruct'
+BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
 NUM_GENERATIONS = 4
 MAX_NEW_TOKENS = 1024
 LEARNING_RATE = 1e-5
@@ -84,8 +86,8 @@ def main():
 
     # Step 2: Initialize the Tinker-compatible client
     logger.info("Connecting to Tinker server...")
-    service_client = init_tinker_compat_client(
-        base_url='http://localhost:8000')
+    service_client = init_tinker_compat_client(base_url='http://www.modelscope.cn/twinkle',
+                                               api_key=os.environ.get('MODELSCOPE_SDK_TOKEN'))
     
     logger.info("Creating LoRA training client...")
     # Create a LoRA training client for GRPO
diff --git a/cookbook/client/tinker/megatron/server_config.yaml b/cookbook/client/tinker/megatron/server_config.yaml
@@ -45,18 +45,17 @@ applications:
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
-        max_model_len: 4096           # Maximum sequence length the engine supports
-        gpu_memory_utilization: 0.7   # Fraction of GPU memory to use (0.0-1.0)
+        max_model_len: 8192           # Maximum sequence length the engine supports
+        gpu_memory_utilization: 0.85   # Fraction of GPU memory to use (0.0-1.0)
         enable_lora: true             # Allow loading LoRA adapters during inference
       device_group:                   # Logical device group for the sampler
         name: sampler
-        gpus_per_worker: 2
-        ranks: [0,1,2,3,4,5,6,7]                    # GPU rank indices to use
+        gpus_per_worker: 1
+        ranks: [0,1,2,3]                    # GPU rank indices to use
         device_type: cuda
       device_mesh:
         device_type: cuda
         dp_size: 4
-        tp_size: 2
     deployments:
       - name: SamplerManagement
         autoscaling_config:
@@ -68,7 +67,7 @@ applications:
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
-              DEVICE_COUNT_PER_PHYSICAL_NODE: "16"
+              DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
 
   # 2. Model Service (commented out) - Would host the base model for training.
   #    Uncomment and configure if you need a training model worker.
@@ -81,18 +80,16 @@ applications:
       nproc_per_node: 4                            # Number of GPU processes per node
       device_group:
         name: model
-        ranks: [8,9,10,11,12,13,14,15]                              # GPU rank indices
+        ranks: [4,5,6,7]                              # GPU rank indices
         device_type: cuda
       device_mesh:
         device_type: cuda
-        dp_size: 2
-        tp_size: 2
-        pp:size: 2
+        dp_size: 4
         ep_size: 2
 
       queue_config:
         rps_limit: 100                             # Max requests per second
-        tps_limit: 10000                           # Max tokens per second
+        tps_limit: 100000                           # Max tokens per second
       adapter_config:
         per_token_adapter_limit: 30                # Max concurrent LoRA adapters
         adapter_timeout: 1800                      # Seconds before idle adapter unload
@@ -101,10 +98,10 @@ applications:
         autoscaling_config:
           min_replicas: 1
           max_replicas: 1
-          target_ongoing_requests: 16
+          target_ongoing_requests: 8
         ray_actor_options:
           num_cpus: 0.1
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
-              DEVICE_COUNT_PER_PHYSICAL_NODE: "16"
+              DEVICE_COUNT_PER_PHYSICAL_NODE: "8"