Skip to content

Commit b6c05f0

Browse files
committed
fix
1 parent 3cc2189 commit b6c05f0

File tree

2 files changed

+15
-16
lines changed

2 files changed

+15
-16
lines changed

cookbook/client/tinker/grpo.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
# Requires both model and sampler services to be configured.
2020

2121
import gc
22+
import os
23+
2224
import numpy as np
2325
from typing import List, Tuple
2426

@@ -34,7 +36,7 @@
3436
logger = get_logger()
3537

3638
# ========== Configuration ==========
37-
BASE_MODEL = 'Qwen/Qwen2.5-7B-Instruct'
39+
BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
3840
NUM_GENERATIONS = 4
3941
MAX_NEW_TOKENS = 1024
4042
LEARNING_RATE = 1e-5
@@ -84,8 +86,8 @@ def main():
8486

8587
# Step 2: Initialize the Tinker-compatible client
8688
logger.info("Connecting to Tinker server...")
87-
service_client = init_tinker_compat_client(
88-
base_url='http://localhost:8000')
89+
service_client = init_tinker_compat_client(base_url='http://www.modelscope.cn/twinkle',
90+
api_key=os.environ.get('MODELSCOPE_SDK_TOKEN'))
8991

9092
logger.info("Creating LoRA training client...")
9193
# Create a LoRA training client for GRPO

cookbook/client/tinker/megatron/server_config.yaml

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -45,18 +45,17 @@ applications:
4545
nproc_per_node: 4 # Number of GPU processes per node
4646
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4747
engine_args: # vLLM engine-specific settings
48-
max_model_len: 4096 # Maximum sequence length the engine supports
49-
gpu_memory_utilization: 0.7 # Fraction of GPU memory to use (0.0-1.0)
48+
max_model_len: 8192 # Maximum sequence length the engine supports
49+
gpu_memory_utilization: 0.85 # Fraction of GPU memory to use (0.0-1.0)
5050
enable_lora: true # Allow loading LoRA adapters during inference
5151
device_group: # Logical device group for the sampler
5252
name: sampler
53-
gpus_per_worker: 2
54-
ranks: [0,1,2,3,4,5,6,7] # GPU rank indices to use
53+
gpus_per_worker: 1
54+
ranks: [0,1,2,3] # GPU rank indices to use
5555
device_type: cuda
5656
device_mesh:
5757
device_type: cuda
5858
dp_size: 4
59-
tp_size: 2
6059
deployments:
6160
- name: SamplerManagement
6261
autoscaling_config:
@@ -68,7 +67,7 @@ applications:
6867
runtime_env:
6968
env_vars:
7069
TWINKLE_TRUST_REMOTE_CODE: "0"
71-
DEVICE_COUNT_PER_PHYSICAL_NODE: "16"
70+
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
7271

7372
# 2. Model Service (commented out) - Would host the base model for training.
7473
# Uncomment and configure if you need a training model worker.
@@ -81,18 +80,16 @@ applications:
8180
nproc_per_node: 4 # Number of GPU processes per node
8281
device_group:
8382
name: model
84-
ranks: [8,9,10,11,12,13,14,15] # GPU rank indices
83+
ranks: [4,5,6,7] # GPU rank indices
8584
device_type: cuda
8685
device_mesh:
8786
device_type: cuda
88-
dp_size: 2
89-
tp_size: 2
90-
pp:size: 2
87+
dp_size: 4
9188
ep_size: 2
9289

9390
queue_config:
9491
rps_limit: 100 # Max requests per second
95-
tps_limit: 10000 # Max tokens per second
92+
tps_limit: 100000 # Max tokens per second
9693
adapter_config:
9794
per_token_adapter_limit: 30 # Max concurrent LoRA adapters
9895
adapter_timeout: 1800 # Seconds before idle adapter unload
@@ -101,10 +98,10 @@ applications:
10198
autoscaling_config:
10299
min_replicas: 1
103100
max_replicas: 1
104-
target_ongoing_requests: 16
101+
target_ongoing_requests: 8
105102
ray_actor_options:
106103
num_cpus: 0.1
107104
runtime_env:
108105
env_vars:
109106
TWINKLE_TRUST_REMOTE_CODE: "0"
110-
DEVICE_COUNT_PER_PHYSICAL_NODE: "16"
107+
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"

0 commit comments

Comments
 (0)