Skip to content

Commit d77153b

Browse files
committed
change server_config
1 parent aaddedf commit d77153b

File tree

4 files changed

+64
-16
lines changed

4 files changed

+64
-16
lines changed

cookbook/client/tinker/megatron/server_config.yaml

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Twinkle Server Configuration - Tinker-Compatible Megatron Backend
1+
# Twinkle Server Configuration - Tinker-Compatible Transformers Backend
22

33
# Server protocol type: "tinker" enables the Tinker-compatible API
44
server_type: tinker
@@ -31,23 +31,30 @@ applications:
3131
ray_actor_options:
3232
num_cpus: 0.1 # CPU resources allocated to this actor
3333

34-
# 2. Model Service - Hosts the base model for training (Megatron backend)
35-
# This is the actual model worker that performs forward/backward passes.
36-
- name: models-Qwen2.5-0.5B-Instruct
37-
route_prefix: /api/v1/model/Qwen/Qwen2.5-0.5B-Instruct # REST path for this model
34+
# 2. Model Service (commented out) - Would host the base model for training.
35+
# Uncomment and configure if you need a training model worker.
36+
- name: models-Qwen3-30B-A3B-Instruct-2507
37+
route_prefix: /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507
3838
import_path: model
3939
args:
40-
use_megatron: true # Use Megatron-LM backend (not HuggingFace)
41-
model_id: "ms://Qwen/Qwen2.5-0.5B-Instruct" # ModelScope model identifier to load
42-
nproc_per_node: 2 # Number of GPU processes per node
43-
device_group: # Logical device group for this model
40+
use_megatron: true # Use HuggingFace Transformers backend
41+
model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
42+
nproc_per_node: 4 # Number of GPU processes per node
43+
device_group:
4444
name: model
45-
ranks: [0, 1] # GPU rank indices to use
45+
ranks: [0, 1, 2, 3] # GPU rank indices
4646
device_type: cuda
47-
device_mesh: # Distributed training mesh configuration
47+
device_mesh:
4848
device_type: cuda
49-
mesh: [0, 1] # Device indices in the mesh
50-
mesh_dim_names: ['dp'] # Mesh dimension names: 'dp' = data parallel
49+
dp_size: 2
50+
tp_size: 2
51+
52+
queue_config:
53+
rps_limit: 100 # Max requests per second
54+
tps_limit: 10000 # Max tokens per second
55+
adapter_config:
56+
per_token_adapter_limit: 30 # Max concurrent LoRA adapters
57+
adapter_timeout: 1800 # Seconds before idle adapter unload
5158
deployments:
5259
- name: ModelManagement
5360
autoscaling_config:
@@ -56,3 +63,35 @@ applications:
5663
target_ongoing_requests: 16
5764
ray_actor_options:
5865
num_cpus: 0.1
66+
67+
# 3. Sampler Service - Runs inference / sampling using vLLM engine
68+
# Used for generating text from the model (e.g., evaluating LoRA results).
69+
- name: sampler-Qwen3-30B-A3B-Instruct-2507
70+
route_prefix: /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507
71+
import_path: sampler
72+
args:
73+
model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
74+
nproc_per_node: 4 # Number of GPU processes per node
75+
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
76+
engine_args: # vLLM engine-specific settings
77+
max_model_len: 4096 # Maximum sequence length the engine supports
78+
gpu_memory_utilization: 0.7 # Fraction of GPU memory to use (0.0-1.0)
79+
enable_lora: true # Allow loading LoRA adapters during inference
80+
device_group: # Logical device group for the sampler
81+
name: sampler
82+
gpus_per_worker: 2
83+
ranks: [4,5,6,7] # GPU rank indices to use
84+
device_type: cuda
85+
device_mesh:
86+
device_type: cuda
87+
dp_size: 4
88+
tp_size: 1
89+
deployments:
90+
- name: SamplerManagement
91+
autoscaling_config:
92+
min_replicas: 1
93+
max_replicas: 1
94+
target_ongoing_requests: 16
95+
ray_actor_options:
96+
num_cpus: 0.1
97+
num_gpus: 1 # Sampler needs a full GPU for inference

src/twinkle/server/twinkle/model.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,10 @@ class ModelManagement(AdapterManagerMixin):
154154
def __init__(self, nproc_per_node: int, device_group: Dict[str, Any], device_mesh: Dict[str, Any]):
155155
self.device_group = DeviceGroup(**device_group)
156156
twinkle.initialize(mode='ray', nproc_per_node=nproc_per_node, groups=[self.device_group], lazy_collect=False)
157-
self.device_mesh = DeviceMesh(**device_mesh)
157+
if 'mesh_dim_names' in device_mesh:
158+
self.device_mesh = DeviceMesh(**device_mesh)
159+
else:
160+
self.device_mesh = DeviceMesh.from_sizes(**device_mesh)
158161
if use_megatron:
159162
from twinkle.model import MultiLoraMegatronModel
160163
self.model = MultiLoraMegatronModel(

src/twinkle/server/twinkle/processor.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,10 @@ def __init__(self, nproc_per_node: int, ncpu_proc_per_node:int, device_group: Di
6161
self.device_group = DeviceGroup(**device_group)
6262
twinkle.initialize(mode='ray', nproc_per_node=nproc_per_node, groups=[self.device_group],
6363
lazy_collect=False, ncpu_proc_per_node=ncpu_proc_per_node)
64-
self.device_mesh = DeviceMesh(**device_mesh)
64+
if 'mesh_dim_names' in device_mesh:
65+
self.device_mesh = DeviceMesh(**device_mesh)
66+
else:
67+
self.device_mesh = DeviceMesh.from_sizes(**device_mesh)
6568
self.resource_dict = {}
6669
self.resource_records: Dict[str, int] = {}
6770
self.hb_thread = threading.Thread(target=self.countdown, daemon=True)

src/twinkle/server/twinkle/sampler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,10 @@ def __init__(self, nproc_per_node: int, device_group: Dict[str, Any],
274274
nproc_per_node=nproc_per_node,
275275
groups=[self.device_group],
276276
lazy_collect=False)
277-
self.device_mesh = DeviceMesh(**device_mesh)
277+
if 'mesh_dim_names' in device_mesh:
278+
self.device_mesh = DeviceMesh(**device_mesh)
279+
else:
280+
self.device_mesh = DeviceMesh.from_sizes(**device_mesh)
278281
self.sampler_type = sampler_type
279282

280283
# Initialize sampler based on type

0 commit comments

Comments
 (0)