Skip to content

Commit 417a57e

Browse files
committed
update server
1 parent a68d8fa commit 417a57e

File tree

20 files changed

+543
-165
lines changed

20 files changed

+543
-165
lines changed

cookbook/client/tinker/megatron/server_config.yaml

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,13 @@ applications:
3333

3434
# 2. Model Service - Hosts the base model for training (Megatron backend)
3535
# This is the actual model worker that performs forward/backward passes.
36-
- name: models-Qwen2.5-0.5B-Instruct
37-
route_prefix: /api/v1/model/Qwen/Qwen2.5-0.5B-Instruct # REST path for this model
36+
- name: models-Qwen2.5-3B-Instruct
37+
route_prefix: /api/v1/model/Qwen/Qwen2.5-3B-Instruct # REST path for this model
3838
import_path: model
3939
args:
4040
use_megatron: true # Use Megatron-LM backend (not HuggingFace)
41-
model_id: "ms://Qwen/Qwen2.5-0.5B-Instruct" # ModelScope model identifier to load
41+
mixed_precision: bf16 # Use bfloat16 precision for training
42+
model_id: "ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier to load
4243
nproc_per_node: 2 # Number of GPU processes per node
4344
device_group: # Logical device group for this model
4445
name: model
@@ -48,6 +49,9 @@ applications:
4849
device_type: cuda
4950
mesh: [0, 1] # Device indices in the mesh
5051
mesh_dim_names: ['dp'] # Mesh dimension names: 'dp' = data parallel
52+
adapter_config:
53+
per_token_adapter_limit: 30 # Max concurrent LoRA adapters
54+
adapter_timeout: 1800 # Seconds before idle adapter unload
5155
deployments:
5256
- name: ModelManagement
5357
autoscaling_config:
@@ -56,3 +60,34 @@ applications:
5660
target_ongoing_requests: 16
5761
ray_actor_options:
5862
num_cpus: 0.1
63+
64+
# 3. Sampler Service - Runs inference / sampling using vLLM engine
65+
# Used for generating text from the model (e.g., evaluating LoRA results).
66+
- name: sampler-Qwen2.5-3B-Instruct
67+
route_prefix: /api/v1/sampler/Qwen/Qwen2.5-3B-Instruct
68+
import_path: sampler
69+
args:
70+
model_id: "ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier
71+
nproc_per_node: 1 # Number of GPU processes per node
72+
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
73+
engine_args: # vLLM engine-specific settings
74+
max_model_len: 4096 # Maximum sequence length the engine supports
75+
gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
76+
enable_lora: true # Allow loading LoRA adapters during inference
77+
device_group: # Logical device group for the sampler
78+
name: sampler
79+
ranks: [0] # GPU rank indices to use
80+
device_type: cuda
81+
device_mesh:
82+
device_type: cuda
83+
mesh: [0]
84+
mesh_dim_names: ['dp']
85+
deployments:
86+
- name: SamplerManagement
87+
autoscaling_config:
88+
min_replicas: 1
89+
max_replicas: 1
90+
target_ongoing_requests: 16
91+
ray_actor_options:
92+
num_cpus: 0.1
93+
num_gpus: 1 # Sampler needs a full GPU for inference

cookbook/client/tinker/transformer/grpo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
logger = get_logger()
3535

3636
# ========== Configuration ==========
37-
BASE_MODEL = 'Qwen/Qwen2.5-7B-Instruct'
37+
BASE_MODEL = 'Qwen/Qwen2.5-3B-Instruct'
3838
NUM_GENERATIONS = 4
3939
MAX_NEW_TOKENS = 1024
4040
LEARNING_RATE = 1e-5

0 commit comments

Comments
 (0)