Skip to content

Commit 946810a

Browse files
committed
wip
1 parent 96112a8 commit 946810a

File tree

1 file changed

+13
-14
lines changed

1 file changed

+13
-14
lines changed

cookbook/client/tinker/megatron/server_config.yaml

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,26 +33,25 @@ applications:
3333
runtime_env:
3434
env_vars:
3535
TWINKLE_TRUST_REMOTE_CODE: "0"
36-
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
3736

3837
# 3. Sampler Service - Runs inference / sampling using vLLM engine
3938
# Used for generating text from the model (e.g., evaluating LoRA results).
4039
- name: sampler-Qwen3-30B-A3B-Instruct-2507
41-
route_prefix: /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507
40+
route_prefix: /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct
4241
import_path: sampler
4342
args:
44-
model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
43+
model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier
4544
nproc_per_node: 4 # Number of GPU processes per node
4645
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4746
engine_args: # vLLM engine-specific settings
4847
max_model_len: 16000 # Maximum sequence length the engine supports
49-
gpu_memory_utilization: 0.85 # Fraction of GPU memory to use (0.0-1.0)
48+
gpu_memory_utilization: 0.7 # Fraction of GPU memory to use (0.0-1.0)
5049
enable_lora: true # Allow loading LoRA adapters during inference
5150
max_loras: 5 # Max allowed loras working on vLLM at the same time
5251
device_group: # Logical device group for the sampler
5352
name: sampler
5453
gpus_per_worker: 1
55-
ranks: [0,1,2,3] # GPU rank indices to use
54+
ranks: 4 # GPU rank indices to use
5655
device_type: cuda
5756
device_mesh:
5857
device_type: cuda
@@ -63,30 +62,30 @@ applications:
6362
deployments:
6463
- name: SamplerManagement
6564
autoscaling_config:
66-
min_replicas: 1
67-
max_replicas: 1
65+
min_replicas: 2
66+
max_replicas: 2
6867
target_ongoing_requests: 16
6968
ray_actor_options:
7069
num_cpus: 0.1
7170
runtime_env:
7271
env_vars:
7372
TWINKLE_TRUST_REMOTE_CODE: "0"
74-
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
73+
DEVICE_COUNT_PER_PHYSICAL_NODE: "16"
7574

7675
# 2. Model Service (commented out) - Would host the base model for training.
7776
# Uncomment and configure if you need a training model worker.
7877
- name: models-Qwen3-30B-A3B-Instruct-2507
79-
route_prefix: /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507
78+
route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct
8079
import_path: model
8180
args:
8281
use_megatron: true # Use HuggingFace Transformers backend
83-
model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
82+
model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier
8483
max_length: 16000 # model max length
8584
max_loras: 5 # model max loras
8685
nproc_per_node: 4 # Number of GPU processes per node
8786
device_group:
8887
name: model
89-
ranks: [4,5,6,7] # GPU rank indices
88+
ranks: 4 # GPU rank indices
9089
device_type: cuda
9190
device_mesh:
9291
device_type: cuda
@@ -103,12 +102,12 @@ applications:
103102
deployments:
104103
- name: ModelManagement
105104
autoscaling_config:
106-
min_replicas: 1
107-
max_replicas: 1
105+
min_replicas: 2
106+
max_replicas: 2
108107
target_ongoing_requests: 8
109108
ray_actor_options:
110109
num_cpus: 0.1
111110
runtime_env:
112111
env_vars:
113112
TWINKLE_TRUST_REMOTE_CODE: "0"
114-
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
113+
DEVICE_COUNT_PER_PHYSICAL_NODE: "16"

0 commit comments

Comments
 (0)