Skip to content

Commit 6488986

Browse files
committed
wip
1 parent f8ae876 commit 6488986

File tree

3 files changed

+72
-42
lines changed

3 files changed

+72
-42
lines changed

cookbook/client/tinker/megatron/server_config.yaml

Lines changed: 36 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -31,48 +31,13 @@ applications:
3131
ray_actor_options:
3232
num_cpus: 0.1 # CPU resources allocated to this actor
3333

34-
# 2. Model Service (commented out) - Would host the base model for training.
35-
# Uncomment and configure if you need a training model worker.
36-
- name: models-Qwen3-30B-A3B-Instruct-2507
37-
route_prefix: /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507
38-
import_path: model
39-
args:
40-
use_megatron: true # Use HuggingFace Transformers backend
41-
model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
42-
nproc_per_node: 4 # Number of GPU processes per node
43-
device_group:
44-
name: model
45-
ranks: [0, 1, 2, 3] # GPU rank indices
46-
device_type: cuda
47-
device_mesh:
48-
device_type: cuda
49-
dp_size: 2
50-
tp_size: 2
51-
ep_size: 2
52-
53-
queue_config:
54-
rps_limit: 100 # Max requests per second
55-
tps_limit: 10000 # Max tokens per second
56-
adapter_config:
57-
per_token_adapter_limit: 30 # Max concurrent LoRA adapters
58-
adapter_timeout: 1800 # Seconds before idle adapter unload
59-
deployments:
60-
- name: ModelManagement
61-
autoscaling_config:
62-
min_replicas: 1
63-
max_replicas: 1
64-
target_ongoing_requests: 16
65-
ray_actor_options:
66-
num_cpus: 0.1
67-
num_gpus: 1
68-
6934
# 3. Sampler Service - Runs inference / sampling using vLLM engine
7035
# Used for generating text from the model (e.g., evaluating LoRA results).
71-
- name: sampler-Qwen3-30B-A3B-Instruct-2507
72-
route_prefix: /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507
36+
- name: sampler-Qwen2.5-3B-Instruct
37+
route_prefix: /api/v1/sampler/Qwen/Qwen2.5-3B-Instruct
7338
import_path: sampler
7439
args:
75-
model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
40+
model_id: "ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier
7641
nproc_per_node: 4 # Number of GPU processes per node
7742
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
7843
engine_args: # vLLM engine-specific settings
@@ -82,12 +47,11 @@ applications:
8247
device_group: # Logical device group for the sampler
8348
name: sampler
8449
gpus_per_worker: 1
85-
ranks: [4,5,6,7] # GPU rank indices to use
50+
ranks: [0,1,2,3] # GPU rank indices to use
8651
device_type: cuda
8752
device_mesh:
8853
device_type: cuda
8954
dp_size: 4
90-
tp_size: 1
9155
deployments:
9256
- name: SamplerManagement
9357
autoscaling_config:
@@ -96,4 +60,35 @@ applications:
9660
target_ongoing_requests: 16
9761
ray_actor_options:
9862
num_cpus: 0.1
99-
num_gpus: 1 # Sampler needs a full GPU for inference
63+
64+
# 2. Model Service (commented out) - Would host the base model for training.
65+
# Uncomment and configure if you need a training model worker.
66+
- name: models-Qwen2.5-3B-Instruct
67+
route_prefix: /api/v1/model/Qwen/Qwen2.5-3B-Instruct
68+
import_path: model
69+
args:
70+
use_megatron: false # Use HuggingFace Transformers backend
71+
model_id: "ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier
72+
nproc_per_node: 4 # Number of GPU processes per node
73+
device_group:
74+
name: model
75+
ranks: [4,5,6,7] # GPU rank indices
76+
device_type: cuda
77+
device_mesh:
78+
device_type: cuda
79+
dp_size: 4
80+
81+
queue_config:
82+
rps_limit: 100 # Max requests per second
83+
tps_limit: 10000 # Max tokens per second
84+
adapter_config:
85+
per_token_adapter_limit: 30 # Max concurrent LoRA adapters
86+
adapter_timeout: 1800 # Seconds before idle adapter unload
87+
deployments:
88+
- name: ModelManagement
89+
autoscaling_config:
90+
min_replicas: 1
91+
max_replicas: 1
92+
target_ongoing_requests: 16
93+
ray_actor_options:
94+
num_cpus: 0.1

src/twinkle/infra/_ray/resource_manager.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ def __init__(self,
8888

8989
# GPU/NPU placement groups: keep existing strategy (CPU= node_cpu//2) to avoid affecting training/inference throughput assumptions.
9090
for i in range(self.nnodes):
91-
node = self.nodes[i]
91+
node_idx = self.min_node_idx + i if device_type != 'CPU' else i
92+
node = self.nodes[node_idx]
9293
node_cpu = int(node['Resources']['CPU'])
9394
if device_type != 'CPU':
9495
bundles.append({device_type: nproc_per_node, 'CPU': max(node_cpu // 2, 1)}) # create bundles
@@ -217,6 +218,39 @@ def __init__(self,
217218
global_cpu_proc_idx += 1
218219
self.device_groups[group.name] = local_device_groups
219220

221+
import ray
222+
223+
@ray.remote(num_gpus=1)
224+
def check_gpu_info():
225+
import os
226+
import torch
227+
228+
node_id = ray.get_runtime_context().get_node_id()
229+
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "not set")
230+
231+
# 获取实际 GPU 信息
232+
if torch.cuda.is_available():
233+
gpu_name = torch.cuda.get_device_name(0)
234+
gpu_uuid = torch.cuda.get_device_properties(0).uuid # 不一定有
235+
else:
236+
gpu_name = "N/A"
237+
238+
return {
239+
"node_id": node_id,
240+
"CUDA_VISIBLE_DEVICES": cuda_visible,
241+
"gpu_name": gpu_name,
242+
}
243+
244+
# 在指定 PG 上运行
245+
result = ray.get(
246+
check_gpu_info.options(
247+
placement_group=self.placement_groups[0],
248+
placement_group_bundle_index=0
249+
).remote()
250+
)
251+
print(result)
252+
breakpoint()
253+
220254
self.group_configs = groups
221255
logger.info(f"nodes: {[n['NodeID'][:8] for n in self.nodes]}")
222256
logger.info(f"node_ranks: {self.node_ranks}")

src/twinkle/server/tinker/sampler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ def __init__(self, nproc_per_node: int, device_group: Dict[str, Any],
110110
model_id=model_id,
111111
engine_args=sampler_kwargs,
112112
device_mesh=self.device_mesh,
113+
remote_group=self.device_group.name,
113114
**{k: v for k, v in kwargs.items() if k not in ['engine_args']}
114115
)
115116
else: # torch sampler

0 commit comments

Comments
 (0)