@@ -31,48 +31,13 @@ applications:
3131 ray_actor_options :
3232 num_cpus : 0.1 # CPU resources allocated to this actor
3333
34- # 2. Model Service (commented out) - Would host the base model for training.
35- # Uncomment and configure if you need a training model worker.
36- - name : models-Qwen3-30B-A3B-Instruct-2507
37- route_prefix : /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507
38- import_path : model
39- args :
40- use_megatron : true # Use HuggingFace Transformers backend
41- model_id : " ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
42- nproc_per_node : 4 # Number of GPU processes per node
43- device_group :
44- name : model
45- ranks : [0, 1, 2, 3] # GPU rank indices
46- device_type : cuda
47- device_mesh :
48- device_type : cuda
49- dp_size : 2
50- tp_size : 2
51- ep_size : 2
52-
53- queue_config :
54- rps_limit : 100 # Max requests per second
55- tps_limit : 10000 # Max tokens per second
56- adapter_config :
57- per_token_adapter_limit : 30 # Max concurrent LoRA adapters
58- adapter_timeout : 1800 # Seconds before idle adapter unload
59- deployments :
60- - name : ModelManagement
61- autoscaling_config :
62- min_replicas : 1
63- max_replicas : 1
64- target_ongoing_requests : 16
65- ray_actor_options :
66- num_cpus : 0.1
67- num_gpus : 1
68-
6934 # 3. Sampler Service - Runs inference / sampling using vLLM engine
7035 # Used for generating text from the model (e.g., evaluating LoRA results).
71- - name : sampler-Qwen3-30B-A3B- Instruct-2507
72- route_prefix : /api/v1/sampler/Qwen/Qwen3-30B-A3B- Instruct-2507
36+ - name : sampler-Qwen2.5-3B- Instruct
37+ route_prefix : /api/v1/sampler/Qwen/Qwen2.5-3B- Instruct
7338 import_path : sampler
7439 args :
75- model_id : " ms://Qwen/Qwen3-30B-A3B- Instruct-2507 " # ModelScope model identifier
40+ model_id : " ms://Qwen/Qwen2.5-3B- Instruct" # ModelScope model identifier
7641 nproc_per_node : 4 # Number of GPU processes per node
7742 sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
7843 engine_args : # vLLM engine-specific settings
@@ -82,12 +47,11 @@ applications:
8247 device_group : # Logical device group for the sampler
8348 name : sampler
8449 gpus_per_worker : 1
85- ranks : [4,5,6,7 ] # GPU rank indices to use
50+ ranks : [0,1,2,3 ] # GPU rank indices to use
8651 device_type : cuda
8752 device_mesh :
8853 device_type : cuda
8954 dp_size : 4
90- tp_size : 1
9155 deployments :
9256 - name : SamplerManagement
9357 autoscaling_config :
@@ -96,4 +60,35 @@ applications:
9660 target_ongoing_requests : 16
9761 ray_actor_options :
9862 num_cpus : 0.1
99- num_gpus : 1 # Sampler needs a full GPU for inference
63+
64+ # 2. Model Service (commented out) - Would host the base model for training.
65+ # Uncomment and configure if you need a training model worker.
66+ - name : models-Qwen2.5-3B-Instruct
67+ route_prefix : /api/v1/model/Qwen/Qwen2.5-3B-Instruct
68+ import_path : model
69+ args :
70+ use_megatron : false # Use HuggingFace Transformers backend
71+ model_id : " ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier
72+ nproc_per_node : 4 # Number of GPU processes per node
73+ device_group :
74+ name : model
75+ ranks : [4,5,6,7] # GPU rank indices
76+ device_type : cuda
77+ device_mesh :
78+ device_type : cuda
79+ dp_size : 4
80+
81+ queue_config :
82+ rps_limit : 100 # Max requests per second
83+ tps_limit : 10000 # Max tokens per second
84+ adapter_config :
85+ per_token_adapter_limit : 30 # Max concurrent LoRA adapters
86+ adapter_timeout : 1800 # Seconds before idle adapter unload
87+ deployments :
88+ - name : ModelManagement
89+ autoscaling_config :
90+ min_replicas : 1
91+ max_replicas : 1
92+ target_ongoing_requests : 16
93+ ray_actor_options :
94+ num_cpus : 0.1
0 commit comments