@@ -37,11 +37,11 @@ applications:
3737
3838 # 3. Sampler Service - Runs inference / sampling using vLLM engine
3939 # Used for generating text from the model (e.g., evaluating LoRA results).
40- - name : sampler-Qwen2.5-3B- Instruct
41- route_prefix : /api/v1/sampler/Qwen/Qwen2.5-3B- Instruct
40+ - name : sampler-Qwen3-30B-A3B- Instruct-2507
41+ route_prefix : /api/v1/sampler/Qwen/Qwen3-30B-A3B- Instruct-2507
4242 import_path : sampler
4343 args :
44- model_id : " ms://Qwen/Qwen2.5-3B- Instruct" # ModelScope model identifier
44+ model_id : " ms://Qwen/Qwen3-30B-A3B- Instruct-2507 " # ModelScope model identifier
4545 nproc_per_node : 4 # Number of GPU processes per node
4646 sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4747 engine_args : # vLLM engine-specific settings
@@ -71,20 +71,22 @@ applications:
7171
7272 # 2. Model Service (commented out) - Would host the base model for training.
7373 # Uncomment and configure if you need a training model worker.
74- - name : models-Qwen2.5-3B- Instruct
75- route_prefix : /api/v1/model/Qwen/Qwen2.5-3B- Instruct
74+ - name : models-Qwen3-30B-A3B- Instruct-2507
75+ route_prefix : /api/v1/model/Qwen/Qwen3-30B-A3B- Instruct-2507
7676 import_path : model
7777 args :
78- use_megatron : false # Use HuggingFace Transformers backend
79- model_id : " ms://Qwen/Qwen2.5-3B- Instruct" # ModelScope model identifier
78+ use_megatron : true # Use HuggingFace Transformers backend
79+ model_id : " ms://Qwen/Qwen3-30B-A3B- Instruct-2507 " # ModelScope model identifier
8080 nproc_per_node : 4 # Number of GPU processes per node
8181 device_group :
8282 name : model
8383 ranks : [4,5,6,7] # GPU rank indices
8484 device_type : cuda
8585 device_mesh :
8686 device_type : cuda
87- dp_size : 4
87+ dp_size : 2
88+ tp_size : 2
89+ ep_size : 2
8890
8991 queue_config :
9092 rps_limit : 100 # Max requests per second
0 commit comments