@@ -33,23 +33,24 @@ applications:
3333
3434 # 2. Model Service (commented out) - Would host the base model for training.
3535 # Uncomment and configure if you need a training model worker.
36- - name : models-Qwen2.5-3B -Instruct
37- route_prefix : /api/v1/model/Qwen/Qwen2.5-3B -Instruct
36+ - name : models-Qwen2.5-7B -Instruct
37+ route_prefix : /api/v1/model/Qwen/Qwen2.5-7B -Instruct
3838 import_path : model
3939 args :
40- use_megatron : true # Use HuggingFace Transformers backend
41- model_id : " ms://Qwen/Qwen2.5-3B-Instruct" # ModelScope model identifier
40+ use_megatron : true
41+ model_id : " ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier
42+ max_length : 10240
4243 nproc_per_node : 2 # Number of GPU processes per node
4344 device_group :
4445 name : model
45- ranks : [0, 1] # GPU rank indices
46+ ranks : [0,1] # GPU rank indices
4647 device_type : cuda
4748 device_mesh :
4849 device_type : cuda
49-
50+ dp_size : 2
5051 queue_config :
5152 rps_limit : 100 # Max requests per second
52- tps_limit : 10000 # Max tokens per second
53+ tps_limit : 100000 # Max tokens per second
5354 adapter_config :
5455 per_token_adapter_limit : 30 # Max concurrent LoRA adapters
5556 adapter_timeout : 1800 # Seconds before idle adapter unload
@@ -68,24 +69,28 @@ applications:
6869
6970 # 3. Sampler Service - Runs inference / sampling using vLLM engine
7071 # Used for generating text from the model (e.g., evaluating LoRA results).
71- - name : sampler-Qwen2.5-3B -Instruct
72- route_prefix : /api/v1/sampler/Qwen/Qwen2.5-3B -Instruct
72+ - name : sampler-Qwen2.5-7B -Instruct
73+ route_prefix : /api/v1/sampler/Qwen/Qwen2.5-7B -Instruct
7374 import_path : sampler
7475 args :
75- model_id : " ms://Qwen/Qwen2.5-3B -Instruct" # ModelScope model identifier
76- nproc_per_node : 1 # Number of GPU processes per node
76+ model_id : " ms://Qwen/Qwen2.5-7B -Instruct" # ModelScope model identifier
77+ nproc_per_node : 2 # Number of GPU processes per node
7778 sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
7879 engine_args : # vLLM engine-specific settings
7980 max_model_len : 4096 # Maximum sequence length the engine supports
80- gpu_memory_utilization : 0.7 # Fraction of GPU memory to use (0.0-1.0)
81+ gpu_memory_utilization : 0.5 # Fraction of GPU memory to use (0.0-1.0)
8182 enable_lora : true # Allow loading LoRA adapters during inference
83+ logprobs_mode : processed_logprobs # Logprobs mode for sampling results
8284 device_group : # Logical device group for the sampler
8385 name : sampler
84- gpus_per_worker : 1
85- ranks : [0] # GPU rank indices to use
86+ ranks : [2] # GPU rank indices to use
8687 device_type : cuda
8788 device_mesh :
8889 device_type : cuda
90+ dp_size : 1
91+ queue_config :
92+ rps_limit : 100 # Max requests per second
93+ tps_limit : 100000 # Max tokens per second
8994 deployments :
9095 - name : SamplerManagement
9196 autoscaling_config :
0 commit comments