@@ -36,29 +36,32 @@ applications:
3636
3737 # 3. Sampler Service - Runs inference / sampling using vLLM engine
3838 # Used for generating text from the model (e.g., evaluating LoRA results).
39- - name : sampler-Qwen3.5-4B
40- route_prefix : /api/v1/sampler/Qwen/Qwen3.5-4B
39+ # Config: TP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, ~37GB for KV cache + LoRA
40+ - name : sampler-Qwen3.5-27B
41+ route_prefix : /api/v1/sampler/Qwen/Qwen3.5-27B
4142 import_path : sampler
4243 args :
43- model_id : " ms://Qwen/Qwen3.5-4B " # ModelScope model identifier
44- nproc_per_node : 4 # Number of GPU processes per node
44+ model_id : " ms://Qwen/Qwen3.5-27B " # ModelScope model identifier
45+ nproc_per_node : 8 # Number of GPU processes per node
4546 sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4647 engine_args : # vLLM engine-specific settings
47- max_model_len : 16000 # Maximum sequence length the engine supports
48- gpu_memory_utilization : 0.85 # Fraction of GPU memory to use (0.0-1.0)
48+ max_model_len : 32000 # Maximum sequence length the engine supports
49+ gpu_memory_utilization : 0.80 # 80% utilization, ~64GB/ GPU, leaves buffer for safety
4950 enable_lora : true # Allow loading LoRA adapters during inference
5051 max_loras : 5 # Max allowed loras working on vLLM at the same time
52+ max_lora_rank : 32 # Support up to rank 64 LoRA adapters
5153 device_group : # Logical device group for the sampler
5254 name : sampler
53- gpus_per_worker : 1
55+ gpus_per_worker : 2
5456 ranks : 4 # GPU rank indices to use
5557 device_type : cuda
5658 device_mesh :
5759 device_type : cuda
58- dp_size : 4
60+ dp_size : 2
61+ tp_size : 2 # 2 TP replicas for multi-tenant throughput
5962 queue_config :
6063 rps_limit : 20 # Max requests per second
61- tps_limit : 16000 # Max tokens per second
64+ tps_limit : 32000 # Max tokens per second
6265 deployments :
6366 - name : SamplerManagement
6467 autoscaling_config :
@@ -71,29 +74,29 @@ applications:
7174 env_vars :
7275 TWINKLE_TRUST_REMOTE_CODE : " 0"
7376
74- # 2. Model Service (commented out) - Would host the base model for training.
75- # Uncomment and configure if you need a training model worker.
76- - name : models-Qwen3.5-4B
77- route_prefix : /api/v1/model/Qwen/Qwen3.5-4B
77+ # 2. Model Service - Hosts the base model for training.
78+ # Config: PP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, comfortable for LoRA training
79+ - name : models-Qwen3.5-27B
80+ route_prefix : /api/v1/model/Qwen/Qwen3.5-27B
7881 import_path : model
7982 args :
80- use_megatron : true # Use HuggingFace Transformers backend
81- model_id : " ms://Qwen/Qwen3.5-4B " # ModelScope model identifier
82- max_length : 16000 # model max length
83+ use_megatron : true # Use Megatron-LM backend
84+ model_id : " ms://Qwen/Qwen3.5-27B " # ModelScope model identifier
85+ max_length : 32000 # model max length
8386 max_loras : 5 # model max loras
84- nproc_per_node : 4 # Number of GPU processes per node
87+ nproc_per_node : 8 # Number of GPU processes per node
8588 device_group :
8689 name : model
8790 ranks : 4 # GPU rank indices
8891 device_type : cuda
8992 device_mesh :
9093 device_type : cuda
91- dp_size : 4
92- ep_size : 2
94+ dp_size : 2 # 2-way data parallel
95+ pp_size : 2 # 2-way pipeline parallel (~27GB/GPU)
9396
9497 queue_config :
9598 rps_limit : 20 # Max requests per second
96- tps_limit : 16000 # Max tokens per second
99+ tps_limit : 32000 # Max tokens per second
97100 adapter_config :
98101 adapter_timeout : 30 # Seconds before idle adapter unload
99102 adapter_max_lifetime : 36000 # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
0 commit comments