@@ -45,7 +45,7 @@ applications:
4545 nproc_per_node : 4 # Number of GPU processes per node
4646 sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4747 engine_args : # vLLM engine-specific settings
48- max_model_len : 14336 # Maximum sequence length the engine supports
48+ max_model_len : 16000 # Maximum sequence length the engine supports
4949 gpu_memory_utilization : 0.85 # Fraction of GPU memory to use (0.0-1.0)
5050 enable_lora : true # Allow loading LoRA adapters during inference
5151 device_group : # Logical device group for the sampler
@@ -58,7 +58,7 @@ applications:
5858 dp_size : 4
5959 queue_config :
6060 rps_limit : 20 # Max requests per second
61- tps_limit : 14336 # Max tokens per second
61+ tps_limit : 16000 # Max tokens per second
6262 deployments :
6363 - name : SamplerManagement
6464 autoscaling_config :
@@ -80,7 +80,7 @@ applications:
8080 args :
8181 use_megatron : true # Use HuggingFace Transformers backend
8282 model_id : " ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
83- max_length : 14336 # model max length
83+ max_length : 16000 # model max length
8484 max_loras : 5 # model max loras
8585 nproc_per_node : 4 # Number of GPU processes per node
8686 device_group :
@@ -94,7 +94,7 @@ applications:
9494
9595 queue_config :
9696 rps_limit : 20 # Max requests per second
97- tps_limit : 14336 # Max tokens per second
97+ tps_limit : 16000 # Max tokens per second
9898 adapter_config :
9999 per_token_adapter_limit : 3 # Max concurrent LoRA adapters
100100 adapter_timeout : 30 # Seconds before idle adapter unload
0 commit comments