@@ -7,7 +7,7 @@ proxy_location: EveryNode
77# HTTP listener settings
88http_options :
99 host : 0.0.0.0 # Listen on all network interfaces
10- port : 8000 # Port number for the server
10+ port : 9000 # Port number for the server
1111
1212# Applications: each entry defines a service component deployed on the server
1313applications :
@@ -39,7 +39,6 @@ applications:
3939 import_path : model
4040 args :
4141 use_megatron : true
42- model_cls : Qwen3_5ForConditionalGeneration
4342 model_id : " ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
4443 max_length : 10240
4544 nproc_per_node : 2 # Number of GPU processes per node
@@ -52,8 +51,8 @@ applications:
5251 dp_size : 2
5352 queue_config :
5453 rps_limit : 100 # Max requests per second
55- tps_limit : 10000 # Max tokens per second for a single user
56- max_input_tokens : 10000 # Maximum input tokens per request
54+ tps_limit : 100000 # Max tokens per second for a single user
55+ max_input_tokens : 60000 # Maximum input tokens per request
5756 adapter_config :
5857 adapter_timeout : 30 # Seconds before idle adapter unload
5958 adapter_max_lifetime : 36000 # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
@@ -80,8 +79,8 @@ applications:
8079 nproc_per_node : 2 # Number of GPU processes per node
8180 sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
8281 engine_args : # vLLM engine-specific settings
83- max_model_len : 4096 # Maximum sequence length the engine supports
84- gpu_memory_utilization : 0.5 # Fraction of GPU memory to use (0.0-1.0)
82+ max_model_len : 16000 # Maximum sequence length the engine supports
83+ gpu_memory_utilization : 0.7 # Fraction of GPU memory to use (0.0-1.0)
8584 enable_lora : true # Allow loading LoRA adapters during inference
8685 logprobs_mode : processed_logprobs # Logprobs mode for sampling results
8786 device_group : # Logical device group for the sampler
0 commit comments