@@ -45,18 +45,17 @@ applications:
4545 nproc_per_node : 4 # Number of GPU processes per node
4646 sampler_type : vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4747 engine_args : # vLLM engine-specific settings
48- max_model_len : 4096 # Maximum sequence length the engine supports
49- gpu_memory_utilization : 0.7 # Fraction of GPU memory to use (0.0-1.0)
48+ max_model_len : 8192 # Maximum sequence length the engine supports
49+ gpu_memory_utilization : 0.85 # Fraction of GPU memory to use (0.0-1.0)
5050 enable_lora : true # Allow loading LoRA adapters during inference
5151 device_group : # Logical device group for the sampler
5252 name : sampler
53- gpus_per_worker : 2
54- ranks : [0,1,2,3,4,5,6,7 ] # GPU rank indices to use
53+ gpus_per_worker : 1
54+ ranks : [0,1,2,3] # GPU rank indices to use
5555 device_type : cuda
5656 device_mesh :
5757 device_type : cuda
5858 dp_size : 4
59- tp_size : 2
6059 deployments :
6160 - name : SamplerManagement
6261 autoscaling_config :
@@ -68,7 +67,7 @@ applications:
6867 runtime_env :
6968 env_vars :
7069 TWINKLE_TRUST_REMOTE_CODE : " 0"
71- DEVICE_COUNT_PER_PHYSICAL_NODE : " 16 "
70+ DEVICE_COUNT_PER_PHYSICAL_NODE : " 8 "
7271
7372 # 2. Model Service (commented out) - Would host the base model for training.
7473 # Uncomment and configure if you need a training model worker.
@@ -81,18 +80,16 @@ applications:
8180 nproc_per_node : 4 # Number of GPU processes per node
8281 device_group :
8382 name : model
84- ranks : [8,9,10,11,12,13,14,15 ] # GPU rank indices
83+ ranks : [4,5,6,7 ] # GPU rank indices
8584 device_type : cuda
8685 device_mesh :
8786 device_type : cuda
88- dp_size : 2
89- tp_size : 2
90- pp:size : 2
87+ dp_size : 4
9188 ep_size : 2
9289
9390 queue_config :
9491 rps_limit : 100 # Max requests per second
95- tps_limit : 10000 # Max tokens per second
92+ tps_limit : 100000 # Max tokens per second
9693 adapter_config :
9794 per_token_adapter_limit : 30 # Max concurrent LoRA adapters
9895 adapter_timeout : 1800 # Seconds before idle adapter unload
@@ -101,10 +98,10 @@ applications:
10198 autoscaling_config :
10299 min_replicas : 1
103100 max_replicas : 1
104- target_ongoing_requests : 16
101+ target_ongoing_requests : 8
105102 ray_actor_options :
106103 num_cpus : 0.1
107104 runtime_env :
108105 env_vars :
109106 TWINKLE_TRUST_REMOTE_CODE : " 0"
110- DEVICE_COUNT_PER_PHYSICAL_NODE : " 16 "
107+ DEVICE_COUNT_PER_PHYSICAL_NODE : " 8 "
0 commit comments