Skip to content

Commit c685f06

Browse files
committed
fix
1 parent 01c4b89 commit c685f06

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

cookbook/client/tinker/megatron/server_config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ applications:
4545
nproc_per_node: 4 # Number of GPU processes per node
4646
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4747
engine_args: # vLLM engine-specific settings
48-
max_model_len: 8192 # Maximum sequence length the engine supports
48+
max_model_len: 14336 # Maximum sequence length the engine supports
4949
gpu_memory_utilization: 0.85 # Fraction of GPU memory to use (0.0-1.0)
5050
enable_lora: true # Allow loading LoRA adapters during inference
5151
device_group: # Logical device group for the sampler
@@ -58,7 +58,7 @@ applications:
5858
dp_size: 4
5959
queue_config:
6060
rps_limit: 20 # Max requests per second
61-
tps_limit: 10000 # Max tokens per second
61+
tps_limit: 14336 # Max tokens per second
6262
deployments:
6363
- name: SamplerManagement
6464
autoscaling_config:
@@ -80,7 +80,7 @@ applications:
8080
args:
8181
use_megatron: true # Use HuggingFace Transformers backend
8282
model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
83-
max_length: 10240 # model max length
83+
max_length: 14336 # model max length
8484
max_loras: 5 # model max loras
8585
nproc_per_node: 4 # Number of GPU processes per node
8686
device_group:
@@ -94,7 +94,7 @@ applications:
9494

9595
queue_config:
9696
rps_limit: 20 # Max requests per second
97-
tps_limit: 10000 # Max tokens per second
97+
tps_limit: 14336 # Max tokens per second
9898
adapter_config:
9999
per_token_adapter_limit: 3 # Max concurrent LoRA adapters
100100
adapter_timeout: 30 # Seconds before idle adapter unload

0 commit comments

Comments
 (0)