Skip to content

Commit 92717fa

Browse files
committed
fix config
1 parent c685f06 commit 92717fa

File tree

2 files changed

+6
-6
lines changed

2 files changed

+6
-6
lines changed

cookbook/client/tinker/megatron/server_config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ applications:
4545
nproc_per_node: 4 # Number of GPU processes per node
4646
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
4747
engine_args: # vLLM engine-specific settings
48-
max_model_len: 14336 # Maximum sequence length the engine supports
48+
max_model_len: 16000 # Maximum sequence length the engine supports
4949
gpu_memory_utilization: 0.85 # Fraction of GPU memory to use (0.0-1.0)
5050
enable_lora: true # Allow loading LoRA adapters during inference
5151
device_group: # Logical device group for the sampler
@@ -58,7 +58,7 @@ applications:
5858
dp_size: 4
5959
queue_config:
6060
rps_limit: 20 # Max requests per second
61-
tps_limit: 14336 # Max tokens per second
61+
tps_limit: 16000 # Max tokens per second
6262
deployments:
6363
- name: SamplerManagement
6464
autoscaling_config:
@@ -80,7 +80,7 @@ applications:
8080
args:
8181
use_megatron: true # Use HuggingFace Transformers backend
8282
model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
83-
max_length: 14336 # model max length
83+
max_length: 16000 # model max length
8484
max_loras: 5 # model max loras
8585
nproc_per_node: 4 # Number of GPU processes per node
8686
device_group:
@@ -94,7 +94,7 @@ applications:
9494

9595
queue_config:
9696
rps_limit: 20 # Max requests per second
97-
tps_limit: 14336 # Max tokens per second
97+
tps_limit: 16000 # Max tokens per second
9898
adapter_config:
9999
per_token_adapter_limit: 3 # Max concurrent LoRA adapters
100100
adapter_timeout: 30 # Seconds before idle adapter unload

src/twinkle/server/utils/task_queue.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,14 @@ class TaskQueueConfig:
6464
max_input_tokens: Maximum allowed input tokens per request (default 10000).
6565
"""
6666
rps_limit: float = 100.0 # 10 requests per second
67-
tps_limit: float = 10000.0 # 10000 input tokens per second
67+
tps_limit: float = 16000.0 # 10000 input tokens per second
6868
window_seconds: float = 1.0 # 1 second sliding window
6969
queue_timeout: float = 300.0 # 5 minutes queue timeout
7070
enabled: bool = True # Rate limiting enabled by default
7171
# Remove tokens after 10x window inactivity
7272
token_cleanup_multiplier: float = 10.0
7373
token_cleanup_interval: float = 60.0 # Run cleanup every 60 seconds
74-
max_input_tokens: int = 10000 # Maximum input tokens per request
74+
max_input_tokens: int = 16000 # Maximum input tokens per request
7575

7676
@classmethod
7777
def from_dict(cls, config_dict: dict[str, Any] | None = None) -> TaskQueueConfig:

0 commit comments

Comments
 (0)