fix config

tastelikefeet · tastelikefeet · commit 92717fa848df · 2026-02-13T14:36:23.000+08:00
diff --git a/cookbook/client/tinker/megatron/server_config.yaml b/cookbook/client/tinker/megatron/server_config.yaml
@@ -45,7 +45,7 @@ applications:
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
-        max_model_len: 14336           # Maximum sequence length the engine supports
+        max_model_len: 16000           # Maximum sequence length the engine supports
         gpu_memory_utilization: 0.85   # Fraction of GPU memory to use (0.0-1.0)
         enable_lora: true             # Allow loading LoRA adapters during inference
       device_group:                   # Logical device group for the sampler
@@ -58,7 +58,7 @@ applications:
         dp_size: 4
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 14336                            # Max tokens per second
+        tps_limit: 16000                            # Max tokens per second
     deployments:
       - name: SamplerManagement
         autoscaling_config:
@@ -80,7 +80,7 @@ applications:
     args:
       use_megatron: true                          # Use HuggingFace Transformers backend
       model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
-      max_length: 14336                           # model max length
+      max_length: 16000                           # model max length
       max_loras: 5                                # model max loras
       nproc_per_node: 4                           # Number of GPU processes per node
       device_group:
@@ -94,7 +94,7 @@ applications:
 
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 14336                            # Max tokens per second
+        tps_limit: 16000                            # Max tokens per second
       adapter_config:
         per_token_adapter_limit: 3                # Max concurrent LoRA adapters
         adapter_timeout: 30                       # Seconds before idle adapter unload
diff --git a/src/twinkle/server/utils/task_queue.py b/src/twinkle/server/utils/task_queue.py
@@ -64,14 +64,14 @@ class TaskQueueConfig:
         max_input_tokens: Maximum allowed input tokens per request (default 10000).
     """
     rps_limit: float = 100.0  # 10 requests per second
-    tps_limit: float = 10000.0  # 10000 input tokens per second
+    tps_limit: float = 16000.0  # 10000 input tokens per second
     window_seconds: float = 1.0  # 1 second sliding window
     queue_timeout: float = 300.0  # 5 minutes queue timeout
     enabled: bool = True  # Rate limiting enabled by default
     # Remove tokens after 10x window inactivity
     token_cleanup_multiplier: float = 10.0
     token_cleanup_interval: float = 60.0  # Run cleanup every 60 seconds
-    max_input_tokens: int = 10000  # Maximum input tokens per request
+    max_input_tokens: int = 16000  # Maximum input tokens per request
 
     @classmethod
     def from_dict(cls, config_dict: dict[str, Any] | None = None) -> TaskQueueConfig: