modelscope · Yunnglin · Feb 13, 2026 · Feb 11, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/.gitignore b/.gitignore
@@ -152,3 +152,4 @@ megatron_output/
 ast_index_file.py
 test_cookbook/
 /test*.py
+swanlog/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,23 +22,23 @@ repos:
     hooks:
       - id: pyupgrade
         args: [--py38-plus]
-        exclude: ^client_tools/
+        exclude: ^(examples/|cookbook/|client_tools/|src/twinkle_client/)
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v6.0.0
     hooks:
       - id: trailing-whitespace
-        exclude: ^client_tools/
+        exclude: ^(client_tools/|src/twinkle_client/)
       - id: check-yaml
-        exclude: ^client_tools/
+        exclude: ^(client_tools/|src/twinkle_client/)
       - id: end-of-file-fixer
-        exclude: ^client_tools/
+        exclude: ^(client_tools/|src/twinkle_client/)
       - id: requirements-txt-fixer
-        exclude: ^client_tools/
+        exclude: ^(client_tools/|src/twinkle_client/)
       - id: double-quote-string-fixer
-        exclude: ^client_tools/
+        exclude: ^(client_tools/|src/twinkle_client/)
       - id: check-merge-conflict
-        exclude: ^client_tools/
+        exclude: ^(client_tools/|src/twinkle_client/)
       - id: mixed-line-ending
         args: ["--fix=lf"]
-        exclude: ^client_tools/
+        exclude: ^(client_tools/|src/twinkle_client/)
diff --git a/cookbook/client/tinker/grpo.py b/cookbook/client/tinker/grpo.py
diff --git a/cookbook/client/tinker/megatron/server_config.yaml b/cookbook/client/tinker/megatron/server_config.yaml
@@ -56,6 +56,9 @@ applications:
       device_mesh:
         device_type: cuda
         dp_size: 4
+      queue_config:
+        rps_limit: 20                               # Max requests per second
+        tps_limit: 10000                            # Max tokens per second
     deployments:
       - name: SamplerManagement
         autoscaling_config:
@@ -77,7 +80,9 @@ applications:
     args:
       use_megatron: true                          # Use HuggingFace Transformers backend
       model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
-      nproc_per_node: 4                            # Number of GPU processes per node
+      max_length: 10240                           # model max length
+      max_loras: 5                                # model max loras
+      nproc_per_node: 4                           # Number of GPU processes per node
       device_group:
         name: model
         ranks: [4,5,6,7]                              # GPU rank indices
@@ -88,11 +93,12 @@ applications:
         ep_size: 2
 
       queue_config:
-        rps_limit: 100                             # Max requests per second
-        tps_limit: 100000                           # Max tokens per second
+        rps_limit: 20                               # Max requests per second
+        tps_limit: 10000                            # Max tokens per second
       adapter_config:
-        per_token_adapter_limit: 30                # Max concurrent LoRA adapters
-        adapter_timeout: 1800                      # Seconds before idle adapter unload
+        per_token_adapter_limit: 3                # Max concurrent LoRA adapters
+        adapter_timeout: 30                       # Seconds before idle adapter unload
+        adapter_max_lifetime: 36000               # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
     deployments:
       - name: ModelManagement
         autoscaling_config:

diff --git a/cookbook/client/tinker/megatron/server_config_7b.yaml b/cookbook/client/tinker/megatron/server_config_7b.yaml
@@ -50,10 +50,12 @@ applications:
         dp_size: 2
       queue_config:
         rps_limit: 100                             # Max requests per second
-        tps_limit: 100000                           # Max tokens per second
+        tps_limit: 10000                           # Max tokens per second for a single user
+        max_input_tokens: 10000                    # Maximum input tokens per request
       adapter_config:
-        per_token_adapter_limit: 30                # Max concurrent LoRA adapters
-        adapter_timeout: 1800                      # Seconds before idle adapter unload
+        adapter_timeout: 30                        # Seconds before idle adapter unload
+        adapter_max_lifetime: 36000                # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
+        per_token_adapter_limit: 30
     deployments:
       - name: ModelManagement
         autoscaling_config: