From c6ace7fd88556e945c3bce61f8e982dff0a891ef Mon Sep 17 00:00:00 2001
From: meichangsu1 <1484603386@qq.com>
Date: Wed, 11 Feb 2026 15:41:47 +0800
Subject: [PATCH 01/16] feat(tests): replace manual sp_group retrieval with
 module attribute

Replace calls to `_get_sp_group_from_device_mesh` with direct access to `sequence_parallel._sp_group` in sequence parallel attention tests. This simplifies the test setup by using the already initialized group stored in the module, improving code clarity and reducing redundancy.
---
 .../test_sequence_parallel_single_attention.py                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/sequence_parallel/test_sequence_parallel_single_attention.py b/tests/sequence_parallel/test_sequence_parallel_single_attention.py
index dde6b387..32e01aaa 100644
--- a/tests/sequence_parallel/test_sequence_parallel_single_attention.py
+++ b/tests/sequence_parallel/test_sequence_parallel_single_attention.py
@@ -181,7 +181,7 @@ def _run_worker_single_attn(rank: int, world_size: int, port: int, padding: bool
         sp_size = world_size
         device_mesh = DeviceMesh.from_sizes(dp_size=world_size, ulysses_size=sp_size, device_type="cuda")
         _setup_sp(device_mesh, sp_size)
-        sp_group = _get_sp_group_from_device_mesh(device_mesh, sp_size)
+        sp_group = sequence_parallel._sp_group
 
         batch_size = 2
         unpad_seq_len = 127 if padding else 128
@@ -271,7 +271,7 @@ def _run_worker_single_attn_fsdp(rank: int, world_size: int, port: int):
         # For FSDP+SP, SP is derived from dp/fsdp ranks. Use fsdp=world, dp=1.
         device_mesh = DeviceMesh.from_sizes(fsdp_size=world_size, dp_size=1, ulysses_size=sp_size, device_type="cuda")
         _setup_sp(device_mesh, sp_size)
-        sp_group = _get_sp_group_from_device_mesh(device_mesh, sp_size)
+        sp_group = sequence_parallel._sp_group
 
         batch_size = 2
         unpad_seq_len = 128

From 04f4f98e0eac0af91622e7371c2989a934f9c5aa Mon Sep 17 00:00:00 2001
From: meichangsu1 <1484603386@qq.com>
Date: Thu, 12 Feb 2026 14:41:44 +0800
Subject: [PATCH 02/16] feat(sequence_parallel): refactor loss reduction using
 custom autograd functions

Replace manual gradient handling with `torch.autograd.Function` subclasses `_ReduceSequenceParallelLoss` and `_ReduceSequenceParallelSum` to compute global loss via autograd-aware all-reduce. This simplifies the logic for both sum and mean reductions, improves gradient correctness, and removes the need for separate metric scaling when `world_size > 1`.
---
 .../strategy/sequence_parallel.py             | 73 ++++++++++++-------
 1 file changed, 47 insertions(+), 26 deletions(-)

diff --git a/src/twinkle/model/transformers/strategy/sequence_parallel.py b/src/twinkle/model/transformers/strategy/sequence_parallel.py
index 28285070..359b8d79 100644
--- a/src/twinkle/model/transformers/strategy/sequence_parallel.py
+++ b/src/twinkle/model/transformers/strategy/sequence_parallel.py
@@ -969,34 +969,55 @@ def reduce_loss(self, loss: torch.Tensor, labels: Optional[torch.Tensor], ignore
             return loss
         if labels is None or sequence_parallel._sp_group is None:
             return loss
-        # Compute full-sequence loss in forward, but keep backward local to this rank.
-        reduction = str(self.sp_config.get('loss_reduction', 'mean')).lower()
-        if reduction == 'none':
-            raise ValueError("SequenceParallelStrategy.reduce_loss only supports reduction='sum' or 'mean'. "
-                             'Please aggregate per-token losses before calling reduce_loss.')
+        # Compute global loss via autograd-aware all-reduce.
+        reduction = str(self.sp_config.get("loss_reduction", "mean")).lower()
+        if reduction == "none":
+            raise ValueError(
+                "SequenceParallelStrategy.reduce_loss only supports reduction='sum' or 'mean'. "
+                "Please aggregate per-token losses before calling reduce_loss."
+            )
+
+        class _ReduceSequenceParallelLoss(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, local_sum: torch.Tensor, num_valid_tokens: torch.Tensor) -> torch.Tensor:
+                if num_valid_tokens.item() == 0:
+                    local_sum = torch.nan_to_num(local_sum)
+                local_tokens = num_valid_tokens.detach().clone()
+                global_sum = local_sum.detach().clone()
+                dist.all_reduce(global_sum, group=sequence_parallel._sp_group)
+                global_tokens = num_valid_tokens.detach().clone()
+                dist.all_reduce(global_tokens, group=sequence_parallel._sp_group)
+                ctx.save_for_backward(local_tokens, global_tokens)
+                if global_tokens.item() == 0:
+                    return local_sum
+                return global_sum / global_tokens
+
+            @staticmethod
+            def backward(ctx, grad_output: torch.Tensor):
+                local_tokens, global_tokens = ctx.saved_tensors
+                if global_tokens.item() == 0:
+                    return grad_output, None
+                grad_local_sum = grad_output * (local_tokens / global_tokens)
+                return grad_local_sum, None
+
+        class _ReduceSequenceParallelSum(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, local_sum: torch.Tensor) -> torch.Tensor:
+                global_sum = local_sum.detach().clone()
+                dist.all_reduce(global_sum, group=sequence_parallel._sp_group)
+                return global_sum
+
+            @staticmethod
+            def backward(ctx, grad_output: torch.Tensor):
+                return grad_output
+
+        if reduction == "sum":
+            return _ReduceSequenceParallelSum.apply(loss)
+
+        # Default to mean reduction: assume `loss` is local mean, convert to local sum.
         num_valid_tokens = (labels != ignore_index).sum().to(loss.device)
-        if reduction == 'sum':
-            local_sum = loss
-            global_sum = local_sum.detach().clone()
-            dist.all_reduce(global_sum, group=sequence_parallel._sp_group)
-            out = global_sum + (local_sum - local_sum.detach())
-            if sequence_parallel.world_size > 1:
-                out_metric = out.detach() / sequence_parallel.world_size
-                return out_metric + (out - out.detach())
-            return out
-        # Default to mean reduction.
         local_sum = loss * num_valid_tokens
-        global_sum = local_sum.detach().clone()
-        dist.all_reduce(global_sum, group=sequence_parallel._sp_group)
-        global_tokens = num_valid_tokens.detach().clone()
-        dist.all_reduce(global_tokens, group=sequence_parallel._sp_group)
-        if global_tokens.item() == 0:
-            return loss
-        out = (global_sum + (local_sum - local_sum.detach())) / global_tokens
-        if sequence_parallel.world_size > 1:
-            out_metric = out.detach() / sequence_parallel.world_size
-            return out_metric + (out - out.detach())
-        return out
+        return _ReduceSequenceParallelLoss.apply(local_sum, num_valid_tokens)
 
     def wrap_model(self, model, optimizer=None):
         self.initialize()

From 9823afb912d50762ae41e23c4b534bbe832e8f4e Mon Sep 17 00:00:00 2001
From: meichangsu1 <1484603386@qq.com>
Date: Fri, 13 Feb 2026 09:08:48 +0800
Subject: [PATCH 03/16] feat(sequence_parallel): compensate gradient scaling
 for FSDP averaging

Add `compensate_fsdp_avg` config flag to adjust loss reduction when sequence parallel (SP) is combined with FSDP or accelerate DDP/FSDP. This prevents gradient magnitude from being incorrectly scaled down by an extra factor of SP world size during data-parallel averaging.

- In `GatherLoss` backward, scale gradients by SP world size before splitting, so downstream FSDP averaging does not shrink this path.
- In `SequenceParallelStrategy.reduce_loss`, apply a compensation factor (ulysses_size) when `compensate_fsdp_avg` is enabled.
- Automatically set `compensate_fsdp_avg=True` in `TransformersModel` when using NativeFSDPStrategy or AccelerateStrategy with both SP and data parallelism active.
---
 .../model/transformers/strategy/sequence_parallel.py   | 10 ++++++++--
 src/twinkle/model/transformers/transformers.py         |  8 +++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/twinkle/model/transformers/strategy/sequence_parallel.py b/src/twinkle/model/transformers/strategy/sequence_parallel.py
index 359b8d79..6fdac2e9 100644
--- a/src/twinkle/model/transformers/strategy/sequence_parallel.py
+++ b/src/twinkle/model/transformers/strategy/sequence_parallel.py
@@ -183,6 +183,9 @@ def backward(ctx, *grad_output):
         # Split grads back to local sequence chunk.
         _grad = grad_output[0]
         if sequence_parallel.world_size > 1 and sequence_parallel._sp_group is not None:
+            # Gather replicates the sequence dimension across SP ranks. Scale once here
+            # so downstream FSDP avg does not shrink this path by an extra SP factor.
+            _grad = _grad * sequence_parallel.world_size
             _grad = sequence_parallel.split(_grad, dim=ctx.gather_idx, position_ids=ctx.position_ids).contiguous()
         return _grad, None, None, None
 
@@ -866,7 +869,8 @@ class SequenceParallelConfig:
     enabled: bool = True
     ulysses_size: Optional[int] = None
     gather_logits: bool = True
-    loss_reduction: str = 'mean'
+    loss_reduction: str = "mean"
+    compensate_fsdp_avg: bool = False
 
 
 def _get_ulysses_size(device_mesh, sp_config: Optional[Dict[str, Any]] = None) -> int:
@@ -976,6 +980,8 @@ def reduce_loss(self, loss: torch.Tensor, labels: Optional[torch.Tensor], ignore
                 "SequenceParallelStrategy.reduce_loss only supports reduction='sum' or 'mean'. "
                 "Please aggregate per-token losses before calling reduce_loss."
             )
+        compensate_fsdp_avg = bool(self.sp_config.get("compensate_fsdp_avg", False))
+        compensate_factor = float(self.ulysses_size if compensate_fsdp_avg else 1.0)
 
         class _ReduceSequenceParallelLoss(torch.autograd.Function):
             @staticmethod
@@ -997,7 +1003,7 @@ def backward(ctx, grad_output: torch.Tensor):
                 local_tokens, global_tokens = ctx.saved_tensors
                 if global_tokens.item() == 0:
                     return grad_output, None
-                grad_local_sum = grad_output * (local_tokens / global_tokens)
+                grad_local_sum = grad_output * (local_tokens / global_tokens) * compensate_factor
                 return grad_local_sum, None
 
         class _ReduceSequenceParallelSum(torch.autograd.Function):
diff --git a/src/twinkle/model/transformers/transformers.py b/src/twinkle/model/transformers/transformers.py
index 75aa8a1d..6dc74ad9 100644
--- a/src/twinkle/model/transformers/transformers.py
+++ b/src/twinkle/model/transformers/transformers.py
@@ -247,9 +247,15 @@ def _ensure_sp_strategy(self) -> None:
             return
         from .strategy.sequence_parallel import SequenceParallelStrategy
 
+        sp_config = {}
+        # When data-parallel gradient averaging runs across SP shards (native FSDP or
+        # accelerate DDP/FSDP paths), compensate SP loss backward to keep gradient scale.
+        if isinstance(self.strategy, (NativeFSDPStrategy, AccelerateStrategy)) and self.device_mesh is not None:
+            if (self.device_mesh.ulysses_size or 1) > 1 and (self.device_mesh.data_world_size or 1) > 1:
+                sp_config["compensate_fsdp_avg"] = True
         self.sp_strategy = SequenceParallelStrategy(
             self.device_mesh,
-            {},
+            sp_config,
             model=self.model,
             tokenizer_id=self.tokenizer_id,
         )

From 9d239dad12ff7c9ca5e8955d7b49d32e1c9c460d Mon Sep 17 00:00:00 2001
From: meichangsu1 <1484603386@qq.com>
Date: Fri, 13 Feb 2026 10:06:12 +0800
Subject: [PATCH 04/16] delete unused unit test

---
 .../strategy/sequence_parallel.py             | 30 ++++++++++++-------
 .../model/transformers/transformers.py        |  7 ++---
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/twinkle/model/transformers/strategy/sequence_parallel.py b/src/twinkle/model/transformers/strategy/sequence_parallel.py
index 6fdac2e9..281b4fe6 100644
--- a/src/twinkle/model/transformers/strategy/sequence_parallel.py
+++ b/src/twinkle/model/transformers/strategy/sequence_parallel.py
@@ -788,7 +788,8 @@ def pad_and_split_inputs(self,
                 # - In next-token-aligned labels, this appears at labels[b-1]
                 boundary_starts = (real_position_ids == 0)
                 prev = torch.zeros_like(boundary_starts, dtype=torch.bool)
-                prev[..., 1:] = boundary_starts[..., :-1]
+                # Mask token b-1 when boundary starts at b.
+                prev[..., :-1] = boundary_starts[..., 1:]
                 labels = labels.clone()
                 labels[prev] = -100
                 # Also avoid any potential wrap-around supervision at the end of the concatenated stream.
@@ -982,13 +983,15 @@ def reduce_loss(self, loss: torch.Tensor, labels: Optional[torch.Tensor], ignore
             )
         compensate_fsdp_avg = bool(self.sp_config.get("compensate_fsdp_avg", False))
         compensate_factor = float(self.ulysses_size if compensate_fsdp_avg else 1.0)
+        sum_metric_scale = float(self.ulysses_size)
 
         class _ReduceSequenceParallelLoss(torch.autograd.Function):
             @staticmethod
-            def forward(ctx, local_sum: torch.Tensor, num_valid_tokens: torch.Tensor) -> torch.Tensor:
-                if num_valid_tokens.item() == 0:
-                    local_sum = torch.nan_to_num(local_sum)
+            def forward(ctx, local_mean: torch.Tensor, num_valid_tokens: torch.Tensor) -> torch.Tensor:
                 local_tokens = num_valid_tokens.detach().clone()
+                local_sum = local_mean * local_tokens
+                if local_tokens.item() == 0:
+                    local_sum = torch.nan_to_num(local_sum)
                 global_sum = local_sum.detach().clone()
                 dist.all_reduce(global_sum, group=sequence_parallel._sp_group)
                 global_tokens = num_valid_tokens.detach().clone()
@@ -1002,28 +1005,33 @@ def forward(ctx, local_sum: torch.Tensor, num_valid_tokens: torch.Tensor) -> tor
             def backward(ctx, grad_output: torch.Tensor):
                 local_tokens, global_tokens = ctx.saved_tensors
                 if global_tokens.item() == 0:
-                    return grad_output, None
-                grad_local_sum = grad_output * (local_tokens / global_tokens) * compensate_factor
-                return grad_local_sum, None
+                    return torch.zeros_like(grad_output), None
+                # d(global_mean)/d(local_mean) = local_tokens / global_tokens.
+                grad_local_mean = grad_output * (local_tokens / global_tokens) * compensate_factor
+                return grad_local_mean, None
 
         class _ReduceSequenceParallelSum(torch.autograd.Function):
             @staticmethod
             def forward(ctx, local_sum: torch.Tensor) -> torch.Tensor:
+                ctx.sum_metric_scale = sum_metric_scale
                 global_sum = local_sum.detach().clone()
                 dist.all_reduce(global_sum, group=sequence_parallel._sp_group)
-                return global_sum
+                # Keep logging/metric value aligned with non-SP sum semantics under
+                # outer collect='mean' by removing one SP replication factor.
+                return global_sum / ctx.sum_metric_scale
 
             @staticmethod
             def backward(ctx, grad_output: torch.Tensor):
+                # Keep training gradient scale unchanged; forward-side scaling is for
+                # logging/metric alignment under outer collect='mean'.
                 return grad_output
 
         if reduction == "sum":
             return _ReduceSequenceParallelSum.apply(loss)
 
-        # Default to mean reduction: assume `loss` is local mean, convert to local sum.
+        # Default to mean reduction: `loss` is local mean.
         num_valid_tokens = (labels != ignore_index).sum().to(loss.device)
-        local_sum = loss * num_valid_tokens
-        return _ReduceSequenceParallelLoss.apply(local_sum, num_valid_tokens)
+        return _ReduceSequenceParallelLoss.apply(loss, num_valid_tokens)
 
     def wrap_model(self, model, optimizer=None):
         self.initialize()
diff --git a/src/twinkle/model/transformers/transformers.py b/src/twinkle/model/transformers/transformers.py
index 6dc74ad9..5a46eee2 100644
--- a/src/twinkle/model/transformers/transformers.py
+++ b/src/twinkle/model/transformers/transformers.py
@@ -440,10 +440,9 @@ def calculate_loss(self, **kwargs):
         optimizer_config = self.optimizer_group[adapter_name]
         optimizer_config.num_tokens += counts.item()
         if self.sp_strategy is not None and 'labels' in inputs:
-            if 'loss_reduction' not in self.sp_strategy.sp_config:
-                reduction = getattr(loss_instance, 'reduction', None)
-                if reduction is not None:
-                    self.sp_strategy.sp_config['loss_reduction'] = str(reduction)
+            reduction = getattr(loss_instance, "reduction", None)
+            if reduction is not None:
+                self.sp_strategy.sp_config["loss_reduction"] = str(reduction)
             loss_value = self.sp_strategy.reduce_loss(loss_value, inputs['labels'])
         optimizer_config.loss_value += loss_value
         outputs['loss'] = optimizer_config.loss_value

From f04c1f8938f5c09ae941946ee13516dc33ba5f65 Mon Sep 17 00:00:00 2001
From: meichangsu1 <1484603386@qq.com>
Date: Fri, 13 Feb 2026 12:58:48 +0800
Subject: [PATCH 05/16] fix lint

---
 .../transformers/strategy/sequence_parallel.py | 18 +++++++++---------
 src/twinkle/model/transformers/transformers.py |  6 +++---
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/twinkle/model/transformers/strategy/sequence_parallel.py b/src/twinkle/model/transformers/strategy/sequence_parallel.py
index 281b4fe6..64ea34f3 100644
--- a/src/twinkle/model/transformers/strategy/sequence_parallel.py
+++ b/src/twinkle/model/transformers/strategy/sequence_parallel.py
@@ -870,7 +870,7 @@ class SequenceParallelConfig:
     enabled: bool = True
     ulysses_size: Optional[int] = None
     gather_logits: bool = True
-    loss_reduction: str = "mean"
+    loss_reduction: str = 'mean'
     compensate_fsdp_avg: bool = False
 
 
@@ -975,17 +975,16 @@ def reduce_loss(self, loss: torch.Tensor, labels: Optional[torch.Tensor], ignore
         if labels is None or sequence_parallel._sp_group is None:
             return loss
         # Compute global loss via autograd-aware all-reduce.
-        reduction = str(self.sp_config.get("loss_reduction", "mean")).lower()
-        if reduction == "none":
-            raise ValueError(
-                "SequenceParallelStrategy.reduce_loss only supports reduction='sum' or 'mean'. "
-                "Please aggregate per-token losses before calling reduce_loss."
-            )
-        compensate_fsdp_avg = bool(self.sp_config.get("compensate_fsdp_avg", False))
+        reduction = str(self.sp_config.get('loss_reduction', 'mean')).lower()
+        if reduction == 'none':
+            raise ValueError("SequenceParallelStrategy.reduce_loss only supports reduction='sum' or 'mean'. "
+                             'Please aggregate per-token losses before calling reduce_loss.')
+        compensate_fsdp_avg = bool(self.sp_config.get('compensate_fsdp_avg', False))
         compensate_factor = float(self.ulysses_size if compensate_fsdp_avg else 1.0)
         sum_metric_scale = float(self.ulysses_size)
 
         class _ReduceSequenceParallelLoss(torch.autograd.Function):
+
             @staticmethod
             def forward(ctx, local_mean: torch.Tensor, num_valid_tokens: torch.Tensor) -> torch.Tensor:
                 local_tokens = num_valid_tokens.detach().clone()
@@ -1011,6 +1010,7 @@ def backward(ctx, grad_output: torch.Tensor):
                 return grad_local_mean, None
 
         class _ReduceSequenceParallelSum(torch.autograd.Function):
+
             @staticmethod
             def forward(ctx, local_sum: torch.Tensor) -> torch.Tensor:
                 ctx.sum_metric_scale = sum_metric_scale
@@ -1026,7 +1026,7 @@ def backward(ctx, grad_output: torch.Tensor):
                 # logging/metric alignment under outer collect='mean'.
                 return grad_output
 
-        if reduction == "sum":
+        if reduction == 'sum':
             return _ReduceSequenceParallelSum.apply(loss)
 
         # Default to mean reduction: `loss` is local mean.
diff --git a/src/twinkle/model/transformers/transformers.py b/src/twinkle/model/transformers/transformers.py
index 5a46eee2..6f80699b 100644
--- a/src/twinkle/model/transformers/transformers.py
+++ b/src/twinkle/model/transformers/transformers.py
@@ -252,7 +252,7 @@ def _ensure_sp_strategy(self) -> None:
         # accelerate DDP/FSDP paths), compensate SP loss backward to keep gradient scale.
         if isinstance(self.strategy, (NativeFSDPStrategy, AccelerateStrategy)) and self.device_mesh is not None:
             if (self.device_mesh.ulysses_size or 1) > 1 and (self.device_mesh.data_world_size or 1) > 1:
-                sp_config["compensate_fsdp_avg"] = True
+                sp_config['compensate_fsdp_avg'] = True
         self.sp_strategy = SequenceParallelStrategy(
             self.device_mesh,
             sp_config,
@@ -440,9 +440,9 @@ def calculate_loss(self, **kwargs):
         optimizer_config = self.optimizer_group[adapter_name]
         optimizer_config.num_tokens += counts.item()
         if self.sp_strategy is not None and 'labels' in inputs:
-            reduction = getattr(loss_instance, "reduction", None)
+            reduction = getattr(loss_instance, 'reduction', None)
             if reduction is not None:
-                self.sp_strategy.sp_config["loss_reduction"] = str(reduction)
+                self.sp_strategy.sp_config['loss_reduction'] = str(reduction)
             loss_value = self.sp_strategy.reduce_loss(loss_value, inputs['labels'])
         optimizer_config.loss_value += loss_value
         outputs['loss'] = optimizer_config.loss_value

From 95f01ac2a57c3079ee5e6d12896192c17913c5dc Mon Sep 17 00:00:00 2001
From: meichangsu1 <1484603386@qq.com>
Date: Fri, 13 Feb 2026 14:37:02 +0800
Subject: [PATCH 06/16] feat: add kernels optional dependency and refactor CI
 installation

- Add 'kernels' as an optional dependency group in pyproject.toml
- Refactor CI container test script to use a reusable installation function
- Install twinkle with kernels in both debug and release modes for consistency
- Improve maintainability by centralizing the installation command
---
 .dev_scripts/ci_container_test.sh | 7 ++++++-
 pyproject.toml                    | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index dd868b5b..66b4c7e1 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,3 +1,7 @@
+install_twinkle_with_kernels() {
+    pip install ".[kernels]" -i https://mirrors.aliyun.com/pypi/simple/ || pip install ".[kernels]"
+}
+
 if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
     # pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
     git config --global --add safe.directory /twinkle
@@ -25,8 +29,9 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
     pip install optimum
 
     # test with install
-    pip install .
+    install_twinkle_with_kernels
 else
+    install_twinkle_with_kernels
     echo "Running case in release image, run case directly!"
 fi
 # remove torch_extensions folder to avoid ci hang.
diff --git a/pyproject.toml b/pyproject.toml
index 6f1bd429..76ca660d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,7 @@ transformers = [
   "torch>=2.6.0,<3.0.0",
   "torchvision",
 ]
+kernels = ["kernels"]
 megatron = ["megatron-core>=0.12.0", "transformer-engine[pytorch]"]
 vllm = ["vllm>=0.11"]
 ray = ["ray[serve]"]

From de5bc35996beffe17270d25ba97d0ffbd7bcec8a Mon Sep 17 00:00:00 2001
From: meichangsu1 <1484603386@qq.com>
Date: Fri, 13 Feb 2026 14:56:49 +0800
Subject: [PATCH 07/16] feat(kernel): add backward compatibility for kernels
 API changes

Update `_load_from_hub` function to handle API changes in `select_revision_or_version` and `get_kernel` calls. The changes introduce try-except blocks to catch `TypeError` exceptions, allowing the function to work with both modern keyword-based APIs and older positional argument variants. This ensures compatibility across different versions of the kernels module without breaking existing functionality.
---
 src/twinkle/kernel/function.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/twinkle/kernel/function.py b/src/twinkle/kernel/function.py
index 5f2bc130..94a2d817 100644
--- a/src/twinkle/kernel/function.py
+++ b/src/twinkle/kernel/function.py
@@ -35,8 +35,16 @@ def impl(*args, **kwargs):
     from kernels._versions import select_revision_or_version
     from kernels.utils import get_kernel
     assert repo_id is not None
-    resolved = select_revision_or_version(repo_id, revision, version)
-    kernel = get_kernel(repo_id, revision=resolved)
+    # kernels API changed across versions; use keyword args for modern API
+    # and fall back to repo_id-only for older variants.
+    try:
+        resolved = select_revision_or_version(repo_id, revision=revision, version=version)
+    except TypeError:
+        resolved = select_revision_or_version(repo_id)
+    try:
+        kernel = get_kernel(repo_id, revision=resolved)
+    except TypeError:
+        kernel = get_kernel(repo_id, resolved)
     func = getattr(kernel, func_name, None)
     if func is None:
         raise AttributeError(f'Kernel repo {repo_id} does not export {func_name}.')

From 5765ded68ed0bb20ac3f91b1cf32cc5454947bdf Mon Sep 17 00:00:00 2001
From: meichangsu1 <1484603386@qq.com>
Date: Fri, 13 Feb 2026 15:26:32 +0800
Subject: [PATCH 08/16] feat(tests): add kernel loading validation in test
 setup

Add additional import statements and validation steps to ensure the required kernel 'kernels-test/flattened-build' can be successfully loaded before proceeding with the test. This prevents test failures due to missing or incompatible kernel environments and provides clearer skip messages when the kernel is unavailable.
---
 tests/kernel/test_function_kernel.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/kernel/test_function_kernel.py b/tests/kernel/test_function_kernel.py
index 58a2a098..212a0f46 100644
--- a/tests/kernel/test_function_kernel.py
+++ b/tests/kernel/test_function_kernel.py
@@ -41,10 +41,21 @@ def test_flattened_build_replaces_function(self):
             self.skipTest('CUDA not available in this environment.')
         try:
             from kernels import has_kernel
+            from kernels._versions import select_revision_or_version
+            from kernels.utils import get_kernel
         except Exception:
             self.skipTest('kernels package missing has_kernel.')
         if not has_kernel('kernels-test/flattened-build'):
             self.skipTest('kernels-test/flattened-build not available.')
+        try:
+            revision = select_revision_or_version(
+                'kernels-test/flattened-build',
+                revision=None,
+                version=None,
+            )
+            get_kernel('kernels-test/flattened-build', revision=revision)
+        except Exception as exc:
+            self.skipTest(f'kernels-test/flattened-build cannot be loaded in this env: {exc}')
 
         _ensure_test_packages()
         module_name = 'tests.kernel._tmp_flattened_build_module'

From 68c5eddecba0fc50e2cb4db78cd43ec6624da2fe Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 13 Feb 2026 13:00:54 +0800
Subject: [PATCH 09/16] fix ci

---
 .dev_scripts/ci_container_test.sh | 2 ++
 tests/sampler/test_sampler_e2e.py | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 66b4c7e1..b5f2d8b3 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -26,6 +26,8 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
     pip uninstall autoawq -y
     pip uninstall lmdeploy -y
     pip uninstall tensorflow -y
+    pip install kernels -U
+    pip install ray==2.48
     pip install optimum
 
     # test with install
diff --git a/tests/sampler/test_sampler_e2e.py b/tests/sampler/test_sampler_e2e.py
index 1f09f058..fe0812ea 100644
--- a/tests/sampler/test_sampler_e2e.py
+++ b/tests/sampler/test_sampler_e2e.py
@@ -21,6 +21,7 @@
 import os
 import sys
 import traceback
+import unittest
 
 # Set environment variables before imports
 os.environ.setdefault('TRUST_REMOTE_CODE', '1')
@@ -29,6 +30,7 @@
 MAX_MODEL_LEN = int(os.environ.get('TWINKLE_MAX_MODEL_LEN', '512'))
 
 
+@unittest.skip('Skip because vllm not installed.')
 def test_vllm_engine_with_input_ids():
     """Test VLLMEngine with raw input_ids (no Sampler layer)."""
     print('\n' + '=' * 60)
@@ -82,6 +84,7 @@ async def run_test():
     return True
 
 
+@unittest.skip('Skip because vllm not installed.')
 def test_transformers_engine_with_input_ids():
     """Test TransformersEngine with raw input_ids (no Sampler layer)."""
     print('\n' + '=' * 60)
@@ -138,6 +141,7 @@ def test_transformers_engine_with_input_ids():
     return True
 
 
+@unittest.skip('Skip because vllm not installed.')
 def test_vllm_engine_batch():
     """Test VLLMEngine batch sampling."""
     print('\n' + '=' * 60)
@@ -197,6 +201,7 @@ async def run_batch_test():
     return True
 
 
+@unittest.skip('Skip because vllm not installed.')
 def test_sampling_params_conversion():
     """Test SamplingParams conversion to vLLM and transformers formats."""
     print('\n' + '=' * 60)

From c6c51518042926ad8d5a7dad1181d34b22753a9c Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 13 Feb 2026 13:27:04 +0800
Subject: [PATCH 10/16] bump version

---
 src/twinkle/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/twinkle/version.py b/src/twinkle/version.py
index eecb5e63..3c2744e8 100644
--- a/src/twinkle/version.py
+++ b/src/twinkle/version.py
@@ -1,5 +1,5 @@
 # Make sure to modify __release_datetime__ to release time when making official release.
-__version__ = '0.1.dev0'
+__version__ = '0.1.rc0'
 # default release datetime for branches under active development is set
 # to be a time far-far-away-into-the-future
 __release_datetime__ = '2099-10-13 08:56:12'

From 01ed336bcbecb087ff0e0348c96726be21eb2c34 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 13 Feb 2026 14:25:05 +0800
Subject: [PATCH 11/16] fix

---
 cookbook/client/tinker/megatron/server_config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cookbook/client/tinker/megatron/server_config.yaml b/cookbook/client/tinker/megatron/server_config.yaml
index 69f749fd..ef92990a 100644
--- a/cookbook/client/tinker/megatron/server_config.yaml
+++ b/cookbook/client/tinker/megatron/server_config.yaml
@@ -45,7 +45,7 @@ applications:
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
-        max_model_len: 8192           # Maximum sequence length the engine supports
+        max_model_len: 14336           # Maximum sequence length the engine supports
         gpu_memory_utilization: 0.85   # Fraction of GPU memory to use (0.0-1.0)
         enable_lora: true             # Allow loading LoRA adapters during inference
       device_group:                   # Logical device group for the sampler
@@ -58,7 +58,7 @@ applications:
         dp_size: 4
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 10000                            # Max tokens per second
+        tps_limit: 14336                            # Max tokens per second
     deployments:
       - name: SamplerManagement
         autoscaling_config:
@@ -80,7 +80,7 @@ applications:
     args:
       use_megatron: true                          # Use HuggingFace Transformers backend
       model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
-      max_length: 10240                           # model max length
+      max_length: 14336                           # model max length
       max_loras: 5                                # model max loras
       nproc_per_node: 4                           # Number of GPU processes per node
       device_group:
@@ -94,7 +94,7 @@ applications:
 
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 10000                            # Max tokens per second
+        tps_limit: 14336                            # Max tokens per second
       adapter_config:
         per_token_adapter_limit: 3                # Max concurrent LoRA adapters
         adapter_timeout: 30                       # Seconds before idle adapter unload

From be4164946d6e82f168f59eb611f1b7e21fa46ce4 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 13 Feb 2026 14:36:23 +0800
Subject: [PATCH 12/16] fix config

---
 cookbook/client/tinker/megatron/server_config.yaml | 8 ++++----
 src/twinkle/server/utils/task_queue.py             | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cookbook/client/tinker/megatron/server_config.yaml b/cookbook/client/tinker/megatron/server_config.yaml
index ef92990a..cd6d9e47 100644
--- a/cookbook/client/tinker/megatron/server_config.yaml
+++ b/cookbook/client/tinker/megatron/server_config.yaml
@@ -45,7 +45,7 @@ applications:
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
-        max_model_len: 14336           # Maximum sequence length the engine supports
+        max_model_len: 16000           # Maximum sequence length the engine supports
         gpu_memory_utilization: 0.85   # Fraction of GPU memory to use (0.0-1.0)
         enable_lora: true             # Allow loading LoRA adapters during inference
       device_group:                   # Logical device group for the sampler
@@ -58,7 +58,7 @@ applications:
         dp_size: 4
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 14336                            # Max tokens per second
+        tps_limit: 16000                            # Max tokens per second
     deployments:
       - name: SamplerManagement
         autoscaling_config:
@@ -80,7 +80,7 @@ applications:
     args:
       use_megatron: true                          # Use HuggingFace Transformers backend
       model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
-      max_length: 14336                           # model max length
+      max_length: 16000                           # model max length
       max_loras: 5                                # model max loras
       nproc_per_node: 4                           # Number of GPU processes per node
       device_group:
@@ -94,7 +94,7 @@ applications:
 
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 14336                            # Max tokens per second
+        tps_limit: 16000                            # Max tokens per second
       adapter_config:
         per_token_adapter_limit: 3                # Max concurrent LoRA adapters
         adapter_timeout: 30                       # Seconds before idle adapter unload
diff --git a/src/twinkle/server/utils/task_queue.py b/src/twinkle/server/utils/task_queue.py
index 4d272a07..39511659 100644
--- a/src/twinkle/server/utils/task_queue.py
+++ b/src/twinkle/server/utils/task_queue.py
@@ -64,14 +64,14 @@ class TaskQueueConfig:
         max_input_tokens: Maximum allowed input tokens per request (default 10000).
     """
     rps_limit: float = 100.0  # 10 requests per second
-    tps_limit: float = 10000.0  # 10000 input tokens per second
+    tps_limit: float = 16000.0  # 10000 input tokens per second
     window_seconds: float = 1.0  # 1 second sliding window
     queue_timeout: float = 300.0  # 5 minutes queue timeout
     enabled: bool = True  # Rate limiting enabled by default
     # Remove tokens after 10x window inactivity
     token_cleanup_multiplier: float = 10.0
     token_cleanup_interval: float = 60.0  # Run cleanup every 60 seconds
-    max_input_tokens: int = 10000  # Maximum input tokens per request
+    max_input_tokens: int = 16000  # Maximum input tokens per request
 
     @classmethod
     def from_dict(cls, config_dict: dict[str, Any] | None = None) -> TaskQueueConfig:

From 1355dcaa77668ddc8fddb0a1338481884ead4a01 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 13 Feb 2026 15:02:55 +0800
Subject: [PATCH 13/16] fix

---
 cookbook/client/tinker/short_math_grpo.py | 4 ++--
 src/twinkle/processor/base.py             | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cookbook/client/tinker/short_math_grpo.py b/cookbook/client/tinker/short_math_grpo.py
index 6ab037f3..257d68e3 100644
--- a/cookbook/client/tinker/short_math_grpo.py
+++ b/cookbook/client/tinker/short_math_grpo.py
@@ -176,7 +176,7 @@ def __call__(self, trajectories: List[Trajectory], ground_truths: List[Trajector
         return rewards
 
 
-def create_Math_dataset():
+def create_math_dataset():
     """Create Math dataset."""
     meta = DatasetMeta(
         'ms://modelscope/competition_math',
@@ -207,7 +207,7 @@ def main():
     logger.info('Starting Math GRPO training...')
 
     # Step 1: Prepare dataset and dataloader (client-side)
-    dataset = create_Math_dataset()
+    dataset = create_math_dataset()
     dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
     template = Template(model_id=f'ms://{BASE_MODEL}')
 
diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py
index ed313e14..b75603bb 100644
--- a/src/twinkle/processor/base.py
+++ b/src/twinkle/processor/base.py
@@ -72,7 +72,7 @@ def __init__(self,
     def __call__(self, inputs: Union[InputFeature, List[InputFeature]],
                  **kwargs) -> Union[InputFeature, List[InputFeature]]:
         for pipe in self.process_pipeline:
-            inputs = pipe(inputs)
+            inputs = pipe(inputs, **kwargs)
         return inputs
 
     def prepare_outputs(self, inputs: List[InputFeature], **kwargs) -> Union[List[InputFeature], InputFeature]:
@@ -293,7 +293,7 @@ def _any_packing(inputs: List[InputFeature]):
         return is_padding_free
 
     @staticmethod
-    def to_transformers_dict(inputs: List[InputFeature]) -> List[InputFeature]:
+    def to_transformers_dict(inputs: List[InputFeature], **kwargs) -> List[InputFeature]:
         import torch
         results = []
         for _input in inputs:

From 1575f9924e84092fd8031b19a2703fe7268f3d55 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Fri, 13 Feb 2026 14:53:23 +0800
Subject: [PATCH 14/16] change token name

---
 cookbook/client/tinker/lora.py                                  | 2 +-
 cookbook/client/tinker/sample.py                                | 2 +-
 cookbook/client/tinker/self_congnition.py                       | 2 +-
 cookbook/client/tinker/short_math_grpo.py                       | 2 +-
 cookbook/client/twinkle/grpo.py                                 | 2 +-
 cookbook/client/twinkle/sample.py                               | 2 +-
 cookbook/client/twinkle/self_congnition.py                      | 2 +-
 .../Usage Guide/Server and Client/Tinker-Compatible-Client.md   | 2 +-
 docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md  | 2 +-
 ...\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" | 2 +-
 .../Twinkle\345\256\242\346\210\267\347\253\257.md"             | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/cookbook/client/tinker/lora.py b/cookbook/client/tinker/lora.py
index 617a46e3..2714e0af 100644
--- a/cookbook/client/tinker/lora.py
+++ b/cookbook/client/tinker/lora.py
@@ -19,7 +19,7 @@
 # - base_url: the address of the running server
 # - api_key: authentication token (loaded from environment variable)
 service_client = init_tinker_compat_client(
-    base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_SDK_TOKEN'))
+    base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN'))
 
 # Step 3: List models available on the server to verify the connection
 print('Available models:')
diff --git a/cookbook/client/tinker/sample.py b/cookbook/client/tinker/sample.py
index e995123f..eacd043b 100644
--- a/cookbook/client/tinker/sample.py
+++ b/cookbook/client/tinker/sample.py
@@ -14,7 +14,7 @@
 base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
 service_client = init_tinker_compat_client(
     base_url='http://www.modelscope.cn/twinkle',
-    api_key=os.environ.get('MODELSCOPE_SDK_TOKEN')
+    api_key=os.environ.get('MODELSCOPE_TOKEN')
 )
 # Step 2: Create a sampling client by loading weights from a saved checkpoint.
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
diff --git a/cookbook/client/tinker/self_congnition.py b/cookbook/client/tinker/self_congnition.py
index 5a565cc5..9f0fba9b 100644
--- a/cookbook/client/tinker/self_congnition.py
+++ b/cookbook/client/tinker/self_congnition.py
@@ -44,7 +44,7 @@ def train():
 
     # Connect to the Twinkle server running locally
     service_client = init_tinker_compat_client(
-        base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_SDK_TOKEN'))
+        base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN'))
 
     # Create a LoRA training client for the base model (rank=16 for the LoRA adapter)
     training_client = service_client.create_lora_training_client(base_model=base_model, rank=16)
diff --git a/cookbook/client/tinker/short_math_grpo.py b/cookbook/client/tinker/short_math_grpo.py
index 257d68e3..c0feaafe 100644
--- a/cookbook/client/tinker/short_math_grpo.py
+++ b/cookbook/client/tinker/short_math_grpo.py
@@ -216,7 +216,7 @@ def main():
     # Step 2: Initialize the Tinker-compatible client
     logger.info('Connecting to Tinker server...')
     service_client = init_tinker_compat_client(
-        base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_SDK_TOKEN'))
+        base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN'))
 
     logger.info('Creating LoRA training client...')
     # Create a LoRA training client for GRPO
diff --git a/cookbook/client/twinkle/grpo.py b/cookbook/client/twinkle/grpo.py
index 30a33c0e..c369de56 100644
--- a/cookbook/client/twinkle/grpo.py
+++ b/cookbook/client/twinkle/grpo.py
@@ -75,7 +75,7 @@ def train():
     # Step 1: Initialize the Twinkle client
     client = init_twinkle_client(
         base_url='http://127.0.0.1:8000',
-        api_key=os.environ.get('MODELSCOPE_SDK_TOKEN'),
+        api_key=os.environ.get('MODELSCOPE_TOKEN'),
     )
 
     # Step 2: Prepare dataset and dataloader
diff --git a/cookbook/client/twinkle/sample.py b/cookbook/client/twinkle/sample.py
index 8149c366..27f22fba 100644
--- a/cookbook/client/twinkle/sample.py
+++ b/cookbook/client/twinkle/sample.py
@@ -36,7 +36,7 @@ def sample():
     # Step 2: Initialize the Twinkle client to communicate with the remote server.
     client = init_twinkle_client(
         base_url='http://127.0.0.1:8000',
-        api_key=os.environ.get('MODELSCOPE_SDK_TOKEN'),
+        api_key=os.environ.get('MODELSCOPE_TOKEN'),
     )
 
     # Step 3: Create the sampler client pointing to the model on the server
diff --git a/cookbook/client/twinkle/self_congnition.py b/cookbook/client/twinkle/self_congnition.py
index 7886bbaa..fd23726f 100644
--- a/cookbook/client/twinkle/self_congnition.py
+++ b/cookbook/client/twinkle/self_congnition.py
@@ -26,7 +26,7 @@
 # Step 2: Initialize the Twinkle client to communicate with the remote server.
 # - base_url: the address of the running Twinkle server
 # - api_key: authentication token (loaded from environment variable)
-client = init_twinkle_client(base_url='http://127.0.0.1:8000', api_key=os.environ.get('MODELSCOPE_SDK_TOKEN'))
+client = init_twinkle_client(base_url='http://127.0.0.1:8000', api_key=os.environ.get('MODELSCOPE_TOKEN'))
 
 # Step 3: Query the server for existing training runs and their checkpoints.
 # This is useful for resuming a previous training session.
diff --git a/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md b/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md
index 9ac53d77..67a6b30f 100644
--- a/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md	
+++ b/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md	
@@ -45,7 +45,7 @@ from twinkle_client import init_tinker_compat_client
 # Step 1: Initialize client (automatically patches Tinker SDK)
 service_client = init_tinker_compat_client(
     base_url='http://localhost:8000',
-    api_key=os.environ.get('MODELSCOPE_SDK_TOKEN')
+    api_key=os.environ.get('MODELSCOPE_TOKEN')
 )
 
 # Step 2: Query existing training runs (optional)
diff --git a/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md b/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md
index 611eba7f..da0a5f1e 100644
--- a/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md	
+++ b/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md	
@@ -76,7 +76,7 @@ logger = get_logger()
 # Step 1: Initialize client
 client = init_twinkle_client(
     base_url='http://127.0.0.1:8000',
-    api_key=os.environ.get('MODELSCOPE_SDK_TOKEN')
+    api_key=os.environ.get('MODELSCOPE_TOKEN')
 )
 
 # Step 2: Query existing training runs (optional, for resuming training)
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md"
index bdf3eebd..ef1c7e26 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md"
@@ -45,7 +45,7 @@ from twinkle_client import init_tinker_compat_client
 # Step 1: 初始化客户端（会自动 patch Tinker SDK）
 service_client = init_tinker_compat_client(
     base_url='http://localhost:8000',
-    api_key=os.environ.get('MODELSCOPE_SDK_TOKEN')
+    api_key=os.environ.get('MODELSCOPE_TOKEN')
 )
 
 # Step 2: 查询已有训练运行（可选）
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md"
index 4d5734b3..fd81ac1b 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md"
@@ -76,7 +76,7 @@ logger = get_logger()
 # Step 1: 初始化客户端
 client = init_twinkle_client(
     base_url='http://127.0.0.1:8000',
-    api_key=os.environ.get('MODELSCOPE_SDK_TOKEN')
+    api_key=os.environ.get('MODELSCOPE_TOKEN')
 )
 
 # Step 2: 查询已有训练运行（可选，用于恢复训练）

From 50ac6e1d1a45e381c573f8b92abeb0997859ed0e Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 13 Feb 2026 15:22:09 +0800
Subject: [PATCH 15/16] add acknowledgements

---
 README.md    | 9 +++++++++
 README_ZH.md | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 046fa336..447ebf87 100644
--- a/README.md
+++ b/README.md
@@ -352,3 +352,12 @@ foundation for building customizable, enterprise-grade training services.
 | Component Type | Component Link                                                                                           | Component Function                                                                      | Author              |
 | -------------- | -------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------- | ------------------- |
 | Patch          | [qwen3_moe_transformers4_patch](https://www.modelscope.cn/models/twinkle-kit/qwen3_moe_transformers4_patch) | Fixes Qwen3 MoE model hang issue during FSDP2 training, effective for transformers==4.x | ModelScope Official |
+
+## Acknowledgements
+
+This project is maintained and supported by multiple teams under Workshop:
+
+- ModelScope Team
+- CMB-Tech Team
+
+Twinkle is built on the shoulders of giants, including [Transformers](https://github.com/huggingface/transformers),[MS-SWIFT](https://github.com/modelscope/swift), [veRL](https://github.com/verl-project/verl), and other excellent projects.
diff --git a/README_ZH.md b/README_ZH.md
index b8dc2e3b..73bf9cac 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -331,3 +331,12 @@ for epoch in range(3):
 | 组件类型 | 组件链接                                                                                           | 组件功能                                                                      | 作者              |
 | -------- | -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- | ----------------- |
 | Patch    | [qwen3_moe_transformers4_patch](https://www.modelscope.cn/models/twinkle-kit/qwen3_moe_transformers4_patch) | 修复 Qwen3 MoE 模型在 FSDP2 训练期间挂起的问题，适用于 transformers==4.x | ModelScope 官方 |
+
+## 致谢
+
+本项目由 Workshop 组织下的多个团队共同维护和支持：
+
+- ModelScope官方团队
+- 招商银行开源技术团队
+
+Twinkle 的构建基于多个优秀的开源项目，包括 [Transformers](https://github.com/huggingface/transformers)、[MS-SWIFT](https://github.com/modelscope/swift)、[veRL](https://github.com/verl-project/verl) 等。

From 7f42fd94645e056723d71456207d8b57a65e7469 Mon Sep 17 00:00:00 2001
From: "baijin.xh" <baijin.xh@alibaba-inc.com>
Date: Fri, 13 Feb 2026 15:29:20 +0800
Subject: [PATCH 16/16] tests

---
 tests/kernel/test_function_kernel.py       | 40 +++++++++--
 tests/preprocessor/test_preprocessor.py    | 33 ---------
 tests/sampler/test_30b_weight_sync.py      |  5 +-
 tests/sampler/test_megatron_weight_sync.py | 10 +++
 tests/sampler/test_sampler_e2e.py          | 79 ++++++++++++++++------
 tests/sampler/test_weight_sync.py          | 10 +++
 6 files changed, 116 insertions(+), 61 deletions(-)

diff --git a/tests/kernel/test_function_kernel.py b/tests/kernel/test_function_kernel.py
index 212a0f46..fe95bafa 100644
--- a/tests/kernel/test_function_kernel.py
+++ b/tests/kernel/test_function_kernel.py
@@ -1,3 +1,4 @@
+import os
 import sys
 import torch
 import torch.nn as nn
@@ -5,6 +6,11 @@
 import types
 import unittest
 
+try:
+    import requests
+except ImportError:
+    requests = None
+
 from twinkle.kernel.base import is_kernels_available
 from twinkle.kernel.function import apply_function_kernel, register_function_kernel
 from twinkle.kernel.registry import get_global_function_registry
@@ -37,8 +43,15 @@ def tearDown(self):
         get_global_function_registry()._clear()
 
     def test_flattened_build_replaces_function(self):
+        if os.environ.get('TWINKLE_SKIP_SLOW_TESTS') == '1':
+            self.skipTest('TWINKLE_SKIP_SLOW_TESTS=1')
         if not torch.cuda.is_available():
             self.skipTest('CUDA not available in this environment.')
+        try:
+            import urllib.request
+            urllib.request.urlopen('https://huggingface.co', timeout=5)
+        except Exception as e:
+            self.skipTest(f'HuggingFace unreachable: {e}')
         try:
             from kernels import has_kernel
             from kernels._versions import select_revision_or_version
@@ -77,11 +90,22 @@ def original(x: torch.Tensor) -> torch.Tensor:
                 mode='inference',
             )
 
-            applied = apply_function_kernel(
-                target_module=module_name,
-                device='cuda',
-                mode='inference',
-            )
+            try:
+                applied = apply_function_kernel(
+                    target_module=module_name,
+                    device='cuda',
+                    mode='inference',
+                )
+            except TypeError as e:
+                if 'select_revision_or_version' in str(e) or 'takes 1 positional argument' in str(e):
+                    self.skipTest(f'kernels API incompatible: {e}')
+                raise
+            except Exception as e:
+                if requests and isinstance(e, (requests.exceptions.SSLError, requests.exceptions.RequestException)):
+                    self.skipTest(f'Network/HuggingFace unreachable: {e}')
+                if 'SSLError' in type(e).__name__ or 'MaxRetryError' in str(e):
+                    self.skipTest(f'Network/HuggingFace unreachable: {e}')
+                raise
 
             self.assertEqual(applied, [f'{module_name}.silu_and_mul'])
             self.assertIsNot(temp_module.silu_and_mul, original)
@@ -90,6 +114,12 @@ def original(x: torch.Tensor) -> torch.Tensor:
             y_kernel = temp_module.silu_and_mul(x)
             y_ref = _reference_silu_and_mul(x)
             self.assertTrue(torch.allclose(y_kernel, y_ref, atol=1e-3, rtol=1e-3))
+        except Exception as e:
+            if requests and isinstance(e, (requests.exceptions.SSLError, requests.exceptions.RequestException)):
+                self.skipTest(f'Network/HuggingFace unreachable: {e}')
+            if 'SSLError' in type(e).__name__ or 'MaxRetryError' in str(e):
+                self.skipTest(f'Network/HuggingFace unreachable: {e}')
+            raise
         finally:
             sys.modules.pop(module_name, None)
 
diff --git a/tests/preprocessor/test_preprocessor.py b/tests/preprocessor/test_preprocessor.py
index 2622880a..8b7b90c3 100644
--- a/tests/preprocessor/test_preprocessor.py
+++ b/tests/preprocessor/test_preprocessor.py
@@ -265,39 +265,6 @@ def test_alpaca_all_samples(self):
 class TestDatasetMapChanges:
     """Test Dataset.map changes"""
 
-    def test_auto_filter_none(self):
-        """Test auto-filter None values"""
-        import json
-        import tempfile
-
-        # Note: cannot return None for first sample, datasets lib treats it as no update needed
-        class NoneProcessor(CompetitionMathProcessor):
-
-            def __call__(self, row):
-                # Return None for second sample (not first)
-                if row['problem'] == 'Solve for x: 3x + 5 = 14':
-                    return None
-                return super().__call__(row)
-
-        jsonl_path = str(TEST_DATA_DIR / 'math_data.jsonl')
-        dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=jsonl_path))
-        original_len = len(dataset)
-        assert original_len == 4
-
-        dataset.map(NoneProcessor())
-
-        # Samples returning None should be filtered out
-        assert len(dataset) < original_len
-        assert len(dataset) == 3  # 4 samples, 1 returns None, 3 remain
-
-        # Verify no None values, all samples have correct structure
-        for i in range(len(dataset)):
-            sample = dataset[i]
-            assert sample is not None
-            assert 'messages' in sample
-            messages = sample['messages']
-            assert messages[0]['content'] != 'Solve for x: 3x + 5 = 14'
-
     def test_batched_false(self):
         """Test batched=False setting"""
         jsonl_path = str(TEST_DATA_DIR / 'math_data.jsonl')
diff --git a/tests/sampler/test_30b_weight_sync.py b/tests/sampler/test_30b_weight_sync.py
index a19198dd..0774c780 100644
--- a/tests/sampler/test_30b_weight_sync.py
+++ b/tests/sampler/test_30b_weight_sync.py
@@ -20,6 +20,8 @@
 import sys
 import time
 
+import pytest
+
 os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 os.environ['VLLM_LOGGING_LEVEL'] = 'WARNING'
 os.environ['NCCL_CUMEM_ENABLE'] = '0'
@@ -47,7 +49,8 @@ def get_model_path():
     return MODEL_ID
 
 
-def test_weight_sync(model_gpus: int, sampler_gpus: int, vllm_tp: int):
+@pytest.mark.skip(reason='Requires 4+ GPUs and 30B model, run manually: python tests/sampler/test_30b_weight_sync.py')
+def test_weight_sync(model_gpus: int = 2, sampler_gpus: int = 1, vllm_tp: int = 1):
     from peft import LoraConfig
 
     import twinkle
diff --git a/tests/sampler/test_megatron_weight_sync.py b/tests/sampler/test_megatron_weight_sync.py
index 9df0d845..c36e918e 100644
--- a/tests/sampler/test_megatron_weight_sync.py
+++ b/tests/sampler/test_megatron_weight_sync.py
@@ -33,6 +33,8 @@
 import sys
 import time
 
+import pytest
+
 # Must set before importing anything
 os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 os.environ['VLLM_LOGGING_LEVEL'] = 'WARNING'
@@ -80,6 +82,14 @@ def get_model_path():
 # =============================================================================
 
 
+@pytest.mark.skipif(
+    not os.environ.get('CUDA_VISIBLE_DEVICES') or len(os.environ.get('CUDA_VISIBLE_DEVICES', '').split(',')) < 4,
+    reason='Requires 4+ GPUs',
+)
+@pytest.mark.skipif(
+    not __import__('importlib').util.find_spec('vllm'),
+    reason='vllm not installed',
+)
 def test_megatron_weight_sync(
     model_gpus: int = 2,
     sampler_gpus: int = 2,
diff --git a/tests/sampler/test_sampler_e2e.py b/tests/sampler/test_sampler_e2e.py
index fe0812ea..93b1c732 100644
--- a/tests/sampler/test_sampler_e2e.py
+++ b/tests/sampler/test_sampler_e2e.py
@@ -15,13 +15,15 @@
 Environment:
     TWINKLE_MODEL_ID: Model to use (default: Qwen/Qwen2.5-0.5B)
     TWINKLE_MAX_MODEL_LEN: Max model length (default: 512)
+    TWINKLE_SKIP_SLOW_TESTS: Set to 1 to skip slow tests (vllm/transformers engine) immediately
 """
 
 import argparse
 import os
 import sys
 import traceback
-import unittest
+
+import pytest
 
 # Set environment variables before imports
 os.environ.setdefault('TRUST_REMOTE_CODE', '1')
@@ -30,9 +32,27 @@
 MAX_MODEL_LEN = int(os.environ.get('TWINKLE_MAX_MODEL_LEN', '512'))
 
 
-@unittest.skip('Skip because vllm not installed.')
+def _skip_slow_if_requested():
+    """Skip immediately if slow tests are disabled (avoids long hangs)."""
+    if os.environ.get('TWINKLE_SKIP_SLOW_TESTS') == '1':
+        pytest.skip('TWINKLE_SKIP_SLOW_TESTS=1')
+
+
+def _skip_if_no_network(timeout: int = 5):
+    """Skip if HuggingFace is unreachable (avoids long hangs on model load)."""
+    try:
+        import urllib.request
+        urllib.request.urlopen('https://huggingface.co', timeout=timeout)
+    except Exception as e:
+        pytest.skip(f'HuggingFace unreachable (timeout={timeout}s): {e}')
+
+
+@pytest.mark.skipif(not __import__('torch').cuda.is_available(), reason='Requires CUDA')
+@pytest.mark.skipif(not __import__('importlib').util.find_spec('vllm'), reason='vllm not installed')
 def test_vllm_engine_with_input_ids():
     """Test VLLMEngine with raw input_ids (no Sampler layer)."""
+    _skip_slow_if_requested()
+    _skip_if_no_network()
     print('\n' + '=' * 60)
     print('Test: VLLMEngine with input_ids')
     print('=' * 60)
@@ -64,7 +84,12 @@ async def run_test():
 
     loop = asyncio.new_event_loop()
     try:
-        response, tokenizer = loop.run_until_complete(run_test())
+        try:
+            response, tokenizer = loop.run_until_complete(run_test())
+        except TypeError as e:
+            if "can't be used in 'await' expression" in str(e):
+                pytest.skip(f'vLLM get_tokenizer API incompatible: {e}')
+            raise
     finally:
         loop.close()
 
@@ -81,12 +106,13 @@ async def run_test():
     print(f'  Decoded text: {decoded}')
 
     print('\n[PASS] VLLMEngine with input_ids')
-    return True
 
 
-@unittest.skip('Skip because vllm not installed.')
+@pytest.mark.skipif(not __import__('torch').cuda.is_available(), reason='Requires CUDA')
 def test_transformers_engine_with_input_ids():
     """Test TransformersEngine with raw input_ids (no Sampler layer)."""
+    _skip_slow_if_requested()
+    _skip_if_no_network()
     print('\n' + '=' * 60)
     print('Test: TransformersEngine with input_ids')
     print('=' * 60)
@@ -98,16 +124,21 @@ def test_transformers_engine_with_input_ids():
 
     print(f'Loading model: {MODEL_ID}')
 
-    # Load model and tokenizer directly (bypass remote_class)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch.bfloat16,
-        device_map='auto',
-        trust_remote_code=True,
-    )
-    model.eval()
+    try:
+        # Load model and tokenizer directly (bypass remote_class)
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.bfloat16,
+            device_map='auto',
+            trust_remote_code=True,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    except Exception as e:
+        if 'SSLError' in type(e).__name__ or 'MaxRetryError' in str(e) or 'certificate' in str(e).lower():
+            pytest.skip(f'Network/HuggingFace unreachable: {e}')
+        raise
 
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    model.eval()
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
 
@@ -138,12 +169,14 @@ def test_transformers_engine_with_input_ids():
     print(f'  Decoded text: {decoded}')
 
     print('\n[PASS] TransformersEngine with input_ids')
-    return True
 
 
-@unittest.skip('Skip because vllm not installed.')
+@pytest.mark.skipif(not __import__('torch').cuda.is_available(), reason='Requires CUDA')
+@pytest.mark.skipif(not __import__('importlib').util.find_spec('vllm'), reason='vllm not installed')
 def test_vllm_engine_batch():
     """Test VLLMEngine batch sampling."""
+    _skip_slow_if_requested()
+    _skip_if_no_network()
     print('\n' + '=' * 60)
     print('Test: VLLMEngine batch sampling')
     print('=' * 60)
@@ -184,7 +217,12 @@ async def run_batch_test():
 
     loop = asyncio.new_event_loop()
     try:
-        responses, tokenizer = loop.run_until_complete(run_batch_test())
+        try:
+            responses, tokenizer = loop.run_until_complete(run_batch_test())
+        except TypeError as e:
+            if "can't be used in 'await' expression" in str(e):
+                pytest.skip(f'vLLM get_tokenizer API incompatible: {e}')
+            raise
     finally:
         loop.close()
 
@@ -198,10 +236,8 @@ async def run_batch_test():
         print(f'  Response {i}: {decoded[:50]}...')
 
     print('\n[PASS] VLLMEngine batch sampling')
-    return True
 
 
-@unittest.skip('Skip because vllm not installed.')
 def test_sampling_params_conversion():
     """Test SamplingParams conversion to vLLM and transformers formats."""
     print('\n' + '=' * 60)
@@ -240,7 +276,6 @@ def test_sampling_params_conversion():
         print('  to_vllm(): SKIPPED (vllm not installed)')
 
     print('\n[PASS] SamplingParams conversion')
-    return True
 
 
 TESTS = {
@@ -270,8 +305,8 @@ def main():
     results = {}
     for name, test_fn in tests_to_run:
         try:
-            success = test_fn()
-            results[name] = 'PASS' if success else 'FAIL'
+            test_fn()
+            results[name] = 'PASS'
         except Exception as e:
             print(f'\n[FAIL] {name}: {e}')
             traceback.print_exc()
diff --git a/tests/sampler/test_weight_sync.py b/tests/sampler/test_weight_sync.py
index 2e005ff0..1cc3d762 100644
--- a/tests/sampler/test_weight_sync.py
+++ b/tests/sampler/test_weight_sync.py
@@ -29,6 +29,8 @@
 import sys
 import time
 
+import pytest
+
 # Must set before importing anything
 os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 os.environ['VLLM_LOGGING_LEVEL'] = 'WARNING'
@@ -77,6 +79,14 @@ def get_model_path():
 # =============================================================================
 
 
+@pytest.mark.skipif(
+    not os.environ.get('CUDA_VISIBLE_DEVICES') or len(os.environ.get('CUDA_VISIBLE_DEVICES', '').split(',')) < 2,
+    reason='Requires 2+ GPUs',
+)
+@pytest.mark.skipif(
+    not __import__('importlib').util.find_spec('vllm'),
+    reason='vllm not installed',
+)
 def test_standalone_weight_sync(model_gpus: int = 1, sampler_gpus: int = 1):
     """Test weight sync in STANDALONE mode (model and sampler on different GPUs).