From 8c20837ea7e7a2bef0dca12dbdf4f32c12a63a3a Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 27 Feb 2026 16:30:17 +0800 Subject: [PATCH 1/3] fix single gpu bug --- src/twinkle/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py index fe9733f1..1bc5ac0c 100644 --- a/src/twinkle/processor/base.py +++ b/src/twinkle/processor/base.py @@ -109,8 +109,8 @@ def _pad_cp(_input: InputFeature) -> InputFeature: # Pad sequence for parallel compatibility # 1. For CP > 1: Megatron's RoPE requires seq_len % (2 * cp_size) == 0 # 2. For sequence_parallel with TP > 1: seq_len must be divisible by TP size - cp_size = self.device_mesh.cp_world_size - tp_size = self.device_mesh.tp_world_size + cp_size = self.device_mesh.cp_world_size if self.device_mesh is not None else 1 + tp_size = self.device_mesh.tp_world_size if self.device_mesh is not None else 1 position_ids = _input.get('position_ids') def pad_cp_inputs(input_tensor: torch.Tensor, padding_value: int) -> torch.Tensor: From 084050140290ca6d3ec11e30970fa2b912a496d6 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 27 Feb 2026 16:33:58 +0800 Subject: [PATCH 2/3] fix --- src/twinkle/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py index 1bc5ac0c..2dad49fb 100644 --- a/src/twinkle/processor/base.py +++ b/src/twinkle/processor/base.py @@ -122,7 +122,7 @@ def pad_cp_inputs(input_tensor: torch.Tensor, padding_value: int) -> torch.Tenso # Calculate required divisor based on parallelism settings if cp_size > 1: divisor = 2 * cp_size - elif self.device_mesh.sequence_parallel and tp_size > 1: + elif self.device_mesh is not None and self.device_mesh.sequence_parallel and tp_size > 1: divisor = tp_size else: divisor = 1 @@ -150,7 +150,7 @@ def pad_cp_inputs(input_tensor: torch.Tensor, padding_value: int) -> torch.Tenso result.append(pad_cp_inputs(_value_slice, padding_value=self.padding_map[key])) value = torch.cat(result, dim=1) _input[key] = value - elif self.device_mesh.sequence_parallel and tp_size > 1: + elif self.device_mesh is not None and self.device_mesh.sequence_parallel and tp_size > 1: # Sequence parallel without CP still requires seq_len % TP == 0 for key in ['input_ids', 'position_ids', 'attention_mask', 'labels']: value = _input.get(key) From a48861c1452c9a81cfd208917208e8e38688b6ef Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 27 Feb 2026 16:37:40 +0800 Subject: [PATCH 3/3] fix --- src/twinkle/processor/base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py index 2dad49fb..ff0fbabf 100644 --- a/src/twinkle/processor/base.py +++ b/src/twinkle/processor/base.py @@ -105,12 +105,15 @@ def to_tensor(_input): def pad_cp(self, inputs: List[InputFeature], **kwargs) -> List[InputFeature]: + if self.device_mesh is None: + return inputs + def _pad_cp(_input: InputFeature) -> InputFeature: # Pad sequence for parallel compatibility # 1. For CP > 1: Megatron's RoPE requires seq_len % (2 * cp_size) == 0 # 2. For sequence_parallel with TP > 1: seq_len must be divisible by TP size - cp_size = self.device_mesh.cp_world_size if self.device_mesh is not None else 1 - tp_size = self.device_mesh.tp_world_size if self.device_mesh is not None else 1 + cp_size = self.device_mesh.cp_world_size + tp_size = self.device_mesh.tp_world_size position_ids = _input.get('position_ids') def pad_cp_inputs(input_tensor: torch.Tensor, padding_value: int) -> torch.Tensor: @@ -122,7 +125,7 @@ def pad_cp_inputs(input_tensor: torch.Tensor, padding_value: int) -> torch.Tenso # Calculate required divisor based on parallelism settings if cp_size > 1: divisor = 2 * cp_size - elif self.device_mesh is not None and self.device_mesh.sequence_parallel and tp_size > 1: + elif self.device_mesh.sequence_parallel and tp_size > 1: divisor = tp_size else: divisor = 1 @@ -150,7 +153,7 @@ def pad_cp_inputs(input_tensor: torch.Tensor, padding_value: int) -> torch.Tenso result.append(pad_cp_inputs(_value_slice, padding_value=self.padding_map[key])) value = torch.cat(result, dim=1) _input[key] = value - elif self.device_mesh is not None and self.device_mesh.sequence_parallel and tp_size > 1: + elif self.device_mesh.sequence_parallel and tp_size > 1: # Sequence parallel without CP still requires seq_len % TP == 0 for key in ['input_ids', 'position_ids', 'attention_mask', 'labels']: value = _input.get(key)