Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,6 @@ supported on Twinkle✨ framework.
| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) |
| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1.5B/7B/14B/32B | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) |

For more detailed model support list 👉 [Quick Start](docs/source_en/Usage%20Guide/Quick-Start.md)

## Sample Code

Below are some of the capabilities demonstrated in the example code. For a complete introduction to training capabilities,
Expand Down
2 changes: 0 additions & 2 deletions README_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,6 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | - | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) |
| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1.5B/7B/14B/32B | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) |

更详细的模型支持列表 👉 [快速开始.md](docs/source_zh/使用指引/快速开始.md)

## 示例代码

下面列出了示例代码的一部分能力。完整的训练能力介绍请参考[快速开始](docs/source_zh/使用指引/快速开始.md)以及[cookbook](cookbook)。
Expand Down
97 changes: 97 additions & 0 deletions cookbook/mm/fsdp2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from peft import LoraConfig
from tqdm import tqdm

import twinkle
from twinkle import DeviceMesh, get_device_placement, get_logger
from twinkle.data_format import Trajectory, Message
from twinkle.dataloader import DataLoader
from twinkle.dataset import LazyDataset, DatasetMeta
from twinkle.model import TransformersModel
from twinkle.preprocessor import Preprocessor

# Construct a device_mesh, fsdp=2
device_mesh = DeviceMesh.from_sizes(fsdp_size=2)
# use torchrun mode
twinkle.initialize(mode='local', global_device_mesh=device_mesh)

logger = get_logger()


class LatexOCRProcessor(Preprocessor):

def __call__(self, row) -> Trajectory:
return Trajectory(
messages=[
Message(role='user', content='<image>Using LaTeX to perform OCR on the image.', images=[row['image']]),
Message(role='assistant', content=row['text']),
]
)


def eval(model):
# 100 Samples
dataset = LazyDataset(dataset_meta=DatasetMeta('ms://AI-ModelScope/LaTeX_OCR', data_slice=range(100)))
dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
dataset.map(LatexOCRProcessor)
dataset.encode()
dataloader = DataLoader(dataset=dataset, batch_size=8)
for step, batch in tqdm(enumerate(dataloader)):
model.forward_only(inputs=batch)
model.calculate_loss()
metrics = model.calculate_metric(is_training=False)
return metrics


def train():
# 2000 samples
dataset = LazyDataset(dataset_meta=DatasetMeta('ms://AI-ModelScope/LaTeX_OCR', data_slice=range(2000)))
# Set template to prepare encoding
dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=1024)
# Preprocess the dataset to standard format
dataset.map(LatexOCRProcessor)
# Encode dataset
dataset.encode()
# Global batch size = 4, for GPUs, so 2 sample per GPU
dataloader = DataLoader(dataset=dataset, batch_size=4)
# Use a TransformersModel
from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForConditionalGeneration
model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', model_cls=Qwen3_5ForConditionalGeneration)
model.model._no_split_modules = {'Qwen3_5DecoderLayer'}

lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')

# Add a lora to model, with name `default`
# Comment this to use full-parameter training
model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
# Add Optimizer for lora `default`
model.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
# Add LRScheduler for lora `default`
model.set_lr_scheduler(
scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader))
logger.info(get_device_placement())
# Print the training config
logger.info(model.get_train_configs())
logger.info(f'Total steps: {len(dataloader)}')
loss_metric = 99.0
for step, batch in enumerate(dataloader):
# Do forward and backward
model.forward_backward(inputs=batch)
# Step
model.clip_grad_and_step()
if step % 20 == 0:
# Print metric
metric = model.calculate_metric(is_training=True)
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
if step > 0 and step % 40 == 0:
metrics = eval(model)
logger.info(f'Eval metric: {metrics}')
metrics['step'] = step
if loss_metric > float(metrics['loss']):
model.save(f'checkpoint-{step}')
loss_metric = float(metrics['loss'])
model.save(f'last-checkpoint')


if __name__ == '__main__':
train()
1 change: 1 addition & 0 deletions cookbook/mm/fsdp2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 fsdp2.py
7 changes: 3 additions & 4 deletions cookbook/transformers/fsdp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
def eval(model):
# 100 Samples
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100)))
dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B')
dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B')
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
dataset.encode()
dataloader = DataLoader(dataset=dataset, batch_size=8)
Expand All @@ -35,16 +35,15 @@ def train():
# 1000 samples
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
# Set template to prepare encoding
dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B')
dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B')
# Preprocess the dataset to standard format
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
# Encode dataset
dataset.encode()
# Global batch size = 8, for GPUs, so 1 sample per GPU
dataloader = DataLoader(dataset=dataset, batch_size=8)
# Use a TransformersModel
model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
model = TransformersModel(model_id='ms://Qwen/Qwen3-4B')

lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')

Expand Down
2 changes: 1 addition & 1 deletion cookbook/transformers/sp_fsdp_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from twinkle.preprocessor import SelfCognitionProcessor

logger = get_logger()
MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
MODEL_ID = 'ms://Qwen/Qwen3-4B'
DATASETS = 'ms://swift/self-cognition'

device_group = [DeviceGroup(
Expand Down
5 changes: 3 additions & 2 deletions src/twinkle/hub/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ def load_dataset(cls,
subset_name: str,
split: str,
streaming: bool = False,
revision: Optional[str] = None):
revision: Optional[str] = None,
**kwargs):
"""Load a dataset from the repo

Args:
Expand All @@ -179,7 +180,7 @@ def load_dataset(cls,
The Dataset instance
"""
hub = cls._get_hub_class(dataset_id)
return hub.load_dataset(cls.remove_source_type(dataset_id), subset_name, split, streaming, revision)
return hub.load_dataset(cls.remove_source_type(dataset_id), subset_name, split, streaming, revision, **kwargs)

@classmethod
def download_model(cls,
Expand Down
4 changes: 3 additions & 1 deletion src/twinkle/model/multi_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,9 @@ def _patch_peft(_module):
if isinstance(_module, PeftModel):
_module.add_adapter(lora_tenant.adapter_name, config)
else:
_module = get_peft_model(_module, config, lora_tenant.adapter_name)
_peft_model: PeftModel = get_peft_model(_module, config, lora_tenant.adapter_name)
_module.active_adapters = _peft_model.active_adapters
_module = _peft_model

for name, submodule in _module.named_modules():
if isinstance(submodule, LoraLayer):
Expand Down
20 changes: 20 additions & 0 deletions src/twinkle/model/transformers/multi_lora_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,31 @@ def __init__(
self.multi_adapter.save_initial_weights()
# Active group for compatibility with single adapter
self.active_group = None
self.handler = self.register_global_mm_forward_hook()

def _check_adapter_valid(self, adapter_name: str):
assert adapter_name and adapter_name in self.optimizer_group, (f'Use a valid adapter_name first, '
f'current is: {adapter_name}')

def register_global_mm_forward_hook(self):

def forward_hook(model, args, kwargs):
active_adapter = model.active_adapters[0]
active_adapter = self.multi_adapter.find_lora(active_adapter).tenant_adapter_name
optimizer_group = self.optimizer_group[active_adapter]
template = optimizer_group.template
assert template is not None
return template.pre_forward_hook(model, args, kwargs)

model = self.strategy.unwrap_model(self.model)
return model.register_forward_pre_hook(forward_hook, with_kwargs=True)

def register_mm_forward_hook(self, optimizer_group: OptimizerGroup):
pass

def unregister_mm_forward_hook(self, optimizer_group: OptimizerGroup):
pass

def _lazy_wrap_model(self):
pass

Expand Down
13 changes: 13 additions & 0 deletions src/twinkle/model/transformers/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class OptimizerGroup:
checkpoint_engine: CheckpointEngine = None
_dp_group = None
_device_mesh: DeviceMesh = None
_handler: Any = None

def do_grad_sync(self, gradient_accumulation_steps: Optional[int] = None) -> bool:
if gradient_accumulation_steps is None:
Expand Down Expand Up @@ -284,11 +285,23 @@ def _lazy_wrap_model(self):
assert optimizer is not None
self.model, optimizer = self.strategy.wrap_model(self.model, optimizer)
optimizer_group.optimizer = optimizer
self.register_mm_forward_hook(optimizer_group)
else:
# maybe forward_only, no optimizer_group available
self.model = self.strategy.wrap_model(self.model)
self._model_wrapped = True

def register_mm_forward_hook(self, optimizer_group: OptimizerGroup):
model = self.strategy.unwrap_model(self.model)
template = optimizer_group.template
assert template is not None
optimizer_group._handler = model.register_forward_pre_hook(template.pre_forward_hook, with_kwargs=True)

def unregister_mm_forward_hook(self, optimizer_group: OptimizerGroup):
if optimizer_group._handler is not None:
optimizer_group._handler.remove()
optimizer_group._handler = None

@staticmethod
def _should_enable_expert_parallel(expert_parallel_config: Optional[Dict[str, Any]],
device_mesh: Optional[DeviceMesh]) -> bool:
Expand Down
15 changes: 13 additions & 2 deletions src/twinkle/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ def to_tensor(_input):
value = torch.from_numpy(value)
elif isinstance(value, list) and isinstance(value[0], (int, float, np.number)):
value = torch.tensor(value)
elif key in self.VLM_CONCAT_FIELDS:
if not isinstance(value[0], torch.Tensor):
value = [torch.tensor(v) for v in value]
value = torch.cat(value, dim=0)
if isinstance(value, torch.Tensor):
value = value.to(Platform.get_local_device())
if value.dim() == 1:
Expand Down Expand Up @@ -260,7 +264,8 @@ def _create_4d_attention_mask(attention_mask):

@staticmethod
def _get_packed_seq_params(position_ids):
assert position_ids.shape[0] == 1
if position_ids.shape[0] > 1:
position_ids = position_ids[:1]
position_ids_f = position_ids.flatten()
indices_q = torch.arange(position_ids_f.shape[0], device=position_ids_f.device, dtype=torch.int32)

Expand Down Expand Up @@ -305,7 +310,10 @@ def to_transformers_dict(inputs: List[InputFeature], **kwargs) -> List[InputFeat
results = []
for _input in inputs:
output = {}
_keys = ['input_ids', 'input_embeddings', 'attention_mask', 'position_ids', 'labels', 'completion_mask']
_keys = [
'input_ids', 'input_embeddings', 'attention_mask', 'position_ids', 'labels', 'completion_mask',
'pixel_values', 'image_grid_thw'
]
for key in list(_input.keys()):
if key in _keys:
output[key] = np.array(_input[key]) if not isinstance(_input[key], torch.Tensor) else _input[key]
Expand Down Expand Up @@ -361,6 +369,9 @@ def _collate_macro_batch(self, inputs: List[InputFeature]) -> InputFeature:

for field, values in vlm_fields.items():
if values:
if values[0].dim() == 1:
# image_thw may be squeezed
values = [value.unsqueeze(0) for value in values]
result[field] = torch.cat(values, dim=0)

return result
Expand Down
2 changes: 1 addition & 1 deletion src/twinkle/template/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
from .base import Template
from .qwen3_vl import Qwen3VLTemplate
from .qwen3_5_vl import Qwen3_5Template
Loading
Loading