diff --git a/README.md b/README.md index 92cf6ec6..92e807c2 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,9 @@ For more detailed model support list 👉 [Quick Start](docs/source_en/Usage%20 ## Sample Code +Below are some of the capabilities demonstrated in the example code. For a complete introduction to training capabilities, +please refer to [Quick Start](docs/source_en/Usage%20Guide/Quick-Start.md) and [cookbook](cookbook). + ### Train with Ray ```python @@ -157,7 +160,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me def train(): # to load model from Hugging Face, use 'hf://...' - base_model = 'ms://Qwen/Qwen2.5-7B-Instruct' + base_model = 'ms://Qwen/Qwen3-4B' # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding @@ -214,13 +217,13 @@ from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.tinker.common import input_feature_to_datum base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' -base_url='http://www.modelscope.cn/twinkle' -api_key=os.environ.get('MODELSCOPE_TOKEN') +base_url='your-base-url' +api_key='your-api-key' # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) dataset.set_template('Template', model_id=base_model, max_length=256) -dataset.map(SelfCognitionProcessor('twinkle Model', 'twinkle Team'), load_from_cache_file=False) +dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git a/README_ZH.md b/README_ZH.md index 8462f148..e2b60dcc 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -67,9 +67,11 @@ pip install -e . | twinkle 客户端微调 | megatron | [脚本](cookbook/client/twinkle/megatron) | | twinkle 客户端微调 | transformer | [脚本](cookbook/client/twinkle/transformer) | +Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Client等各场景下。其算法过程是外露的,非常便于修改和调试。完整的框架介绍请查看[快速开始](docs/source_zh/使用指引/快速开始.md) + ## 更新日志 -- 🎉2026-02-13 Twinkle✨ 初始版本发布,包括对文本模型的 SFT/PT/RL 支持以及在 [ModelScope](https://modelscope.cn) 上的无服务器训练能力。 +🎉2026-02-13 Twinkle✨ 初始版本发布,支持文本模型的SFT/PT/RL训练。我们还通过兼容Tinker的API,在魔搭社区上提供了无服务器训练功能。 ## ModelScope 的训练服务 @@ -88,8 +90,8 @@ pip install -e . 随着新模型的发布,我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。 ->[!注意] -> 对于通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前一次只支持一个训练基座,当前是 [Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507)。 +>[!Note] +> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507)。 | 模型类型 | [ModelScope](https://modelscope.cn) 上的模型 ID | 要求 | Megatron 支持 | HF 模型 ID | @@ -117,6 +119,8 @@ pip install -e . ## 示例代码 +下面列出了示例代码的一部分能力。完整的训练能力介绍请参考[快速开始](docs/source_zh/使用指引/快速开始.md)以及[cookbook](cookbook)。 + ### 使用 Ray 训练 ```python @@ -136,7 +140,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me def train(): # to load model from Hugging Face, use 'hf://...' - base_model = 'ms://Qwen/Qwen2.5-7B-Instruct' + base_model = 'ms://Qwen/Qwen3-4B' # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding @@ -180,7 +184,7 @@ if __name__ == '__main__': train() ``` -### 使用类 Tinker API +### 使用类 Tinker API实现无服务器式训练 ```python import os @@ -193,13 +197,13 @@ from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.tinker.common import input_feature_to_datum base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' -base_url='http://www.modelscope.cn/twinkle' -api_key=os.environ.get('MODELSCOPE_TOKEN') +base_url='your-base-url' +api_key='your-api-key' # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) dataset.set_template('Template', model_id=base_model, max_length=256) -dataset.map(SelfCognitionProcessor('twinkle Model', 'twinkle Team'), load_from_cache_file=False) +dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git a/assets/framework.jpg b/assets/framework.jpg index 38e5110a..1e7aee30 100644 Binary files a/assets/framework.jpg and b/assets/framework.jpg differ diff --git a/cookbook/megatron/qwen3_5.py b/cookbook/megatron/qwen3_5.py deleted file mode 100644 index 8807c066..00000000 --- a/cookbook/megatron/qwen3_5.py +++ /dev/null @@ -1,46 +0,0 @@ -from peft import LoraConfig - -import twinkle -from twinkle import DeviceMesh, get_device_placement, get_logger -from twinkle.dataloader import DataLoader -from twinkle.dataset import Dataset, DatasetMeta -from twinkle.model import MegatronModel -from twinkle.preprocessor import SelfCognitionProcessor - -device_mesh = DeviceMesh.from_sizes(dp_size=4, tp_size=1, pp_size=1, ep_size=4) -twinkle.initialize(mode='local', global_device_mesh=device_mesh) - -logger = get_logger() - -MODEL_ID = 'Qwen/Qwen3.5-35B-A3B' - -def train(): - dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) - dataset.set_template('Template', model_id=MODEL_ID) - dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) - dataset.encode() - dataloader = DataLoader(dataset=dataset, batch_size=4) - - model = MegatronModel(model_id=MODEL_ID) - lora_config = LoraConfig(r=8, lora_alpha=16, target_modules='all-linear') - model.add_adapter_to_model('default', lora_config) - model.set_optimizer(optimizer_cls='default', lr=1e-4) - model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=2, lr_decay_steps=len(dataloader)) - logger.info(get_device_placement()) - logger.info(model.get_train_configs()) - logger.info(f'Total steps: {len(dataloader)}') - - for step, batch in enumerate(dataloader): - model.forward_backward(inputs=batch) - model.clip_grad_and_step() - if step % 5 == 0: - metric = model.calculate_metric(is_training=True) - logger.info(f'Step {step}/{len(dataloader)}, metric: {metric}') - - # NOTE: you should merge lora for Qwen3.5 model when using Megatron - model.save('last-checkpoint', merge_lora=True) - logger.info('Training completed.') - - -if __name__ == '__main__': - train() diff --git a/cookbook/megatron/tp.py b/cookbook/megatron/tp.py index 662bd50f..f985d740 100644 --- a/cookbook/megatron/tp.py +++ b/cookbook/megatron/tp.py @@ -8,6 +8,8 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.model import MegatronModel from twinkle.preprocessor import SelfCognitionProcessor +from twinkle.server.tinker.common import input_feature_to_datum +from twinkle.server.tinker.common.compat_base import TwinkleCompatModelBase # Construct a device_mesh, tp=pp=cp=2, dp=1 device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2) @@ -20,7 +22,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=16) @@ -34,7 +36,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -42,7 +44,7 @@ def train(): # Global batch size = 1, dp_size = 1 dataloader = DataLoader(dataset=dataset, batch_size=16) # Use a MegatronModel - model = MegatronModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct') + model = MegatronModel(model_id='ms://Qwen/Qwen3-4B') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') @@ -63,6 +65,8 @@ def train(): for step, batch in enumerate(dataloader): # Do forward and backward model.forward_backward(inputs=batch) + _inputs = [input_feature_to_datum(b) for b in batch] + _temp = TwinkleCompatModelBase._get_forward_output(_inputs, model.optimizer_group['default'].outputs['logits']) # Step model.clip_grad_and_step() if step % 5 == 0: diff --git a/cookbook/megatron/tp_moe.py b/cookbook/megatron/tp_moe.py index 7de83962..364ac686 100644 --- a/cookbook/megatron/tp_moe.py +++ b/cookbook/megatron/tp_moe.py @@ -20,7 +20,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=16) @@ -34,7 +34,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -42,7 +42,7 @@ def train(): # Global batch size = 1, dp_size = 1 dataloader = DataLoader(dataset=dataset, batch_size=16) # Use a MegatronModel - model = MegatronModel(model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + model = MegatronModel(model_id='ms://Qwen/Qwen3.5-35B-A3B') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') @@ -75,7 +75,7 @@ def train(): if loss_metric > float(metrics['loss']): model.save(f'checkpoint-{step}') loss_metric = float(metrics['loss']) - model.save(f'last-checkpoint') + model.save('last-checkpoint', merge_lora=True) if __name__ == '__main__': diff --git a/cookbook/ray/single_controller.py b/cookbook/ray/single_controller.py index d0a0e730..cd1482a6 100644 --- a/cookbook/ray/single_controller.py +++ b/cookbook/ray/single_controller.py @@ -26,7 +26,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) @@ -41,7 +41,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -49,7 +49,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', remote_group='default') + model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', remote_group='default') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') diff --git a/cookbook/rl/grpo.py b/cookbook/rl/grpo.py index 4b217725..29059fae 100644 --- a/cookbook/rl/grpo.py +++ b/cookbook/rl/grpo.py @@ -20,7 +20,7 @@ logger = get_logger() -MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen2.5-3B-Instruct') +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3-4B') USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1'))) MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) diff --git a/cookbook/transformers/fsdp2.py b/cookbook/transformers/fsdp2.py index 586000fc..ca37d724 100644 --- a/cookbook/transformers/fsdp2.py +++ b/cookbook/transformers/fsdp2.py @@ -20,7 +20,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -35,7 +35,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -43,7 +43,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct') + model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') diff --git a/cookbook/transformers/sp_fsdp_dense.py b/cookbook/transformers/sp_fsdp_dense.py index 7a563a2c..da6e2d28 100644 --- a/cookbook/transformers/sp_fsdp_dense.py +++ b/cookbook/transformers/sp_fsdp_dense.py @@ -10,7 +10,7 @@ from twinkle.preprocessor import SelfCognitionProcessor logger = get_logger() -MODEL_ID = 'ms://Qwen/Qwen2.5-7B-Instruct' +MODEL_ID = 'ms://Qwen/Qwen3-4B' DATASETS = 'ms://swift/self-cognition' device_group = [DeviceGroup( diff --git a/docs/source_en/Components/Advantage/GRPOAdvantage.md b/docs/source_en/Components/Advantage/GRPOAdvantage.md index 381b7605..ba90b4c0 100644 --- a/docs/source_en/Components/Advantage/GRPOAdvantage.md +++ b/docs/source_en/Components/Advantage/GRPOAdvantage.md @@ -41,8 +41,8 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward # Create components -actor = TransformersModel(model_id='Qwen/Qwen2.5-7B-Instruct') -sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +actor = TransformersModel(model_id='ms://Qwen/Qwen3-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') reward_fn = MathReward() advantage_fn = GRPOAdvantage() diff --git a/docs/source_en/Components/Advantage/RLOOAdvantage.md b/docs/source_en/Components/Advantage/RLOOAdvantage.md index 19308d35..5479a4ce 100644 --- a/docs/source_en/Components/Advantage/RLOOAdvantage.md +++ b/docs/source_en/Components/Advantage/RLOOAdvantage.md @@ -29,7 +29,7 @@ RLOO advantages: - More accurate counterfactual baseline estimation - Better performance when there are more samples -## Complete Training Example +## Training Example ```python from twinkle.advantage import RLOOAdvantage @@ -38,10 +38,11 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward # Create components -actor = TransformersModel(model_id='Qwen/Qwen2.5-7B-Instruct') -sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +actor = TransformersModel(model_id='ms://Qwen/Qwen3-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') reward_fn = MathReward() advantage_fn = RLOOAdvantage() +dataloader = ... # Training loop for batch in dataloader: diff --git a/docs/source_en/Components/Data Format/Sampling.md b/docs/source_en/Components/Data Format/Sampling.md index ea7db13a..cd21454a 100644 --- a/docs/source_en/Components/Data Format/Sampling.md +++ b/docs/source_en/Components/Data Format/Sampling.md @@ -62,7 +62,7 @@ Usage example: from twinkle.data_format import SamplingParams, SampleResponse from twinkle.sampler import vLLMSampler -sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') params = SamplingParams(max_tokens=512, temperature=0.7, top_p=0.9) response: SampleResponse = sampler.sample(trajectories, sampling_params=params, num_samples=4) diff --git a/docs/source_en/Components/Dataset/Dataset.md b/docs/source_en/Components/Dataset/Dataset.md index a68bd66e..fc75e6fc 100644 --- a/docs/source_en/Components/Dataset/Dataset.md +++ b/docs/source_en/Components/Dataset/Dataset.md @@ -60,7 +60,7 @@ dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=r The Template component is responsible for converting string/image multimodal raw data into model input tokens. The dataset can set a Template to complete the `encode` process. ```python -dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct', max_length=512) +dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512) ``` The set_template method supports passing `kwargs` (such as `max_length` in the example) to be used as constructor parameters for `Template`. diff --git a/docs/source_en/Components/Model/MegatronModel.md b/docs/source_en/Components/Model/MegatronModel.md index 35030997..d6b26c77 100644 --- a/docs/source_en/Components/Model/MegatronModel.md +++ b/docs/source_en/Components/Model/MegatronModel.md @@ -35,7 +35,7 @@ from twinkle.model import MegatronModel from twinkle import DeviceMesh from twinkle.dataloader import DataLoader dataloader = DataLoader(...) -model = MegatronModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2), remote_group='actor') +model = MegatronModel(model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2), remote_group='actor') model.add_adapter_to_model(...) model.set_optimizer('default', adapter_name='...') for data in dataloader: diff --git a/docs/source_en/Components/Model/TransformersModel.md b/docs/source_en/Components/Model/TransformersModel.md index df3616bd..ff9eac7e 100644 --- a/docs/source_en/Components/Model/TransformersModel.md +++ b/docs/source_en/Components/Model/TransformersModel.md @@ -41,7 +41,7 @@ from twinkle.model import TransformersModel from twinkle import DeviceMesh from twinkle.dataloader import DataLoader dataloader = DataLoader(...) -model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', device_mesh=DeviceMesh.from_sizes(dp_size=2, fsdp_size=2), remote_group='actor') +model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, fsdp_size=2), remote_group='actor') model.add_adapter_to_model(...) model.set_optimizer(..., adapter_name='...') for data in dataloader: diff --git a/docs/source_en/Components/Reward/Reward.md b/docs/source_en/Components/Reward/Reward.md index add11213..0f9903b7 100644 --- a/docs/source_en/Components/Reward/Reward.md +++ b/docs/source_en/Components/Reward/Reward.md @@ -87,7 +87,7 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward from twinkle.advantage import GRPOAdvantage -sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') reward_fn = MathReward() advantage_fn = GRPOAdvantage() diff --git a/docs/source_en/Components/Sampler/TorchSampler.md b/docs/source_en/Components/Sampler/TorchSampler.md index c302993e..93d1469c 100644 --- a/docs/source_en/Components/Sampler/TorchSampler.md +++ b/docs/source_en/Components/Sampler/TorchSampler.md @@ -9,7 +9,7 @@ from twinkle.sampler import TorchSampler from twinkle import DeviceMesh sampler = TorchSampler( - model_id='ms://Qwen/Qwen2.5-7B-Instruct', + model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=1), ) diff --git a/docs/source_en/Components/Sampler/vLLMSampler.md b/docs/source_en/Components/Sampler/vLLMSampler.md index 83465207..53c034e7 100644 --- a/docs/source_en/Components/Sampler/vLLMSampler.md +++ b/docs/source_en/Components/Sampler/vLLMSampler.md @@ -11,7 +11,7 @@ from twinkle import DeviceMesh # Create sampler sampler = vLLMSampler( - model_id='ms://Qwen/Qwen2.5-7B-Instruct', + model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2), remote_group='sampler_group' ) @@ -60,7 +60,7 @@ twinkle.initialize('ray', groups=device_groups) # Create remote sampler sampler = vLLMSampler( - model_id='ms://Qwen/Qwen2.5-7B-Instruct', + model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=4), remote_group='sampler' ) diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md index c47949e5..d00a3560 100644 --- a/docs/source_en/Usage Guide/Quick-Start.md +++ b/docs/source_en/Usage Guide/Quick-Start.md @@ -28,6 +28,803 @@ Twinkle and [ms-swift](https://github.com/modelscope/ms-swift) are both model tr - If you need other capabilities like inference, deployment, quantization - If you are sensitive to new model training support, Swift guarantees day-0 update capability +## Usage Patterns + +### Using Only Partial Components + +Developers can use only a portion of Twinkle's components, combining them with their own existing code to complete training work. For example, using only Dataset & DataLoader: + +```python +from twinkle.dataset import PackingDataset, DatasetMeta +from twinkle.dataloader import DataLoader +from twinkle.preprocessor import SelfCognitionProcessor + +def train(): + dataset_meta = DatasetMeta( + dataset_id='ms://swift/self-cognition', + ) + + dataset = PackingDataset(dataset_meta) + dataset.map(SelfCognitionProcessor(model_name='Twinkle Model', model_author='ModelScope Community')) + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512) + dataset.encode() + dataset.pack_dataset() + + dataloader = DataLoader(dataset, batch_size=8) + for data in dataloader: + print(data) + """ + { + "input_ids": [...], + "position_ids": [...], + ... + } + """ + break + +if __name__ == '__main__': + train() +``` +In the code above, we use PackingDataset to load a dataset called `swift/self-cognition`. PackingDataset can be used to bin-pack data, ensuring that each batch has a length similar to the configured maximum length. +In the loop, we simply used print to display the output. In actual use, you can continue writing your custom training code below. + +All of Twinkle's components support being used separately. Please refer to the component list in the sections below. + +### Single GPU + +Twinkle supports running training on a single GPU. Here is an example: + +```python +from peft import LoraConfig + +from twinkle import get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +logger = get_logger() + + +def train(): + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) + # Encode dataset + dataset.encode() + # Global batch size = 8, for GPUs, so 1 sample per GPU + dataloader = DataLoader(dataset=dataset, batch_size=8) + # Use a TransformersModel + model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') + + lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='AdamW', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler( + scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader)) + logger.info(get_device_placement()) + # Print the training config + logger.info(model.get_train_configs()) + logger.info(f'Total steps: {len(dataloader)}') + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 20 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() +``` + +In this training code, we constructed a dataset and loaded the Qwen/Qwen3-4B model, used LoRA with the all-linear approach, and completed one training run. In the logs, you can observe the process of loss gradually converging. + +### torchrun + +Twinkle supports running training in torchrun mode. In this scenario, Ray-related dependencies do not need to be installed. + +```python +from peft import LoraConfig + +import twinkle +from twinkle import DeviceMesh, get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +# Construct a device_mesh, fsdp=4, dp=2 +device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2) +# use torchrun mode +twinkle.initialize(mode='local', global_device_mesh=device_mesh) + +logger = get_logger() + + +def train(): + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) + # Encode dataset + dataset.encode() + # Global batch size = 8, for GPUs, so 1 sample per GPU + dataloader = DataLoader(dataset=dataset, batch_size=8) + # Use a TransformersModel + model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') + + lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='AdamW', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler( + scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader)) + logger.info(get_device_placement()) + # Print the training config + logger.info(model.get_train_configs()) + logger.info(f'Total steps: {len(dataloader)}') + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 20 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() +``` + +In the code above, we constructed a hybrid parallel mode combining FSDP2 and DP, and used 8 GPUs for training. You can see that it is basically the same as the single-GPU training code, except that `DeviceMesh` is used to declare the model layout. + +When running, you need to launch training like this: + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 train.py +``` + +### Ray Training + +[Ray](https://github.com/ray-project/ray) is a commonly used scheduling middleware framework for multi-machine model training and inference scenarios. It provides additional optimizations for multi-model, multi-device execution and resource management, and supports integration with Kubernetes systems for production deployment. These characteristics make it particularly suitable for complex training scenarios such as RL and GKD. + +Twinkle supports using Ray for training and sampling, and its code is almost identical to the training API above: + +```python +import os +from typing import List, Tuple, Dict, Any +from peft import LoraConfig +import twinkle +from twinkle import DeviceMesh, DeviceGroup, get_device_placement +from twinkle.advantage import GRPOAdvantage +from twinkle.checkpoint_engine import CheckpointEngineManager +from twinkle.data_format import SamplingParams +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model.megatron import MegatronModel +from twinkle.metric import CompletionRewardMetric +from twinkle.preprocessor.llm import GSM8KProcessor +from twinkle.processor import InputProcessor +from twinkle.reward import GSM8KAccuracyReward, GSM8KFormatReward +from twinkle.sampler import vLLMSampler +from twinkle.template import Template + +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3-4B') +MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) +SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS',4)) +NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS +NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8)) +MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096)) +LEARNING_RATE = float(os.environ.get('LR', 1e-5)) +MAX_STEPS = int(os.environ.get('MAX_STEPS', 200)) +BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 16)) # global prompt-level, global completion-level batch size = BATCH_SIZE * num_generations * dp_size +MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 16)) # global completion-level mini-batch-size +MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2)) # per-device-micro-batch-size (completion-level), batch_size in forward_backward +GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1)) +ADAPTER_NAME = 'default' + +def create_gsm8k_dataset(): + dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) + dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.map(GSM8KProcessor()) + dataset.encode(add_generation_prompt=True) + return dataset + +def compute_rewards( + trajectories: List[Dict[str, Any]], +) -> Tuple[List[float], List[float], List[float]]: + accuracy_reward_fn = GSM8KAccuracyReward() + format_reward_fn = GSM8KFormatReward() + accuracy_rewards = accuracy_reward_fn(trajectories) + format_rewards = format_reward_fn(trajectories) + total_rewards = [a + f for a, f in zip(accuracy_rewards, format_rewards)] + return total_rewards, format_rewards, accuracy_rewards + +def main(): + # set sampler and model separate to use different gpus + device_groups = [ + DeviceGroup(name='model',ranks=list(range(MODEL_GPUS)),device_type='GPU'), + DeviceGroup(name='sampler',ranks=list(range(MODEL_GPUS, NUM_GPUS)),device_type='GPU'), + ] + model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS) + sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS) + twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False) + + lora_config = LoraConfig(target_modules='all-linear', r=32, lora_alpha=64, lora_dropout=0.05) + model = MegatronModel(model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model', mixed_precision='bf16') + model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=1) + model.set_optimizer('default', lr=LEARNING_RATE) + model.set_lr_scheduler('default', lr_decay_steps=MAX_STEPS, max_lr=LEARNING_RATE) + model.set_loss('GRPOLoss', epsilon=0.2) + model.set_processor(InputProcessor) + model.set_template('Template', model_id=MODEL_ID) + + sampler = vLLMSampler( + model_id=MODEL_ID, + engine_args={ + 'gpu_memory_utilization': 0.8, + 'max_model_len': 4096, + 'max_lora_rank': 32, # save as lora_config + 'enable_lora': True, + }, + device_mesh=sampler_mesh, + remote_group='sampler', + ) + sampler.set_template(Template, model_id=MODEL_ID) + ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) + dataloader = DataLoader( + dataset=create_gsm8k_dataset, + batch_size=BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS, + min_batch_size=BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS, + device_mesh=model_mesh, + remote_group='model', + ) + advantage_fn = GRPOAdvantage() + metrics = CompletionRewardMetric() + sampling_params = SamplingParams(max_tokens=MAX_NEW_TOKENS) + optim_step = 0 + print(get_device_placement()) + + for batch in dataloader: + if optim_step >= MAX_STEPS: + break + metrics.reset() + global_prompts = batch if isinstance(batch, list) else [batch] + ckpt_manager.sync_weights(merge_and_sync=False) + sampler.reset_prefix_cache() + sample_response = sampler.sample( + global_prompts*NUM_GENERATIONS, + sampling_params, + num_samples=1, + ) + all_input_data: List[Dict[str, Any]] = [] + all_old_logps: List[List[float]] = [] + all_completion_lengths: List[int] = [] + + for sequence in sample_response.sequences: + all_input_data.append(sequence.new_input_feature) + all_old_logps.append(sequence.logprobs) + all_completion_lengths.append(len(sequence.tokens)) + total_rewards, format_rewards, accuracy_rewards = compute_rewards( + all_input_data + ) + metrics.accumulate( + completion_lengths=all_completion_lengths, + rewards={ + 'total': total_rewards, + 'format': format_rewards, + 'accuracy': accuracy_rewards, + }, + ) + advantages = advantage_fn(total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist() + # Split completions into mini-batches and run one optim step per mini-batch. + total_completions = len(all_input_data) + for mb_start in range(0, total_completions, MINI_BATCH_SIZE): + mb_end = min(mb_start + MINI_BATCH_SIZE, total_completions) + mb_inputs = all_input_data[mb_start:mb_end] + mb_old_logps = all_old_logps[mb_start:mb_end] + mb_advantages = advantages[mb_start:mb_end] + + model.forward_backward( + inputs=mb_inputs, + old_logps=mb_old_logps, + advantages=mb_advantages, + micro_batch_size=MICRO_BATCH_SIZE, + ) + model.clip_grad_and_step() + optim_step += 1 + + if optim_step >= MAX_STEPS: + break + log_dict = metrics.calculate() + log_dict.update(model.calculate_metric(is_training=True)) + metrics.reset() + print(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}') + + print(f'Training completed. optim_steps={optim_step}') + model.save('grpo-gsm8k-checkpoint') + +if __name__ == '__main__': + main() +``` + +In the code above, we provide an RL training example. We can clearly see in the code how data is constructed, how the sampler/model are declared and parameterized, and the construction process for advantage and loss. +There is no explicit reference to `ray` anywhere in this process. We only declared Ray mode during initialization: + +```python +twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False) +``` + +Developers can customize the construction and invocation methods of components like models. All Transformers and Megatron model parameters can be passed in when constructing the model. + +All subsequent Ray calls and data distribution are performed implicitly. Running this script requires having Ray installed beforehand. Then run it like this: + +```shell +python train.py +``` + +### Remote Training + +A major feature of Twinkle is support for multi-tenant mixed training. Specifically, multiple users can use a single base model for LoRA training, which can greatly reduce server-side deployment costs. + +Suppose we start a service using eight GPUs. First, we need to start the Ray cluster: + +```shell +CUDA_VISIBLE_DEVICES=0,1 ray start --head --port=6379 --num-gpus=2 +CUDA_VISIBLE_DEVICES=2,3 ray start --address=127.0.0.1:6379 --num-gpus=2 +CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0 +``` + +We started a Ray cluster containing three nodes: +- GPUs 0 and 1 as one node +- GPUs 2 and 3 as one node +- CPU resources as one node + +For production environments, you can start more nodes and deploy more replicas to accommodate larger user volumes. Here we only use four GPUs as an example. + +Next, start the server: +```shell + +cd cookbook/client/twinkle/transformer +python server.py +``` + +The server will start three services: a sampler cluster, a model cluster, and a utility cluster. + +Now you can perform client-side training: +```python +import dotenv +dotenv.load_dotenv('.env') +import re +from twinkle.data_format import Trajectory +from twinkle.reward.base import Reward +import gc +from peft import LoraConfig +from typing import List, Tuple + +from twinkle import get_logger +from twinkle.advantage import GRPOAdvantage +from twinkle.dataset import DatasetMeta +from twinkle.metric import CompletionRewardMetric +from twinkle_client import init_twinkle_client +from twinkle_client.dataloader import DataLoader +from twinkle_client.dataset import Dataset +from twinkle_client.model import MultiLoraTransformersModel +from twinkle_client.sampler import vLLMSampler + +logger = get_logger() + +# ========== Configuration ========== +MODEL_ID = 'ms://Qwen/Qwen3-4B' +NUM_GENERATIONS = 4 +MAX_NEW_TOKENS = 1024 +LEARNING_RATE = 1e-5 +MAX_STEPS = 10 +BATCH_SIZE = 2 +TEMPERATURE = 1.0 +SYNC_INTERVAL = 1 # Save weights for sampler every N steps +GRADIENT_ACCUMULATION_STEPS = 4 + + +def create_countdown_dataset(): + """Create Countdown Game dataset for GRPO training.""" + + dataset = Dataset(dataset_meta=DatasetMeta('ms://zouxuhong/Countdown-Tasks-3to4', data_slice=range(500))) + dataset.set_template('Template', model_id=MODEL_ID, max_length=8192) + dataset.map('CountdownProcessor') + dataset.encode(add_generation_prompt=True, batched=True) + return dataset + + +class CountDownAccuracy(Reward): + + @staticmethod + def countdown_accuracy_reward(completion: str, target: int, nums: List[int]) -> float: + """Accuracy reward: checks if equation is correct.""" + try: + match = re.search(r'(.*?)<\/answer>', completion) + if match is None: + return 0.0 + equation = match.group(1).strip() + if '=' in equation: + equation = equation.split('=')[0] + used_numbers = [int(n) for n in re.findall(r'\d+', equation)] + if sorted(used_numbers) != sorted(nums): + return 0.0 + if not re.match(r'^[\d+\-*/().\s]+$', equation): + return 0.0 + result = eval(equation, {'__builtins__': None}, {}) + return 1.0 if abs(float(result) - float(target)) < 1e-5 else 0.0 + except Exception: # noqa + return 0.0 + + def __call__(self, trajectories: List[Trajectory], ground_truths: List[Trajectory]): + rewards = [] + for trajectory in trajectories: + messages = trajectory.get('messages', []) + completion = '' + for msg in reversed(messages): + if msg.get('role') == 'assistant': + completion = msg.get('content', '') + break + user_data = trajectory.get('user_data', [{}]) + data = user_data[0] if isinstance(user_data, list) and user_data else {} + target = data.get('target', 0) + nums = data.get('nums', []) + acc_reward = self.countdown_accuracy_reward(completion, target, nums) + rewards.append(acc_reward) + return rewards + + +def compute_rewards(trajectories: List[dict], ) -> Tuple[List[float], List[float], List[float]]: + """Compute format and accuracy rewards for Countdown game.""" + from twinkle.reward import FormatReward + format_rewards = FormatReward()(trajectories, []) + accuracy_rewards = CountDownAccuracy()(trajectories, []) + total_rewards = [a + b for a, b in zip(accuracy_rewards, format_rewards)] + return total_rewards, format_rewards, accuracy_rewards + + +def train(): + # Step 1: Initialize the Twinkle client + client = init_twinkle_client( + base_url='http://localhost:8000', + api_key='', + ) + + # Step 2: Prepare dataset and dataloader + dataset = create_countdown_dataset() + dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE) + + # Step 3: Configure the training model + model = MultiLoraTransformersModel(model_id=MODEL_ID) + + lora_config = LoraConfig( + target_modules='all-linear', + r=8, + lora_alpha=32, + lora_dropout=0.05, + ) + model.add_adapter_to_model( + 'default', + lora_config, + gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, + ) + + # Set GRPO loss (the key difference from SFT training) + model.set_loss('GRPOLoss', epsilon=0.2, beta=0.0) + + # Set optimizer and LR scheduler + model.set_optimizer('AdamW', lr=LEARNING_RATE) + model.set_lr_scheduler( + 'CosineWarmupScheduler', + num_warmup_steps=500, + num_training_steps=MAX_STEPS, + ) + + # Set processor and template for encoding inputs + model.set_processor('InputProcessor') + model.set_template('Template', model_id=MODEL_ID) + + # Step 4: Configure the sampler + sampler = vLLMSampler(model_id=MODEL_ID) + sampler.set_template('Template', model_id=MODEL_ID) + + # Step 5: Setup metrics and advantage function + advantage_fn = GRPOAdvantage() + metrics = CompletionRewardMetric() + + sampling_params = { + 'max_tokens': MAX_NEW_TOKENS, + 'temperature': TEMPERATURE, + 'top_p': 0.95, + } + + # Track the current adapter path for sampling + current_adapter_uri = None + + step = 0 + for batch in dataloader: + if step >= MAX_STEPS: + break + + metrics.reset() + prompts = batch if isinstance(batch, list) else [batch] + + # ========== 1. Save weights and update adapter_uri ========== + # Instead of sync_weights, save the model checkpoint and pass + # the resulting path to the sampler as adapter_uri + if step % SYNC_INTERVAL == 0: + logger.info(f'Step {step}: Saving weights for sampler...') + twinkle_path = model.save( + name=f'grpo-sampler-step-{step}', + save_optimizer=False, + ) + current_adapter_uri = twinkle_path + logger.info(f'Step {step}: Saved weights to {current_adapter_uri}') + + # ========== 2. Sample completions ========== + sample_response = sampler.sample( + inputs=prompts, + sampling_params=sampling_params, + adapter_uri=current_adapter_uri, + num_samples=NUM_GENERATIONS, + ) + + input_features = [] + old_logps_list = [] + completion_lengths = [] + + sequences = sample_response.get('sequences', []) + for seq in sequences: + input_features.append(seq.get('new_input_feature', seq)) + old_logps_list.append(seq.get('logprobs', [])) + completion_lengths.append(len(seq.get('tokens', []))) + + if not input_features: + logger.warning(f'Step {step}: No valid samples, skipping') + step += 1 + continue + + # ========== 3. Compute rewards ========== + total_rewards, format_rewards, accuracy_rewards = compute_rewards(input_features) + metrics.accumulate( + None, + None, + completion_lengths=completion_lengths, + rewards={ + 'total': total_rewards, + 'format': format_rewards, + 'accuracy': accuracy_rewards, + }) + + # ========== 4. Compute advantages ========== + advantages = advantage_fn( + total_rewards, + num_generations=NUM_GENERATIONS, + scale='group', + ).tolist() + + frac_zero_std = (1.0 if all(abs(a) < 1e-8 for a in advantages) else 0.0) + if frac_zero_std == 1.0: + logger.info(f'Step {step}: All advantages are zero, skipping training') + step += 1 + continue + + # ========== 5. Training step (GRPO) ========== + # forward_backward with GRPO loss: passes advantages and old_logps + # to the server-side GRPOLoss for proper policy optimization + model.forward_backward( + inputs=input_features, + advantages=advantages, + old_logps=old_logps_list, + ) + + # Gradient clipping and optimizer step + model.clip_grad_norm(1.0) + model.step() + model.zero_grad() + model.lr_step() + + gc.collect() + + # ========== 6. Log ========== + log_dict = metrics.calculate() + log_dict.update(model.calculate_metric()) + log_dict['train/frac_reward_zero_std'] = frac_zero_std + logger.info(f'Step {step}: {log_dict}') + step += 1 + + # Save final checkpoint + twinkle_path = model.save(name='grpo-countdown-final', save_optimizer=True) + logger.info(f'Saved final checkpoint: {twinkle_path}') + + +if __name__ == '__main__': + train() +``` + +Multiple developers can use a single base model from this service for parallel training and sampling. Furthermore, the training methods they use are allowed to differ. For example, User A can perform SFT, User B can perform RL, and User C can perform sampling. Similarly, Twinkle also supports Tinker-like APIs for remote training: + +>[!Note] +> One important note: in the current Twinkle implementation, the client-side Twinkle API and Tinker API cannot be used simultaneously on the same server. When you need to provide the Tinker API, you need to start the service under cookbook/client/tinker. +> This issue will be addressed with high priority in upcoming iterations. + +```python +from tinker import types +from tqdm import tqdm +from tinker import ServiceClient +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.preprocessor import SelfCognitionProcessor +from twinkle.server.tinker.common import input_feature_to_datum + +# The base model to fine-tune / evaluate +base_model = 'ms://Qwen/Qwen3-4B' + + +def train(): + # Step 1: Prepare the dataset + + # Load the self-cognition dataset from ModelScope (first 500 examples) + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) + + # Apply the chat template matching the base model (max 256 tokens per sample) + dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + + # Replace placeholder names with custom model/author identity + dataset.map(SelfCognitionProcessor('twinkle model', 'twinkle team'), load_from_cache_file=False) + + # Tokenize and encode the dataset into model-ready input features + dataset.encode(batched=True, load_from_cache_file=False) + + # Wrap the dataset into a DataLoader that yields batches of size 8 + dataloader = DataLoader(dataset=dataset, batch_size=8) + + # Step 2: Initialize the training client + # Connect to the Twinkle server running locally + service_client = ServiceClient(base_url='http://localhost:8000', api_key='your-api-key') + # Create a LoRA training client for the base model (rank=16 for the LoRA adapter) + training_client = service_client.create_lora_training_client(base_model=base_model, rank=16) + + # Step 3: Run the training loop + for epoch in range(3): + print(f'Epoch {epoch}') + for step, batch in tqdm(enumerate(dataloader)): + # Convert each InputFeature into a Datum for the Tinker API + input_datum = [input_feature_to_datum(input_feature) for input_feature in batch] + + # Send data to server: forward + backward pass (computes gradients) + fwdbwd_future = training_client.forward_backward(input_datum, 'cross_entropy') + + # Optimizer step: update model weights with Adam + optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4)) + + # Wait for both operations to complete + fwdbwd_future.result() + optim_result = optim_future.result() + print(f'Training Metrics: {optim_result}') + + # Save a checkpoint after each epoch + save_future = training_client.save_state(f'twinkle-lora-{epoch}') + save_result = save_future.result() + print(f'Saved checkpoint to {save_result.path}') + + +if __name__ == '__main__': + train() +``` + +### Using ModelScope Community's TaaS Training Service + +Concurrent with the open-source release of the Twinkle framework, we also provide a hosted Training as a Service (TaaS) powered by ModelScope's backend services. Developers can experience Twinkle's training API for free through this service. +This service shares the same code as the Tinker API section described above. The only difference is that the Endpoint and Token need to use the official ModelScope information. For details on how to use the official service, please refer to the detailed description in [Training Service](./Train-as-a-Service.md). + +## Using Hugging Face models + +Switch the prefix. + +```text +ms://Qwen/Qwen3-4B -> hf://Qwen/Qwen3-4B +``` + +## 🛠️ Twinkle✨ Modular Ecosystem + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Dataset
Data loading and preprocessing

+
+

Template
Encoding and decoding

+
+

DataLoader
Data distribution and batching

+
+

Preprocessor
Data ETL

+
+

InputProcessor
Task-specific input processing

+
+

Model
Large models, supports multiple frameworks

+
+

Sampler
Sampler logic

+
+

Loss
Loss functions

+
+

Metric
Training metrics collection

+
+

Reward
Reward function

+
+

Advantage
Advantage function

+
+

CheckpointEngine
Weight synchronization

+
+

Patch
Patches for model fixes

+
+

Module
Components, e.g., Optimizer

+
+

Kernel
Operators

+
+

Server
Start backend cluster

+
+

Client
Client code

+
+

Infra
Isolate ray and torchrun differences

+
+

Plugin
Use hub components

+
+

Hub
Interface with HF/MS libraries

+
+
+ ## Twinkle's Customizable Components In Twinkle's design, training using torchrun, Ray, and HTTP uses the same API and shares the same components and input/output structures. Therefore, many of its components can be customized by developers to implement new algorithm development. @@ -77,10 +874,10 @@ DeviceGroup: Define how many resource groups are needed for this training sessio ```python from twinkle.model import TransformersModel -model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', remote_group='default', device_mesh=device_mesh) +model = TransformersModel(model_id='Qwen/Qwen3-4B', remote_group='default', device_mesh=device_mesh) # Or from twinkle.model import MegatronModel -model = MegatronModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', remote_group='default', device_mesh=device_mesh) +model = MegatronModel(model_id='Qwen/Qwen3-4B', remote_group='default', device_mesh=device_mesh) ``` DeviceMesh specifies the topology of components like models within the resource group. It can be understood as how to perform parallelization. This affects a series of framework decisions, such as data acquisition, data consumption, data return, etc. @@ -106,7 +903,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.set_template('Template', model_id='Qwen/Qwen3-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -114,7 +911,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', remote_group='default') + model = TransformersModel(model_id='Qwen/Qwen3-4B', remote_group='default') lora_config = LoraConfig( r=8, @@ -154,54 +951,54 @@ python3 train.py ## Supported Large Language Models List -| Model Type | Model ID Example | Requires | Support Megatron | HF Model ID | -| ------------------- | ---------------------------------------------------------------------------------------------------------- | -------------------- | ---------------- | ---------------------------------------------------------------------------------------------------------- | -| qwen2 series | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | -| | [Qwen/Qwen2-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-72B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | -| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) | -| | [Qwen/Qwen2-7B](https://modelscope.cn/models/Qwen/Qwen2-7B) | transformers>=4.37 | ✔ | [Qwen/Qwen2-7B](https://huggingface.co/Qwen/Qwen2-7B) | -| | [Qwen/Qwen2-72B](https://modelscope.cn/models/Qwen/Qwen2-72B) | transformers>=4.37 | ✔ | [Qwen/Qwen2-72B](https://huggingface.co/Qwen/Qwen2-72B) | -| | [Qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | -| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | -| | [Qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-72B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | -| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | -| | [Qwen/Qwen2.5-32B](https://modelscope.cn/models/Qwen/Qwen2.5-32B) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) | -| qwen2_moe series | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | -| | [Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B) | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) | -| qwen3 series | [Qwen/Qwen3-0.6B-Base](https://modelscope.cn/models/Qwen/Qwen3-0.6B-Base) | transformers>=4.51 | ✔ | [Qwen/Qwen3-0.6B-Base](https://huggingface.co/Qwen/Qwen3-0.6B-Base) | -| | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) | -| | [Qwen/Qwen3-0.6B](https://modelscope.cn/models/Qwen/Qwen3-0.6B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) | -| | [Qwen/Qwen3-1.7B](https://modelscope.cn/models/Qwen/Qwen3-1.7B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B) | -| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen2.5-32B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | -| qwen3_moe series | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | -| | [Qwen/Qwen3-30B-A3B](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B) | -| | [Qwen/Qwen3-235B-A22B](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) | -| chatglm2 series | [ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b) | transformers<4.42 | ✘ | [zai-org/chatglm2-6b](https://huggingface.co/zai-org/chatglm2-6b) | -| | [ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k) | transformers<4.42 | ✘ | [zai-org/chatglm2-6b-32k](https://huggingface.co/zai-org/chatglm2-6b-32k) | -| chatglm3 series | [ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b) | transformers<4.42 | ✘ | [zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b) | -| | [ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base) | transformers<4.42 | ✘ | [zai-org/chatglm3-6b-base](https://huggingface.co/zai-org/chatglm3-6b-base) | -| | [ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k) | transformers<4.42 | ✘ | [zai-org/chatglm3-6b-32k](https://huggingface.co/zai-org/chatglm3-6b-32k) | -| | [ZhipuAI/chatglm3-6b-128k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-128k) | transformers<4.42 | ✘ | [zai-org/chatglm3-6b-128k](https://huggingface.co/zai-org/chatglm3-6b-128k) | -| chatglm4 series | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | -| | [ZhipuAI/glm-4-9b](https://modelscope.cn/models/ZhipuAI/glm-4-9b) | transformers>=4.42 | ✘ | [zai-org/glm-4-9b](https://huggingface.co/zai-org/glm-4-9b) | -| | [ZhipuAI/glm-4-9b-chat-1m](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m) | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat-1m](https://huggingface.co/zai-org/glm-4-9b-chat-1m) | -| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | -| glm_edge series | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | -| | [ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat) | transformers>=4.46 | ✘ | [zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat) | -| internlm2 series | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | -| | [Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft) | transformers>=4.38 | ✘ | [internlm/internlm2-chat-1_8b-sft](https://huggingface.co/internlm/internlm2-chat-1_8b-sft) | -| | [Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b) | transformers>=4.38 | ✘ | [internlm/internlm2-base-7b](https://huggingface.co/internlm/internlm2-base-7b) | -| | [Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b) | transformers>=4.38 | ✘ | [internlm/internlm2-7b](https://huggingface.co/internlm/internlm2-7b) | -| | [Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b) | transformers>=4.38 | ✘ | [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) | -| deepseek_v1 | [deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat) | transformers>=4.39.4 | ✔ | | -| | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | -| | [deepseek-ai/DeepSeek-V2-Lite-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite-Chat) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat) | -| | [deepseek-ai/DeepSeek-V2](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2](https://huggingface.co/deepseek-ai/DeepSeek-V2) | -| | [deepseek-ai/DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat) | -| | [deepseek-ai/DeepSeek-V2.5](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2.5) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5) | -| | [deepseek-ai/DeepSeek-Prover-V2-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-7B) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-Prover-V2-7B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B) | -| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | -| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | -| | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | -| | [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | -| | [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | +| Model Type | Model ID Example | Requires | Support Megatron | HF Model ID | +|---------------------|-------------------------------------------------------------------------------------------------------------------------------|----------------------|------------------|---------------------------------------------------------------------------------------------------------------| +| qwen2 series | [Qwen/Qwen2-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-0.5B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) | +| | [Qwen/Qwen2-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-72B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | +| | [Qwen/Qwen2-1.5B](https://modelscope.cn/models/Qwen/Qwen2-1.5B) | transformers>=4.37 | ✔ | [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) | +| | [Qwen/Qwen2-7B](https://modelscope.cn/models/Qwen/Qwen2-7B) | transformers>=4.37 | ✔ | [Qwen/Qwen2-7B](https://huggingface.co/Qwen/Qwen2-7B) | +| | [Qwen/Qwen2-72B](https://modelscope.cn/models/Qwen/Qwen2-72B) | transformers>=4.37 | ✔ | [Qwen/Qwen2-72B](https://huggingface.co/Qwen/Qwen2-72B) | +| | [Qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | +| | [Qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-1.5B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | +| | [Qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-72B-Instruct) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | +| | [Qwen/Qwen2.5-0.5B](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) | +| | [Qwen/Qwen2.5-32B](https://modelscope.cn/models/Qwen/Qwen2.5-32B) | transformers>=4.37 | ✔ | [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) | +| qwen2_moe series | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat) | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat) | +| | [Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B) | transformers>=4.40 | ✔ | [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) | +| qwen3 series | [Qwen/Qwen3-0.6B-Base](https://modelscope.cn/models/Qwen/Qwen3-0.6B-Base) | transformers>=4.51 | ✔ | [Qwen/Qwen3-0.6B-Base](https://huggingface.co/Qwen/Qwen3-0.6B-Base) | +| | [Qwen/Qwen3-14B-Base](https://modelscope.cn/models/Qwen/Qwen3-14B-Base) | transformers>=4.51 | ✔ | [Qwen/Qwen3-14B-Base](https://huggingface.co/Qwen/Qwen3-14B-Base) | +| | [Qwen/Qwen3-0.6B](https://modelscope.cn/models/Qwen/Qwen3-0.6B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) | +| | [Qwen/Qwen3-1.7B](https://modelscope.cn/models/Qwen/Qwen3-1.7B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B) | +| | [Qwen/Qwen3-32B](https://modelscope.cn/models/Qwen/Qwen2.5-32B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | +| qwen3_moe series | [Qwen/Qwen3-30B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Base) | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B-Base](https://huggingface.co/Qwen/Qwen3-30B-A3B-Base) | +| | [Qwen/Qwen3-30B-A3B](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B) | +| | [Qwen/Qwen3-235B-A22B](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B) | transformers>=4.51 | ✔ | [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) | +| chatglm2 series | [ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b) | transformers<4.42 | ✘ | [zai-org/chatglm2-6b](https://huggingface.co/zai-org/chatglm2-6b) | +| | [ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k) | transformers<4.42 | ✘ | [zai-org/chatglm2-6b-32k](https://huggingface.co/zai-org/chatglm2-6b-32k) | +| chatglm3 series | [ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b) | transformers<4.42 | ✘ | [zai-org/chatglm3-6b](https://huggingface.co/zai-org/chatglm3-6b) | +| | [ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base) | transformers<4.42 | ✘ | [zai-org/chatglm3-6b-base](https://huggingface.co/zai-org/chatglm3-6b-base) | +| | [ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k) | transformers<4.42 | ✘ | [zai-org/chatglm3-6b-32k](https://huggingface.co/zai-org/chatglm3-6b-32k) | +| | [ZhipuAI/chatglm3-6b-128k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-128k) | transformers<4.42 | ✘ | [zai-org/chatglm3-6b-128k](https://huggingface.co/zai-org/chatglm3-6b-128k) | +| chatglm4 series | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat](https://huggingface.co/zai-org/glm-4-9b-chat) | +| | [ZhipuAI/glm-4-9b](https://modelscope.cn/models/ZhipuAI/glm-4-9b) | transformers>=4.42 | ✘ | [zai-org/glm-4-9b](https://huggingface.co/zai-org/glm-4-9b) | +| | [ZhipuAI/glm-4-9b-chat-1m](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m) | transformers>=4.42 | ✘ | [zai-org/glm-4-9b-chat-1m](https://huggingface.co/zai-org/glm-4-9b-chat-1m) | +| | [ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b) | transformers>=4.42 | ✘ | [zai-org/LongWriter-glm4-9b](https://huggingface.co/zai-org/LongWriter-glm4-9b) | +| glm_edge series | [ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat) | transformers>=4.46 | ✘ | [zai-org/glm-edge-1.5b-chat](https://huggingface.co/zai-org/glm-edge-1.5b-chat) | +| | [ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat) | transformers>=4.46 | ✘ | [zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat) | +| internlm2 series | [Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b) | transformers>=4.38 | ✘ | [internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b) | +| | [Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft) | transformers>=4.38 | ✘ | [internlm/internlm2-chat-1_8b-sft](https://huggingface.co/internlm/internlm2-chat-1_8b-sft) | +| | [Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b) | transformers>=4.38 | ✘ | [internlm/internlm2-base-7b](https://huggingface.co/internlm/internlm2-base-7b) | +| | [Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b) | transformers>=4.38 | ✘ | [internlm/internlm2-7b](https://huggingface.co/internlm/internlm2-7b) | +| | [Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b) | transformers>=4.38 | ✘ | [internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) | +| deepseek_v1 | [deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat) | transformers>=4.39.4 | ✔ | | +| | [deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | +| | [deepseek-ai/DeepSeek-V2-Lite-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite-Chat) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Lite-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat) | +| | [deepseek-ai/DeepSeek-V2](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2](https://huggingface.co/deepseek-ai/DeepSeek-V2) | +| | [deepseek-ai/DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat) | +| | [deepseek-ai/DeepSeek-V2.5](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2.5) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-V2.5](https://huggingface.co/deepseek-ai/DeepSeek-V2.5) | +| | [deepseek-ai/DeepSeek-Prover-V2-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-Prover-V2-7B) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-Prover-V2-7B](https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-7B) | +| | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1) | transformers>=4.39.3 | ✔ | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | +| deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | +| | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | +| | [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | +| | [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | transformers>=4.37 | ✔ | [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | diff --git a/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md b/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md index 2e781ad9..57e86366 100644 --- a/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md +++ b/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md @@ -155,7 +155,7 @@ base_model = "Qwen/Qwen2.5-0.5B-Instruct" # Use Twinkle's Dataset component to load and preprocess data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) -dataset.map(SelfCognitionProcessor('twinkle model', 'twinkle team'), load_from_cache_file=False) +dataset.map(SelfCognitionProcessor('twinkle model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git a/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md b/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md index da0a5f1e..66d98eec 100644 --- a/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md +++ b/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md @@ -97,7 +97,7 @@ dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct', max_l # Data preprocessing: Replace placeholders with custom names dataset.map('SelfCognitionProcessor', - init_args={'model_name': 'twinkle model', 'model_author': 'twinkle team'}) + init_args={'model_name': 'twinkle model', 'model_author': 'ModelScope Team'}) # Encode dataset into tokens usable by the model dataset.encode(batched=True) diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md index ce6048b4..fd6c30f3 100644 --- a/docs/source_en/Usage Guide/Train-as-a-Service.md +++ b/docs/source_en/Usage Guide/Train-as-a-Service.md @@ -18,6 +18,50 @@ API endpoint: `base_url="https://www.modelscope.cn/twinkle"` We strongly recommend that developers review our [cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook/client/tinker) and build upon the training code provided there. +Sample code: + +```python +import os +from tqdm import tqdm +from tinker import types +from twinkle_client import init_tinker_client +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.preprocessor import SelfCognitionProcessor +from twinkle.server.tinker.common import input_feature_to_datum + +base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' +base_url='http://www.modelscope.cn/twinkle' +api_key=os.environ.get('MODELSCOPE_TOKEN') + +# Use twinkle dataset to load the data +dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) +dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.map(SelfCognitionProcessor('Twinkle Model', 'ModelScope Team'), load_from_cache_file=False) +dataset.encode(batched=True, load_from_cache_file=False) +dataloader = DataLoader(dataset=dataset, batch_size=8) + +# Initialize Tinker client before importing ServiceClient +init_tinker_client() +from tinker import ServiceClient + +service_client = ServiceClient(base_url=base_url, api_key=api_key) +training_client = service_client.create_lora_training_client(base_model=base_model[len('ms://'):], rank=16) + +# Training loop: use input_feature_to_datum to transfer the input format +for epoch in range(3): + for step, batch in tqdm(enumerate(dataloader)): + input_datum = [input_feature_to_datum(input_feature) for input_feature in batch] + + fwdbwd_future = training_client.forward_backward(input_datum, "cross_entropy") + optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4)) + + fwdbwd_result = fwdbwd_future.result() + optim_result = optim_future.result() + + training_client.save_state(f"twinkle-lora-{epoch}").result() +``` + > The ModelScope server is tinker-compatible, so use the tinker cookbooks. In the future version, we will support a server works both for twinkle/tinker clients. Developers can customize datasets, advantage functions, rewards, templates, and more. However, the Loss component is not currently customizable since it needs to be executed on the server side (for security reasons). If you need support for additional Loss functions, you can upload your Loss implementation to ModelHub and contact us via the Q&A group or through an issue to have the corresponding component added to the whitelist. diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" index f0185a4f..fa1da56d 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" @@ -28,6 +28,805 @@ Twinkle 和 [ms-swift](https://github.com/modelscope/ms-swift) 都是模型训 - 如果你需要推理、部署、量化等其他能力 - 如果你对新模型的训练支持敏感,Swift 会保证 day-0 的更新能力 +## 使用模式 + +### 仅使用部分组件 + +开发者可以仅使用Twinkle的一部分组件,结合自己的已有代码来完成训练工作。例如,仅使用Dataset&DataLoader: + +```python +from twinkle.dataset import PackingDataset, DatasetMeta +from twinkle.dataloader import DataLoader +from twinkle.preprocessor import SelfCognitionProcessor + +def train(): + dataset_meta = DatasetMeta( + dataset_id='ms://swift/self-cognition', + ) + + dataset = PackingDataset(dataset_meta) + dataset.map(SelfCognitionProcessor(model_name='Twinkle模型', model_author='ModelScope社区')) + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512) + dataset.encode() + dataset.pack_dataset() + + dataloader = DataLoader(dataset, batch_size=8) + for data in dataloader: + print(data) + """ + { + "input_ids": [...], + "position_ids": [...], + ... + } + """ + break + +if __name__ == '__main__': + train() +``` +上面的代码中,使用PackingDataset加载了一个叫做`swift/self-cognition`的数据集。PackingDataset可以用于将数据进行装箱,保证每个batch的长度都与设置的最大长度相似。 +我们在循环中简单地使用了print打印了输出,在实际使用中,你可以在下面继续编写你的自定义训练代码。 + +Twinkle的所有组件都支持单独拆分使用,可以参考下面章节的组件列表。 + +### 单GPU + +Twinkle支持单GPU运行训练。下面是一个例子: + +```python +from peft import LoraConfig + +from twinkle import get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +logger = get_logger() + + +def train(): + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + # Encode dataset + dataset.encode() + # Global batch size = 8, for GPUs, so 1 sample per GPU + dataloader = DataLoader(dataset=dataset, batch_size=8) + # Use a TransformersModel + model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') + + lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='AdamW', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler( + scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader)) + logger.info(get_device_placement()) + # Print the training config + logger.info(model.get_train_configs()) + logger.info(f'Total steps: {len(dataloader)}') + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 20 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() + +``` + +在这个训练代码中,我们构造了一个数据集并拉起了Qwen/Qwen3-4B模型,使用all-linear方式加载了lora,并完成了一次训练。在日志中,可以看到loss逐步收敛的过程。 + +### torchrun + +Twinkle支持以torchrun模式运行训练。在这种场景下,不需要安装ray相关的依赖。 + +```python +from peft import LoraConfig + +import twinkle +from twinkle import DeviceMesh, get_device_placement, get_logger +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model import TransformersModel +from twinkle.preprocessor import SelfCognitionProcessor + +# Construct a device_mesh, fsdp=4, dp=2 +device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2) +# use torchrun mode +twinkle.initialize(mode='local', global_device_mesh=device_mesh) + +logger = get_logger() + + +def train(): + # 1000 samples + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) + # Set template to prepare encoding + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') + # Preprocess the dataset to standard format + dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) + # Encode dataset + dataset.encode() + # Global batch size = 8, for GPUs, so 1 sample per GPU + dataloader = DataLoader(dataset=dataset, batch_size=8) + # Use a TransformersModel + model = TransformersModel(model_id='ms://Qwen/Qwen3-4B') + + lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear') + + # Add a lora to model, with name `default` + # Comment this to use full-parameter training + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + # Add Optimizer for lora `default` + model.set_optimizer(optimizer_cls='AdamW', lr=1e-4) + # Add LRScheduler for lora `default` + model.set_lr_scheduler( + scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader)) + logger.info(get_device_placement()) + # Print the training config + logger.info(model.get_train_configs()) + logger.info(f'Total steps: {len(dataloader)}') + for step, batch in enumerate(dataloader): + # Do forward and backward + model.forward_backward(inputs=batch) + # Step + model.clip_grad_and_step() + if step % 20 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}') + model.save(f'last-checkpoint') + + +if __name__ == '__main__': + train() +``` + +上面的代码中,构造了fsdp2和dp的hybrid并行模式,并使用了八张卡进行训练。可以看到它和单卡训练的代码基本相同,只是使用了`DeviceMesh`来声明模型布局。 + +运行时,需要这样拉起训练: + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 train.py +``` + +### Ray训练 + +[Ray](https://github.com/ray-project/ray)是多机模型训练和推理场景中常用的调度中间件框架。它针对多模型、多设备的执行和资源管理进行了额外优化, +并支持对接kubernetes系统进行生产化。这样的特性使得它尤其适用于RL、GKD等复杂训练场景中。 + +Twinkle支持使用ray进行训练和采样,并且它的代码和上面的训练API几乎一致: + +```python +import os +from typing import List, Tuple, Dict, Any +from peft import LoraConfig +import twinkle +from twinkle import DeviceMesh, DeviceGroup, get_device_placement +from twinkle.advantage import GRPOAdvantage +from twinkle.checkpoint_engine import CheckpointEngineManager +from twinkle.data_format import SamplingParams +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.model.megatron import MegatronModel +from twinkle.metric import CompletionRewardMetric +from twinkle.preprocessor.llm import GSM8KProcessor +from twinkle.processor import InputProcessor +from twinkle.reward import GSM8KAccuracyReward, GSM8KFormatReward +from twinkle.sampler import vLLMSampler +from twinkle.template import Template + +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3-4B') +MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) +SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS',4)) +NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS +NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8)) +MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096)) +LEARNING_RATE = float(os.environ.get('LR', 1e-5)) +MAX_STEPS = int(os.environ.get('MAX_STEPS', 200)) +BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 16)) # global prompt-level, global completion-level batch size = BATCH_SIZE * num_generations * dp_size +MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 16)) # global completion-level mini-batch-size +MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2)) # per-device-micro-batch-size (completion-level), batch_size in forward_backward +GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1)) +ADAPTER_NAME = 'default' + +def create_gsm8k_dataset(): + dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) + dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.map(GSM8KProcessor()) + dataset.encode(add_generation_prompt=True) + return dataset + +def compute_rewards( + trajectories: List[Dict[str, Any]], +) -> Tuple[List[float], List[float], List[float]]: + accuracy_reward_fn = GSM8KAccuracyReward() + format_reward_fn = GSM8KFormatReward() + accuracy_rewards = accuracy_reward_fn(trajectories) + format_rewards = format_reward_fn(trajectories) + total_rewards = [a + f for a, f in zip(accuracy_rewards, format_rewards)] + return total_rewards, format_rewards, accuracy_rewards + +def main(): + # set sampler and model separate to use different gpus + device_groups = [ + DeviceGroup(name='model',ranks=list(range(MODEL_GPUS)),device_type='GPU'), + DeviceGroup(name='sampler',ranks=list(range(MODEL_GPUS, NUM_GPUS)),device_type='GPU'), + ] + model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS) + sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS) + twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False) + + lora_config = LoraConfig(target_modules='all-linear', r=32, lora_alpha=64, lora_dropout=0.05) + model = MegatronModel(model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model', mixed_precision='bf16') + model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=1) + model.set_optimizer('default', lr=LEARNING_RATE) + model.set_lr_scheduler('default', lr_decay_steps=MAX_STEPS, max_lr=LEARNING_RATE) + model.set_loss('GRPOLoss', epsilon=0.2) + model.set_processor(InputProcessor) + model.set_template('Template', model_id=MODEL_ID) + + sampler = vLLMSampler( + model_id=MODEL_ID, + engine_args={ + 'gpu_memory_utilization': 0.8, + 'max_model_len': 4096, + 'max_lora_rank': 32, # save as lora_config + 'enable_lora': True, + }, + device_mesh=sampler_mesh, + remote_group='sampler', + ) + sampler.set_template(Template, model_id=MODEL_ID) + ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) + dataloader = DataLoader( + dataset=create_gsm8k_dataset, + batch_size=BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS, + min_batch_size=BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS, + device_mesh=model_mesh, + remote_group='model', + ) + advantage_fn = GRPOAdvantage() + metrics = CompletionRewardMetric() + sampling_params = SamplingParams(max_tokens=MAX_NEW_TOKENS) + optim_step = 0 + print(get_device_placement()) + + for batch in dataloader: + if optim_step >= MAX_STEPS: + break + metrics.reset() + global_prompts = batch if isinstance(batch, list) else [batch] + ckpt_manager.sync_weights(merge_and_sync=False) + sampler.reset_prefix_cache() + sample_response = sampler.sample( + global_prompts*NUM_GENERATIONS, + sampling_params, + num_samples=1, + ) + all_input_data: List[Dict[str, Any]] = [] + all_old_logps: List[List[float]] = [] + all_completion_lengths: List[int] = [] + + for sequence in sample_response.sequences: + all_input_data.append(sequence.new_input_feature) + all_old_logps.append(sequence.logprobs) + all_completion_lengths.append(len(sequence.tokens)) + total_rewards, format_rewards, accuracy_rewards = compute_rewards( + all_input_data + ) + metrics.accumulate( + completion_lengths=all_completion_lengths, + rewards={ + 'total': total_rewards, + 'format': format_rewards, + 'accuracy': accuracy_rewards, + }, + ) + advantages = advantage_fn(total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist() + # Split completions into mini-batches and run one optim step per mini-batch. + total_completions = len(all_input_data) + for mb_start in range(0, total_completions, MINI_BATCH_SIZE): + mb_end = min(mb_start + MINI_BATCH_SIZE, total_completions) + mb_inputs = all_input_data[mb_start:mb_end] + mb_old_logps = all_old_logps[mb_start:mb_end] + mb_advantages = advantages[mb_start:mb_end] + + model.forward_backward( + inputs=mb_inputs, + old_logps=mb_old_logps, + advantages=mb_advantages, + micro_batch_size=MICRO_BATCH_SIZE, + ) + model.clip_grad_and_step() + optim_step += 1 + + if optim_step >= MAX_STEPS: + break + log_dict = metrics.calculate() + log_dict.update(model.calculate_metric(is_training=True)) + metrics.reset() + print(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}') + + print(f'Training completed. optim_steps={optim_step}') + model.save('grpo-gsm8k-checkpoint') + +if __name__ == '__main__': + main() +``` + +在上面的代码中,我们给出了一个RL的训练代码。我们可以在代码中清晰看到数据如何构造、sampler/model如何声明和传参,以及advantage和loss的构造过程。 +这个过程没有任何显示引用`ray`的地方。我们仅在初始化时声明了ray模式: + +```python +twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False) +``` + +开发者可以定制模型等组件的构造和调用方式,所有transformers、Megatron的模型参数都可以在构造模型时传入。 + +后面所有的ray调用和数据分发,都是隐式进行的。运行这个脚本需要提前安装好ray。之后这样运行: + +```shell +python train.py +``` + +### 远程训练 + +Twinkle的一大特色是支持多租户用户混合训练。具体来说,多个用户可以使用一个基模进行lora训练,这样可以极大减小服务端部署成本。 + +假设我们使用八卡开启一个服务。首先我们需要启动ray集群: + +```shell +CUDA_VISIBLE_DEVICES=0,1 ray start --head --port=6379 --num-gpus=2 +CUDA_VISIBLE_DEVICES=2,3 ray start --address=127.0.0.1:6379 --num-gpus=2 +CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0 +``` + +我们启动了一组包含三个node的ray集群: +- 01两张卡作为一个node +- 23两张卡作为一个node +- cpu资源作为一个node + +如果在生产环境使用,可以启动更多node,并部署更多replica以兼容更大的用户量。在这里我们仅以四卡作为例子。 + +下面,启动server: +```shell + +cd cookbook/client/twinkle/transformer +python server.py +``` + +服务端会启动一个包含了一个sampler集群、一个模型集群、一个工具集群的三个服务。 + +下面可以进行client端训练: +```python +import dotenv +dotenv.load_dotenv('.env') +import re +from twinkle.data_format import Trajectory +from twinkle.reward.base import Reward +import gc +from peft import LoraConfig +from typing import List, Tuple + +from twinkle import get_logger +from twinkle.advantage import GRPOAdvantage +from twinkle.dataset import DatasetMeta +from twinkle.metric import CompletionRewardMetric +from twinkle_client import init_twinkle_client +from twinkle_client.dataloader import DataLoader +from twinkle_client.dataset import Dataset +from twinkle_client.model import MultiLoraTransformersModel +from twinkle_client.sampler import vLLMSampler + +logger = get_logger() + +# ========== Configuration ========== +MODEL_ID = 'ms://Qwen/Qwen3-4B' +NUM_GENERATIONS = 4 +MAX_NEW_TOKENS = 1024 +LEARNING_RATE = 1e-5 +MAX_STEPS = 10 +BATCH_SIZE = 2 +TEMPERATURE = 1.0 +SYNC_INTERVAL = 1 # Save weights for sampler every N steps +GRADIENT_ACCUMULATION_STEPS = 4 + + +def create_countdown_dataset(): + """Create Countdown Game dataset for GRPO training.""" + + dataset = Dataset(dataset_meta=DatasetMeta('ms://zouxuhong/Countdown-Tasks-3to4', data_slice=range(500))) + dataset.set_template('Template', model_id=MODEL_ID, max_length=8192) + dataset.map('CountdownProcessor') + dataset.encode(add_generation_prompt=True, batched=True) + return dataset + + +class CountDownAccuracy(Reward): + + @staticmethod + def countdown_accuracy_reward(completion: str, target: int, nums: List[int]) -> float: + """Accuracy reward: checks if equation is correct.""" + try: + match = re.search(r'(.*?)<\/answer>', completion) + if match is None: + return 0.0 + equation = match.group(1).strip() + if '=' in equation: + equation = equation.split('=')[0] + used_numbers = [int(n) for n in re.findall(r'\d+', equation)] + if sorted(used_numbers) != sorted(nums): + return 0.0 + if not re.match(r'^[\d+\-*/().\s]+$', equation): + return 0.0 + result = eval(equation, {'__builtins__': None}, {}) + return 1.0 if abs(float(result) - float(target)) < 1e-5 else 0.0 + except Exception: # noqa + return 0.0 + + def __call__(self, trajectories: List[Trajectory], ground_truths: List[Trajectory]): + rewards = [] + for trajectory in trajectories: + messages = trajectory.get('messages', []) + completion = '' + for msg in reversed(messages): + if msg.get('role') == 'assistant': + completion = msg.get('content', '') + break + user_data = trajectory.get('user_data', [{}]) + data = user_data[0] if isinstance(user_data, list) and user_data else {} + target = data.get('target', 0) + nums = data.get('nums', []) + acc_reward = self.countdown_accuracy_reward(completion, target, nums) + rewards.append(acc_reward) + return rewards + + +def compute_rewards(trajectories: List[dict], ) -> Tuple[List[float], List[float], List[float]]: + """Compute format and accuracy rewards for Countdown game.""" + from twinkle.reward import FormatReward + format_rewards = FormatReward()(trajectories, []) + accuracy_rewards = CountDownAccuracy()(trajectories, []) + total_rewards = [a + b for a, b in zip(accuracy_rewards, format_rewards)] + return total_rewards, format_rewards, accuracy_rewards + + +def train(): + # Step 1: Initialize the Twinkle client + client = init_twinkle_client( + base_url='http://localhost:8000', + api_key='', + ) + + # Step 2: Prepare dataset and dataloader + dataset = create_countdown_dataset() + dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE) + + # Step 3: Configure the training model + model = MultiLoraTransformersModel(model_id=MODEL_ID) + + lora_config = LoraConfig( + target_modules='all-linear', + r=8, + lora_alpha=32, + lora_dropout=0.05, + ) + model.add_adapter_to_model( + 'default', + lora_config, + gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, + ) + + # Set GRPO loss (the key difference from SFT training) + model.set_loss('GRPOLoss', epsilon=0.2, beta=0.0) + + # Set optimizer and LR scheduler + model.set_optimizer('AdamW', lr=LEARNING_RATE) + model.set_lr_scheduler( + 'CosineWarmupScheduler', + num_warmup_steps=500, + num_training_steps=MAX_STEPS, + ) + + # Set processor and template for encoding inputs + model.set_processor('InputProcessor') + model.set_template('Template', model_id=MODEL_ID) + + # Step 4: Configure the sampler + sampler = vLLMSampler(model_id=MODEL_ID) + sampler.set_template('Template', model_id=MODEL_ID) + + # Step 5: Setup metrics and advantage function + advantage_fn = GRPOAdvantage() + metrics = CompletionRewardMetric() + + sampling_params = { + 'max_tokens': MAX_NEW_TOKENS, + 'temperature': TEMPERATURE, + 'top_p': 0.95, + } + + # Track the current adapter path for sampling + current_adapter_uri = None + + step = 0 + for batch in dataloader: + if step >= MAX_STEPS: + break + + metrics.reset() + prompts = batch if isinstance(batch, list) else [batch] + + # ========== 1. Save weights and update adapter_uri ========== + # Instead of sync_weights, save the model checkpoint and pass + # the resulting path to the sampler as adapter_uri + if step % SYNC_INTERVAL == 0: + logger.info(f'Step {step}: Saving weights for sampler...') + twinkle_path = model.save( + name=f'grpo-sampler-step-{step}', + save_optimizer=False, + ) + current_adapter_uri = twinkle_path + logger.info(f'Step {step}: Saved weights to {current_adapter_uri}') + + # ========== 2. Sample completions ========== + sample_response = sampler.sample( + inputs=prompts, + sampling_params=sampling_params, + adapter_uri=current_adapter_uri, + num_samples=NUM_GENERATIONS, + ) + + input_features = [] + old_logps_list = [] + completion_lengths = [] + + sequences = sample_response.get('sequences', []) + for seq in sequences: + input_features.append(seq.get('new_input_feature', seq)) + old_logps_list.append(seq.get('logprobs', [])) + completion_lengths.append(len(seq.get('tokens', []))) + + if not input_features: + logger.warning(f'Step {step}: No valid samples, skipping') + step += 1 + continue + + # ========== 3. Compute rewards ========== + total_rewards, format_rewards, accuracy_rewards = compute_rewards(input_features) + metrics.accumulate( + None, + None, + completion_lengths=completion_lengths, + rewards={ + 'total': total_rewards, + 'format': format_rewards, + 'accuracy': accuracy_rewards, + }) + + # ========== 4. Compute advantages ========== + advantages = advantage_fn( + total_rewards, + num_generations=NUM_GENERATIONS, + scale='group', + ).tolist() + + frac_zero_std = (1.0 if all(abs(a) < 1e-8 for a in advantages) else 0.0) + if frac_zero_std == 1.0: + logger.info(f'Step {step}: All advantages are zero, skipping training') + step += 1 + continue + + # ========== 5. Training step (GRPO) ========== + # forward_backward with GRPO loss: passes advantages and old_logps + # to the server-side GRPOLoss for proper policy optimization + model.forward_backward( + inputs=input_features, + advantages=advantages, + old_logps=old_logps_list, + ) + + # Gradient clipping and optimizer step + model.clip_grad_norm(1.0) + model.step() + model.zero_grad() + model.lr_step() + + gc.collect() + + # ========== 6. Log ========== + log_dict = metrics.calculate() + log_dict.update(model.calculate_metric()) + log_dict['train/frac_reward_zero_std'] = frac_zero_std + logger.info(f'Step {step}: {log_dict}') + step += 1 + + # Save final checkpoint + twinkle_path = model.save(name='grpo-countdown-final', save_optimizer=True) + logger.info(f'Saved final checkpoint: {twinkle_path}') + + +if __name__ == '__main__': + train() +``` + +多个开发者可以并行使用这个服务的单个基模并行训练和采样。并且,他们进行的训练方式允许不同。例如,A用户可以进行SFT,B用户可以进行RL,C用户可以进行采样。 同样,Twinkle也支持Tinker-like API进行远端训练: + +>[!Note] +> 需要注意的一点,在当前Twinkle的实现中,客户端的Twinkle API和Tinker API是无法同时在一个服务端使用的。当你需要提供Tinker API时,你需要启动cookbook/client/tinker下的服务。 +> 这个问题会在接下来的迭代高优解决。 + +```python +from tinker import types +from tqdm import tqdm +from tinker import ServiceClient +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.preprocessor import SelfCognitionProcessor +from twinkle.server.tinker.common import input_feature_to_datum + +# The base model to fine-tune / evaluate +base_model = 'Qwen/Qwen3-4B' + + +def train(): + # Step 1: Prepare the dataset + + # Load the self-cognition dataset from ModelScope (first 500 examples) + dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) + + # Apply the chat template matching the base model (max 256 tokens per sample) + dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + + # Replace placeholder names with custom model/author identity + dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) + + # Tokenize and encode the dataset into model-ready input features + dataset.encode(batched=True, load_from_cache_file=False) + + # Wrap the dataset into a DataLoader that yields batches of size 8 + dataloader = DataLoader(dataset=dataset, batch_size=8) + + # Step 2: Initialize the training client + # Connect to the Twinkle server running locally + service_client = ServiceClient(base_url='http://localhost:8000', api_key='your-api-key') + # Create a LoRA training client for the base model (rank=16 for the LoRA adapter) + training_client = service_client.create_lora_training_client(base_model=base_model, rank=16) + + # Step 3: Run the training loop + for epoch in range(3): + print(f'Epoch {epoch}') + for step, batch in tqdm(enumerate(dataloader)): + # Convert each InputFeature into a Datum for the Tinker API + input_datum = [input_feature_to_datum(input_feature) for input_feature in batch] + + # Send data to server: forward + backward pass (computes gradients) + fwdbwd_future = training_client.forward_backward(input_datum, 'cross_entropy') + + # Optimizer step: update model weights with Adam + optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4)) + + # Wait for both operations to complete + fwdbwd_future.result() + optim_result = optim_future.result() + print(f'Training Metrics: {optim_result}') + + # Save a checkpoint after each epoch + save_future = training_client.save_state(f'twinkle-lora-{epoch}') + save_result = save_future.result() + print(f'Saved checkpoint to {save_result.path}') + + +if __name__ == '__main__': + train() +``` + +### 使用魔搭社区提供的TaaS化训练服务 + +在 Twinkle 框架开源的同时,我们依托ModelScope的后台服务,也提供了托管的模型训练服务(Training as a Service),开发者可以通过这一服务, 免费体验Twinkle的训练API。 +该服务和上面叙述的Tinker API部分代码是相同的,唯一不同的是Endpoint和Token需要使用魔搭官方的对应信息。关于如何使用官方服务,请查看[训练服务](./训练服务.md)的详细描述。 + +## 使用Hugging Face的模型 + +切换前缀即可。 + +```text +ms://Qwen/Qwen3-4B -> hf://Qwen/Qwen3-4B +``` + +## 🛠️ Twinkle✨ 模块化生态系统 + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Dataset
数据加载和预处理

+
+

Template
编码和解码

+
+

DataLoader
数据分发和批处理

+
+

Preprocessor
数据 ETL

+
+

InputProcessor
任务特定的输入处理

+
+

Model
大模型,支持多种框架

+
+

Sampler
采样逻辑

+
+

Loss
损失函数

+
+

Metric
训练指标收集

+
+

Reward
奖励函数

+
+

Advantage
优势函数

+
+

CheckpointEngine
权重同步

+
+

Patch
模型修复补丁

+
+

Module
组件,如优化器

+
+

Kernel
算子

+
+

Server
启动后端集群

+
+

Client
客户端代码

+
+

Infra
隔离 ray 和 torchrun 的差异

+
+

Plugin
使用 hub 组件

+
+

Hub
与 HF/MS 库对接

+
+
+ ## Twinkle 的可定制组件 在 Twinkle 的设计中,torchrun、Ray、HTTP 的训练使用同样的 API,并共享相同的组件和输入输出结构。因此,其很多组件可以由开发者自定义来实现新的算法开发。 @@ -77,10 +876,10 @@ DeviceGroup:定义本次训练需要多少个资源组。定义后,组件可 ```python from twinkle.model import TransformersModel -model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', remote_group='default', device_mesh=device_mesh) +model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', remote_group='default', device_mesh=device_mesh) # 或者 from twinkle.model import MegatronModel -model = MegatronModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', remote_group='default', device_mesh=device_mesh) +model = MegatronModel(model_id='ms://Qwen/Qwen3-4B', remote_group='default', device_mesh=device_mesh) ``` DeviceMesh 指定了模型等组件在资源组中的拓扑结构。可以理解为如何进行并行。这会影响一系列的框架决策,例如数据获取、数据消费、数据返回等。 @@ -106,7 +905,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -114,7 +913,7 @@ def train(): # Global batch size = 8, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) # Use a TransformersModel - model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', remote_group='default') + model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', remote_group='default') lora_config = LoraConfig( r=8, diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" index 24d50728..c0d5b68f 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" @@ -21,6 +21,50 @@ 我们强烈推荐开发者查看我们的 [cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook/client/tinker),并根据其中的训练代码进行二次开发。 +样例代码: + +```python +import os +from tqdm import tqdm +from tinker import types +from twinkle_client import init_tinker_client +from twinkle.dataloader import DataLoader +from twinkle.dataset import Dataset, DatasetMeta +from twinkle.preprocessor import SelfCognitionProcessor +from twinkle.server.tinker.common import input_feature_to_datum + +base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' +base_url='http://www.modelscope.cn/twinkle' +api_key=os.environ.get('MODELSCOPE_TOKEN') + +# Use twinkle dataset to load the data +dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) +dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.map(SelfCognitionProcessor('Twinkle Model', 'ModelScope Team'), load_from_cache_file=False) +dataset.encode(batched=True, load_from_cache_file=False) +dataloader = DataLoader(dataset=dataset, batch_size=8) + +# Initialize Tinker client before importing ServiceClient +init_tinker_client() +from tinker import ServiceClient + +service_client = ServiceClient(base_url=base_url, api_key=api_key) +training_client = service_client.create_lora_training_client(base_model=base_model[len('ms://'):], rank=16) + +# Training loop: use input_feature_to_datum to transfer the input format +for epoch in range(3): + for step, batch in tqdm(enumerate(dataloader)): + input_datum = [input_feature_to_datum(input_feature) for input_feature in batch] + + fwdbwd_future = training_client.forward_backward(input_datum, "cross_entropy") + optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4)) + + fwdbwd_result = fwdbwd_future.result() + optim_result = optim_future.result() + + training_client.save_state(f"twinkle-lora-{epoch}").result() +``` + > 目前的服务兼容tinker client,因此请使用tinker的cookbook进行训练。后续我们会支持单服务器支持twinkle/tinker双client。 开发者可以定制数据集/优势函数/奖励/模板等,其中 Loss 部分由于需要在服务端执行,因此当前暂不支持(安全性原因)。 diff --git "a/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/GRPOAdvantage.md" "b/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/GRPOAdvantage.md" index 574ad309..77f8441e 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/GRPOAdvantage.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/GRPOAdvantage.md" @@ -41,8 +41,8 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward # 创建组件 -actor = TransformersModel(model_id='Qwen/Qwen2.5-7B-Instruct') -sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +actor = TransformersModel(model_id='ms://Qwen/Qwen3-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') reward_fn = MathReward() advantage_fn = GRPOAdvantage() diff --git "a/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/RLOOAdvantage.md" "b/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/RLOOAdvantage.md" index c05d9362..50a7c53b 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/RLOOAdvantage.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\344\274\230\345\212\277/RLOOAdvantage.md" @@ -38,8 +38,8 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward # 创建组件 -actor = TransformersModel(model_id='Qwen/Qwen2.5-7B-Instruct') -sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +actor = TransformersModel(model_id='ms://Qwen/Qwen3-4B') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') reward_fn = MathReward() advantage_fn = RLOOAdvantage() diff --git "a/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/Reward.md" "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/Reward.md" index 70118a12..2728612c 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/Reward.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/Reward.md" @@ -87,7 +87,7 @@ from twinkle.sampler import vLLMSampler from twinkle.reward import MathReward from twinkle.advantage import GRPOAdvantage -sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') reward_fn = MathReward() advantage_fn = GRPOAdvantage() diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Sampling.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Sampling.md" index c3d48352..71977be1 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Sampling.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Sampling.md" @@ -62,7 +62,7 @@ class SampleResponse: from twinkle.data_format import SamplingParams, SampleResponse from twinkle.sampler import vLLMSampler -sampler = vLLMSampler(model_id='Qwen/Qwen2.5-7B-Instruct') +sampler = vLLMSampler(model_id='ms://Qwen/Qwen3-4B') params = SamplingParams(max_tokens=512, temperature=0.7, top_p=0.9) response: SampleResponse = sampler.sample(trajectories, sampling_params=params, num_samples=4) diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" index 86e580d2..11d8a716 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" @@ -60,7 +60,7 @@ dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=r Template 组件是负责将字符串/图片多模态原始数据转换为模型输入 token 的组件。数据集可以设置一个 Template 来完成 `encode` 过程。 ```python -dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct', max_length=512) +dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512) ``` set_template 方法支持传入 `kwargs`(例如例子中的 `max_length`),作为 `Template` 的构造参数使用。 diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MegatronModel.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MegatronModel.md" index a8037ae0..08986bbd 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MegatronModel.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MegatronModel.md" @@ -35,7 +35,7 @@ from twinkle.model import MegatronModel from twinkle import DeviceMesh from twinkle.dataloader import DataLoader dataloader = DataLoader(...) -model = MegatronModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2), remote_group='actor') +model = MegatronModel(model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2), remote_group='actor') model.add_adapter_to_model(...) model.set_optimizer('default', adapter_name='...') for data in dataloader: diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/TransformersModel.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/TransformersModel.md" index 383f42ab..297ca54f 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/TransformersModel.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/TransformersModel.md" @@ -41,7 +41,7 @@ from twinkle.model import TransformersModel from twinkle import DeviceMesh from twinkle.dataloader import DataLoader dataloader = DataLoader(...) -model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct', device_mesh=DeviceMesh.from_sizes(dp_size=2, fsdp_size=2), remote_group='actor') +model = TransformersModel(model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, fsdp_size=2), remote_group='actor') model.add_adapter_to_model(...) model.set_optimizer(..., adapter_name='...') for data in dataloader: diff --git "a/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/TorchSampler.md" "b/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/TorchSampler.md" index 3c2a78ee..c23246f7 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/TorchSampler.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/TorchSampler.md" @@ -9,7 +9,7 @@ from twinkle.sampler import TorchSampler from twinkle import DeviceMesh sampler = TorchSampler( - model_id='ms://Qwen/Qwen2.5-7B-Instruct', + model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=1), ) diff --git "a/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/vLLMSampler.md" "b/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/vLLMSampler.md" index 84fc584f..32fe18b6 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/vLLMSampler.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\351\207\207\346\240\267\345\231\250/vLLMSampler.md" @@ -11,7 +11,7 @@ from twinkle import DeviceMesh # 创建采样器 sampler = vLLMSampler( - model_id='ms://Qwen/Qwen2.5-7B-Instruct', + model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=2, tp_size=2), remote_group='sampler_group' ) @@ -60,7 +60,7 @@ twinkle.initialize('ray', groups=device_groups) # 创建远程采样器 sampler = vLLMSampler( - model_id='ms://Qwen/Qwen2.5-7B-Instruct', + model_id='ms://Qwen/Qwen3-4B', device_mesh=DeviceMesh.from_sizes(dp_size=4), remote_group='sampler' ) diff --git a/src/twinkle/data_format/__init__.py b/src/twinkle/data_format/__init__.py index 19bc68a4..2b2c3cf0 100644 --- a/src/twinkle/data_format/__init__.py +++ b/src/twinkle/data_format/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) ModelScope Contributors. All rights reserved. from .input_feature import InputFeature from .message import Message, Tool, ToolCall -from .output import ModelOutput +from .output import LossOutput, ModelOutput from .sampling import SampledSequence, SampleResponse, SamplingParams from .trajectory import Trajectory diff --git a/src/twinkle/data_format/output.py b/src/twinkle/data_format/output.py index 2f723e35..366e37c3 100644 --- a/src/twinkle/data_format/output.py +++ b/src/twinkle/data_format/output.py @@ -1,7 +1,7 @@ # Copyright (c) ModelScope Contributors. All rights reserved. import numpy as np import sys -from typing import TYPE_CHECKING, Any, List, Union +from typing import Any, List, Optional, Union if sys.version_info[:2] <= (3, 11): # Pydantic requirements. @@ -9,10 +9,7 @@ else: from typing import TypedDict -if TYPE_CHECKING: - import torch - -OutputType = Union[np.ndarray, 'torch.Tensor', List[Any]] +OutputType = Union[np.ndarray, 'torch.Tensor', List[Any], float] class ModelOutput(TypedDict, total=False): @@ -21,6 +18,15 @@ class ModelOutput(TypedDict, total=False): Text-related fields: logits: The logits output by the model. loss: The loss calculated by the model. + logps: The log-probabilities of correct tokens by the model. """ - logits: OutputType - loss: OutputType + logits: Optional[OutputType] + loss: Optional[OutputType] + logps: Optional[OutputType] + + +class LossOutput(TypedDict, total=False): + """The output structure for the Losses""" + + loss: Optional[OutputType] + num_tokens: Optional[int] diff --git a/src/twinkle/infra/__init__.py b/src/twinkle/infra/__init__.py index cc5e4ec6..1e8f773f 100644 --- a/src/twinkle/infra/__init__.py +++ b/src/twinkle/infra/__init__.py @@ -103,6 +103,9 @@ def get_device_placement(device_group=None) -> str: if device_group is None: device_group = _device_group + if device_group is None: + return 'No device group provided.' + WIDTH = 80 def box_line(content='', align='left', prefix='│', suffix='│'): diff --git a/src/twinkle/loss/base.py b/src/twinkle/loss/base.py index 1d4c77ce..3981186c 100644 --- a/src/twinkle/loss/base.py +++ b/src/twinkle/loss/base.py @@ -1,8 +1,8 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -from twinkle.data_format import InputFeature, ModelOutput +from twinkle.data_format import InputFeature, LossOutput, ModelOutput class Loss: - def __call__(self, inputs: InputFeature, outputs: ModelOutput, **kwargs): + def __call__(self, inputs: InputFeature, outputs: ModelOutput, **kwargs) -> LossOutput: ... diff --git a/src/twinkle/loss/chunked_cross_entropy.py b/src/twinkle/loss/chunked_cross_entropy.py index f8b60bc8..22d3d407 100644 --- a/src/twinkle/loss/chunked_cross_entropy.py +++ b/src/twinkle/loss/chunked_cross_entropy.py @@ -2,10 +2,12 @@ import math from typing import Any +from ..data_format import LossOutput from .base import Loss class ChunkedCrossEntropyLoss(Loss): + """TODO untested code""" def __init__(self, chunk_size): self.chunk_size = chunk_size @@ -58,4 +60,4 @@ def backward(ctx: Any, *grad_outputs: Any): logits = outputs['logits'] labels = inputs['labels'] - return ChunkedCrossEntropyLossFunc.apply(logits, labels, self.chunk_size) + return LossOutput(loss=ChunkedCrossEntropyLossFunc.apply(logits, labels, self.chunk_size), num_tokens=0) diff --git a/src/twinkle/loss/cross_entropy.py b/src/twinkle/loss/cross_entropy.py index c5f25b9c..12851d45 100644 --- a/src/twinkle/loss/cross_entropy.py +++ b/src/twinkle/loss/cross_entropy.py @@ -1,4 +1,6 @@ # Copyright (c) ModelScope Contributors. All rights reserved. +from twinkle.data_format import LossOutput +from twinkle.utils import selective_log_softmax from .base import Loss @@ -13,6 +15,6 @@ def __call__(self, inputs, outputs, **kwargs): labels = inputs['labels'].view(-1) loss = torch.nn.CrossEntropyLoss(reduction=self.reduction)(logits, labels) if self.reduction != 'sum': - return loss + return LossOutput(loss=loss, num_tokens=0) else: - return loss, (labels != -100).sum() + return LossOutput(loss=loss, num_tokens=(labels != -100).sum()) diff --git a/src/twinkle/loss/grpo.py b/src/twinkle/loss/grpo.py index ccd34fed..d4997710 100644 --- a/src/twinkle/loss/grpo.py +++ b/src/twinkle/loss/grpo.py @@ -2,7 +2,7 @@ import numpy as np from typing import TYPE_CHECKING, Dict, List, Optional, Union -from twinkle.data_format import Trajectory +from twinkle.data_format import LossOutput from twinkle.loss.base import Loss from twinkle.utils.torch_utils import selective_log_softmax @@ -263,7 +263,7 @@ def __call__( ref_logps: Optional['torch.Tensor'] = None, advantages: Optional[Union['torch.Tensor', List[float], np.ndarray]] = None, **kwargs, - ) -> 'torch.Tensor': + ): """ Compute GRPO loss. @@ -280,9 +280,6 @@ def __call__( Same padding/alignment rules as old_logps. advantages: advantage values **kwargs: Additional arguments - - Returns: - loss: Scalar loss value """ import torch labels = inputs.get('labels') @@ -369,7 +366,7 @@ def __call__( loss = self._aggregate_loss(per_token_loss, loss_mask, **kwargs) - return loss + return LossOutput(loss=loss, num_tokens=0) def compute_metrics( self, diff --git a/src/twinkle/loss/mse.py b/src/twinkle/loss/mse.py index ffae868f..65801d42 100644 --- a/src/twinkle/loss/mse.py +++ b/src/twinkle/loss/mse.py @@ -1,4 +1,5 @@ # Copyright (c) ModelScope Contributors. All rights reserved. +from twinkle.data_format import LossOutput from .base import Loss @@ -8,4 +9,4 @@ def __call__(self, inputs, outputs, **kwargs): import torch preds = outputs['logits'] labels = inputs['labels'] - return torch.nn.MSELoss()(preds, labels) + return LossOutput(loss=torch.nn.MSELoss()(preds, labels)) diff --git a/src/twinkle/loss/vocab_parallel_cross_entropy.py b/src/twinkle/loss/vocab_parallel_cross_entropy.py index bc221afb..0a30429f 100644 --- a/src/twinkle/loss/vocab_parallel_cross_entropy.py +++ b/src/twinkle/loss/vocab_parallel_cross_entropy.py @@ -1,4 +1,5 @@ # Copyright (c) ModelScope Contributors. All rights reserved. +from ..data_format import LossOutput from .base import Loss @@ -35,4 +36,7 @@ def __call__(self, inputs, outputs, **kwargs): # Apply loss mask loss_mask = (labels != self.ignore_index).float() - return (per_token_loss * loss_mask).sum(), loss_mask.sum().clamp(min=1) + return LossOutput( + loss=(per_token_loss * loss_mask).sum(), + num_tokens=loss_mask.sum().clamp(min=1), + ) diff --git a/src/twinkle/model/base.py b/src/twinkle/model/base.py index b4550350..bee6c37d 100644 --- a/src/twinkle/model/base.py +++ b/src/twinkle/model/base.py @@ -23,92 +23,92 @@ class TwinkleModel(ABC): _checkpoint_engine = None @abstractmethod - def forward(self, *, inputs: Dict[str, Any], **kwargs): + def forward(self, *, inputs: Dict[str, Any], **kwargs) -> ModelOutput: ... @abstractmethod - def forward_only(self, *, inputs: Dict[str, Any], **kwargs): + def forward_only(self, *, inputs: Dict[str, Any], **kwargs) -> ModelOutput: ... @abstractmethod - def calculate_loss(self, **kwargs): + def calculate_loss(self, **kwargs) -> float: ... @abstractmethod - def backward(self, **kwargs): + def backward(self, **kwargs) -> None: ... @abstractmethod - def forward_backward(self, *, inputs: Dict[str, Any], **kwargs): + def forward_backward(self, *, inputs: Dict[str, Any], **kwargs) -> ModelOutput: ... @abstractmethod - def clip_grad_norm(self, max_grad_norm: float = 1.0, norm_type=2, **kwargs): + def clip_grad_norm(self, max_grad_norm: float = 1.0, norm_type=2, **kwargs) -> float: ... @abstractmethod - def step(self, **kwargs): + def step(self, **kwargs) -> None: ... @abstractmethod - def zero_grad(self, **kwargs): + def zero_grad(self, **kwargs) -> None: ... @abstractmethod - def lr_step(self, **kwargs): + def lr_step(self, **kwargs) -> None: ... @abstractmethod - def clip_grad_and_step(self, max_grad_norm: float = 1.0, norm_type=2, **kwargs): + def clip_grad_and_step(self, max_grad_norm: float = 1.0, norm_type=2, **kwargs) -> None: ... @abstractmethod def set_loss(self, loss_cls: Union[Loss, Type[Loss], str, Callable[[InputFeature, ModelOutput, ...], - 'torch.Tensor']], **kwargs): + 'torch.Tensor']], **kwargs) -> None: ... @abstractmethod - def set_optimizer(self, optimizer_cls: Union['Optimizer', Type['Optimizer'], str], **kwargs): + def set_optimizer(self, optimizer_cls: Union['Optimizer', Type['Optimizer'], str], **kwargs) -> None: ... @abstractmethod - def set_lr_scheduler(self, scheduler_cls: Union['LRScheduler', Type['LRScheduler'], str], **kwargs): + def set_lr_scheduler(self, scheduler_cls: Union['LRScheduler', Type['LRScheduler'], str], **kwargs) -> None: ... @abstractmethod - def save(self, name: str, output_dir: Optional[str] = None, **kwargs): + def save(self, name: str, output_dir: Optional[str] = None, **kwargs) -> str: ... @abstractmethod - def load(self, name: str, output_dir: Optional[str] = None, **kwargs): + def load(self, name: str, output_dir: Optional[str] = None, **kwargs) -> None: ... @abstractmethod - def get_state_dict(self, **kwargs): + def get_state_dict(self, **kwargs) -> Dict[str, Any]: ... @abstractmethod - def apply_patch(self, patch_cls: Union[Patch, Type[Patch], str], **kwargs): + def apply_patch(self, patch_cls: Union[Patch, Type[Patch], str], **kwargs) -> None: ... @abstractmethod - def add_metric(self, metric_cls: Union[Metric, str], is_training: Optional[bool] = None, **kwargs): + def add_metric(self, metric_cls: Union[Metric, str], is_training: Optional[bool] = None, **kwargs) -> None: ... @abstractmethod - def calculate_metric(self, is_training: bool, **kwargs): + def calculate_metric(self, is_training: bool, **kwargs) -> Dict[str, Any]: ... @abstractmethod - def add_adapter_to_model(self, adapter_name: str, config_or_dir, **kwargs): + def add_adapter_to_model(self, adapter_name: str, config_or_dir, **kwargs) -> None: ... @abstractmethod - def set_template(self, template_cls: Union[Template, Type[Template], str], **kwargs): + def set_template(self, template_cls: Union[Template, Type[Template], str], **kwargs) -> None: ... @abstractmethod - def set_processor(self, processor_cls: Union[InputProcessor, Type[InputProcessor], str], **kwargs): + def set_processor(self, processor_cls: Union[InputProcessor, Type[InputProcessor], str], **kwargs) -> None: ... @abstractmethod @@ -119,7 +119,7 @@ def upload_to_hub(self, checkpoint_dir: str, hub_model_id: str, hub_token: Optional[str] = None, - async_upload: bool = True): + async_upload: bool = True) -> None: """Upload model checkpoint to hub. Args: diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py index 3df60298..512318bd 100644 --- a/src/twinkle/model/megatron/megatron.py +++ b/src/twinkle/model/megatron/megatron.py @@ -32,7 +32,7 @@ from twinkle.patch import Patch, apply_patch from twinkle.processor import InputProcessor from twinkle.template import Template -from twinkle.utils import construct_class, exists +from twinkle.utils import construct_class, exists, selective_log_softmax from .strategy import MegatronStrategy @@ -195,7 +195,7 @@ def __init__( self._seed = kwargs.pop('seed', None) or int(os.environ.get('TWINKLE_SEED', 42)) self._default_tokenizer = None self.use_distributed_optimizer = kwargs.get('use_distributed_optimizer', True) - self.variable_seq_lengths = kwargs.get('variable_seq_lengths', True) + self.variable_seq_lengths = kwargs.get('variable_seq_lengths', False) torch_util.set_device() self.strategy = MegatronStrategy(self.device_mesh, mixed_precision=mixed_precision, **kwargs) @@ -416,18 +416,17 @@ def forward_backward(self, _mb_counter = [0] # mutable counter for closure - def post_loss_function(output_tensor, inputs): + def post_loss_function(output_tensor, inputs, logps): mb_idx = _mb_counter[0] _mb_counter[0] += 1 current_kwargs = loss_extra_kwargs_per_mb[mb_idx % len(loss_extra_kwargs_per_mb)] outputs = ModelOutput(logits=output_tensor) result = loss_instance(inputs, outputs, **current_kwargs) - if isinstance(result, tuple): - losses, counts = result - else: - losses = result + losses = result['loss'] + counts = result['num_tokens'] + if not counts: counts = torch.tensor(1, device=losses.device) - return self.strategy.gather_loss_for_cp(losses, counts, output_tensor) + return self.strategy.gather_loss_for_cp(losses, counts, output_tensor, logps) # Define forward step function for Megatron # forward_step_func(data_iterator, model) -> (output_tensor, partial(loss_func)) @@ -436,7 +435,12 @@ def forward_step_func(data_iterator, model): labels = batch.pop('labels', None) output_tensor = model(**batch) batch['labels'] = labels - return output_tensor, partial(post_loss_function, inputs=batch) + if labels is not None: + loss_mask = (labels != -100).bool() + masked_labels = labels.clone() + masked_labels[~loss_mask] = 0 + logps = selective_log_softmax(output_tensor, masked_labels) + return output_tensor, partial(post_loss_function, inputs=batch, logps=logps) # Get Megatron's forward-backward function # This automatically selects the right scheduler based on PP config: @@ -467,6 +471,7 @@ def forward_step_func(data_iterator, model): # Extract loss from results (only last PP stage returns non-empty) loss = torch.tensor(0.0).to(Platform.get_local_device()) logits = [] + logps = [] count = 0 if losses: for loss_dict in losses: @@ -476,6 +481,8 @@ def forward_step_func(data_iterator, model): count += 1 if 'logits' in loss_dict: logits.append(loss_dict['logits']) + if 'logps' in loss_dict: + logps.append(loss_dict['logps']) elif isinstance(loss_dict, torch.Tensor): loss += loss_dict count += 1 @@ -507,23 +514,18 @@ def forward_step_func(data_iterator, model): torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.AVG, group=dp_cp_group) optimizer_config.inputs = inputs - if forward_only: - if len({logit.shape[0] for logit in logits}) == 1: - logits = torch.cat(logits, dim=0) - return { - 'loss': loss, - 'logits': logits, - } - else: - optimizer_config.outputs = ModelOutput(logits=logits, loss=loss) - if isinstance(loss, torch.Tensor): - return loss.detach().cpu().float().numpy() - return float(loss) + if len({_logps.shape[1] for _logps in logps}) == 1: + logps = torch.cat(logps, dim=0) + if isinstance(loss, torch.Tensor): + loss = loss.detach().cpu().float().numpy() + if not forward_only: + optimizer_config.outputs = ModelOutput(logits=None, loss=loss, logps=logps) + return ModelOutput(logits=None, loss=loss, logps=logps) @remote_function(dispatch='all') def clip_grad_norm(self, max_grad_norm: float = 1.0, norm_type: int = 2, **kwargs): # Megatron optimizer will cover this function. - pass + return 0 @remote_function(dispatch='all') def step(self, **kwargs): diff --git a/src/twinkle/model/megatron/strategy/megatron.py b/src/twinkle/model/megatron/strategy/megatron.py index cddd6505..7dc8f4d6 100644 --- a/src/twinkle/model/megatron/strategy/megatron.py +++ b/src/twinkle/model/megatron/strategy/megatron.py @@ -133,7 +133,7 @@ def _wrap_with_megatron_ddp( return wrapped_models - def gather_loss_for_cp(self, local_loss_sum, local_count, logits): + def gather_loss_for_cp(self, local_loss_sum, local_count, logits, logps): import torch from megatron.core import parallel_state as mpu cp_size = mpu.get_context_parallel_world_size() @@ -155,7 +155,7 @@ def gather_loss_for_cp(self, local_loss_sum, local_count, logits): else: loss = local_loss_sum / local_count.clamp(min=1) - return loss, {'loss': loss.detach(), 'logits': logits.detach()} + return loss, {'loss': loss.detach(), 'logits': logits.detach(), 'logps': logps.detach()} def get_model_config( self, diff --git a/src/twinkle/model/transformers/transformers.py b/src/twinkle/model/transformers/transformers.py index 1e8c1700..33f044af 100644 --- a/src/twinkle/model/transformers/transformers.py +++ b/src/twinkle/model/transformers/transformers.py @@ -34,7 +34,7 @@ from twinkle.patch import Patch, apply_patch from twinkle.processor import InputProcessor from twinkle.template import Template -from twinkle.utils import construct_class, torch_util +from twinkle.utils import construct_class, selective_log_softmax, torch_util from twinkle.utils.framework import Torch from twinkle.utils.grad_clip import normalize_and_clip_grad_norm @@ -369,6 +369,11 @@ def forward(self, *, inputs: Union[InputFeature, List[InputFeature], List[Trajec optimizer_config.inputs = inputs optimizer_config.outputs = outputs optimizer_config.loss_value = outputs.get('aux_loss', 0) + if labels is not None: + loss_mask = (labels != -100).bool() + masked_labels = labels.clone() + masked_labels[~loss_mask] = 0 + outputs['logps'] = selective_log_softmax(outputs['logits'], masked_labels) return outputs @remote_function(dispatch='slice_dp', collect='flatten') @@ -411,6 +416,11 @@ def forward_only(self, *, inputs: Union[InputFeature, List[InputFeature], List[T optimizer_config.inputs = inputs optimizer_config.outputs = outputs optimizer_config.loss_value = outputs.get('aux_loss', 0) + if labels is not None: + loss_mask = (labels != -100).bool() + masked_labels = labels.clone() + masked_labels[~loss_mask] = 0 + outputs['logps'] = selective_log_softmax(outputs['logits'], masked_labels) return outputs @remote_function(collect='mean') @@ -432,10 +442,9 @@ def calculate_loss(self, **kwargs): outputs = optimizer_config.outputs assert inputs is not None and outputs is not None, 'Cannot calculate loss of empty inputs and outputs' result = loss_instance(inputs, outputs, **kwargs) - if isinstance(result, tuple): - loss_value, counts = result - else: - loss_value = result + loss_value = result['loss'] + counts = result['num_tokens'] + if not counts: counts = torch.tensor(0, device=loss_value.device) optimizer_config = self.optimizer_group[adapter_name] optimizer_config.num_tokens += counts.item() @@ -487,10 +496,11 @@ def forward_backward(self, *, inputs: Union[InputFeature, List[InputFeature], Tr Returns: The output of the model forward. """ - self.forward(inputs=inputs, **kwargs) + outputs = self.forward(inputs=inputs, **kwargs) loss = self.calculate_loss(**kwargs) + outputs['loss'] = loss self.backward(**kwargs) - return loss + return outputs @remote_function() def clip_grad_norm(self, max_grad_norm: float = 1.0, norm_type=2, **kwargs): @@ -538,11 +548,10 @@ def clip_grad_norm(self, max_grad_norm: float = 1.0, norm_type=2, **kwargs): @remote_function(dispatch='all') def clip_grad_and_step(self, max_grad_norm: float = 1.0, norm_type=2, **kwargs): - grad_norm = self.clip_grad_norm(max_grad_norm, norm_type, **kwargs) + self.clip_grad_norm(max_grad_norm, norm_type, **kwargs) self.step(**kwargs) self.zero_grad(**kwargs) self.lr_step(**kwargs) - return grad_norm def _create_param_group(self, adapter_name: str, diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py index 6f10ff63..ca021ff4 100644 --- a/src/twinkle/processor/base.py +++ b/src/twinkle/processor/base.py @@ -165,6 +165,9 @@ def pad_cp_inputs(input_tensor: torch.Tensor, padding_value: int) -> torch.Tenso def split_cp(self, inputs: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]: + if self.device_mesh is None: + return inputs + def _split_cp(inputs: Dict[str, Any]) -> Dict[str, Any]: cp_size = self.device_mesh.cp_world_size diff --git a/src/twinkle/reward/base.py b/src/twinkle/reward/base.py index f80b2197..edf05a65 100644 --- a/src/twinkle/reward/base.py +++ b/src/twinkle/reward/base.py @@ -6,5 +6,5 @@ class Reward: - def __call__(self, trajectories: List[Trajectory], ground_truths: List[Trajectory]): + def __call__(self, trajectories: List[Trajectory], ground_truths: List[Trajectory]) -> List[float]: ... diff --git a/src/twinkle/server/__main__.py b/src/twinkle/server/__main__.py index 17fe87fc..d6d30582 100644 --- a/src/twinkle/server/__main__.py +++ b/src/twinkle/server/__main__.py @@ -10,7 +10,7 @@ python -m twinkle.server --config server_config.yaml --server-type tinker # Quick start with minimal args - python -m twinkle.server --server-type tinker --port 8000 --model-id "Qwen/Qwen2.5-7B-Instruct" + python -m twinkle.server --server-type tinker --port 8000 --model-id "Qwen/Qwen3-4B" """ from __future__ import annotations diff --git a/src/twinkle/server/tinker/common/compat_base.py b/src/twinkle/server/tinker/common/compat_base.py index 54d665e3..160e303a 100644 --- a/src/twinkle/server/tinker/common/compat_base.py +++ b/src/twinkle/server/tinker/common/compat_base.py @@ -114,7 +114,7 @@ def get_template(self, adapter_name: str) -> Template: return self.optimizer_group[adapter_name].template @staticmethod - def _get_forward_output(inputs: List[types.Datum], logits: torch.Tensor) -> List[dict]: + def _get_forward_output(inputs: List[types.Datum], logits: torch.Tensor, logps: torch.Tensor) -> List[dict]: """Convert raw logits to the expected output format with logprobs and elementwise_loss.""" from twinkle.utils.torch_utils import selective_log_softmax @@ -129,11 +129,16 @@ def _get_forward_output(inputs: List[types.Datum], logits: torch.Tensor) -> List # Labels are assumed to be already shifted/aligned with logits seq_len = labels.numel() - # Check if index is within logits bounds - feature_logits = logit[:seq_len, :] + if logps is None: + assert logits is not None + # Check if index is within logits bounds + # Right padding + feature_logits = logit[:seq_len, :] - # Calculate log probs for all labels - token_log_probs = selective_log_softmax(feature_logits, labels) + # Calculate log probs for all labels + token_log_probs = selective_log_softmax(feature_logits, labels) + else: + token_log_probs = logps[:seq_len, :] # elementwise_loss: positive NLL loss (0.0 where masked) elementwise_loss = -token_log_probs * weights diff --git a/src/twinkle/server/tinker/common/datum.py b/src/twinkle/server/tinker/common/datum.py index fa707b93..289176fe 100644 --- a/src/twinkle/server/tinker/common/datum.py +++ b/src/twinkle/server/tinker/common/datum.py @@ -81,7 +81,7 @@ def input_feature_to_datum(input_feature: InputFeature) -> types.Datum: else: tokens = [int(t) for t in input_ids] else: - tokens = np.asarray(input_ids, dtype=np.int64).flatten().tolist() + tokens = np.asarray(input_ids.cpu(), dtype=np.int64).flatten().tolist() model_input = types.ModelInput.from_ints(tokens) @@ -93,7 +93,7 @@ def input_feature_to_datum(input_feature: InputFeature) -> types.Datum: if isinstance(labels_raw, np.ndarray): labels_arr = labels_raw.astype(np.int64) else: - labels_arr = np.asarray(labels_raw, dtype=np.int64) + labels_arr = np.asarray(labels_raw.cpu(), dtype=np.int64) labels_arr = labels_arr.reshape(-1) diff --git a/src/twinkle/server/tinker/common/megatron_model.py b/src/twinkle/server/tinker/common/megatron_model.py index 4b8be0a9..2d54c950 100644 --- a/src/twinkle/server/tinker/common/megatron_model.py +++ b/src/twinkle/server/tinker/common/megatron_model.py @@ -80,9 +80,10 @@ def forward_backward(self, *, inputs: List[types.Datum], adapter_name: str, loss optimizer_config = self.optimizer_group.get(adapter_name) outputs = optimizer_config.outputs if optimizer_config else {} logits_list = outputs.get('logits', []) + logps = outputs.get('logprobs', []) # When PP enabled, only logits from last stage are available - if not logits_list: + if not logits_list and not logps: return [None, None] # Process logits to match transformers output format @@ -91,7 +92,8 @@ def forward_backward(self, *, inputs: List[types.Datum], adapter_name: str, loss else: # Concatenate logits from multiple microbatches logits = torch.cat([logit.detach() for logit in logits_list], dim=0) - results = self._get_forward_output(inputs, logits) + logps = logps.detach().cpu() + results = self._get_forward_output(inputs, logits, logps) # Convert loss to scalar if isinstance(loss, torch.Tensor): @@ -112,14 +114,15 @@ def forward_only(self, *, inputs: List[types.Datum], **kwargs): outputs = super().forward_only(inputs=input_features, **kwargs) # Get logits - logits = outputs.get('logits', None) if isinstance(outputs, dict) else None + logits = outputs.get('logits', None) + logps = outputs.get('logps', None) if logits is not None: if isinstance(logits, torch.Tensor): logits = logits.detach().cpu() elif isinstance(logits, list) and len(logits) > 0: logits = torch.cat([logit.detach().cpu() for logit in logits], dim=0) - results = self._get_forward_output(inputs, logits) + results = self._get_forward_output(inputs, logits, logps) else: # If no logits available (non-last PP stage), return empty results results = [{'logprobs': None, 'elementwise_loss': None} for _ in inputs] diff --git a/src/twinkle/server/tinker/common/transformers_model.py b/src/twinkle/server/tinker/common/transformers_model.py index 95151952..98ae0134 100644 --- a/src/twinkle/server/tinker/common/transformers_model.py +++ b/src/twinkle/server/tinker/common/transformers_model.py @@ -52,7 +52,10 @@ def forward_only(self, *, inputs: List[types.Datum], **kwargs): outputs = super().forward_only(inputs=input_features, **kwargs) # shape (batch_size, seq_len, vocab_size) logits = outputs['logits'].detach().cpu() - results = self._get_forward_output(inputs, logits) + logps = outputs.get('logps', None) + if logps is not None: + logps = logps.detach().cpu() + results = self._get_forward_output(inputs, logits, logps) return results @remote_function(dispatch='slice_dp', collect=collect_forward_backward_results) @@ -89,7 +92,10 @@ def forward_backward(self, *, inputs: List[types.Datum], adapter_name: str, loss # shape (batch_size, seq_len, vocab_size) logits = outputs['logits'].detach() - results = self._get_forward_output(inputs, logits) + logps = outputs.get('logps', None) + if logps is not None: + logps = logps.detach().cpu() + results = self._get_forward_output(inputs, logits, logps) return [results, loss] @remote_function() diff --git a/src/twinkle/server/twinkle/sampler.py b/src/twinkle/server/twinkle/sampler.py index 62cb6a72..6efa5bd6 100644 --- a/src/twinkle/server/twinkle/sampler.py +++ b/src/twinkle/server/twinkle/sampler.py @@ -104,7 +104,7 @@ def build_sampler_app(model_id: str, """Build a sampler application for text generation inference. Args: - model_id: Model identifier (e.g., "Qwen/Qwen2.5-7B-Instruct") + model_id: Model identifier (e.g., "Qwen/Qwen3-4B") nproc_per_node: Number of GPU processes per node device_group: Device group configuration dict device_mesh: Device mesh configuration dict for parallelism diff --git a/src/twinkle/utils/framework.py b/src/twinkle/utils/framework.py index 7d4f7bb6..5a23623e 100644 --- a/src/twinkle/utils/framework.py +++ b/src/twinkle/utils/framework.py @@ -39,10 +39,9 @@ def seed_everything(seed: Optional[int] = 42, full_determinism: bool = False): @staticmethod def gather_object(object: Any, device_mesh: DeviceMesh, process_group=None): - import torch import torch.distributed as dist output_objects = [object] - if device_mesh.data_world_size > 1: + if device_mesh is not None and device_mesh.data_world_size > 1: group_size = dist.get_world_size(group=process_group) output_objects = [None for _ in range(group_size)] dist.all_gather_object(output_objects, object, group=process_group) diff --git a/src/twinkle/utils/logger.py b/src/twinkle/utils/logger.py index 26a74b9a..d04beef8 100644 --- a/src/twinkle/utils/logger.py +++ b/src/twinkle/utils/logger.py @@ -8,6 +8,8 @@ from .platforms import Platform +log_level = os.getenv('LOG_LEVEL', 'INFO').upper() + # Avoid circular reference def _is_local_master(): @@ -116,10 +118,9 @@ def get_logger(log_file: Optional[str] = None, return logger -logger = get_logger() +logger = get_logger(log_level=log_level) logger.handlers[0].setFormatter(logger_format) -log_level = os.getenv('LOG_LEVEL', 'INFO').upper() @contextmanager diff --git a/src/twinkle/utils/torch_utils.py b/src/twinkle/utils/torch_utils.py index 91bf3569..35c636d8 100644 --- a/src/twinkle/utils/torch_utils.py +++ b/src/twinkle/utils/torch_utils.py @@ -77,10 +77,10 @@ def selective_log_softmax(logits, index) -> 'torch.Tensor': if mpu.get_tensor_model_parallel_world_size() >= 1: try: return _vocab_parallel_selective_log_softmax(logits, index) - except Exception: + except Exception: # noqa import traceback print(traceback.format_exc()) - except Exception: + except Exception: # noqa pass if logits.dtype in [torch.float32, torch.float64]: selected_logits = torch.gather(logits, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)