diff --git a/Dockerfile b/Dockerfile index a3aa4ff6..eeb03f8e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,8 +16,36 @@ RUN echo "Available release branches:" && git branch -r -l 'origin/release/*' -- echo "Checking out: $LATEST_RELEASE" && \ git checkout --track "$LATEST_RELEASE" -RUN sh INSTALL_MEGATRON.sh +ENV SETUPTOOLS_USE_DISTUTILS=local +# Install base packages +RUN pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir + +# Install vllm +RUN pip install --upgrade vllm --no-cache-dir + +# Install transformer_engine and megatron_core +RUN SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") && \ + CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \ + CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \ + pip install --no-build-isolation "transformer_engine[pytorch]" --no-cache-dir + +RUN pip install megatron_core mcore_bridge --no-cache-dir + +# Install flash-attention (default arch 8.0;9.0, override via build-arg if needed) +ARG TORCH_CUDA_ARCH_LIST="8.0;9.0" +RUN TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" \ + MAX_JOBS=8 \ + FLASH_ATTENTION_FORCE_BUILD=TRUE \ + pip install flash-attn --no-build-isolation --no-cache-dir + +RUN pip install flash-linear-attention -U --no-cache-dir + +# Install numpy +RUN pip install numpy==2.2 --no-cache-dir + +# Install tinker, ray, and other deps RUN pip install --no-cache-dir tinker==0.14.0 "ray[serve]" transformers peft accelerate -U +# Install twinkle itself RUN pip install -e . --no-build-isolation diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh index e86e5478..276598c5 100644 --- a/INSTALL_MEGATRON.sh +++ b/INSTALL_MEGATRON.sh @@ -56,12 +56,12 @@ echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST" # Install latest base packages echo "" echo "Installing peft, accelerate, transformers, modelscope..." -pip install --upgrade peft accelerate transformers "modelscope[framework]" +pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir # Install latest vllm echo "" echo "Installing latest vllm..." -pip install --upgrade vllm +pip install --upgrade vllm --no-cache-dir # Get site-packages path and install transformer_engine and megatron_core echo "" @@ -83,12 +83,12 @@ MAX_JOBS=8 \ FLASH_ATTENTION_FORCE_BUILD=TRUE \ pip install flash-attn --no-build-isolation --no-cache-dir -pip install flash-linear-attention -U +pip install flash-linear-attention -U --no-cache-dir # Install numpy echo "" echo "Installing numpy==2.2 and deep_gemm..." -pip install numpy==2.2 +pip install numpy==2.2 --no-cache-dir # Verify installation echo "" diff --git a/README.md b/README.md index 35799b85..8c958f41 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id=base_model) + dataset.set_template('Qwen3_5Template', model_id=base_model) # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -242,7 +242,7 @@ api_key='your-api-key' # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256) dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git a/README_ZH.md b/README_ZH.md index 352ebde0..6ae4f742 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -166,7 +166,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id=base_model) + dataset.set_template('Qwen3_5Template', model_id=base_model) # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -224,7 +224,7 @@ api_key='your-api-key' # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256) dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py index 9f02ee40..2347c7fc 100644 --- a/cookbook/client/tinker/modelscope/self_cognition.py +++ b/cookbook/client/tinker/modelscope/self_cognition.py @@ -34,7 +34,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply the chat template matching the base model (max 256 tokens per sample) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) # Replace placeholder names with custom model/author identity dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py index 6796b517..47a7d24a 100644 --- a/cookbook/client/tinker/modelscope/short_math_grpo.py +++ b/cookbook/client/tinker/modelscope/short_math_grpo.py @@ -182,7 +182,7 @@ def create_math_dataset(): data_slice=range(DATA_NUM), ) dataset = Dataset(meta) - dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete') + dataset.set_template('Qwen3_5Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete') dataset.map(MathPreprocessor()) dataset.filter(lambda row: bool(row['messages'])) dataset.encode(add_generation_prompt=True) diff --git a/cookbook/client/tinker/self_host/self_cognition.py b/cookbook/client/tinker/self_host/self_cognition.py index 6951760d..691662e6 100644 --- a/cookbook/client/tinker/self_host/self_cognition.py +++ b/cookbook/client/tinker/self_host/self_cognition.py @@ -36,7 +36,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply the chat template matching the base model (max 256 tokens per sample) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) # Replace placeholder names with custom model/author identity dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) diff --git a/cookbook/client/tinker/self_host/short_math_grpo.py b/cookbook/client/tinker/self_host/short_math_grpo.py index 35b4d96d..f6fe8b45 100644 --- a/cookbook/client/tinker/self_host/short_math_grpo.py +++ b/cookbook/client/tinker/self_host/short_math_grpo.py @@ -182,7 +182,7 @@ def create_math_dataset(): data_slice=range(DATA_NUM), ) dataset = Dataset(meta) - dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete') + dataset.set_template('Qwen3_5Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete') dataset.map(MathPreprocessor()) dataset.filter(lambda row: bool(row['messages'])) dataset.encode(add_generation_prompt=True) diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py index aafc5d14..81c5ab4d 100644 --- a/cookbook/client/twinkle/modelscope/self_congnition.py +++ b/cookbook/client/twinkle/modelscope/self_congnition.py @@ -52,7 +52,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply a chat template so the data matches the model's expected input format - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=512) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=512) # Replace placeholder names in the dataset with custom model/author names dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'}) @@ -77,7 +77,7 @@ def train(): model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) # Set the same chat template used during data preprocessing - model.set_template('Template') + model.set_template('Qwen3_5Template') # Set the input processor (pads sequences on the right side) model.set_processor('InputProcessor', padding_side='right') diff --git a/cookbook/client/twinkle/self_host/grpo.py b/cookbook/client/twinkle/self_host/grpo.py index cabce6ea..d87bfa77 100644 --- a/cookbook/client/twinkle/self_host/grpo.py +++ b/cookbook/client/twinkle/self_host/grpo.py @@ -55,7 +55,7 @@ def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map('GSM8KProcessor') dataset.encode(add_generation_prompt=True) return dataset @@ -112,11 +112,11 @@ def train(): # Set processor and template for encoding inputs model.set_processor('InputProcessor') - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 4: Configure the sampler sampler = vLLMSampler(model_id=MODEL_ID) - sampler.set_template('Template', model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 5: Setup metrics and advantage function advantage_fn = GRPOAdvantage() diff --git a/cookbook/client/twinkle/self_host/sample.py b/cookbook/client/twinkle/self_host/sample.py index 3b02c4ec..f7925d4f 100644 --- a/cookbook/client/twinkle/self_host/sample.py +++ b/cookbook/client/twinkle/self_host/sample.py @@ -42,7 +42,7 @@ def sample(): sampler = vLLMSampler(model_id=MODEL_ID) # Step 4: Set the chat template so the sampler can encode Trajectory inputs - sampler.set_template('Template', model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 5: Prepare inputs as Trajectory dicts (messages format) # Each trajectory is a conversation with system and user messages diff --git a/cookbook/client/twinkle/self_host/self_congnition.py b/cookbook/client/twinkle/self_host/self_congnition.py index e31daaba..f382956f 100644 --- a/cookbook/client/twinkle/self_host/self_congnition.py +++ b/cookbook/client/twinkle/self_host/self_congnition.py @@ -59,7 +59,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply a chat template so the data matches the model's expected input format - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=512) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=512) # Replace placeholder names in the dataset with custom model/author names dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'}) @@ -84,7 +84,7 @@ def train(): model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) # Set the same chat template used during data preprocessing - model.set_template('Template') + model.set_template('Qwen3_5Template') # Set the input processor (pads sequences on the right side) model.set_processor('InputProcessor', padding_side='right') diff --git a/cookbook/megatron/tp_moe.py b/cookbook/megatron/tp_moe.py index 364ac686..b66b109f 100644 --- a/cookbook/megatron/tp_moe.py +++ b/cookbook/megatron/tp_moe.py @@ -20,7 +20,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=16) @@ -34,7 +34,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset diff --git a/cookbook/ray/single_controller.py b/cookbook/ray/single_controller.py index 39d99353..edb8d8e6 100644 --- a/cookbook/ray/single_controller.py +++ b/cookbook/ray/single_controller.py @@ -26,7 +26,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) @@ -41,7 +41,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset diff --git a/cookbook/rl/gkd_on_policy.py b/cookbook/rl/gkd_on_policy.py index f134f0de..f30df2ea 100644 --- a/cookbook/rl/gkd_on_policy.py +++ b/cookbook/rl/gkd_on_policy.py @@ -173,6 +173,9 @@ def main(): # ── Student vLLM sampler (for on-policy generation) ──────────────────────── student_sampler = vLLMSampler( model_id=STUDENT_MODEL_ID, + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM engine_args={'gpu_memory_utilization': 0.85, 'max_model_len': 4096, 'enable_lora': True, 'max_loras': 1}, device_mesh=sampler_mesh, remote_group='student_sampler', @@ -210,6 +213,9 @@ def main(): break # 1. Sync student model weights to student sampler + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM ckpt_manager.sync_weights(merge_and_sync=False) student_sampler.reset_prefix_cache() diff --git a/cookbook/rl/grpo.py b/cookbook/rl/grpo.py index bc864309..30d5d898 100644 --- a/cookbook/rl/grpo.py +++ b/cookbook/rl/grpo.py @@ -103,6 +103,9 @@ def main(): 'max_model_len': 4496, 'max_lora_rank': 32, # save as lora_config # NOTE: To use enable_lora with qwen3.5, ensure vLLM includes PR https://github.com/vllm-project/vllm/pull/36976 + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM 'enable_lora': True, }, device_mesh=sampler_mesh, @@ -133,6 +136,9 @@ def main(): break metrics.reset() global_prompts = batch if isinstance(batch, list) else [batch] + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM ckpt_manager.sync_weights(merge_and_sync=False) sampler.reset_prefix_cache() sample_responses = sampler.sample( diff --git a/cookbook/rl/grpo_mm.py b/cookbook/rl/grpo_mm.py index d6f934d5..0705febb 100644 --- a/cookbook/rl/grpo_mm.py +++ b/cookbook/rl/grpo_mm.py @@ -27,10 +27,6 @@ ) from twinkle.sampler import vLLMSampler -import swanlab -swanlab.init( - project='twinkle', -) logger = get_logger() # Model configuration @@ -184,6 +180,9 @@ def main(): 'gpu_memory_utilization': 0.8, 'max_model_len': 32000, 'max_lora_rank': 32, + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM 'enable_lora': True, 'limit_mm_per_prompt': {'image': 9}, # OlympiadBench has up to 9 images }, @@ -221,6 +220,9 @@ def main(): metrics.reset() # Sync weights to sampler + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM ckpt_manager.sync_weights(merge_and_sync=False) sampler.reset_prefix_cache() @@ -282,7 +284,6 @@ def main(): log_dict.update(model.calculate_metric(is_training=True, adapter_name=ADAPTER_NAME)) metrics.reset() logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}') - swanlab.log(log_dict) logger.info(f'Training completed. optim_steps={optim_step}') model.save('olympiad-grpo-mixed-final', adapter_name=ADAPTER_NAME) diff --git a/cookbook/rl/short_math_grpo.py b/cookbook/rl/short_math_grpo.py index 55939cbd..8f498923 100644 --- a/cookbook/rl/short_math_grpo.py +++ b/cookbook/rl/short_math_grpo.py @@ -50,12 +50,6 @@ SYSTEM_PROMPT = ('You are a helpful math assistant. Solve the problem with minimal but correct reasoning ' 'and put your final answer within \\boxed{}.') -import swanlab -swanlab.init( - project='twinkle', -) - - # ========== Reward Functions ========== class GSM8KBrevityReward(Reward): """Brevity reward: rewards shorter completions that contain a valid answer. @@ -167,6 +161,9 @@ def main(): 'max_model_len': 8192, 'max_lora_rank': 32, # save as lora_config # NOTE: To use enable_lora with qwen3.5, ensure vLLM includes PR https://github.com/vllm-project/vllm/pull/36976 + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM 'enable_lora': True, }, device_mesh=sampler_mesh, @@ -202,6 +199,9 @@ def main(): for prompt in batch: expand_prompts.extend([prompt] * NUM_GENERATIONS) + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM ckpt_manager.sync_weights(merge_and_sync=False) sampler.reset_prefix_cache() @@ -256,7 +256,6 @@ def main(): log_dict = metrics.calculate() log_dict.update(model.calculate_metric(is_training=True)) - swanlab.log(log_dict) metrics.reset() logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}') diff --git a/cookbook/transformers/ep_fsdp_qwen3_moe.py b/cookbook/transformers/ep_fsdp_qwen3_moe.py index 3c02b218..11855fae 100644 --- a/cookbook/transformers/ep_fsdp_qwen3_moe.py +++ b/cookbook/transformers/ep_fsdp_qwen3_moe.py @@ -13,7 +13,7 @@ MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3.5-4B') DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition') -TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Template') +TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Qwen3_5Template') _num_layers_env = os.environ.get('NUM_LAYERS') NUM_LAYERS = int(_num_layers_env) if _num_layers_env is not None else None BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '4')) @@ -47,7 +47,7 @@ def train(): try: dataset.set_template(TEMPLATE_ID, model_id=MODEL_ID) except ValueError: - dataset.set_template('Template', model_id=MODEL_ID) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID) dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode(batched=True) diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index 2a92794a..23a53f4a 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -20,7 +20,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=4) @@ -35,7 +35,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset diff --git a/cookbook/transformers/sp_fsdp_dense.py b/cookbook/transformers/sp_fsdp_dense.py index 868b61c0..280ed6f1 100644 --- a/cookbook/transformers/sp_fsdp_dense.py +++ b/cookbook/transformers/sp_fsdp_dense.py @@ -49,7 +49,7 @@ def eval(model): def create_dataset(data_slice=None): dataset = Dataset(dataset_meta=DatasetMeta(DATASETS, data_slice=range(500))) - dataset.set_template('Template', model_id=MODEL_ID) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID) dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队')) dataset.encode(batched=True) return dataset diff --git a/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md b/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md index f72bec83..1a7c39bf 100644 --- a/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md +++ b/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md @@ -67,3 +67,6 @@ See: [HCCLCheckpointEngine](HCCLCheckpointEngine.md) - **HCCLCheckpointEngine**: Suitable for Ascend NPU environments > Checkpoint engine is a key component of RLHF training infrastructure, ensuring that trainers and samplers use consistent model weights. +> Currently, synchronization is divided into two cases based on merge_and_sync=True/False. When set to True, the LoRA is merged into the base model and then synchronized. +> When set to False, only the LoRA weights are synchronized. Additionally, for multi-tenant scenarios, LoRA files are directly attached to vLLM. +> When merge_and_sync=False or in multi-tenant mode, vLLM's startup parameter enable_lora=True needs to be enabled. When merge_and_sync=True or using full parameters, this value should be set to False. diff --git a/docs/source_en/Components/Dataset/Dataset.md b/docs/source_en/Components/Dataset/Dataset.md index 01ce46cf..3fb86119 100644 --- a/docs/source_en/Components/Dataset/Dataset.md +++ b/docs/source_en/Components/Dataset/Dataset.md @@ -66,7 +66,7 @@ If using a local path or a local file, please follow these instructions: The Template component is responsible for converting string/image multimodal raw data into model input tokens. The dataset can set a Template to complete the `encode` process. ```python -dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) +dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) ``` The set_template method supports passing `kwargs` (such as `max_length` in the example) to be used as constructor parameters for `Template`. diff --git a/docs/source_en/Components/Template/Template.md b/docs/source_en/Components/Template/Template.md index 4bd52722..32709361 100644 --- a/docs/source_en/Components/Template/Template.md +++ b/docs/source_en/Components/Template/Template.md @@ -50,3 +50,10 @@ class Template: > Template does not support using functions as replacements because it needs to support many functions internally. If you need to write a new Template, please inherit the `Template` class. > Generally speaking, using the Template base class is sufficient for pure text models. In the base class, we use tokenizer.apply_chat_template to encode the model, which is universal for general pure text models. + +# Template mapping + +Currently, the model-template mapping is simple: + +- Template class:Supported in all pure text LLMs. +- Qwen3_5Template class: For Qwen3.5 MLLMs. diff --git a/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md b/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md index d1eba8cc..c5856fdc 100644 --- a/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md +++ b/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md @@ -48,7 +48,7 @@ logger = get_logger() def eval(model): # Validation set: 100 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -63,7 +63,7 @@ def train(): # Training set: 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess: replace placeholders in self-cognition data dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -188,7 +188,7 @@ ADAPTER_NAME = 'default' def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map(GSM8KProcessor()) dataset.encode(add_generation_prompt=True) return dataset @@ -222,7 +222,7 @@ def main(): model.set_lr_scheduler('CosineAnnealingLR', T_max=MAX_STEPS, eta_min=0) model.set_loss('GRPOLoss', epsilon=0.2) model.set_processor(InputProcessor) - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # Sampler deployed in the 'sampler' group sampler = vLLMSampler( @@ -236,7 +236,7 @@ def main(): device_mesh=sampler_mesh, remote_group='sampler', ) - sampler.set_template(Template, model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) @@ -393,7 +393,7 @@ for run in runs: def train(): # Prepare dataset dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle model', 'model_author': 'ModelScope Community'}) dataset.encode(batched=True) dataloader = DataLoader(dataset=dataset, batch_size=4) @@ -403,7 +403,7 @@ def train(): lora_config = LoraConfig(target_modules='all-linear') model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) - model.set_template('Template') + model.set_template('Qwen3_5Template') model.set_processor('InputProcessor', padding_side='right') model.set_loss('CrossEntropyLoss') model.set_optimizer('AdamW', lr=1e-4) @@ -473,7 +473,7 @@ base_url = 'http://www.modelscope.cn/twinkle' def train(): # Prepare dataset dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) dataset.map(SelfCognitionProcessor('Twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md index 2a2f0f31..4ffa9c86 100644 --- a/docs/source_en/Usage Guide/Quick-Start.md +++ b/docs/source_en/Usage Guide/Quick-Start.md @@ -70,7 +70,7 @@ def train(): dataset = PackingDataset(dataset_meta) dataset.map(SelfCognitionProcessor(model_name='Twinkle Model', model_author='ModelScope Community')) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.encode() dataset.pack_dataset() @@ -114,7 +114,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -182,7 +182,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -271,7 +271,7 @@ ADAPTER_NAME = 'default' def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map(GSM8KProcessor()) dataset.encode(add_generation_prompt=True) return dataset @@ -303,7 +303,7 @@ def main(): model.set_lr_scheduler('default', lr_decay_steps=MAX_STEPS, max_lr=LEARNING_RATE) model.set_loss('GRPOLoss', epsilon=0.2) model.set_processor(InputProcessor) - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) sampler = vLLMSampler( model_id=MODEL_ID, @@ -316,7 +316,7 @@ def main(): device_mesh=sampler_mesh, remote_group='sampler', ) - sampler.set_template(Template, model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) dataloader = DataLoader( dataset=create_gsm8k_dataset, @@ -476,7 +476,7 @@ def create_countdown_dataset(): """Create Countdown Game dataset for GRPO training.""" dataset = Dataset(dataset_meta=DatasetMeta('ms://zouxuhong/Countdown-Tasks-3to4', data_slice=range(500))) - dataset.set_template('Template', model_id=MODEL_ID, max_length=8192) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=8192) dataset.map('CountdownProcessor') dataset.encode(add_generation_prompt=True, batched=True) return dataset @@ -570,11 +570,11 @@ def train(): # Set processor and template for encoding inputs model.set_processor('InputProcessor') - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 4: Configure the sampler sampler = vLLMSampler(model_id=MODEL_ID) - sampler.set_template('Template', model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 5: Setup metrics and advantage function advantage_fn = GRPOAdvantage() @@ -712,7 +712,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply the chat template matching the base model (max 256 tokens per sample) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) # Replace placeholder names with custom model/author identity dataset.map(SelfCognitionProcessor('twinkle model', 'twinkle team'), load_from_cache_file=False) @@ -925,7 +925,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset diff --git a/docs/source_en/Usage Guide/Server and Client/Server.md b/docs/source_en/Usage Guide/Server and Client/Server.md index 141a730d..ff0918da 100644 --- a/docs/source_en/Usage Guide/Server and Client/Server.md +++ b/docs/source_en/Usage Guide/Server and Client/Server.md @@ -67,8 +67,8 @@ In the YAML configuration file, **each component needs to occupy a separate Node ```yaml applications: # Model service occupies GPU 0-3 (physical card numbers) - - name: models-Qwen2.5-7B-Instruct - route_prefix: /models/Qwen/Qwen2.5-7B-Instruct + - name: models-Qwen3.5-4B + route_prefix: /models/Qwen/Qwen3.5-4B import_path: model args: nproc_per_node: 4 @@ -84,8 +84,8 @@ applications: # ep_size: 1 # Expert parallel size (optional) # Sampler service occupies GPU 4-5 (physical card numbers) - - name: sampler-Qwen2.5-7B-Instruct - route_prefix: /sampler/Qwen/Qwen2.5-7B-Instruct + - name: sampler-Qwen3.5-4B + route_prefix: /sampler/Qwen/Qwen3.5-4B import_path: sampler args: nproc_per_node: 2 @@ -291,12 +291,12 @@ applications: The difference from the Megatron backend is only in the `use_megatron` parameter of the Model service: ```yaml - - name: models-Qwen2.5-7B-Instruct - route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct + - name: models-Qwen3.5-4B + route_prefix: /api/v1/model/Qwen/Qwen3.5-4B import_path: model args: use_megatron: false # Use Transformers backend - model_id: "ms://Qwen/Qwen2.5-7B-Instruct" + model_id: "ms://Qwen/Qwen3.5-4B" nproc_per_node: 2 device_group: name: model diff --git a/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md b/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md index e44f3cea..77738bb7 100644 --- a/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md +++ b/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md @@ -58,7 +58,7 @@ response = rest_client.list_training_runs(limit=50).result() print(f"Found {len(response.training_runs)} training runs") # Step 3: Create training client -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3-4B" # Create new training session training_client = service_client.create_lora_training_client( @@ -137,6 +137,7 @@ for epoch in range(2): Tinker compatible mode can also leverage Twinkle's dataset components to simplify data preparation instead of manually constructing `Datum`: ```python +import os from tqdm import tqdm from tinker import types from twinkle import init_tinker_client @@ -150,11 +151,11 @@ init_tinker_client() from tinker import ServiceClient -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3.5-4B" # Use Twinkle's Dataset component to load and preprocess data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) +dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) dataset.map(SelfCognitionProcessor('twinkle model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -223,7 +224,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3.5-4B" service_client = ServiceClient( base_url='http://localhost:8000', diff --git a/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md b/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md index 66d98eec..85980986 100644 --- a/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md +++ b/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md @@ -93,7 +93,7 @@ for run in runs: dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition')) # Set chat template to match model's input format -dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct', max_length=512) +dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) # Data preprocessing: Replace placeholders with custom names dataset.map('SelfCognitionProcessor', @@ -106,14 +106,14 @@ dataset.encode(batched=True) dataloader = DataLoader(dataset=dataset, batch_size=8) # Step 4: Configure model -model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct') +model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3.5-4B') # Configure LoRA lora_config = LoraConfig(target_modules='all-linear') model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) # Set template, processor, loss function -model.set_template('Template') +model.set_template('Qwen3_5Template') model.set_processor('InputProcessor', padding_side='right') model.set_loss('CrossEntropyLoss') diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md index 38e46858..692ef3f4 100644 --- a/docs/source_en/Usage Guide/Train-as-a-Service.md +++ b/docs/source_en/Usage Guide/Train-as-a-Service.md @@ -36,7 +36,7 @@ api_key=os.environ.get('MODELSCOPE_TOKEN') # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256) dataset.map(SelfCognitionProcessor('Twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" index ad78e28d..bd29a651 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" @@ -48,7 +48,7 @@ logger = get_logger() def eval(model): # 验证集:100 条样本 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -63,7 +63,7 @@ def train(): # 训练集:1000 条样本 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # 设置模板,准备编码 - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # 数据预处理:替换自我认知数据中的占位符 dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # 编码数据集 @@ -188,7 +188,7 @@ ADAPTER_NAME = 'default' def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map(GSM8KProcessor()) dataset.encode(add_generation_prompt=True) return dataset @@ -222,7 +222,7 @@ def main(): model.set_lr_scheduler('CosineAnnealingLR', T_max=MAX_STEPS, eta_min=0) model.set_loss('GRPOLoss', epsilon=0.2) model.set_processor(InputProcessor) - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # 采样器部署在 'sampler' 组 sampler = vLLMSampler( @@ -236,7 +236,7 @@ def main(): device_mesh=sampler_mesh, remote_group='sampler', ) - sampler.set_template(Template, model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) @@ -393,7 +393,7 @@ for run in runs: def train(): # 准备数据集 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'}) dataset.encode(batched=True) dataloader = DataLoader(dataset=dataset, batch_size=4) @@ -403,7 +403,7 @@ def train(): lora_config = LoraConfig(target_modules='all-linear') model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) - model.set_template('Template') + model.set_template('Qwen3_5Template') model.set_processor('InputProcessor', padding_side='right') model.set_loss('CrossEntropyLoss') model.set_optimizer('AdamW', lr=1e-4) @@ -473,7 +473,7 @@ base_url = 'http://www.modelscope.cn/twinkle' def train(): # 准备数据集 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) dataset.map(SelfCognitionProcessor('Twinkle模型', 'ModelScope团队'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" index db8b8f43..b8161c81 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" @@ -70,7 +70,7 @@ def train(): dataset = PackingDataset(dataset_meta) dataset.map(SelfCognitionProcessor(model_name='Twinkle模型', model_author='ModelScope社区')) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.encode() dataset.pack_dataset() @@ -114,7 +114,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -183,7 +183,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -273,7 +273,7 @@ ADAPTER_NAME = 'default' def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map(GSM8KProcessor()) dataset.encode(add_generation_prompt=True) return dataset @@ -305,7 +305,7 @@ def main(): model.set_lr_scheduler('default', lr_decay_steps=MAX_STEPS, max_lr=LEARNING_RATE) model.set_loss('GRPOLoss', epsilon=0.2) model.set_processor(InputProcessor) - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) sampler = vLLMSampler( model_id=MODEL_ID, @@ -318,7 +318,7 @@ def main(): device_mesh=sampler_mesh, remote_group='sampler', ) - sampler.set_template(Template, model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) dataloader = DataLoader( dataset=create_gsm8k_dataset, @@ -478,7 +478,7 @@ def create_countdown_dataset(): """Create Countdown Game dataset for GRPO training.""" dataset = Dataset(dataset_meta=DatasetMeta('ms://zouxuhong/Countdown-Tasks-3to4', data_slice=range(500))) - dataset.set_template('Template', model_id=MODEL_ID, max_length=8192) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=8192) dataset.map('CountdownProcessor') dataset.encode(add_generation_prompt=True, batched=True) return dataset @@ -572,11 +572,11 @@ def train(): # Set processor and template for encoding inputs model.set_processor('InputProcessor') - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 4: Configure the sampler sampler = vLLMSampler(model_id=MODEL_ID) - sampler.set_template('Template', model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 5: Setup metrics and advantage function advantage_fn = GRPOAdvantage() @@ -714,7 +714,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply the chat template matching the base model (max 256 tokens per sample) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) # Replace placeholder names with custom model/author identity dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) @@ -927,7 +927,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" index 27db69b2..1340fc06 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" @@ -58,7 +58,7 @@ response = rest_client.list_training_runs(limit=50).result() print(f"Found {len(response.training_runs)} training runs") # Step 3: 创建训练客户端 -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3-4B" # 新建训练会话 training_client = service_client.create_lora_training_client( @@ -137,6 +137,7 @@ for epoch in range(2): Tinker 兼容模式也可以利用 Twinkle 的数据集组件来简化数据准备,而不是手动构建 `Datum`: ```python +import os from tqdm import tqdm from tinker import types from twinkle import init_tinker_client @@ -150,11 +151,11 @@ init_tinker_client() from tinker import ServiceClient -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3.5-4B" # 使用 Twinkle 的 Dataset 组件加载和预处理数据 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) +dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -223,7 +224,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen/Qwen3.5-4B" service_client = ServiceClient( base_url='http://localhost:8000', diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md" index fd81ac1b..c9fded19 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md" @@ -93,7 +93,7 @@ for run in runs: dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition')) # 设置 chat 模板,使数据匹配模型的输入格式 -dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct', max_length=512) +dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) # 数据预处理:替换占位符为自定义名称 dataset.map('SelfCognitionProcessor', @@ -106,14 +106,14 @@ dataset.encode(batched=True) dataloader = DataLoader(dataset=dataset, batch_size=8) # Step 4: 配置模型 -model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct') +model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3.5-4B') # 配置 LoRA lora_config = LoraConfig(target_modules='all-linear') model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) # 设置模板、处理器、损失函数 -model.set_template('Template') +model.set_template('Qwen3_5Template') model.set_processor('InputProcessor', padding_side='right') model.set_loss('CrossEntropyLoss') diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/\346\234\215\345\212\241\347\253\257.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/\346\234\215\345\212\241\347\253\257.md" index 1528a439..3a791b21 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/\346\234\215\345\212\241\347\253\257.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/\346\234\215\345\212\241\347\253\257.md" @@ -67,8 +67,8 @@ export TWINKLE_TRUST_REMOTE_CODE=0 # 是否信任远程代码(安全考 ```yaml applications: # 模型服务占用 GPU 0-3(物理卡号) - - name: models-Qwen2.5-7B-Instruct - route_prefix: /models/Qwen/Qwen2.5-7B-Instruct + - name: models-Qwen3.5-4B + route_prefix: /models/Qwen/Qwen3.5-4B import_path: model args: nproc_per_node: 4 @@ -84,8 +84,8 @@ applications: # ep_size: 1 # 专家并行大小(可选) # Sampler 服务占用 GPU 4-5(物理卡号) - - name: sampler-Qwen2.5-7B-Instruct - route_prefix: /sampler/Qwen/Qwen2.5-7B-Instruct + - name: sampler-Qwen3.5-4B + route_prefix: /sampler/Qwen/Qwen3.5-4B import_path: sampler args: nproc_per_node: 2 @@ -291,12 +291,12 @@ applications: Transformers 后端与 Megatron 后端的区别仅在 Model 服务的 `use_megatron` 参数: ```yaml - - name: models-Qwen2.5-7B-Instruct - route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct + - name: models-Qwen3.5-4B + route_prefix: /api/v1/model/Qwen/Qwen3.5-4B import_path: model args: use_megatron: false # 使用 Transformers 后端 - model_id: "ms://Qwen/Qwen2.5-7B-Instruct" + model_id: "ms://Qwen/Qwen3.5-4B" nproc_per_node: 2 device_group: name: model diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" index 8dfc056e..0c7afc44 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" @@ -39,7 +39,7 @@ api_key=os.environ.get('MODELSCOPE_TOKEN') # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256) dataset.map(SelfCognitionProcessor('Twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" index 322c0e34..812a7e7f 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" @@ -66,7 +66,7 @@ dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=r Template 组件是负责将字符串/图片多模态原始数据转换为模型输入 token 的组件。数据集可以设置一个 Template 来完成 `encode` 过程。 ```python -dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) +dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) ``` set_template 方法支持传入 `kwargs`(例如例子中的 `max_length`),作为 `Template` 的构造参数使用。 diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\243\200\346\237\245\347\202\271\345\274\225\346\223\216/CheckpointEngine.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\243\200\346\237\245\347\202\271\345\274\225\346\223\216/CheckpointEngine.md" index b7acdef2..338be10d 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\243\200\346\237\245\347\202\271\345\274\225\346\223\216/CheckpointEngine.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\243\200\346\237\245\347\202\271\345\274\225\346\223\216/CheckpointEngine.md" @@ -67,3 +67,5 @@ Twinkle 提供了两种检查点引擎实现: - **HCCLCheckpointEngine**: 适用于昇腾 NPU 环境 > 检查点引擎是 RLHF 训练基础设施的关键组件,确保训练器和采样器使用一致的模型权重。 +> 目前的同步分为merge_and_sync=True/False两种情况,为True时将lora合并仅基模并同步,为False时仅同步lora权重。另外,多租户直接附加lora文件到vLLM上,在merge_and_sync=False,或使用多租户时, +> vLLM的启动参数需要开启`enable_lora=True`,在merge_and_sync=True或全参时,该值设置为False. diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" index e58abeb4..d9cdba97 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" @@ -50,3 +50,10 @@ class Template: > Template 不支持使用函数来代替,因为其内部要支持的功能较多。如果需要编写新的 Template,请继承 `Template` 类。 > 一般来说,纯文本模型使用 Template 基类就足够了,在基类中我们使用了 tokenizer.apply_chat_template 来编码模型,对一般的纯文本模型是通用的。 + +# 模板对应关系 + +目前模板关系较为简单: + +- Template类:纯文本模型通用 +- Qwen3_5Template类:Qwen3.5多模态模型使用 diff --git a/pyproject.toml b/pyproject.toml index af96f65e..a0cf3908 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,16 +1,16 @@ [project] name = "twinkle-kit" -version = "0.3.dev0" +version = "0.2.0" description = "Training API for large language models with efficient data handling and advanced optimization techniques." readme = "README.md" authors = [{ name = "ModelScope", email = "contact@modelscope.cn" }] requires-python = ">=3.11,<3.13" dependencies = [ "numpy>=2.0.0,<2.3.0", - "datasets>=3.0,<4.0", + "datasets", "omegaconf>=2.3.0,<3.0.0", "fastapi", - "modelscope[framework]>=1.35.0", + "modelscope[framework]>=1.34.0", "safetensors", "peft>=0.11.0,<=0.19.0", "transformers", diff --git a/src/twinkle/server/model/app.py b/src/twinkle/server/model/app.py index 8f0c6f77..41351811 100644 --- a/src/twinkle/server/model/app.py +++ b/src/twinkle/server/model/app.py @@ -143,7 +143,7 @@ def build_model_app(model_id: str, Supports both Tinker (polling-style) and Twinkle (synchronous) clients. Args: - model_id: Base model identifier (e.g., "Qwen/Qwen2.5-0.5B-Instruct") + model_id: Base model identifier (e.g., "Qwen/Qwen3.5-4B") nproc_per_node: Number of processes per node for distributed training device_group: Device group configuration dict device_mesh: Device mesh configuration dict for tensor parallelism diff --git a/src/twinkle/server/sampler/app.py b/src/twinkle/server/sampler/app.py index dc54e4f6..0443df94 100644 --- a/src/twinkle/server/sampler/app.py +++ b/src/twinkle/server/sampler/app.py @@ -111,7 +111,7 @@ def build_sampler_app(model_id: str, Twinkle (synchronous /twinkle/*) sampler clients. Args: - model_id: Model identifier (e.g., "Qwen/Qwen2.5-0.5B-Instruct") + model_id: Model identifier (e.g., "Qwen/Qwen3.5-4B") nproc_per_node: Number of processes per node device_group: Device group configuration dict device_mesh: Device mesh configuration dict for parallelism diff --git a/src/twinkle/version.py b/src/twinkle/version.py index 30f4428a..08a7c147 100644 --- a/src/twinkle/version.py +++ b/src/twinkle/version.py @@ -1,5 +1,5 @@ # Make sure to modify __release_datetime__ to release time when making official release. -__version__ = '0.3.dev0' +__version__ = '0.2.0' # default release datetime for branches under active development is set # to be a time far-far-away-into-the-future __release_datetime__ = '2099-10-13 08:56:12'