modelscope · tastelikefeet · Apr 9, 2026 · Apr 5, 2026 · Apr 5, 2026 · Apr 5, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -16,8 +16,36 @@ RUN echo "Available release branches:" && git branch -r -l 'origin/release/*' --
     echo "Checking out: $LATEST_RELEASE" && \
     git checkout --track "$LATEST_RELEASE"
 
-RUN sh INSTALL_MEGATRON.sh
+ENV SETUPTOOLS_USE_DISTUTILS=local
 
+# Install base packages
+RUN pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir
+
+# Install vllm
+RUN pip install --upgrade vllm --no-cache-dir
+
+# Install transformer_engine and megatron_core
+RUN SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") && \
+    CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \
+    CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \
+    pip install --no-build-isolation "transformer_engine[pytorch]" --no-cache-dir
+
+RUN pip install megatron_core mcore_bridge --no-cache-dir
+
+# Install flash-attention (default arch 8.0;9.0, override via build-arg if needed)
+ARG TORCH_CUDA_ARCH_LIST="8.0;9.0"
+RUN TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" \
+    MAX_JOBS=8 \
+    FLASH_ATTENTION_FORCE_BUILD=TRUE \
+    pip install flash-attn --no-build-isolation --no-cache-dir
+
+RUN pip install flash-linear-attention -U --no-cache-dir
+
+# Install numpy
+RUN pip install numpy==2.2 --no-cache-dir
+
+# Install tinker, ray, and other deps
 RUN pip install --no-cache-dir tinker==0.14.0 "ray[serve]" transformers peft accelerate -U
 
+# Install twinkle itself
 RUN pip install -e . --no-build-isolation
diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh
@@ -56,12 +56,12 @@ echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST"
 # Install latest base packages
 echo ""
 echo "Installing peft, accelerate, transformers, modelscope..."
-pip install --upgrade peft accelerate transformers "modelscope[framework]"
+pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir
 
 # Install latest vllm
 echo ""
 echo "Installing latest vllm..."
-pip install --upgrade vllm
+pip install --upgrade vllm --no-cache-dir
 
 # Get site-packages path and install transformer_engine and megatron_core
 echo ""
@@ -83,12 +83,12 @@ MAX_JOBS=8 \
 FLASH_ATTENTION_FORCE_BUILD=TRUE \
 pip install flash-attn --no-build-isolation --no-cache-dir
 
-pip install flash-linear-attention -U
+pip install flash-linear-attention -U --no-cache-dir
 
 # Install numpy
 echo ""
 echo "Installing numpy==2.2 and deep_gemm..."
-pip install numpy==2.2
+pip install numpy==2.2 --no-cache-dir
 
 # Verify installation
 echo ""

diff --git a/README.md b/README.md
@@ -184,7 +184,7 @@ def train():
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id=base_model)
+    dataset.set_template('Qwen3_5Template', model_id=base_model)
     # Preprocess the dataset to standard format
     dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community'))
     # Encode dataset
@@ -242,7 +242,7 @@ api_key='your-api-key'
 
 # Use twinkle dataset to load the data
 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
-dataset.set_template('Template', model_id=base_model, max_length=256)
+dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256)
 dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False)
 dataset.encode(batched=True, load_from_cache_file=False)
 dataloader = DataLoader(dataset=dataset, batch_size=8)

diff --git a/README_ZH.md b/README_ZH.md
@@ -166,7 +166,7 @@ def train():
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id=base_model)
+    dataset.set_template('Qwen3_5Template', model_id=base_model)
     # Preprocess the dataset to standard format
     dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community'))
     # Encode dataset
@@ -224,7 +224,7 @@ api_key='your-api-key'
 
 # Use twinkle dataset to load the data
 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
-dataset.set_template('Template', model_id=base_model, max_length=256)
+dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256)
 dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False)
 dataset.encode(batched=True, load_from_cache_file=False)
 dataloader = DataLoader(dataset=dataset, batch_size=8)

diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py
@@ -34,7 +34,7 @@ def train():
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
 
     # Apply the chat template matching the base model (max 256 tokens per sample)
-    dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256)
+    dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256)
 
     # Replace placeholder names with custom model/author identity
     dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False)

diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py
@@ -182,7 +182,7 @@ def create_math_dataset():
         data_slice=range(DATA_NUM),
     )
     dataset = Dataset(meta)
-    dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')
+    dataset.set_template('Qwen3_5Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')
     dataset.map(MathPreprocessor())
     dataset.filter(lambda row: bool(row['messages']))
     dataset.encode(add_generation_prompt=True)

diff --git a/cookbook/client/tinker/self_host/self_cognition.py b/cookbook/client/tinker/self_host/self_cognition.py
@@ -36,7 +36,7 @@ def train():
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
 
     # Apply the chat template matching the base model (max 256 tokens per sample)
-    dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256)
+    dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256)
 
     # Replace placeholder names with custom model/author identity
     dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False)

diff --git a/cookbook/client/tinker/self_host/short_math_grpo.py b/cookbook/client/tinker/self_host/short_math_grpo.py
@@ -182,7 +182,7 @@ def create_math_dataset():
         data_slice=range(DATA_NUM),
     )
     dataset = Dataset(meta)
-    dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')
+    dataset.set_template('Qwen3_5Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')
     dataset.map(MathPreprocessor())
     dataset.filter(lambda row: bool(row['messages']))
     dataset.encode(add_generation_prompt=True)

diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py
@@ -52,7 +52,7 @@ def train():
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
 
     # Apply a chat template so the data matches the model's expected input format
-    dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=512)
+    dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=512)
 
     # Replace placeholder names in the dataset with custom model/author names
     dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'})
@@ -77,7 +77,7 @@ def train():
     model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
 
     # Set the same chat template used during data preprocessing
-    model.set_template('Template')
+    model.set_template('Qwen3_5Template')
 
     # Set the input processor (pads sequences on the right side)
     model.set_processor('InputProcessor', padding_side='right')

diff --git a/cookbook/client/twinkle/self_host/grpo.py b/cookbook/client/twinkle/self_host/grpo.py
@@ -55,7 +55,7 @@
 
 def create_gsm8k_dataset():
     dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train'))
-    dataset.set_template('Template', model_id=MODEL_ID, max_length=2048)
+    dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048)
     dataset.map('GSM8KProcessor')
     dataset.encode(add_generation_prompt=True)
     return dataset
@@ -112,11 +112,11 @@ def train():
 
     # Set processor and template for encoding inputs
     model.set_processor('InputProcessor')
-    model.set_template('Template', model_id=MODEL_ID)
+    model.set_template('Qwen3_5Template', model_id=MODEL_ID)
 
     # Step 4: Configure the sampler
     sampler = vLLMSampler(model_id=MODEL_ID)
-    sampler.set_template('Template', model_id=MODEL_ID)
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID)
 
     # Step 5: Setup metrics and advantage function
     advantage_fn = GRPOAdvantage()

diff --git a/cookbook/client/twinkle/self_host/sample.py b/cookbook/client/twinkle/self_host/sample.py
@@ -42,7 +42,7 @@ def sample():
     sampler = vLLMSampler(model_id=MODEL_ID)
 
     # Step 4: Set the chat template so the sampler can encode Trajectory inputs
-    sampler.set_template('Template', model_id=MODEL_ID)
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID)
 
     # Step 5: Prepare inputs as Trajectory dicts (messages format)
     # Each trajectory is a conversation with system and user messages

diff --git a/cookbook/client/twinkle/self_host/self_congnition.py b/cookbook/client/twinkle/self_host/self_congnition.py
@@ -59,7 +59,7 @@ def train():
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
 
     # Apply a chat template so the data matches the model's expected input format
-    dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=512)
+    dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=512)
 
     # Replace placeholder names in the dataset with custom model/author names
     dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'})
@@ -84,7 +84,7 @@ def train():
     model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
 
     # Set the same chat template used during data preprocessing
-    model.set_template('Template')
+    model.set_template('Qwen3_5Template')
 
     # Set the input processor (pads sequences on the right side)
     model.set_processor('InputProcessor', padding_side='right')

diff --git a/cookbook/megatron/tp_moe.py b/cookbook/megatron/tp_moe.py
@@ -20,7 +20,7 @@
 def eval(model):
     # 100 Samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100)))
-    dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B')
+    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B')
     dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     dataset.encode()
     dataloader = DataLoader(dataset=dataset, batch_size=16)
@@ -34,7 +34,7 @@ def train():
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B')
+    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B')
     # Preprocess the dataset to standard format
     dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     # Encode dataset

diff --git a/cookbook/ray/single_controller.py b/cookbook/ray/single_controller.py
@@ -26,7 +26,7 @@
 def eval(model):
     # 100 Samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100)))
-    dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B')
+    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B')
     dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     dataset.encode()
     dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8)
@@ -41,7 +41,7 @@ def train():
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B')
+    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
     # Preprocess the dataset to standard format
     dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     # Encode dataset

diff --git a/cookbook/rl/gkd_on_policy.py b/cookbook/rl/gkd_on_policy.py
@@ -173,6 +173,9 @@ def main():
     # ── Student vLLM sampler (for on-policy generation) ────────────────────────
     student_sampler = vLLMSampler(
         model_id=STUDENT_MODEL_ID,
+        # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False)
+        # meaning only sync lora weights, if merge_and_sync=True,
+        # lora will be merged into the base model and sync all weights to vLLM
         engine_args={'gpu_memory_utilization': 0.85, 'max_model_len': 4096, 'enable_lora': True, 'max_loras': 1},
         device_mesh=sampler_mesh,
         remote_group='student_sampler',
@@ -210,6 +213,9 @@ def main():
             break
 
         # 1. Sync student model weights to student sampler
+        # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False)
+        # meaning only sync lora weights, if merge_and_sync=True,
+        # lora will be merged into the base model and sync all weights to vLLM
         ckpt_manager.sync_weights(merge_and_sync=False)
         student_sampler.reset_prefix_cache()
 

diff --git a/cookbook/rl/grpo.py b/cookbook/rl/grpo.py
@@ -103,6 +103,9 @@ def main():
             'max_model_len': 4496,
             'max_lora_rank': 32, # save as lora_config
             # NOTE: To use enable_lora with qwen3.5, ensure vLLM includes PR https://github.com/vllm-project/vllm/pull/36976
+            # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False)
+            # meaning only sync lora weights, if merge_and_sync=True,
+            # lora will be merged into the base model and sync all weights to vLLM
             'enable_lora': True,
         },
         device_mesh=sampler_mesh,
@@ -133,6 +136,9 @@ def main():
             break
         metrics.reset()
         global_prompts = batch if isinstance(batch, list) else [batch]
+        # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False)
+        # meaning only sync lora weights, if merge_and_sync=True,
+        # lora will be merged into the base model and sync all weights to vLLM
         ckpt_manager.sync_weights(merge_and_sync=False)
         sampler.reset_prefix_cache()
         sample_responses = sampler.sample(

diff --git a/cookbook/rl/grpo_mm.py b/cookbook/rl/grpo_mm.py
@@ -27,10 +27,6 @@
 )
 from twinkle.sampler import vLLMSampler
 
-import swanlab
-swanlab.init(
-    project='twinkle',
-)
 logger = get_logger()
 
 # Model configuration
@@ -184,6 +180,9 @@ def main():
             'gpu_memory_utilization': 0.8,
             'max_model_len': 32000,
             'max_lora_rank': 32,
+            # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False)
+            # meaning only sync lora weights, if merge_and_sync=True,
+            # lora will be merged into the base model and sync all weights to vLLM
             'enable_lora': True,
             'limit_mm_per_prompt': {'image': 9},  # OlympiadBench has up to 9 images
         },
@@ -221,6 +220,9 @@ def main():
         metrics.reset()
 
         # Sync weights to sampler
+        # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False)
+        # meaning only sync lora weights, if merge_and_sync=True,
+        # lora will be merged into the base model and sync all weights to vLLM
         ckpt_manager.sync_weights(merge_and_sync=False)
         sampler.reset_prefix_cache()
 
@@ -282,7 +284,6 @@ def main():
         log_dict.update(model.calculate_metric(is_training=True, adapter_name=ADAPTER_NAME))
         metrics.reset()
         logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}')
-        swanlab.log(log_dict)
 
     logger.info(f'Training completed. optim_steps={optim_step}')
     model.save('olympiad-grpo-mixed-final', adapter_name=ADAPTER_NAME)

diff --git a/cookbook/rl/short_math_grpo.py b/cookbook/rl/short_math_grpo.py
@@ -50,12 +50,6 @@
 SYSTEM_PROMPT = ('You are a helpful math assistant. Solve the problem with minimal but correct reasoning '
                  'and put your final answer within \\boxed{}.')
 
-import swanlab
-swanlab.init(
-    project='twinkle',
-)
-
-
 # ========== Reward Functions ==========
 class GSM8KBrevityReward(Reward):
     """Brevity reward: rewards shorter completions that contain a valid answer.
@@ -167,6 +161,9 @@ def main():
             'max_model_len': 8192,
             'max_lora_rank': 32, # save as lora_config
             # NOTE: To use enable_lora with qwen3.5, ensure vLLM includes PR https://github.com/vllm-project/vllm/pull/36976
+            # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False)
+            # meaning only sync lora weights, if merge_and_sync=True,
+            # lora will be merged into the base model and sync all weights to vLLM
             'enable_lora': True,
         },
         device_mesh=sampler_mesh,
@@ -202,6 +199,9 @@ def main():
         for prompt in batch:
             expand_prompts.extend([prompt] * NUM_GENERATIONS)
 
+        # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False)
+        # meaning only sync lora weights, if merge_and_sync=True,
+        # lora will be merged into the base model and sync all weights to vLLM
         ckpt_manager.sync_weights(merge_and_sync=False)
         sampler.reset_prefix_cache()
 
@@ -256,7 +256,6 @@ def main():
 
         log_dict = metrics.calculate()
         log_dict.update(model.calculate_metric(is_training=True))
-        swanlab.log(log_dict)
         metrics.reset()
         logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}')
 

diff --git a/cookbook/transformers/ep_fsdp_qwen3_moe.py b/cookbook/transformers/ep_fsdp_qwen3_moe.py
@@ -13,7 +13,7 @@
 
 MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
 DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition')
-TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Template')
+TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Qwen3_5Template')
 _num_layers_env = os.environ.get('NUM_LAYERS')
 NUM_LAYERS = int(_num_layers_env) if _num_layers_env is not None else None
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '4'))
@@ -47,7 +47,7 @@ def train():
     try:
         dataset.set_template(TEMPLATE_ID, model_id=MODEL_ID)
     except ValueError:
-        dataset.set_template('Template', model_id=MODEL_ID)
+        dataset.set_template('Qwen3_5Template', model_id=MODEL_ID)
 
     dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     dataset.encode(batched=True)

diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py
@@ -20,7 +20,7 @@
 def eval(model):
     # 100 Samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100)))
-    dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B')
+    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
     dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     dataset.encode()
     dataloader = DataLoader(dataset=dataset, batch_size=4)
@@ -35,7 +35,7 @@ def train():
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B')
+    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
     # Preprocess the dataset to standard format
     dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     # Encode dataset