modelscope
diff --git a/‎Dockerfile‎
Lines changed: 29 additions & 1 deletion b/‎Dockerfile‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎INSTALL_MEGATRON.sh‎
Lines changed: 4 additions & 4 deletions b/‎INSTALL_MEGATRON.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README_ZH.md‎
Lines changed: 2 additions & 2 deletions b/‎README_ZH.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cookbook/client/tinker/modelscope/self_cognition.py‎
Lines changed: 1 addition & 1 deletion b/‎cookbook/client/tinker/modelscope/self_cognition.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cookbook/client/tinker/modelscope/short_math_grpo.py‎
Lines changed: 1 addition & 1 deletion b/‎cookbook/client/tinker/modelscope/short_math_grpo.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cookbook/client/tinker/self_host/self_cognition.py‎
Lines changed: 1 addition & 1 deletion b/‎cookbook/client/tinker/self_host/self_cognition.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cookbook/client/tinker/self_host/short_math_grpo.py‎
Lines changed: 1 addition & 1 deletion b/‎cookbook/client/tinker/self_host/short_math_grpo.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cookbook/client/twinkle/modelscope/self_congnition.py‎
Lines changed: 2 additions & 2 deletions b/‎cookbook/client/twinkle/modelscope/self_congnition.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cookbook/client/twinkle/self_host/grpo.py‎
Lines changed: 3 additions & 3 deletions b/‎cookbook/client/twinkle/self_host/grpo.py‎
Lines changed: 3 additions & 3 deletions
@@ -16,8 +16,36 @@ RUN echo "Available release branches:" && git branch -r -l 'origin/release/*' --
     echo "Checking out: $LATEST_RELEASE" && \
     git checkout --track "$LATEST_RELEASE"
 
-RUN sh INSTALL_MEGATRON.sh
+ENV SETUPTOOLS_USE_DISTUTILS=local
 
+# Install base packages
+RUN pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir
+
+# Install vllm
+RUN pip install --upgrade vllm --no-cache-dir
+
+# Install transformer_engine and megatron_core
+RUN SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") && \
+    CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \
+    CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \
+    pip install --no-build-isolation "transformer_engine[pytorch]" --no-cache-dir
+
+RUN pip install megatron_core mcore_bridge --no-cache-dir
+
+# Install flash-attention (default arch 8.0;9.0, override via build-arg if needed)
+ARG TORCH_CUDA_ARCH_LIST="8.0;9.0"
+RUN TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" \
+    MAX_JOBS=8 \
+    FLASH_ATTENTION_FORCE_BUILD=TRUE \
+    pip install flash-attn --no-build-isolation --no-cache-dir
+
+RUN pip install flash-linear-attention -U --no-cache-dir
+
+# Install numpy
+RUN pip install numpy==2.2 --no-cache-dir
+
+# Install tinker, ray, and other deps
 RUN pip install --no-cache-dir tinker==0.14.0 "ray[serve]" transformers peft accelerate -U
 
+# Install twinkle itself
 RUN pip install -e . --no-build-isolation
@@ -56,12 +56,12 @@ echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST"
 # Install latest base packages
 echo ""
 echo "Installing peft, accelerate, transformers, modelscope..."
-pip install --upgrade peft accelerate transformers "modelscope[framework]"
+pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir
 
 # Install latest vllm
 echo ""
 echo "Installing latest vllm..."
-pip install --upgrade vllm
+pip install --upgrade vllm --no-cache-dir
 
 # Get site-packages path and install transformer_engine and megatron_core
 echo ""
@@ -83,12 +83,12 @@ MAX_JOBS=8 \
 FLASH_ATTENTION_FORCE_BUILD=TRUE \
 pip install flash-attn --no-build-isolation --no-cache-dir
 
-pip install flash-linear-attention -U
+pip install flash-linear-attention -U --no-cache-dir
 
 # Install numpy
 echo ""
 echo "Installing numpy==2.2 and deep_gemm..."
-pip install numpy==2.2
+pip install numpy==2.2 --no-cache-dir
 
 # Verify installation
 echo ""
 
@@ -184,7 +184,7 @@ def train():
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id=base_model)
+    dataset.set_template('Qwen3_5Template', model_id=base_model)
     # Preprocess the dataset to standard format
     dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community'))
     # Encode dataset
@@ -242,7 +242,7 @@ api_key='your-api-key'
 
 # Use twinkle dataset to load the data
 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
-dataset.set_template('Template', model_id=base_model, max_length=256)
+dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256)
 dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False)
 dataset.encode(batched=True, load_from_cache_file=False)
 dataloader = DataLoader(dataset=dataset, batch_size=8)
 
@@ -166,7 +166,7 @@ def train():
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id=base_model)
+    dataset.set_template('Qwen3_5Template', model_id=base_model)
     # Preprocess the dataset to standard format
     dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community'))
     # Encode dataset
@@ -224,7 +224,7 @@ api_key='your-api-key'
 
 # Use twinkle dataset to load the data
 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
-dataset.set_template('Template', model_id=base_model, max_length=256)
+dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256)
 dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False)
 dataset.encode(batched=True, load_from_cache_file=False)
 dataloader = DataLoader(dataset=dataset, batch_size=8)
 
@@ -34,7 +34,7 @@ def train():
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
 
     # Apply the chat template matching the base model (max 256 tokens per sample)
-    dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256)
+    dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256)
 
     # Replace placeholder names with custom model/author identity
     dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False)
 
@@ -182,7 +182,7 @@ def create_math_dataset():
         data_slice=range(DATA_NUM),
     )
     dataset = Dataset(meta)
-    dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')
+    dataset.set_template('Qwen3_5Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')
     dataset.map(MathPreprocessor())
     dataset.filter(lambda row: bool(row['messages']))
     dataset.encode(add_generation_prompt=True)
 
@@ -36,7 +36,7 @@ def train():
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
 
     # Apply the chat template matching the base model (max 256 tokens per sample)
-    dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256)
+    dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256)
 
     # Replace placeholder names with custom model/author identity
     dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False)
 
@@ -182,7 +182,7 @@ def create_math_dataset():
         data_slice=range(DATA_NUM),
     )
     dataset = Dataset(meta)
-    dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')
+    dataset.set_template('Qwen3_5Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')
     dataset.map(MathPreprocessor())
     dataset.filter(lambda row: bool(row['messages']))
     dataset.encode(add_generation_prompt=True)
 
@@ -52,7 +52,7 @@ def train():
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
 
     # Apply a chat template so the data matches the model's expected input format
-    dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=512)
+    dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=512)
 
     # Replace placeholder names in the dataset with custom model/author names
     dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'})
@@ -77,7 +77,7 @@ def train():
     model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
 
     # Set the same chat template used during data preprocessing
-    model.set_template('Template')
+    model.set_template('Qwen3_5Template')
 
     # Set the input processor (pads sequences on the right side)
     model.set_processor('InputProcessor', padding_side='right')
 
@@ -55,7 +55,7 @@
 
 def create_gsm8k_dataset():
     dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train'))
-    dataset.set_template('Template', model_id=MODEL_ID, max_length=2048)
+    dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048)
     dataset.map('GSM8KProcessor')
     dataset.encode(add_generation_prompt=True)
     return dataset
@@ -112,11 +112,11 @@ def train():
 
     # Set processor and template for encoding inputs
     model.set_processor('InputProcessor')
-    model.set_template('Template', model_id=MODEL_ID)
+    model.set_template('Qwen3_5Template', model_id=MODEL_ID)
 
     # Step 4: Configure the sampler
     sampler = vLLMSampler(model_id=MODEL_ID)
-    sampler.set_template('Template', model_id=MODEL_ID)
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID)
 
     # Step 5: Setup metrics and advantage function
     advantage_fn = GRPOAdvantage()
Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ def create_math_dataset():`
`182`	`182`	`data_slice=range(DATA_NUM),`
`183`	`183`	`)`
`184`	`184`	`dataset = Dataset(meta)`
`185`		`- dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')`
	`185`	`+ dataset.set_template('Qwen3_5Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete')`
`186`	`186`	`dataset.map(MathPreprocessor())`
`187`	`187`	`dataset.filter(lambda row: bool(row['messages']))`
`188`	`188`	`dataset.encode(add_generation_prompt=True)`