From 9072c90a04d58560d244d090a702e45adbdd0694 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Sun, 5 Apr 2026 20:09:13 +0800 Subject: [PATCH 01/17] bump version to 0.2.0 --- pyproject.toml | 2 +- src/twinkle/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 584099cb..ce392b26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "twinkle-kit" -version = "0.2.dev0" +version = "0.2.0" description = "Training API for large language models with efficient data handling and advanced optimization techniques." readme = "README.md" authors = [{ name = "ModelScope", email = "contact@modelscope.cn" }] diff --git a/src/twinkle/version.py b/src/twinkle/version.py index 05103d1d..08a7c147 100644 --- a/src/twinkle/version.py +++ b/src/twinkle/version.py @@ -1,5 +1,5 @@ # Make sure to modify __release_datetime__ to release time when making official release. -__version__ = '0.2.dev0' +__version__ = '0.2.0' # default release datetime for branches under active development is set # to be a time far-far-away-into-the-future __release_datetime__ = '2099-10-13 08:56:12' From 967093a26a252738eae339346916dfc6c1a0ad9f Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Sun, 5 Apr 2026 21:11:54 +0800 Subject: [PATCH 02/17] fix --- Dockerfile | 10 +++++----- INSTALL_MEGATRON.sh | 5 ----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 97c35113..d01937f5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,15 +6,15 @@ RUN curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.s rm Miniconda3-latest-Linux-x86_64.sh ENV PATH="/opt/conda/bin:${PATH}" RUN conda create -n twinkle python=3.12 -y --override-channels -c conda-forge -SHELL ["conda", "run", "-n", "twinkle", "/bin/bash", "-c"] +ENV PATH="/opt/conda/envs/twinkle/bin:${PATH}" # Clone and install twinkle, checkout to latest v-tag RUN git clone https://github.com/modelscope/twinkle.git WORKDIR /twinkle -RUN echo "Available v-tags:" && git tag -l 'v*' --sort=-v:refname && \ - LATEST_TAG=$(git tag -l 'v*' --sort=-v:refname | head -n 1) && \ - echo "Checking out: $LATEST_TAG" && \ - git checkout "$LATEST_TAG" +RUN echo "Available release branches:" && git branch -r -l 'origin/release/*' --sort=-v:refname && \ + LATEST_RELEASE=$(git branch -r -l 'origin/release/*' --sort=-v:refname | head -n 1 | tr -d ' ') && \ + echo "Checking out: $LATEST_RELEASE" && \ + git checkout --track "$LATEST_RELEASE" RUN sh INSTALL_MEGATRON.sh diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh index 775f7588..cc7298d2 100644 --- a/INSTALL_MEGATRON.sh +++ b/INSTALL_MEGATRON.sh @@ -87,11 +87,6 @@ pip install flash-linear-attention -U echo "" echo "Installing numpy==2.2 and deep_gemm..." pip install numpy==2.2 -pip uninstall deep_gemm -y -cd /tmp -git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git -cd DeepGEMM -pip install . --no-build-isolation # Verify installation echo "" From 68aa05b6db9277d9b850c3e4218d42ddcb728b78 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Sun, 5 Apr 2026 21:14:20 +0800 Subject: [PATCH 03/17] fix --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index d01937f5..ca6d39ed 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,7 @@ FROM modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.9.1-1.35.0 +RUN echo '47.110.159.78 github.com' >> /etc/hosts + # Install miniconda with Python 3.12 RUN curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ From 1eeb423ad2b426b91b8e5599db8c66cbcd95a3fb Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Sun, 5 Apr 2026 21:58:37 +0800 Subject: [PATCH 04/17] fix dockerfile --- Dockerfile | 5 ++--- INSTALL_MEGATRON.sh | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index ca6d39ed..8bae7c6a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,5 @@ FROM modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.9.1-1.35.0 -RUN echo '47.110.159.78 github.com' >> /etc/hosts - # Install miniconda with Python 3.12 RUN curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ @@ -24,4 +22,5 @@ RUN pip install --no-cache-dir tinker==0.14.0 "ray[serve]" transformers peft acc RUN pip install -e . --no-build-isolation -CMD ["bash", "cookbook/client/server/megatron/run.sh"] +ENV TWINKLE_WORKDIR=/data +CMD ["bash", "-c", "mkdir -p $TWINKLE_WORKDIR && cd $TWINKLE_WORKDIR && bash /twinkle/cookbook/client/server/megatron/run.sh 2>&1 | tee $TWINKLE_WORKDIR/run.log"] diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh index cc7298d2..fb23524b 100644 --- a/INSTALL_MEGATRON.sh +++ b/INSTALL_MEGATRON.sh @@ -95,7 +95,7 @@ echo "" python -c " import pkg_resources -packages = ['peft', 'accelerate', 'transformers', 'modelscope', 'oss2', 'vllm', 'transformer_engine', 'megatron_core', 'flash_attn', 'numpy'] +packages = ['peft', 'accelerate', 'transformers', 'modelscope', 'vllm', 'transformer_engine', 'megatron_core', 'flash_attn', 'numpy'] print('Installed package versions:') print('-' * 40) From b4c5fbf8bf971b8ff735d53522e231ff2b650957 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Sun, 5 Apr 2026 22:58:15 +0800 Subject: [PATCH 05/17] remove oss2 --- INSTALL_MEGATRON.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh index fb23524b..dd6266cc 100644 --- a/INSTALL_MEGATRON.sh +++ b/INSTALL_MEGATRON.sh @@ -55,8 +55,8 @@ echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST" # Install latest base packages echo "" -echo "Installing peft, accelerate, transformers, modelscope, oss2..." -pip install --upgrade peft accelerate transformers "modelscope[framework]" oss2 +echo "Installing peft, accelerate, transformers, modelscope..." +pip install --upgrade peft accelerate transformers "modelscope[framework]" # Install latest vllm echo "" From fe6e8663012c31150125d131a25af4e4fd76a15f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 6 Apr 2026 10:52:46 +0800 Subject: [PATCH 06/17] fix install --- INSTALL_MEGATRON.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh index dd6266cc..e2d1db4b 100644 --- a/INSTALL_MEGATRON.sh +++ b/INSTALL_MEGATRON.sh @@ -4,7 +4,7 @@ # which always occur error set -e # Exit immediately on error - +export SETUPTOOLS_USE_DISTUTILS=local echo "==========================================" echo "Starting deep learning dependencies installation..." echo "==========================================" From b2d1466d20e277f1a49cf52d6c92e55010ee1eed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 6 Apr 2026 11:31:39 +0800 Subject: [PATCH 07/17] fix server config --- cookbook/client/server/megatron/server_config.yaml | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml index 21d8a16b..0f66dd57 100644 --- a/cookbook/client/server/megatron/server_config.yaml +++ b/cookbook/client/server/megatron/server_config.yaml @@ -42,7 +42,7 @@ applications: import_path: sampler args: model_id: "ms://Qwen/Qwen3.5-27B" # ModelScope model identifier - nproc_per_node: 8 # Number of GPU processes per node + nproc_per_node: 4 # Number of GPU processes per node sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler) engine_args: # vLLM engine-specific settings max_model_len: 32000 # Maximum sequence length the engine supports @@ -84,7 +84,7 @@ applications: model_id: "ms://Qwen/Qwen3.5-27B" # ModelScope model identifier max_length: 32000 # model max length max_loras: 5 # model max loras - nproc_per_node: 8 # Number of GPU processes per node + nproc_per_node: 4 # Number of GPU processes per node device_group: name: model ranks: 4 # GPU rank indices diff --git a/pyproject.toml b/pyproject.toml index ce392b26..f3880b2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ dependencies = [ "datasets>=3.0,<4.0", "omegaconf>=2.3.0,<3.0.0", "fastapi", - "modelscope[framework]>=1.34.0", + "modelscope[framework]>=1.35.0", "safetensors", "peft>=0.11.0,<=0.19.0", "transformers", From 1c20c5eacb0b2333715afbf4d16e5887c66eb000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 6 Apr 2026 11:47:58 +0800 Subject: [PATCH 08/17] fix trust_remote_code --- Dockerfile | 3 --- cookbook/client/server/megatron/run.sh | 2 +- cookbook/client/server/megatron/server.py | 2 +- src/twinkle/dataset/base.py | 6 ++++++ src/twinkle/hub/hub.py | 4 +++- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8bae7c6a..a3aa4ff6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,6 +21,3 @@ RUN sh INSTALL_MEGATRON.sh RUN pip install --no-cache-dir tinker==0.14.0 "ray[serve]" transformers peft accelerate -U RUN pip install -e . --no-build-isolation - -ENV TWINKLE_WORKDIR=/data -CMD ["bash", "-c", "mkdir -p $TWINKLE_WORKDIR && cd $TWINKLE_WORKDIR && bash /twinkle/cookbook/client/server/megatron/run.sh 2>&1 | tee $TWINKLE_WORKDIR/run.log"] diff --git a/cookbook/client/server/megatron/run.sh b/cookbook/client/server/megatron/run.sh index 38befef2..c7db36d1 100644 --- a/cookbook/client/server/megatron/run.sh +++ b/cookbook/client/server/megatron/run.sh @@ -3,4 +3,4 @@ export RAY_ROTATION_BACKUP_COUNT=1 CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4 --disable-usage-stats --include-dashboard=false CUDA_VISIBLE_DEVICES=4,5,6,7 ray start --address=127.0.0.1:6379 --num-gpus=4 CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0 -python server.py +python "$(dirname "$0")/server.py" diff --git a/cookbook/client/server/megatron/server.py b/cookbook/client/server/megatron/server.py index e38f43a4..d6cb87c5 100644 --- a/cookbook/client/server/megatron/server.py +++ b/cookbook/client/server/megatron/server.py @@ -9,7 +9,7 @@ import os # Enable Ray debug mode for verbose logging during development -os.environ['TWINKLE_TRUST_REMOTE_CODE'] = '1' +os.environ['TWINKLE_TRUST_REMOTE_CODE'] = '0' from twinkle.server import launch_server diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py index 8ffe5cea..98bb9c8f 100644 --- a/src/twinkle/dataset/base.py +++ b/src/twinkle/dataset/base.py @@ -51,6 +51,9 @@ class Dataset(TorchDataset): """ def __init__(self, dataset_meta: DatasetMeta, **kwargs): + trust_remote_code = bool(os.environ.get('TWINKLE_TRUST_REMOTE_CODE', '1')) + if not trust_remote_code: + kwargs['trust_remote_code'] = False dataset = self._load_dataset(dataset_meta, **kwargs) self.datasets = {dataset_meta.get_id(): dataset} self.dataset = dataset @@ -247,6 +250,9 @@ def add_dataset(self, dataset_meta: DatasetMeta, **kwargs): Args: dataset_meta: The dataset_meta information of the loaded dataset. """ + trust_remote_code = bool(os.environ.get('TWINKLE_TRUST_REMOTE_CODE', '1')) + if not trust_remote_code: + kwargs['trust_remote_code'] = False dataset = self._load_dataset(dataset_meta, **kwargs) self.datasets[dataset_meta.get_id()] = dataset diff --git a/src/twinkle/hub/hub.py b/src/twinkle/hub/hub.py index 916a42b2..15fc1ef5 100644 --- a/src/twinkle/hub/hub.py +++ b/src/twinkle/hub/hub.py @@ -401,7 +401,7 @@ def load_dataset(cls, cls.try_login(token) if revision is None or revision == 'main': revision = 'master' - load_kwargs = {'trust_remote_code': True} + load_kwargs = {'trust_remote_code': kwargs.get('trust_remote_code', True)} return MsDataset.load( dataset_id, subset_name=subset_name, @@ -595,6 +595,7 @@ def load_dataset(cls, from datasets import load_dataset if revision is None or revision == 'master': revision = 'main' + trust_remote_code = kwargs.get('trust_remote_code', True) return load_dataset( dataset_id, name=subset_name, @@ -602,6 +603,7 @@ def load_dataset(cls, streaming=streaming, revision=revision, download_mode=download_mode, + trust_remote_code=trust_remote_code, num_proc=num_proc) @classmethod From 4c7cdfa133090d4a009952610d0d635058e713f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 6 Apr 2026 11:49:42 +0800 Subject: [PATCH 09/17] fix install script --- INSTALL_MEGATRON.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh index e2d1db4b..e86e5478 100644 --- a/INSTALL_MEGATRON.sh +++ b/INSTALL_MEGATRON.sh @@ -71,7 +71,9 @@ echo "Site-packages path: $SITE_PACKAGES" CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \ CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \ -pip install --no-build-isolation "transformer_engine[pytorch]" megatron_core --no-cache-dir +pip install --no-build-isolation "transformer_engine[pytorch]" --no-cache-dir + +pip install megatron_core mcore_bridge --no-cache-dir # Install flash-attention (force local build) echo "" From 4086a54945e3153f78c75ee9e54f27ca676984d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 6 Apr 2026 12:03:49 +0800 Subject: [PATCH 10/17] fix support models --- cookbook/client/tinker/modelscope/sample.py | 2 +- src/twinkle/server/gateway/server.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cookbook/client/tinker/modelscope/sample.py b/cookbook/client/tinker/modelscope/sample.py index 72bd9f24..40c9b327 100644 --- a/cookbook/client/tinker/modelscope/sample.py +++ b/cookbook/client/tinker/modelscope/sample.py @@ -45,7 +45,7 @@ ] ) -input_feature = template.encode(trajectory, add_generation_prompt=True) +input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0] input_ids = input_feature['input_ids'].tolist() diff --git a/src/twinkle/server/gateway/server.py b/src/twinkle/server/gateway/server.py index cd942e61..79a90349 100644 --- a/src/twinkle/server/gateway/server.py +++ b/src/twinkle/server/gateway/server.py @@ -36,7 +36,7 @@ def __init__(self, self.http_options = http_options or {} self.proxy = ServiceProxy(http_options=http_options, route_prefix=self.route_prefix) self.supported_models = self._normalize_models(supported_models) or [ - types.SupportedModel(model_name='Qwen/Qwen3.5-4B'), + types.SupportedModel(model_name='Qwen/Qwen3.5-27B'), ] self._modelscope_config_lock = asyncio.Lock() From b55ba2c82cb2da7ae2d7a89ca46ee036e5715a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 6 Apr 2026 12:50:02 +0800 Subject: [PATCH 11/17] fix torchrun --- src/twinkle/processor/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py index da6bb3d5..3269a574 100644 --- a/src/twinkle/processor/base.py +++ b/src/twinkle/processor/base.py @@ -97,7 +97,8 @@ def to_tensor(_input): # so tensor ops like labels != ignore_index or .to(device) would fail without this. if isinstance(value, np.ndarray): value = torch.from_numpy(value) - elif isinstance(value, list) and isinstance(value[0], (int, float, np.number)): + elif (isinstance(value, list) and isinstance(value[0], + (int, float, np.number))) or key == 'position_ids': value = torch.tensor(value) elif key in self.VLM_CONCAT_FIELDS: if not isinstance(value[0], torch.Tensor): From 6558796da2649979f808eb642d1f674477ced02b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Wed, 8 Apr 2026 19:21:12 +0800 Subject: [PATCH 12/17] fix version --- pyproject.toml | 2 +- src/twinkle/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index af96f65e..f3880b2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "twinkle-kit" -version = "0.3.dev0" +version = "0.2.0" description = "Training API for large language models with efficient data handling and advanced optimization techniques." readme = "README.md" authors = [{ name = "ModelScope", email = "contact@modelscope.cn" }] diff --git a/src/twinkle/version.py b/src/twinkle/version.py index 30f4428a..08a7c147 100644 --- a/src/twinkle/version.py +++ b/src/twinkle/version.py @@ -1,5 +1,5 @@ # Make sure to modify __release_datetime__ to release time when making official release. -__version__ = '0.3.dev0' +__version__ = '0.2.0' # default release datetime for branches under active development is set # to be a time far-far-away-into-the-future __release_datetime__ = '2099-10-13 08:56:12' From 031be8f1e322fe1696ad1c3cc2752935fb5bb7a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Wed, 8 Apr 2026 22:01:59 +0800 Subject: [PATCH 13/17] add no-cache-dir --- INSTALL_MEGATRON.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh index e86e5478..276598c5 100644 --- a/INSTALL_MEGATRON.sh +++ b/INSTALL_MEGATRON.sh @@ -56,12 +56,12 @@ echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST" # Install latest base packages echo "" echo "Installing peft, accelerate, transformers, modelscope..." -pip install --upgrade peft accelerate transformers "modelscope[framework]" +pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir # Install latest vllm echo "" echo "Installing latest vllm..." -pip install --upgrade vllm +pip install --upgrade vllm --no-cache-dir # Get site-packages path and install transformer_engine and megatron_core echo "" @@ -83,12 +83,12 @@ MAX_JOBS=8 \ FLASH_ATTENTION_FORCE_BUILD=TRUE \ pip install flash-attn --no-build-isolation --no-cache-dir -pip install flash-linear-attention -U +pip install flash-linear-attention -U --no-cache-dir # Install numpy echo "" echo "Installing numpy==2.2 and deep_gemm..." -pip install numpy==2.2 +pip install numpy==2.2 --no-cache-dir # Verify installation echo "" From 80e9e581b0a9f90a10be7ab7840c238d7601731f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Wed, 8 Apr 2026 23:19:29 +0800 Subject: [PATCH 14/17] fix deps version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f3880b2f..ce392b26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ dependencies = [ "datasets>=3.0,<4.0", "omegaconf>=2.3.0,<3.0.0", "fastapi", - "modelscope[framework]>=1.35.0", + "modelscope[framework]>=1.34.0", "safetensors", "peft>=0.11.0,<=0.19.0", "transformers", From e3f37c986f02a73a0215d205e51d022ee08e4019 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Wed, 8 Apr 2026 23:21:24 +0800 Subject: [PATCH 15/17] fix deps version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ce392b26..a0cf3908 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ authors = [{ name = "ModelScope", email = "contact@modelscope.cn" }] requires-python = ">=3.11,<3.13" dependencies = [ "numpy>=2.0.0,<2.3.0", - "datasets>=3.0,<4.0", + "datasets", "omegaconf>=2.3.0,<3.0.0", "fastapi", "modelscope[framework]>=1.34.0", From 9342d745518e3d5384a264c041b60fe6f374ef07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Wed, 8 Apr 2026 23:38:38 +0800 Subject: [PATCH 16/17] update dockerfile --- Dockerfile | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a3aa4ff6..eeb03f8e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,8 +16,36 @@ RUN echo "Available release branches:" && git branch -r -l 'origin/release/*' -- echo "Checking out: $LATEST_RELEASE" && \ git checkout --track "$LATEST_RELEASE" -RUN sh INSTALL_MEGATRON.sh +ENV SETUPTOOLS_USE_DISTUTILS=local +# Install base packages +RUN pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir + +# Install vllm +RUN pip install --upgrade vllm --no-cache-dir + +# Install transformer_engine and megatron_core +RUN SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") && \ + CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \ + CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \ + pip install --no-build-isolation "transformer_engine[pytorch]" --no-cache-dir + +RUN pip install megatron_core mcore_bridge --no-cache-dir + +# Install flash-attention (default arch 8.0;9.0, override via build-arg if needed) +ARG TORCH_CUDA_ARCH_LIST="8.0;9.0" +RUN TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" \ + MAX_JOBS=8 \ + FLASH_ATTENTION_FORCE_BUILD=TRUE \ + pip install flash-attn --no-build-isolation --no-cache-dir + +RUN pip install flash-linear-attention -U --no-cache-dir + +# Install numpy +RUN pip install numpy==2.2 --no-cache-dir + +# Install tinker, ray, and other deps RUN pip install --no-cache-dir tinker==0.14.0 "ray[serve]" transformers peft accelerate -U +# Install twinkle itself RUN pip install -e . --no-build-isolation From d1c2d8e04818ad7a860d073118db6337f9dbff99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Thu, 9 Apr 2026 17:10:56 +0800 Subject: [PATCH 17/17] fix docs --- README.md | 4 ++-- README_ZH.md | 4 ++-- .../tinker/modelscope/self_cognition.py | 2 +- .../tinker/modelscope/short_math_grpo.py | 2 +- .../client/tinker/self_host/self_cognition.py | 2 +- .../tinker/self_host/short_math_grpo.py | 2 +- .../twinkle/modelscope/self_congnition.py | 4 ++-- cookbook/client/twinkle/self_host/grpo.py | 6 ++--- cookbook/client/twinkle/self_host/sample.py | 2 +- .../twinkle/self_host/self_congnition.py | 4 ++-- cookbook/megatron/tp_moe.py | 4 ++-- cookbook/ray/single_controller.py | 4 ++-- cookbook/rl/gkd_on_policy.py | 6 +++++ cookbook/rl/grpo.py | 6 +++++ cookbook/rl/grpo_mm.py | 11 +++++----- cookbook/rl/short_math_grpo.py | 13 +++++------ cookbook/transformers/ep_fsdp_qwen3_moe.py | 4 ++-- cookbook/transformers/fsdp2_moe.py | 4 ++-- cookbook/transformers/sp_fsdp_dense.py | 2 +- .../Checkpoint Engine/CheckpointEngine.md | 3 +++ docs/source_en/Components/Dataset/Dataset.md | 2 +- .../source_en/Components/Template/Template.md | 7 ++++++ .../Usage Guide/Introduction-with-Qwen3.5.md | 16 +++++++------- docs/source_en/Usage Guide/Quick-Start.md | 22 +++++++++---------- .../Usage Guide/Server and Client/Server.md | 14 ++++++------ .../Tinker-Compatible-Client.md | 9 ++++---- .../Server and Client/Twinkle-Client.md | 6 ++--- .../Usage Guide/Train-as-a-Service.md | 2 +- ...00\344\275\263\345\256\236\350\267\265.md" | 16 +++++++------- ...53\351\200\237\345\274\200\345\247\213.md" | 22 +++++++++---------- ...71\345\256\242\346\210\267\347\253\257.md" | 9 ++++---- ...le\345\256\242\346\210\267\347\253\257.md" | 6 ++--- .../\346\234\215\345\212\241\347\253\257.md" | 14 ++++++------ ...55\347\273\203\346\234\215\345\212\241.md" | 2 +- .../Dataset.md" | 2 +- .../CheckpointEngine.md" | 2 ++ .../\346\250\241\346\235\277/Template.md" | 7 ++++++ src/twinkle/server/model/app.py | 2 +- src/twinkle/server/sampler/app.py | 2 +- 39 files changed, 142 insertions(+), 109 deletions(-) diff --git a/README.md b/README.md index 35799b85..8c958f41 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id=base_model) + dataset.set_template('Qwen3_5Template', model_id=base_model) # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -242,7 +242,7 @@ api_key='your-api-key' # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256) dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git a/README_ZH.md b/README_ZH.md index 352ebde0..6ae4f742 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -166,7 +166,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id=base_model) + dataset.set_template('Qwen3_5Template', model_id=base_model) # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -224,7 +224,7 @@ api_key='your-api-key' # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256) dataset.map(SelfCognitionProcessor('twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py index 9f02ee40..2347c7fc 100644 --- a/cookbook/client/tinker/modelscope/self_cognition.py +++ b/cookbook/client/tinker/modelscope/self_cognition.py @@ -34,7 +34,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply the chat template matching the base model (max 256 tokens per sample) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) # Replace placeholder names with custom model/author identity dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py index 6796b517..47a7d24a 100644 --- a/cookbook/client/tinker/modelscope/short_math_grpo.py +++ b/cookbook/client/tinker/modelscope/short_math_grpo.py @@ -182,7 +182,7 @@ def create_math_dataset(): data_slice=range(DATA_NUM), ) dataset = Dataset(meta) - dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete') + dataset.set_template('Qwen3_5Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete') dataset.map(MathPreprocessor()) dataset.filter(lambda row: bool(row['messages'])) dataset.encode(add_generation_prompt=True) diff --git a/cookbook/client/tinker/self_host/self_cognition.py b/cookbook/client/tinker/self_host/self_cognition.py index 6951760d..691662e6 100644 --- a/cookbook/client/tinker/self_host/self_cognition.py +++ b/cookbook/client/tinker/self_host/self_cognition.py @@ -36,7 +36,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply the chat template matching the base model (max 256 tokens per sample) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) # Replace placeholder names with custom model/author identity dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) diff --git a/cookbook/client/tinker/self_host/short_math_grpo.py b/cookbook/client/tinker/self_host/short_math_grpo.py index 35b4d96d..f6fe8b45 100644 --- a/cookbook/client/tinker/self_host/short_math_grpo.py +++ b/cookbook/client/tinker/self_host/short_math_grpo.py @@ -182,7 +182,7 @@ def create_math_dataset(): data_slice=range(DATA_NUM), ) dataset = Dataset(meta) - dataset.set_template('Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete') + dataset.set_template('Qwen3_5Template', model_id=BASE_MODEL, max_length=4096, truncation_strategy='delete') dataset.map(MathPreprocessor()) dataset.filter(lambda row: bool(row['messages'])) dataset.encode(add_generation_prompt=True) diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py index aafc5d14..81c5ab4d 100644 --- a/cookbook/client/twinkle/modelscope/self_congnition.py +++ b/cookbook/client/twinkle/modelscope/self_congnition.py @@ -52,7 +52,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply a chat template so the data matches the model's expected input format - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=512) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=512) # Replace placeholder names in the dataset with custom model/author names dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'}) @@ -77,7 +77,7 @@ def train(): model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) # Set the same chat template used during data preprocessing - model.set_template('Template') + model.set_template('Qwen3_5Template') # Set the input processor (pads sequences on the right side) model.set_processor('InputProcessor', padding_side='right') diff --git a/cookbook/client/twinkle/self_host/grpo.py b/cookbook/client/twinkle/self_host/grpo.py index cabce6ea..d87bfa77 100644 --- a/cookbook/client/twinkle/self_host/grpo.py +++ b/cookbook/client/twinkle/self_host/grpo.py @@ -55,7 +55,7 @@ def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map('GSM8KProcessor') dataset.encode(add_generation_prompt=True) return dataset @@ -112,11 +112,11 @@ def train(): # Set processor and template for encoding inputs model.set_processor('InputProcessor') - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 4: Configure the sampler sampler = vLLMSampler(model_id=MODEL_ID) - sampler.set_template('Template', model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 5: Setup metrics and advantage function advantage_fn = GRPOAdvantage() diff --git a/cookbook/client/twinkle/self_host/sample.py b/cookbook/client/twinkle/self_host/sample.py index 3b02c4ec..f7925d4f 100644 --- a/cookbook/client/twinkle/self_host/sample.py +++ b/cookbook/client/twinkle/self_host/sample.py @@ -42,7 +42,7 @@ def sample(): sampler = vLLMSampler(model_id=MODEL_ID) # Step 4: Set the chat template so the sampler can encode Trajectory inputs - sampler.set_template('Template', model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 5: Prepare inputs as Trajectory dicts (messages format) # Each trajectory is a conversation with system and user messages diff --git a/cookbook/client/twinkle/self_host/self_congnition.py b/cookbook/client/twinkle/self_host/self_congnition.py index e31daaba..f382956f 100644 --- a/cookbook/client/twinkle/self_host/self_congnition.py +++ b/cookbook/client/twinkle/self_host/self_congnition.py @@ -59,7 +59,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply a chat template so the data matches the model's expected input format - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=512) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=512) # Replace placeholder names in the dataset with custom model/author names dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'}) @@ -84,7 +84,7 @@ def train(): model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) # Set the same chat template used during data preprocessing - model.set_template('Template') + model.set_template('Qwen3_5Template') # Set the input processor (pads sequences on the right side) model.set_processor('InputProcessor', padding_side='right') diff --git a/cookbook/megatron/tp_moe.py b/cookbook/megatron/tp_moe.py index 364ac686..b66b109f 100644 --- a/cookbook/megatron/tp_moe.py +++ b/cookbook/megatron/tp_moe.py @@ -20,7 +20,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=16) @@ -34,7 +34,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset diff --git a/cookbook/ray/single_controller.py b/cookbook/ray/single_controller.py index 39d99353..edb8d8e6 100644 --- a/cookbook/ray/single_controller.py +++ b/cookbook/ray/single_controller.py @@ -26,7 +26,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=8, min_batch_size=8) @@ -41,7 +41,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset diff --git a/cookbook/rl/gkd_on_policy.py b/cookbook/rl/gkd_on_policy.py index f134f0de..f30df2ea 100644 --- a/cookbook/rl/gkd_on_policy.py +++ b/cookbook/rl/gkd_on_policy.py @@ -173,6 +173,9 @@ def main(): # ── Student vLLM sampler (for on-policy generation) ──────────────────────── student_sampler = vLLMSampler( model_id=STUDENT_MODEL_ID, + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM engine_args={'gpu_memory_utilization': 0.85, 'max_model_len': 4096, 'enable_lora': True, 'max_loras': 1}, device_mesh=sampler_mesh, remote_group='student_sampler', @@ -210,6 +213,9 @@ def main(): break # 1. Sync student model weights to student sampler + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM ckpt_manager.sync_weights(merge_and_sync=False) student_sampler.reset_prefix_cache() diff --git a/cookbook/rl/grpo.py b/cookbook/rl/grpo.py index bc864309..30d5d898 100644 --- a/cookbook/rl/grpo.py +++ b/cookbook/rl/grpo.py @@ -103,6 +103,9 @@ def main(): 'max_model_len': 4496, 'max_lora_rank': 32, # save as lora_config # NOTE: To use enable_lora with qwen3.5, ensure vLLM includes PR https://github.com/vllm-project/vllm/pull/36976 + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM 'enable_lora': True, }, device_mesh=sampler_mesh, @@ -133,6 +136,9 @@ def main(): break metrics.reset() global_prompts = batch if isinstance(batch, list) else [batch] + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM ckpt_manager.sync_weights(merge_and_sync=False) sampler.reset_prefix_cache() sample_responses = sampler.sample( diff --git a/cookbook/rl/grpo_mm.py b/cookbook/rl/grpo_mm.py index d6f934d5..0705febb 100644 --- a/cookbook/rl/grpo_mm.py +++ b/cookbook/rl/grpo_mm.py @@ -27,10 +27,6 @@ ) from twinkle.sampler import vLLMSampler -import swanlab -swanlab.init( - project='twinkle', -) logger = get_logger() # Model configuration @@ -184,6 +180,9 @@ def main(): 'gpu_memory_utilization': 0.8, 'max_model_len': 32000, 'max_lora_rank': 32, + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM 'enable_lora': True, 'limit_mm_per_prompt': {'image': 9}, # OlympiadBench has up to 9 images }, @@ -221,6 +220,9 @@ def main(): metrics.reset() # Sync weights to sampler + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM ckpt_manager.sync_weights(merge_and_sync=False) sampler.reset_prefix_cache() @@ -282,7 +284,6 @@ def main(): log_dict.update(model.calculate_metric(is_training=True, adapter_name=ADAPTER_NAME)) metrics.reset() logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}') - swanlab.log(log_dict) logger.info(f'Training completed. optim_steps={optim_step}') model.save('olympiad-grpo-mixed-final', adapter_name=ADAPTER_NAME) diff --git a/cookbook/rl/short_math_grpo.py b/cookbook/rl/short_math_grpo.py index 55939cbd..8f498923 100644 --- a/cookbook/rl/short_math_grpo.py +++ b/cookbook/rl/short_math_grpo.py @@ -50,12 +50,6 @@ SYSTEM_PROMPT = ('You are a helpful math assistant. Solve the problem with minimal but correct reasoning ' 'and put your final answer within \\boxed{}.') -import swanlab -swanlab.init( - project='twinkle', -) - - # ========== Reward Functions ========== class GSM8KBrevityReward(Reward): """Brevity reward: rewards shorter completions that contain a valid answer. @@ -167,6 +161,9 @@ def main(): 'max_model_len': 8192, 'max_lora_rank': 32, # save as lora_config # NOTE: To use enable_lora with qwen3.5, ensure vLLM includes PR https://github.com/vllm-project/vllm/pull/36976 + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM 'enable_lora': True, }, device_mesh=sampler_mesh, @@ -202,6 +199,9 @@ def main(): for prompt in batch: expand_prompts.extend([prompt] * NUM_GENERATIONS) + # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False) + # meaning only sync lora weights, if merge_and_sync=True, + # lora will be merged into the base model and sync all weights to vLLM ckpt_manager.sync_weights(merge_and_sync=False) sampler.reset_prefix_cache() @@ -256,7 +256,6 @@ def main(): log_dict = metrics.calculate() log_dict.update(model.calculate_metric(is_training=True)) - swanlab.log(log_dict) metrics.reset() logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}') diff --git a/cookbook/transformers/ep_fsdp_qwen3_moe.py b/cookbook/transformers/ep_fsdp_qwen3_moe.py index 3c02b218..11855fae 100644 --- a/cookbook/transformers/ep_fsdp_qwen3_moe.py +++ b/cookbook/transformers/ep_fsdp_qwen3_moe.py @@ -13,7 +13,7 @@ MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3.5-4B') DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition') -TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Template') +TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Qwen3_5Template') _num_layers_env = os.environ.get('NUM_LAYERS') NUM_LAYERS = int(_num_layers_env) if _num_layers_env is not None else None BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '4')) @@ -47,7 +47,7 @@ def train(): try: dataset.set_template(TEMPLATE_ID, model_id=MODEL_ID) except ValueError: - dataset.set_template('Template', model_id=MODEL_ID) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID) dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode(batched=True) diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index 2a92794a..23a53f4a 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -20,7 +20,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=4) @@ -35,7 +35,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset diff --git a/cookbook/transformers/sp_fsdp_dense.py b/cookbook/transformers/sp_fsdp_dense.py index 868b61c0..280ed6f1 100644 --- a/cookbook/transformers/sp_fsdp_dense.py +++ b/cookbook/transformers/sp_fsdp_dense.py @@ -49,7 +49,7 @@ def eval(model): def create_dataset(data_slice=None): dataset = Dataset(dataset_meta=DatasetMeta(DATASETS, data_slice=range(500))) - dataset.set_template('Template', model_id=MODEL_ID) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID) dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队')) dataset.encode(batched=True) return dataset diff --git a/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md b/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md index f72bec83..1a7c39bf 100644 --- a/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md +++ b/docs/source_en/Components/Checkpoint Engine/CheckpointEngine.md @@ -67,3 +67,6 @@ See: [HCCLCheckpointEngine](HCCLCheckpointEngine.md) - **HCCLCheckpointEngine**: Suitable for Ascend NPU environments > Checkpoint engine is a key component of RLHF training infrastructure, ensuring that trainers and samplers use consistent model weights. +> Currently, synchronization is divided into two cases based on merge_and_sync=True/False. When set to True, the LoRA is merged into the base model and then synchronized. +> When set to False, only the LoRA weights are synchronized. Additionally, for multi-tenant scenarios, LoRA files are directly attached to vLLM. +> When merge_and_sync=False or in multi-tenant mode, vLLM's startup parameter enable_lora=True needs to be enabled. When merge_and_sync=True or using full parameters, this value should be set to False. diff --git a/docs/source_en/Components/Dataset/Dataset.md b/docs/source_en/Components/Dataset/Dataset.md index 01ce46cf..3fb86119 100644 --- a/docs/source_en/Components/Dataset/Dataset.md +++ b/docs/source_en/Components/Dataset/Dataset.md @@ -66,7 +66,7 @@ If using a local path or a local file, please follow these instructions: The Template component is responsible for converting string/image multimodal raw data into model input tokens. The dataset can set a Template to complete the `encode` process. ```python -dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) +dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) ``` The set_template method supports passing `kwargs` (such as `max_length` in the example) to be used as constructor parameters for `Template`. diff --git a/docs/source_en/Components/Template/Template.md b/docs/source_en/Components/Template/Template.md index 4bd52722..32709361 100644 --- a/docs/source_en/Components/Template/Template.md +++ b/docs/source_en/Components/Template/Template.md @@ -50,3 +50,10 @@ class Template: > Template does not support using functions as replacements because it needs to support many functions internally. If you need to write a new Template, please inherit the `Template` class. > Generally speaking, using the Template base class is sufficient for pure text models. In the base class, we use tokenizer.apply_chat_template to encode the model, which is universal for general pure text models. + +# Template mapping + +Currently, the model-template mapping is simple: + +- Template class:Supported in all pure text LLMs. +- Qwen3_5Template class: For Qwen3.5 MLLMs. diff --git a/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md b/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md index d1eba8cc..c5856fdc 100644 --- a/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md +++ b/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md @@ -48,7 +48,7 @@ logger = get_logger() def eval(model): # Validation set: 100 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -63,7 +63,7 @@ def train(): # Training set: 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess: replace placeholders in self-cognition data dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -188,7 +188,7 @@ ADAPTER_NAME = 'default' def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map(GSM8KProcessor()) dataset.encode(add_generation_prompt=True) return dataset @@ -222,7 +222,7 @@ def main(): model.set_lr_scheduler('CosineAnnealingLR', T_max=MAX_STEPS, eta_min=0) model.set_loss('GRPOLoss', epsilon=0.2) model.set_processor(InputProcessor) - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # Sampler deployed in the 'sampler' group sampler = vLLMSampler( @@ -236,7 +236,7 @@ def main(): device_mesh=sampler_mesh, remote_group='sampler', ) - sampler.set_template(Template, model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) @@ -393,7 +393,7 @@ for run in runs: def train(): # Prepare dataset dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle model', 'model_author': 'ModelScope Community'}) dataset.encode(batched=True) dataloader = DataLoader(dataset=dataset, batch_size=4) @@ -403,7 +403,7 @@ def train(): lora_config = LoraConfig(target_modules='all-linear') model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) - model.set_template('Template') + model.set_template('Qwen3_5Template') model.set_processor('InputProcessor', padding_side='right') model.set_loss('CrossEntropyLoss') model.set_optimizer('AdamW', lr=1e-4) @@ -473,7 +473,7 @@ base_url = 'http://www.modelscope.cn/twinkle' def train(): # Prepare dataset dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) dataset.map(SelfCognitionProcessor('Twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md index 2a2f0f31..4ffa9c86 100644 --- a/docs/source_en/Usage Guide/Quick-Start.md +++ b/docs/source_en/Usage Guide/Quick-Start.md @@ -70,7 +70,7 @@ def train(): dataset = PackingDataset(dataset_meta) dataset.map(SelfCognitionProcessor(model_name='Twinkle Model', model_author='ModelScope Community')) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.encode() dataset.pack_dataset() @@ -114,7 +114,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -182,7 +182,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset @@ -271,7 +271,7 @@ ADAPTER_NAME = 'default' def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map(GSM8KProcessor()) dataset.encode(add_generation_prompt=True) return dataset @@ -303,7 +303,7 @@ def main(): model.set_lr_scheduler('default', lr_decay_steps=MAX_STEPS, max_lr=LEARNING_RATE) model.set_loss('GRPOLoss', epsilon=0.2) model.set_processor(InputProcessor) - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) sampler = vLLMSampler( model_id=MODEL_ID, @@ -316,7 +316,7 @@ def main(): device_mesh=sampler_mesh, remote_group='sampler', ) - sampler.set_template(Template, model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) dataloader = DataLoader( dataset=create_gsm8k_dataset, @@ -476,7 +476,7 @@ def create_countdown_dataset(): """Create Countdown Game dataset for GRPO training.""" dataset = Dataset(dataset_meta=DatasetMeta('ms://zouxuhong/Countdown-Tasks-3to4', data_slice=range(500))) - dataset.set_template('Template', model_id=MODEL_ID, max_length=8192) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=8192) dataset.map('CountdownProcessor') dataset.encode(add_generation_prompt=True, batched=True) return dataset @@ -570,11 +570,11 @@ def train(): # Set processor and template for encoding inputs model.set_processor('InputProcessor') - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 4: Configure the sampler sampler = vLLMSampler(model_id=MODEL_ID) - sampler.set_template('Template', model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 5: Setup metrics and advantage function advantage_fn = GRPOAdvantage() @@ -712,7 +712,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply the chat template matching the base model (max 256 tokens per sample) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) # Replace placeholder names with custom model/author identity dataset.map(SelfCognitionProcessor('twinkle model', 'twinkle team'), load_from_cache_file=False) @@ -925,7 +925,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle LLM', 'ModelScope Community')) # Encode dataset diff --git a/docs/source_en/Usage Guide/Server and Client/Server.md b/docs/source_en/Usage Guide/Server and Client/Server.md index 141a730d..ff0918da 100644 --- a/docs/source_en/Usage Guide/Server and Client/Server.md +++ b/docs/source_en/Usage Guide/Server and Client/Server.md @@ -67,8 +67,8 @@ In the YAML configuration file, **each component needs to occupy a separate Node ```yaml applications: # Model service occupies GPU 0-3 (physical card numbers) - - name: models-Qwen2.5-7B-Instruct - route_prefix: /models/Qwen/Qwen2.5-7B-Instruct + - name: models-Qwen3.5-4B + route_prefix: /models/Qwen/Qwen3.5-4B import_path: model args: nproc_per_node: 4 @@ -84,8 +84,8 @@ applications: # ep_size: 1 # Expert parallel size (optional) # Sampler service occupies GPU 4-5 (physical card numbers) - - name: sampler-Qwen2.5-7B-Instruct - route_prefix: /sampler/Qwen/Qwen2.5-7B-Instruct + - name: sampler-Qwen3.5-4B + route_prefix: /sampler/Qwen/Qwen3.5-4B import_path: sampler args: nproc_per_node: 2 @@ -291,12 +291,12 @@ applications: The difference from the Megatron backend is only in the `use_megatron` parameter of the Model service: ```yaml - - name: models-Qwen2.5-7B-Instruct - route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct + - name: models-Qwen3.5-4B + route_prefix: /api/v1/model/Qwen/Qwen3.5-4B import_path: model args: use_megatron: false # Use Transformers backend - model_id: "ms://Qwen/Qwen2.5-7B-Instruct" + model_id: "ms://Qwen/Qwen3.5-4B" nproc_per_node: 2 device_group: name: model diff --git a/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md b/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md index e44f3cea..77738bb7 100644 --- a/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md +++ b/docs/source_en/Usage Guide/Server and Client/Tinker-Compatible-Client.md @@ -58,7 +58,7 @@ response = rest_client.list_training_runs(limit=50).result() print(f"Found {len(response.training_runs)} training runs") # Step 3: Create training client -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3-4B" # Create new training session training_client = service_client.create_lora_training_client( @@ -137,6 +137,7 @@ for epoch in range(2): Tinker compatible mode can also leverage Twinkle's dataset components to simplify data preparation instead of manually constructing `Datum`: ```python +import os from tqdm import tqdm from tinker import types from twinkle import init_tinker_client @@ -150,11 +151,11 @@ init_tinker_client() from tinker import ServiceClient -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3.5-4B" # Use Twinkle's Dataset component to load and preprocess data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) +dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) dataset.map(SelfCognitionProcessor('twinkle model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -223,7 +224,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3.5-4B" service_client = ServiceClient( base_url='http://localhost:8000', diff --git a/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md b/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md index 66d98eec..85980986 100644 --- a/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md +++ b/docs/source_en/Usage Guide/Server and Client/Twinkle-Client.md @@ -93,7 +93,7 @@ for run in runs: dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition')) # Set chat template to match model's input format -dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct', max_length=512) +dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) # Data preprocessing: Replace placeholders with custom names dataset.map('SelfCognitionProcessor', @@ -106,14 +106,14 @@ dataset.encode(batched=True) dataloader = DataLoader(dataset=dataset, batch_size=8) # Step 4: Configure model -model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct') +model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3.5-4B') # Configure LoRA lora_config = LoraConfig(target_modules='all-linear') model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) # Set template, processor, loss function -model.set_template('Template') +model.set_template('Qwen3_5Template') model.set_processor('InputProcessor', padding_side='right') model.set_loss('CrossEntropyLoss') diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md index 38e46858..692ef3f4 100644 --- a/docs/source_en/Usage Guide/Train-as-a-Service.md +++ b/docs/source_en/Usage Guide/Train-as-a-Service.md @@ -36,7 +36,7 @@ api_key=os.environ.get('MODELSCOPE_TOKEN') # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256) dataset.map(SelfCognitionProcessor('Twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" index ad78e28d..bd29a651 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" @@ -48,7 +48,7 @@ logger = get_logger() def eval(model): # 验证集:100 条样本 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -63,7 +63,7 @@ def train(): # 训练集:1000 条样本 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # 设置模板,准备编码 - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # 数据预处理:替换自我认知数据中的占位符 dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # 编码数据集 @@ -188,7 +188,7 @@ ADAPTER_NAME = 'default' def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map(GSM8KProcessor()) dataset.encode(add_generation_prompt=True) return dataset @@ -222,7 +222,7 @@ def main(): model.set_lr_scheduler('CosineAnnealingLR', T_max=MAX_STEPS, eta_min=0) model.set_loss('GRPOLoss', epsilon=0.2) model.set_processor(InputProcessor) - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # 采样器部署在 'sampler' 组 sampler = vLLMSampler( @@ -236,7 +236,7 @@ def main(): device_mesh=sampler_mesh, remote_group='sampler', ) - sampler.set_template(Template, model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) @@ -393,7 +393,7 @@ for run in runs: def train(): # 准备数据集 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'}) dataset.encode(batched=True) dataloader = DataLoader(dataset=dataset, batch_size=4) @@ -403,7 +403,7 @@ def train(): lora_config = LoraConfig(target_modules='all-linear') model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) - model.set_template('Template') + model.set_template('Qwen3_5Template') model.set_processor('InputProcessor', padding_side='right') model.set_loss('CrossEntropyLoss') model.set_optimizer('AdamW', lr=1e-4) @@ -473,7 +473,7 @@ base_url = 'http://www.modelscope.cn/twinkle' def train(): # 准备数据集 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) dataset.map(SelfCognitionProcessor('Twinkle模型', 'ModelScope团队'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" index db8b8f43..b8161c81 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" @@ -70,7 +70,7 @@ def train(): dataset = PackingDataset(dataset_meta) dataset.map(SelfCognitionProcessor(model_name='Twinkle模型', model_author='ModelScope社区')) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) dataset.encode() dataset.pack_dataset() @@ -114,7 +114,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -183,7 +183,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -273,7 +273,7 @@ ADAPTER_NAME = 'default' def create_gsm8k_dataset(): dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train')) - dataset.set_template('Template', model_id=MODEL_ID, max_length=2048) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048) dataset.map(GSM8KProcessor()) dataset.encode(add_generation_prompt=True) return dataset @@ -305,7 +305,7 @@ def main(): model.set_lr_scheduler('default', lr_decay_steps=MAX_STEPS, max_lr=LEARNING_RATE) model.set_loss('GRPOLoss', epsilon=0.2) model.set_processor(InputProcessor) - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) sampler = vLLMSampler( model_id=MODEL_ID, @@ -318,7 +318,7 @@ def main(): device_mesh=sampler_mesh, remote_group='sampler', ) - sampler.set_template(Template, model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler) dataloader = DataLoader( dataset=create_gsm8k_dataset, @@ -478,7 +478,7 @@ def create_countdown_dataset(): """Create Countdown Game dataset for GRPO training.""" dataset = Dataset(dataset_meta=DatasetMeta('ms://zouxuhong/Countdown-Tasks-3to4', data_slice=range(500))) - dataset.set_template('Template', model_id=MODEL_ID, max_length=8192) + dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=8192) dataset.map('CountdownProcessor') dataset.encode(add_generation_prompt=True, batched=True) return dataset @@ -572,11 +572,11 @@ def train(): # Set processor and template for encoding inputs model.set_processor('InputProcessor') - model.set_template('Template', model_id=MODEL_ID) + model.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 4: Configure the sampler sampler = vLLMSampler(model_id=MODEL_ID) - sampler.set_template('Template', model_id=MODEL_ID) + sampler.set_template('Qwen3_5Template', model_id=MODEL_ID) # Step 5: Setup metrics and advantage function advantage_fn = GRPOAdvantage() @@ -714,7 +714,7 @@ def train(): dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) # Apply the chat template matching the base model (max 256 tokens per sample) - dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) # Replace placeholder names with custom model/author identity dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) @@ -927,7 +927,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') + dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" index 27db69b2..1340fc06 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Tinker\345\205\274\345\256\271\345\256\242\346\210\267\347\253\257.md" @@ -58,7 +58,7 @@ response = rest_client.list_training_runs(limit=50).result() print(f"Found {len(response.training_runs)} training runs") # Step 3: 创建训练客户端 -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3-4B" # 新建训练会话 training_client = service_client.create_lora_training_client( @@ -137,6 +137,7 @@ for epoch in range(2): Tinker 兼容模式也可以利用 Twinkle 的数据集组件来简化数据准备,而不是手动构建 `Datum`: ```python +import os from tqdm import tqdm from tinker import types from twinkle import init_tinker_client @@ -150,11 +151,11 @@ init_tinker_client() from tinker import ServiceClient -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen3.5-4B" # 使用 Twinkle 的 Dataset 组件加载和预处理数据 dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256) +dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=256) dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) @@ -223,7 +224,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = "Qwen/Qwen2.5-0.5B-Instruct" +base_model = "Qwen/Qwen/Qwen3.5-4B" service_client = ServiceClient( base_url='http://localhost:8000', diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md" index fd81ac1b..c9fded19 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/Twinkle\345\256\242\346\210\267\347\253\257.md" @@ -93,7 +93,7 @@ for run in runs: dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition')) # 设置 chat 模板,使数据匹配模型的输入格式 -dataset.set_template('Template', model_id='ms://Qwen/Qwen2.5-7B-Instruct', max_length=512) +dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) # 数据预处理:替换占位符为自定义名称 dataset.map('SelfCognitionProcessor', @@ -106,14 +106,14 @@ dataset.encode(batched=True) dataloader = DataLoader(dataset=dataset, batch_size=8) # Step 4: 配置模型 -model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct') +model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3.5-4B') # 配置 LoRA lora_config = LoraConfig(target_modules='all-linear') model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) # 设置模板、处理器、损失函数 -model.set_template('Template') +model.set_template('Qwen3_5Template') model.set_processor('InputProcessor', padding_side='right') model.set_loss('CrossEntropyLoss') diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/\346\234\215\345\212\241\347\253\257.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/\346\234\215\345\212\241\347\253\257.md" index 1528a439..3a791b21 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/\346\234\215\345\212\241\347\253\257.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\346\234\215\345\212\241\347\253\257\345\222\214\345\256\242\346\210\267\347\253\257/\346\234\215\345\212\241\347\253\257.md" @@ -67,8 +67,8 @@ export TWINKLE_TRUST_REMOTE_CODE=0 # 是否信任远程代码(安全考 ```yaml applications: # 模型服务占用 GPU 0-3(物理卡号) - - name: models-Qwen2.5-7B-Instruct - route_prefix: /models/Qwen/Qwen2.5-7B-Instruct + - name: models-Qwen3.5-4B + route_prefix: /models/Qwen/Qwen3.5-4B import_path: model args: nproc_per_node: 4 @@ -84,8 +84,8 @@ applications: # ep_size: 1 # 专家并行大小(可选) # Sampler 服务占用 GPU 4-5(物理卡号) - - name: sampler-Qwen2.5-7B-Instruct - route_prefix: /sampler/Qwen/Qwen2.5-7B-Instruct + - name: sampler-Qwen3.5-4B + route_prefix: /sampler/Qwen/Qwen3.5-4B import_path: sampler args: nproc_per_node: 2 @@ -291,12 +291,12 @@ applications: Transformers 后端与 Megatron 后端的区别仅在 Model 服务的 `use_megatron` 参数: ```yaml - - name: models-Qwen2.5-7B-Instruct - route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct + - name: models-Qwen3.5-4B + route_prefix: /api/v1/model/Qwen/Qwen3.5-4B import_path: model args: use_megatron: false # 使用 Transformers 后端 - model_id: "ms://Qwen/Qwen2.5-7B-Instruct" + model_id: "ms://Qwen/Qwen3.5-4B" nproc_per_node: 2 device_group: name: model diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" index 8dfc056e..0c7afc44 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" @@ -39,7 +39,7 @@ api_key=os.environ.get('MODELSCOPE_TOKEN') # Use twinkle dataset to load the data dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500))) -dataset.set_template('Template', model_id=base_model, max_length=256) +dataset.set_template('Qwen3_5Template', model_id=base_model, max_length=256) dataset.map(SelfCognitionProcessor('Twinkle Model', 'ModelScope Team'), load_from_cache_file=False) dataset.encode(batched=True, load_from_cache_file=False) dataloader = DataLoader(dataset=dataset, batch_size=8) diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" index 322c0e34..812a7e7f 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/Dataset.md" @@ -66,7 +66,7 @@ dataset = Dataset(DatasetMeta(dataset_id='my/custom/dataset.jsonl', data_slice=r Template 组件是负责将字符串/图片多模态原始数据转换为模型输入 token 的组件。数据集可以设置一个 Template 来完成 `encode` 过程。 ```python -dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) +dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512) ``` set_template 方法支持传入 `kwargs`(例如例子中的 `max_length`),作为 `Template` 的构造参数使用。 diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\243\200\346\237\245\347\202\271\345\274\225\346\223\216/CheckpointEngine.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\243\200\346\237\245\347\202\271\345\274\225\346\223\216/CheckpointEngine.md" index b7acdef2..338be10d 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\243\200\346\237\245\347\202\271\345\274\225\346\223\216/CheckpointEngine.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\243\200\346\237\245\347\202\271\345\274\225\346\223\216/CheckpointEngine.md" @@ -67,3 +67,5 @@ Twinkle 提供了两种检查点引擎实现: - **HCCLCheckpointEngine**: 适用于昇腾 NPU 环境 > 检查点引擎是 RLHF 训练基础设施的关键组件,确保训练器和采样器使用一致的模型权重。 +> 目前的同步分为merge_and_sync=True/False两种情况,为True时将lora合并仅基模并同步,为False时仅同步lora权重。另外,多租户直接附加lora文件到vLLM上,在merge_and_sync=False,或使用多租户时, +> vLLM的启动参数需要开启`enable_lora=True`,在merge_and_sync=True或全参时,该值设置为False. diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" index e58abeb4..d9cdba97 100644 --- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" +++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" @@ -50,3 +50,10 @@ class Template: > Template 不支持使用函数来代替,因为其内部要支持的功能较多。如果需要编写新的 Template,请继承 `Template` 类。 > 一般来说,纯文本模型使用 Template 基类就足够了,在基类中我们使用了 tokenizer.apply_chat_template 来编码模型,对一般的纯文本模型是通用的。 + +# 模板对应关系 + +目前模板关系较为简单: + +- Template类:纯文本模型通用 +- Qwen3_5Template类:Qwen3.5多模态模型使用 diff --git a/src/twinkle/server/model/app.py b/src/twinkle/server/model/app.py index 8f0c6f77..41351811 100644 --- a/src/twinkle/server/model/app.py +++ b/src/twinkle/server/model/app.py @@ -143,7 +143,7 @@ def build_model_app(model_id: str, Supports both Tinker (polling-style) and Twinkle (synchronous) clients. Args: - model_id: Base model identifier (e.g., "Qwen/Qwen2.5-0.5B-Instruct") + model_id: Base model identifier (e.g., "Qwen/Qwen3.5-4B") nproc_per_node: Number of processes per node for distributed training device_group: Device group configuration dict device_mesh: Device mesh configuration dict for tensor parallelism diff --git a/src/twinkle/server/sampler/app.py b/src/twinkle/server/sampler/app.py index dc54e4f6..0443df94 100644 --- a/src/twinkle/server/sampler/app.py +++ b/src/twinkle/server/sampler/app.py @@ -111,7 +111,7 @@ def build_sampler_app(model_id: str, Twinkle (synchronous /twinkle/*) sampler clients. Args: - model_id: Model identifier (e.g., "Qwen/Qwen2.5-0.5B-Instruct") + model_id: Model identifier (e.g., "Qwen/Qwen3.5-4B") nproc_per_node: Number of processes per node device_group: Device group configuration dict device_mesh: Device mesh configuration dict for parallelism