bump model to qwen3.5-4b

tastelikefeet · tastelikefeet · commit 0d5da72299ae · 2026-03-23T10:23:43.000+08:00
diff --git a/README.md b/README.md
@@ -129,7 +129,7 @@ supported on Twinkle✨ framework.
 > For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it
 > is currently provided via the Tinker-compatible APIs. We will be rolling out services that support
 > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed
-> by one training base at a time, and currently it is [Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507).
+> by one training base at a time, and currently it is [Qwen3.5-4B](https://modelscope.cn/models/Qwen/Qwen3.5-4B).
 
 | Model Type          | Model ID on [ModelScope](https://modelscope.cn)                                                                 |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
@@ -234,7 +234,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'ms://Qwen/Qwen3.5-4B'
 base_url='your-base-url'
 api_key='your-api-key'
 
diff --git a/README_ZH.md b/README_ZH.md
@@ -112,7 +112,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 随着新模型的发布，我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。
 
 >[!Note]
-> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507)。
+> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3.5-4B](https://modelscope.cn/models/Qwen/Qwen3.5-4B)。
 
 | Model Type          | Model ID 举例                                                                                                     |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
@@ -216,7 +216,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'ms://Qwen/Qwen3.5-4B'
 base_url='your-base-url'
 api_key='your-api-key'
 
diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml
@@ -36,11 +36,11 @@ applications:
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
-  - name: sampler-Qwen3-30B-A3B-Instruct-2507
-    route_prefix: /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507
+  - name: sampler-Qwen3.5-4B
+    route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
     import_path: sampler
     args:
-      model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507"   # ModelScope model identifier
+      model_id: "ms://Qwen/Qwen3.5-4B"   # ModelScope model identifier
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
@@ -73,12 +73,12 @@ applications:
 
   # 2. Model Service (commented out) - Would host the base model for training.
   #    Uncomment and configure if you need a training model worker.
-  - name: models-Qwen3-30B-A3B-Instruct-2507
-    route_prefix: /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507
+  - name: models-Qwen3.5-4B
+    route_prefix: /api/v1/model/Qwen/Qwen3.5-4B
     import_path: model
     args:
       use_megatron: true                          # Use HuggingFace Transformers backend
-      model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier
+      model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
       max_length: 16000                           # model max length
       max_loras: 5                                # model max loras
       nproc_per_node: 4                           # Number of GPU processes per node
diff --git a/cookbook/client/server/megatron/server_config_4b.yaml b/cookbook/client/server/megatron/server_config_4b.yaml
@@ -38,7 +38,7 @@ applications:
     route_prefix: /api/v1/model/Qwen/Qwen3.5-4B
     import_path: model
     args:
-      use_megatron: false
+      use_megatron: true
       model_cls: Qwen3_5ForConditionalGeneration
       model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
       max_length: 10240
diff --git a/cookbook/client/tinker/modelscope/sample.py b/cookbook/client/tinker/modelscope/sample.py
@@ -16,7 +16,7 @@
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'Qwen/Qwen3.5-4B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
@@ -29,7 +29,7 @@
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py
@@ -23,7 +23,7 @@
 from tinker import ServiceClient
 
 # The base model to fine-tune / evaluate
-base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'Qwen/Qwen3.5-4B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 
diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py
@@ -38,7 +38,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-BASE_MODEL = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
+BASE_MODEL = 'Qwen/Qwen3.5-4B'
 NUM_GENERATIONS = 8
 MAX_NEW_TOKENS = 4096
 LEARNING_RATE = 1e-4
diff --git a/cookbook/client/tinker/self_host/sample.py b/cookbook/client/tinker/self_host/sample.py
@@ -27,7 +27,7 @@
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py
@@ -21,7 +21,7 @@
 
 logger = get_logger()
 
-base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'Qwen/Qwen3.5-4B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Initialize the Twinkle client to communicate with the remote server.
diff --git a/cookbook/transformers/ep_fsdp_qwen3_moe.py b/cookbook/transformers/ep_fsdp_qwen3_moe.py
@@ -11,7 +11,7 @@
 
 logger = get_logger()
 
-MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507')
+MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
 DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition')
 TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Template')
 _num_layers_env = os.environ.get('NUM_LAYERS')
diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py
@@ -20,7 +20,7 @@
 def eval(model):
     # 100 Samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100)))
-    dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507')
+    dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B')
     dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     dataset.encode()
     dataloader = DataLoader(dataset=dataset, batch_size=4)
@@ -35,15 +35,15 @@ def train():
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507')
+    dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B')
     # Preprocess the dataset to standard format
     dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     # Encode dataset
     dataset.encode()
     # Global batch size = 4, for GPUs, so 1 sample per GPU
     dataloader = DataLoader(dataset=dataset, batch_size=8)
     # Use a TransformersModel, transformer_cls_names_to_wrap=Qwen3MoeSparseMoeBlock to avoid hang of fsdp2
-    model = TransformersModel(model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507', fsdp_config={'transformer_cls_names_to_wrap':['Qwen3MoeSparseMoeBlock']})
+    model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', fsdp_config={'transformer_cls_names_to_wrap':['Qwen3MoeSparseMoeBlock']})
     # Patch MoE model to fix the hang bug, support transformers==4.*
     model.apply_patch('ms://twinkle-kit/qwen3_moe_transformers4_patch')
     lora_config = LoraConfig(
diff --git a/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md b/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md
@@ -530,7 +530,7 @@ Alongside the open-source release of Twinkle, ModelScope provides a hosted model
 
 ```python
 base_url = 'https://www.modelscope.cn/twinkle'
-base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'  # Model currently deployed in the official environment
+base_model = 'Qwen/Qwen3.5-4B'  # Model currently deployed in the official environment
 ```
 
 ---
diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md
@@ -2,7 +2,7 @@
 
 Alongside the open-source release of the Twinkle framework, we also provide a hosted model training service (Training as a Service) powered by ModelScope's backend infrastructure. Developers can use this service to experience Twinkle's training API for free.
 
-The model currently running on the cluster is [Qwen/Qwen3-30B-A3B-Instruct-2507](https://www.modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507). Below are the detailed usage instructions:
+The model currently running on the cluster is [Qwen/Qwen3.5-4B](https://www.modelscope.cn/models/Qwen/Qwen3.5-4B). Below are the detailed usage instructions:
 
 ## Step 1. Register a ModelScope Account and Apply to Join the twinkle-explorers Organization
 
@@ -30,7 +30,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'ms://Qwen/Qwen3.5-4B'
 base_url='http://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
@@ -64,7 +64,7 @@ for epoch in range(2):
     print(f'Saved checkpoint for epoch {epoch} to {result.path}')
 ```
 
-With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3-30B-A3B-Instruct-2507`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA:
+With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.5-4B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA:
 
 ```python
 import os
@@ -79,7 +79,7 @@ init_tinker_client()
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'Qwen/Qwen3.5-4B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
@@ -92,7 +92,7 @@ service_client = ServiceClient(
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git a/docs/source_zh/使用指引/Qwen3.5最佳实践.md b/docs/source_zh/使用指引/Qwen3.5最佳实践.md
@@ -530,7 +530,7 @@ Twinkle 框架开源的同时，魔搭社区依托自身算力基础设施，提
 
 ```python
 base_url = 'https://www.modelscope.cn/twinkle'
-base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'  # 官方环境当前部署的模型
+base_model = 'Qwen/Qwen3.5-4B'  # 官方环境当前部署的模型
 ```
 
 ---
diff --git a/docs/source_zh/使用指引/训练服务.md b/docs/source_zh/使用指引/训练服务.md
@@ -3,7 +3,7 @@
 在 Twinkle 框架开源的同时，我们依托ModelScope的后台服务，也提供了托管的模型训练服务(Training as a Service），开发者可以通过这一服务，
 免费体验Twinkle的训练API。
 
-目前在集群中运行的模型是[Qwen/Qwen3-30B-A3B-Instruct-2507](https://www.modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507)。下面介绍具体的使用方法：
+目前在集群中运行的模型是[Qwen/Qwen3.5-4B](https://www.modelscope.cn/models/Qwen/Qwen3.5-4B)。下面介绍具体的使用方法：
 
 ## Step 1. 注册ModelScope用户并申请加入 twinkle-explorers 组织
 
@@ -33,7 +33,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'ms://Qwen/Qwen3.5-4B'
 base_url='http://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
@@ -67,7 +67,7 @@ for epoch in range(2):
     print(f'Saved checkpoint for epoch {epoch} to {result.path}')
 ```
 
-通过上述代码，你可以训练一个原模型为`Qwen/Qwen3-30B-A3B-Instruct-2507`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理：
+通过上述代码，你可以训练一个原模型为`Qwen/Qwen3.5-4B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理：
 
 ```python
 import os
@@ -82,7 +82,7 @@ init_tinker_client()
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'Qwen/Qwen3.5-4B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
@@ -95,7 +95,7 @@ service_client = ServiceClient(
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git a/src/twinkle/server/gateway/server.py b/src/twinkle/server/gateway/server.py
@@ -36,7 +36,7 @@ def __init__(self,
         self.http_options = http_options or {}
         self.proxy = ServiceProxy(http_options=http_options, route_prefix=self.route_prefix)
         self.supported_models = self._normalize_models(supported_models) or [
-            tinker_types.SupportedModel(model_name='Qwen/Qwen3-30B-A3B-Instruct-2507'),
+            tinker_types.SupportedModel(model_name='Qwen/Qwen3.5-4B'),
         ]
         self._modelscope_config_lock = asyncio.Lock()
 
diff --git a/src/twinkle/server/model/twinkle_handlers.py b/src/twinkle/server/model/twinkle_handlers.py
@@ -12,7 +12,7 @@
 import traceback
 from fastapi import Depends, FastAPI, HTTPException, Request
 from peft import LoraConfig
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from typing import TYPE_CHECKING, Any, Callable
 
 if TYPE_CHECKING:
     from .app import ModelManagement
diff --git a/tests/moe/test_ep_fsdp_vs_single.py b/tests/moe/test_ep_fsdp_vs_single.py
@@ -7,7 +7,7 @@
 
 Requirements:
   - 4 CUDA GPUs
-  - Model weights accessible via QWEN3_MOE_MODEL_ID (default: Qwen/Qwen3-30B-A3B-Instruct-2507)
+  - Model weights accessible via QWEN3_MOE_MODEL_ID (default: Qwen/Qwen3.5-4B)
 
 Launch (requires 4 CUDA GPUs; skipped automatically if fewer GPUs are available):
 
@@ -456,7 +456,7 @@ def test_alignment(self):
         if torch.cuda.device_count() < 4:
             self.skipTest('Need 4 GPUs')
 
-        model_id = os.environ.get('QWEN3_MOE_MODEL_ID', 'Qwen/Qwen3-30B-A3B-Instruct-2507')
+        model_id = os.environ.get('QWEN3_MOE_MODEL_ID', 'Qwen/Qwen3.5-4B')
         local_only = os.environ.get('QWEN3_MOE_LOCAL_ONLY', '1') != '0'
 
         try:
diff --git a/tests/moe/test_expert_parallel_qwen3_fsdp_sp.py b/tests/moe/test_expert_parallel_qwen3_fsdp_sp.py
@@ -472,7 +472,7 @@ def test_qwen3_moe_pretrained_ep_fsdp_sp_alignment(self):
         world_size = 4
         if torch.cuda.device_count() < world_size:
             self.skipTest('Requires at least 4 GPUs for EP+FSDP+SP alignment test.')
-        model_id = os.environ.get('QWEN3_MOE_MODEL_ID', 'Qwen/Qwen3-30B-A3B-Instruct-2507')
+        model_id = os.environ.get('QWEN3_MOE_MODEL_ID', 'Qwen/Qwen3.5-4B')
         local_files_only = os.environ.get('QWEN3_MOE_LOCAL_ONLY', '1') != '0'
         try:
             _load_qwen3_moe_config(model_id, local_files_only)

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@`
`27`	`27`	`# The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.`
`28`	`28`	`# The server will load the base model and apply the LoRA adapter weights.`
`29`	`29`	`sampling_client = service_client.create_sampling_client(`
`30`		`- model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1',`
	`30`	`+ model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',`
`31`	`31`	`base_model=base_model`
`32`	`32`	`)`
`33`	`33`