Change online model to qwen3.5-27b (#140)

tastelikefeet · web-flow · commit 593f740e9c14 · 2026-04-05T20:06:30.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@ __pycache__/
 *.py[cod]
 *$py.class
 test.py
+test.sh
+twinkle-web
 # C extensions
 *.so
 
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,25 @@
+FROM modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.9.1-1.35.0
+
+# Install miniconda with Python 3.12
+RUN curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
+    rm Miniconda3-latest-Linux-x86_64.sh
+ENV PATH="/opt/conda/bin:${PATH}"
+RUN conda create -n twinkle python=3.12 -y --override-channels -c conda-forge
+SHELL ["conda", "run", "-n", "twinkle", "/bin/bash", "-c"]
+
+# Clone and install twinkle, checkout to latest v-tag
+RUN git clone https://github.com/modelscope/twinkle.git
+WORKDIR /twinkle
+RUN echo "Available v-tags:" && git tag -l 'v*' --sort=-v:refname && \
+    LATEST_TAG=$(git tag -l 'v*' --sort=-v:refname | head -n 1) && \
+    echo "Checking out: $LATEST_TAG" && \
+    git checkout "$LATEST_TAG"
+
+RUN sh INSTALL_MEGATRON.sh
+
+RUN pip install --no-cache-dir tinker==0.14.0 "ray[serve]" transformers peft accelerate -U
+
+RUN pip install -e . --no-build-isolation
+
+CMD ["bash", "cookbook/client/server/megatron/run.sh"]
diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh
@@ -81,6 +81,8 @@ MAX_JOBS=8 \
 FLASH_ATTENTION_FORCE_BUILD=TRUE \
 pip install flash-attn --no-build-isolation --no-cache-dir
 
+pip install flash-linear-attention -U
+
 # Install numpy
 echo ""
 echo "Installing numpy==2.2 and deep_gemm..."
diff --git a/README.md b/README.md
@@ -131,7 +131,7 @@ supported on Twinkle✨ framework.
 > For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it
 > is currently provided via the Tinker-compatible APIs. We will be rolling out services that support
 > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed
-> by one training base at a time, and currently it is [Qwen3.5-4B](https://modelscope.cn/models/Qwen/Qwen3.5-4B).
+> by one training base at a time, and currently it is [Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B).
 
 | Model Type          | Model ID on [ModelScope](https://modelscope.cn)                                                                 |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
@@ -180,7 +180,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me
 
 def train():
     # to load model from Hugging Face, use 'hf://...'
-    base_model = 'ms://Qwen/Qwen3.5-4B'
+    base_model = 'ms://Qwen/Qwen3.5-27B'
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
@@ -236,7 +236,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.5-4B'
+base_model = 'ms://Qwen/Qwen3.5-27B'
 base_url='your-base-url'
 api_key='your-api-key'
 
diff --git a/README_ZH.md b/README_ZH.md
@@ -114,7 +114,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 随着新模型的发布，我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。
 
 >[!Note]
-> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3.5-4B](https://modelscope.cn/models/Qwen/Qwen3.5-4B)。
+> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B)。
 
 | Model Type          | Model ID 举例                                                                                                     |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
@@ -162,7 +162,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me
 
 def train():
     # to load model from Hugging Face, use 'hf://...'
-    base_model = 'ms://Qwen/Qwen3.5-4B'
+    base_model = 'ms://Qwen/Qwen3.5-27B'
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
@@ -218,7 +218,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.5-4B'
+base_model = 'ms://Qwen/Qwen3.5-27B'
 base_url='your-base-url'
 api_key='your-api-key'
 
diff --git a/cookbook/client/server/megatron/server.py b/cookbook/client/server/megatron/server.py
@@ -15,7 +15,7 @@
 
 # Resolve the path to server_config.yaml relative to this script's location
 file_dir = os.path.abspath(os.path.dirname(__file__))
-config_path = os.path.join(file_dir, 'server_config_4b.yaml')
+config_path = os.path.join(file_dir, 'server_config.yaml')
 
 # Launch the Twinkle server — this call blocks until the server is shut down
 launch_server(config_path=config_path)
diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml
@@ -36,29 +36,32 @@ applications:
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
-  - name: sampler-Qwen3.5-4B
-    route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
+  #    Config: TP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, ~37GB for KV cache + LoRA
+  - name: sampler-Qwen3.5-27B
+    route_prefix: /api/v1/sampler/Qwen/Qwen3.5-27B
     import_path: sampler
     args:
-      model_id: "ms://Qwen/Qwen3.5-4B"   # ModelScope model identifier
-      nproc_per_node: 4               # Number of GPU processes per node
+      model_id: "ms://Qwen/Qwen3.5-27B"  # ModelScope model identifier
+      nproc_per_node: 8               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
-        max_model_len: 16000           # Maximum sequence length the engine supports
-        gpu_memory_utilization: 0.85   # Fraction of GPU memory to use (0.0-1.0)
+        max_model_len: 32000           # Maximum sequence length the engine supports
+        gpu_memory_utilization: 0.80   # 80% utilization, ~64GB/GPU, leaves buffer for safety
         enable_lora: true             # Allow loading LoRA adapters during inference
         max_loras: 5                  # Max allowed loras working on vLLM at the same time
+        max_lora_rank: 32             # Support up to rank 64 LoRA adapters
       device_group:                   # Logical device group for the sampler
         name: sampler
-        gpus_per_worker: 1
+        gpus_per_worker: 2
         ranks: 4                   # GPU rank indices to use
         device_type: cuda
       device_mesh:
         device_type: cuda
-        dp_size: 4
+        dp_size: 2
+        tp_size: 2                  # 2 TP replicas for multi-tenant throughput
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 16000                            # Max tokens per second
+        tps_limit: 32000                            # Max tokens per second
     deployments:
       - name: SamplerManagement
         autoscaling_config:
@@ -71,29 +74,29 @@ applications:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
 
-  # 2. Model Service (commented out) - Would host the base model for training.
-  #    Uncomment and configure if you need a training model worker.
-  - name: models-Qwen3.5-4B
-    route_prefix: /api/v1/model/Qwen/Qwen3.5-4B
+  # 2. Model Service - Hosts the base model for training.
+  #    Config: PP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, comfortable for LoRA training
+  - name: models-Qwen3.5-27B
+    route_prefix: /api/v1/model/Qwen/Qwen3.5-27B
     import_path: model
     args:
-      use_megatron: true                          # Use HuggingFace Transformers backend
-      model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
-      max_length: 16000                           # model max length
+      use_megatron: true                          # Use Megatron-LM backend
+      model_id: "ms://Qwen/Qwen3.5-27B" # ModelScope model identifier
+      max_length: 32000                           # model max length
       max_loras: 5                                # model max loras
-      nproc_per_node: 4                           # Number of GPU processes per node
+      nproc_per_node: 8                           # Number of GPU processes per node
       device_group:
         name: model
         ranks: 4       # GPU rank indices
         device_type: cuda
       device_mesh:
         device_type: cuda
-        dp_size: 4
-        ep_size: 2
+        dp_size: 2                                # 2-way data parallel
+        pp_size: 2                                # 2-way pipeline parallel (~27GB/GPU)
 
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 16000                            # Max tokens per second
+        tps_limit: 32000                            # Max tokens per second
       adapter_config:
         adapter_timeout: 30                       # Seconds before idle adapter unload
         adapter_max_lifetime: 36000               # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
diff --git a/cookbook/client/tinker/modelscope/sample.py b/cookbook/client/tinker/modelscope/sample.py
@@ -16,7 +16,7 @@
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3.5-4B'
+base_model = 'Qwen/Qwen3.5-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
@@ -29,7 +29,7 @@
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.5-27B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py
@@ -23,7 +23,7 @@
 from tinker import ServiceClient
 
 # The base model to fine-tune / evaluate
-base_model = 'Qwen/Qwen3.5-4B'
+base_model = 'Qwen/Qwen3.5-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 
diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py
@@ -38,7 +38,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-BASE_MODEL = 'Qwen/Qwen3.5-4B'
+BASE_MODEL = 'Qwen/Qwen3.5-27B'
 NUM_GENERATIONS = 8
 MAX_NEW_TOKENS = 4096
 LEARNING_RATE = 1e-4
diff --git a/cookbook/client/twinkle/modelscope/multi_modal.py b/cookbook/client/twinkle/modelscope/multi_modal.py
@@ -24,7 +24,7 @@
 
 logger = get_logger()
 
-base_model = 'Qwen/Qwen3.5-4B'
+base_model = 'Qwen/Qwen3.5-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Initialize the Twinkle client to communicate with the remote server.
diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py
@@ -21,7 +21,7 @@
 
 logger = get_logger()
 
-base_model = 'Qwen/Qwen3.5-4B'
+base_model = 'Qwen/Qwen3.5-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Initialize the Twinkle client to communicate with the remote server.
diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md
@@ -692,10 +692,6 @@ if __name__ == '__main__':
 
 Multiple developers can use a single base model from this service for parallel training and sampling. Furthermore, the training methods they use are allowed to differ. For example, User A can perform SFT, User B can perform RL, and User C can perform sampling. Similarly, Twinkle also supports Tinker-like APIs for remote training:
 
->[!Note]
-> One important note: in the current Twinkle implementation, the client-side Twinkle API and Tinker API cannot be used simultaneously on the same server. When you need to provide the Tinker API, you need to start the service under cookbook/client/tinker.
-> This issue will be addressed with high priority in upcoming iterations.
-
 ```python
 from tinker import types
 from tqdm import tqdm
diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md
@@ -2,7 +2,7 @@
 
 Alongside the open-source release of the Twinkle framework, we also provide a hosted model training service (Training as a Service) powered by ModelScope's backend infrastructure. Developers can use this service to experience Twinkle's training API for free.
 
-The model currently running on the cluster is [Qwen/Qwen3.5-4B](https://www.modelscope.cn/models/Qwen/Qwen3.5-4B). Below are the detailed usage instructions:
+The model currently running on the cluster is [Qwen/Qwen3.5-27B](https://www.modelscope.cn/models/Qwen/Qwen3.5-27B). Below are the detailed usage instructions:
 
 ## Step 1. Register a ModelScope Account and Apply to Join the twinkle-explorers Organization
 
@@ -30,7 +30,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.5-4B'
+base_model = 'ms://Qwen/Qwen3.5-27B'
 base_url='http://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
@@ -64,7 +64,7 @@ for epoch in range(2):
     print(f'Saved checkpoint for epoch {epoch} to {result.path}')
 ```
 
-With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.5-4B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA:
+With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.5-27B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA:
 
 ```python
 import os
@@ -79,7 +79,7 @@ init_tinker_client()
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3.5-4B'
+base_model = 'Qwen/Qwen3.5-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
@@ -92,7 +92,7 @@ service_client = ServiceClient(
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.5-27B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git a/docs/source_zh/使用指引/快速开始.md b/docs/source_zh/使用指引/快速开始.md
@@ -694,10 +694,6 @@ if __name__ == '__main__':
 
 多个开发者可以并行使用这个服务的单个基模并行训练和采样。并且，他们进行的训练方式允许不同。例如，A用户可以进行SFT，B用户可以进行RL，C用户可以进行采样。 同样，Twinkle也支持Tinker-like API进行远端训练：
 
->[!Note]
-> 需要注意的一点，在当前Twinkle的实现中，客户端的Twinkle API和Tinker API是无法同时在一个服务端使用的。当你需要提供Tinker API时，你需要启动cookbook/client/tinker下的服务。
-> 这个问题会在接下来的迭代高优解决。
-
 ```python
 from tinker import types
 from tqdm import tqdm
@@ -765,7 +761,7 @@ if __name__ == '__main__':
 
 ### 使用魔搭社区提供的TaaS化训练服务
 
-在 Twinkle 框架开源的同时，我们依托ModelScope的后台服务，也提供了托管的模型训练服务(Training as a Service），开发者可以通过这一服务， 免费体验Twinkle的训练API。
+在 Twinkle 框架开源的同时，我们依托ModelScope的后台服务，也提供了托管的模型训练服务(Training as a Service)，开发者可以通过这一服务， 免费体验Twinkle的训练API。
 该服务和上面叙述的Tinker API部分代码是相同的，唯一不同的是Endpoint和Token需要使用魔搭官方的对应信息。关于如何使用官方服务，请查看[训练服务](./训练服务.md)的详细描述。
 
 Twinkle提供了采样API，该API可以用于更灵活地控制采样方式以验证结果，或者参与到RL算法的采样流程中。
diff --git a/docs/source_zh/使用指引/训练服务.md b/docs/source_zh/使用指引/训练服务.md
@@ -3,7 +3,7 @@
 在 Twinkle 框架开源的同时，我们依托ModelScope的后台服务，也提供了托管的模型训练服务(Training as a Service），开发者可以通过这一服务，
 免费体验Twinkle的训练API。
 
-目前在集群中运行的模型是[Qwen/Qwen3.5-4B](https://www.modelscope.cn/models/Qwen/Qwen3.5-4B)。下面介绍具体的使用方法：
+目前在集群中运行的模型是[Qwen/Qwen3.5-27B](https://www.modelscope.cn/models/Qwen/Qwen3.5-27B)。下面介绍具体的使用方法：
 
 ## Step 1. 注册ModelScope用户并申请加入 twinkle-explorers 组织
 
@@ -33,7 +33,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.5-4B'
+base_model = 'ms://Qwen/Qwen3.5-27B'
 base_url='http://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
@@ -67,7 +67,7 @@ for epoch in range(2):
     print(f'Saved checkpoint for epoch {epoch} to {result.path}')
 ```
 
-通过上述代码，你可以训练一个原模型为`Qwen/Qwen3.5-4B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理：
+通过上述代码，你可以训练一个原模型为`Qwen/Qwen3.5-27B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理：
 
 ```python
 import os
@@ -82,7 +82,7 @@ init_tinker_client()
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3.5-4B'
+base_model = 'Qwen/Qwen3.5-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
@@ -95,7 +95,7 @@ service_client = ServiceClient(
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.5-27B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,6 @@ dependencies = [
   "safetensors",
   "peft>=0.11.0,<=0.19.0",
   "transformers",
-  "oss2",
 ]
 
 [project.optional-dependencies]