diff --git a/.gitignore b/.gitignore index 58f495d4..37f2f3a9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ __pycache__/ *.py[cod] *$py.class test.py +test.sh +twinkle-web # C extensions *.so diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..97c35113 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,25 @@ +FROM modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.8.1-py311-torch2.9.1-1.35.0 + +# Install miniconda with Python 3.12 +RUN curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ + rm Miniconda3-latest-Linux-x86_64.sh +ENV PATH="/opt/conda/bin:${PATH}" +RUN conda create -n twinkle python=3.12 -y --override-channels -c conda-forge +SHELL ["conda", "run", "-n", "twinkle", "/bin/bash", "-c"] + +# Clone and install twinkle, checkout to latest v-tag +RUN git clone https://github.com/modelscope/twinkle.git +WORKDIR /twinkle +RUN echo "Available v-tags:" && git tag -l 'v*' --sort=-v:refname && \ + LATEST_TAG=$(git tag -l 'v*' --sort=-v:refname | head -n 1) && \ + echo "Checking out: $LATEST_TAG" && \ + git checkout "$LATEST_TAG" + +RUN sh INSTALL_MEGATRON.sh + +RUN pip install --no-cache-dir tinker==0.14.0 "ray[serve]" transformers peft accelerate -U + +RUN pip install -e . --no-build-isolation + +CMD ["bash", "cookbook/client/server/megatron/run.sh"] diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh index c85ec6e1..775f7588 100644 --- a/INSTALL_MEGATRON.sh +++ b/INSTALL_MEGATRON.sh @@ -81,6 +81,8 @@ MAX_JOBS=8 \ FLASH_ATTENTION_FORCE_BUILD=TRUE \ pip install flash-attn --no-build-isolation --no-cache-dir +pip install flash-linear-attention -U + # Install numpy echo "" echo "Installing numpy==2.2 and deep_gemm..." diff --git a/README.md b/README.md index cd7eccfd..35799b85 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,7 @@ supported on Twinkle✨ framework. > For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it > is currently provided via the Tinker-compatible APIs. We will be rolling out services that support > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed -> by one training base at a time, and currently it is [Qwen3.5-4B](https://modelscope.cn/models/Qwen/Qwen3.5-4B). +> by one training base at a time, and currently it is [Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B). | Model Type | Model ID on [ModelScope](https://modelscope.cn) | Model Size | Requires | Support Megatron | HF Model ID | |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:| @@ -180,7 +180,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me def train(): # to load model from Hugging Face, use 'hf://...' - base_model = 'ms://Qwen/Qwen3.5-4B' + base_model = 'ms://Qwen/Qwen3.5-27B' # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding @@ -236,7 +236,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3.5-4B' +base_model = 'ms://Qwen/Qwen3.5-27B' base_url='your-base-url' api_key='your-api-key' diff --git a/README_ZH.md b/README_ZH.md index e404508a..352ebde0 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -114,7 +114,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl 随着新模型的发布,我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。 >[!Note] -> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3.5-4B](https://modelscope.cn/models/Qwen/Qwen3.5-4B)。 +> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B)。 | Model Type | Model ID 举例 | Model Size | Requires | Support Megatron | HF Model ID | |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:| @@ -162,7 +162,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me def train(): # to load model from Hugging Face, use 'hf://...' - base_model = 'ms://Qwen/Qwen3.5-4B' + base_model = 'ms://Qwen/Qwen3.5-27B' # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding @@ -218,7 +218,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3.5-4B' +base_model = 'ms://Qwen/Qwen3.5-27B' base_url='your-base-url' api_key='your-api-key' diff --git a/cookbook/client/server/megatron/server.py b/cookbook/client/server/megatron/server.py index abce8cf6..e38f43a4 100644 --- a/cookbook/client/server/megatron/server.py +++ b/cookbook/client/server/megatron/server.py @@ -15,7 +15,7 @@ # Resolve the path to server_config.yaml relative to this script's location file_dir = os.path.abspath(os.path.dirname(__file__)) -config_path = os.path.join(file_dir, 'server_config_4b.yaml') +config_path = os.path.join(file_dir, 'server_config.yaml') # Launch the Twinkle server — this call blocks until the server is shut down launch_server(config_path=config_path) diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml index 6b5cce0e..21d8a16b 100644 --- a/cookbook/client/server/megatron/server_config.yaml +++ b/cookbook/client/server/megatron/server_config.yaml @@ -36,29 +36,32 @@ applications: # 3. Sampler Service - Runs inference / sampling using vLLM engine # Used for generating text from the model (e.g., evaluating LoRA results). - - name: sampler-Qwen3.5-4B - route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B + # Config: TP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, ~37GB for KV cache + LoRA + - name: sampler-Qwen3.5-27B + route_prefix: /api/v1/sampler/Qwen/Qwen3.5-27B import_path: sampler args: - model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier - nproc_per_node: 4 # Number of GPU processes per node + model_id: "ms://Qwen/Qwen3.5-27B" # ModelScope model identifier + nproc_per_node: 8 # Number of GPU processes per node sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler) engine_args: # vLLM engine-specific settings - max_model_len: 16000 # Maximum sequence length the engine supports - gpu_memory_utilization: 0.85 # Fraction of GPU memory to use (0.0-1.0) + max_model_len: 32000 # Maximum sequence length the engine supports + gpu_memory_utilization: 0.80 # 80% utilization, ~64GB/GPU, leaves buffer for safety enable_lora: true # Allow loading LoRA adapters during inference max_loras: 5 # Max allowed loras working on vLLM at the same time + max_lora_rank: 32 # Support up to rank 64 LoRA adapters device_group: # Logical device group for the sampler name: sampler - gpus_per_worker: 1 + gpus_per_worker: 2 ranks: 4 # GPU rank indices to use device_type: cuda device_mesh: device_type: cuda - dp_size: 4 + dp_size: 2 + tp_size: 2 # 2 TP replicas for multi-tenant throughput queue_config: rps_limit: 20 # Max requests per second - tps_limit: 16000 # Max tokens per second + tps_limit: 32000 # Max tokens per second deployments: - name: SamplerManagement autoscaling_config: @@ -71,29 +74,29 @@ applications: env_vars: TWINKLE_TRUST_REMOTE_CODE: "0" - # 2. Model Service (commented out) - Would host the base model for training. - # Uncomment and configure if you need a training model worker. - - name: models-Qwen3.5-4B - route_prefix: /api/v1/model/Qwen/Qwen3.5-4B + # 2. Model Service - Hosts the base model for training. + # Config: PP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, comfortable for LoRA training + - name: models-Qwen3.5-27B + route_prefix: /api/v1/model/Qwen/Qwen3.5-27B import_path: model args: - use_megatron: true # Use HuggingFace Transformers backend - model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier - max_length: 16000 # model max length + use_megatron: true # Use Megatron-LM backend + model_id: "ms://Qwen/Qwen3.5-27B" # ModelScope model identifier + max_length: 32000 # model max length max_loras: 5 # model max loras - nproc_per_node: 4 # Number of GPU processes per node + nproc_per_node: 8 # Number of GPU processes per node device_group: name: model ranks: 4 # GPU rank indices device_type: cuda device_mesh: device_type: cuda - dp_size: 4 - ep_size: 2 + dp_size: 2 # 2-way data parallel + pp_size: 2 # 2-way pipeline parallel (~27GB/GPU) queue_config: rps_limit: 20 # Max requests per second - tps_limit: 16000 # Max tokens per second + tps_limit: 32000 # Max tokens per second adapter_config: adapter_timeout: 30 # Seconds before idle adapter unload adapter_max_lifetime: 36000 # Maximum lifetime of an adapter in seconds (e.g., 10 hours) diff --git a/cookbook/client/tinker/modelscope/sample.py b/cookbook/client/tinker/modelscope/sample.py index 137d1b2b..72bd9f24 100644 --- a/cookbook/client/tinker/modelscope/sample.py +++ b/cookbook/client/tinker/modelscope/sample.py @@ -16,7 +16,7 @@ from tinker import ServiceClient -base_model = 'Qwen/Qwen3.5-4B' +base_model = 'Qwen/Qwen3.5-27B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Define the base model and connect to the server @@ -29,7 +29,7 @@ # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint. # The server will load the base model and apply the LoRA adapter weights. sampling_client = service_client.create_sampling_client( - model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1', + model_path='twinkle://xxx-Qwen_Qwen3.5-27B-xxx/weights/twinkle-lora-1', base_model=base_model ) diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py index 1d653fb1..9f02ee40 100644 --- a/cookbook/client/tinker/modelscope/self_cognition.py +++ b/cookbook/client/tinker/modelscope/self_cognition.py @@ -23,7 +23,7 @@ from tinker import ServiceClient # The base model to fine-tune / evaluate -base_model = 'Qwen/Qwen3.5-4B' +base_model = 'Qwen/Qwen3.5-27B' base_url = 'http://www.modelscope.cn/twinkle' diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py index 424d460d..6796b517 100644 --- a/cookbook/client/tinker/modelscope/short_math_grpo.py +++ b/cookbook/client/tinker/modelscope/short_math_grpo.py @@ -38,7 +38,7 @@ logger = get_logger() # ========== Configuration ========== -BASE_MODEL = 'Qwen/Qwen3.5-4B' +BASE_MODEL = 'Qwen/Qwen3.5-27B' NUM_GENERATIONS = 8 MAX_NEW_TOKENS = 4096 LEARNING_RATE = 1e-4 diff --git a/cookbook/client/twinkle/modelscope/multi_modal.py b/cookbook/client/twinkle/modelscope/multi_modal.py index f3b8cd24..f7a54ccf 100644 --- a/cookbook/client/twinkle/modelscope/multi_modal.py +++ b/cookbook/client/twinkle/modelscope/multi_modal.py @@ -24,7 +24,7 @@ logger = get_logger() -base_model = 'Qwen/Qwen3.5-4B' +base_model = 'Qwen/Qwen3.5-27B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Initialize the Twinkle client to communicate with the remote server. diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py index ed44b4b1..aafc5d14 100644 --- a/cookbook/client/twinkle/modelscope/self_congnition.py +++ b/cookbook/client/twinkle/modelscope/self_congnition.py @@ -21,7 +21,7 @@ logger = get_logger() -base_model = 'Qwen/Qwen3.5-4B' +base_model = 'Qwen/Qwen3.5-27B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Initialize the Twinkle client to communicate with the remote server. diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md index 6a05a53f..2a2f0f31 100644 --- a/docs/source_en/Usage Guide/Quick-Start.md +++ b/docs/source_en/Usage Guide/Quick-Start.md @@ -692,10 +692,6 @@ if __name__ == '__main__': Multiple developers can use a single base model from this service for parallel training and sampling. Furthermore, the training methods they use are allowed to differ. For example, User A can perform SFT, User B can perform RL, and User C can perform sampling. Similarly, Twinkle also supports Tinker-like APIs for remote training: ->[!Note] -> One important note: in the current Twinkle implementation, the client-side Twinkle API and Tinker API cannot be used simultaneously on the same server. When you need to provide the Tinker API, you need to start the service under cookbook/client/tinker. -> This issue will be addressed with high priority in upcoming iterations. - ```python from tinker import types from tqdm import tqdm diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md index f5571773..38e46858 100644 --- a/docs/source_en/Usage Guide/Train-as-a-Service.md +++ b/docs/source_en/Usage Guide/Train-as-a-Service.md @@ -2,7 +2,7 @@ Alongside the open-source release of the Twinkle framework, we also provide a hosted model training service (Training as a Service) powered by ModelScope's backend infrastructure. Developers can use this service to experience Twinkle's training API for free. -The model currently running on the cluster is [Qwen/Qwen3.5-4B](https://www.modelscope.cn/models/Qwen/Qwen3.5-4B). Below are the detailed usage instructions: +The model currently running on the cluster is [Qwen/Qwen3.5-27B](https://www.modelscope.cn/models/Qwen/Qwen3.5-27B). Below are the detailed usage instructions: ## Step 1. Register a ModelScope Account and Apply to Join the twinkle-explorers Organization @@ -30,7 +30,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3.5-4B' +base_model = 'ms://Qwen/Qwen3.5-27B' base_url='http://www.modelscope.cn/twinkle' api_key=os.environ.get('MODELSCOPE_TOKEN') @@ -64,7 +64,7 @@ for epoch in range(2): print(f'Saved checkpoint for epoch {epoch} to {result.path}') ``` -With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.5-4B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA: +With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.5-27B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA: ```python import os @@ -79,7 +79,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = 'Qwen/Qwen3.5-4B' +base_model = 'Qwen/Qwen3.5-27B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Define the base model and connect to the server @@ -92,7 +92,7 @@ service_client = ServiceClient( # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint. # The server will load the base model and apply the LoRA adapter weights. sampling_client = service_client.create_sampling_client( - model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1', + model_path='twinkle://xxx-Qwen_Qwen3.5-27B-xxx/weights/twinkle-lora-1', base_model=base_model ) diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" index 0b8e386a..db8b8f43 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" @@ -694,10 +694,6 @@ if __name__ == '__main__': 多个开发者可以并行使用这个服务的单个基模并行训练和采样。并且,他们进行的训练方式允许不同。例如,A用户可以进行SFT,B用户可以进行RL,C用户可以进行采样。 同样,Twinkle也支持Tinker-like API进行远端训练: ->[!Note] -> 需要注意的一点,在当前Twinkle的实现中,客户端的Twinkle API和Tinker API是无法同时在一个服务端使用的。当你需要提供Tinker API时,你需要启动cookbook/client/tinker下的服务。 -> 这个问题会在接下来的迭代高优解决。 - ```python from tinker import types from tqdm import tqdm @@ -765,7 +761,7 @@ if __name__ == '__main__': ### 使用魔搭社区提供的TaaS化训练服务 -在 Twinkle 框架开源的同时,我们依托ModelScope的后台服务,也提供了托管的模型训练服务(Training as a Service),开发者可以通过这一服务, 免费体验Twinkle的训练API。 +在 Twinkle 框架开源的同时,我们依托ModelScope的后台服务,也提供了托管的模型训练服务(Training as a Service),开发者可以通过这一服务, 免费体验Twinkle的训练API。 该服务和上面叙述的Tinker API部分代码是相同的,唯一不同的是Endpoint和Token需要使用魔搭官方的对应信息。关于如何使用官方服务,请查看[训练服务](./训练服务.md)的详细描述。 Twinkle提供了采样API,该API可以用于更灵活地控制采样方式以验证结果,或者参与到RL算法的采样流程中。 diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" index cfff81e3..8dfc056e 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" @@ -3,7 +3,7 @@ 在 Twinkle 框架开源的同时,我们依托ModelScope的后台服务,也提供了托管的模型训练服务(Training as a Service),开发者可以通过这一服务, 免费体验Twinkle的训练API。 -目前在集群中运行的模型是[Qwen/Qwen3.5-4B](https://www.modelscope.cn/models/Qwen/Qwen3.5-4B)。下面介绍具体的使用方法: +目前在集群中运行的模型是[Qwen/Qwen3.5-27B](https://www.modelscope.cn/models/Qwen/Qwen3.5-27B)。下面介绍具体的使用方法: ## Step 1. 注册ModelScope用户并申请加入 twinkle-explorers 组织 @@ -33,7 +33,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3.5-4B' +base_model = 'ms://Qwen/Qwen3.5-27B' base_url='http://www.modelscope.cn/twinkle' api_key=os.environ.get('MODELSCOPE_TOKEN') @@ -67,7 +67,7 @@ for epoch in range(2): print(f'Saved checkpoint for epoch {epoch} to {result.path}') ``` -通过上述代码,你可以训练一个原模型为`Qwen/Qwen3.5-4B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理: +通过上述代码,你可以训练一个原模型为`Qwen/Qwen3.5-27B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理: ```python import os @@ -82,7 +82,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = 'Qwen/Qwen3.5-4B' +base_model = 'Qwen/Qwen3.5-27B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Define the base model and connect to the server @@ -95,7 +95,7 @@ service_client = ServiceClient( # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint. # The server will load the base model and apply the LoRA adapter weights. sampling_client = service_client.create_sampling_client( - model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1', + model_path='twinkle://xxx-Qwen_Qwen3.5-27B-xxx/weights/twinkle-lora-1', base_model=base_model ) diff --git a/pyproject.toml b/pyproject.toml index 9e6a321e..584099cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ dependencies = [ "safetensors", "peft>=0.11.0,<=0.19.0", "transformers", - "oss2", ] [project.optional-dependencies]