modelscope
diff --git a/‎.github/copilot-instructions.md‎
Lines changed: 4 additions & 3 deletions b/‎.github/copilot-instructions.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎INSTALL_MEGATRON.sh‎
Lines changed: 116 additions & 0 deletions b/‎INSTALL_MEGATRON.sh‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 8 additions & 5 deletions b/‎README.md‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎README_ZH.md‎
Lines changed: 7 additions & 4 deletions b/‎README_ZH.md‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎ROADMAP.md‎
Lines changed: 2 additions & 0 deletions b/‎ROADMAP.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cookbook/client/tinker/lora.py‎
Lines changed: 17 additions & 12 deletions b/‎cookbook/client/tinker/lora.py‎
Lines changed: 17 additions & 12 deletions
diff --git a/‎cookbook/client/tinker/megatron/server_config.yaml‎
Lines changed: 4 additions & 6 deletions b/‎cookbook/client/tinker/megatron/server_config.yaml‎
Lines changed: 4 additions & 6 deletions
@@ -40,10 +40,11 @@ These instructions help AI agents work productively in this repo. Focus on concr
   - Initialize infra: `twinkle.initialize(mode='local', seed=42)`
   - Inspect device placement: call `twinkle.infra.get_device_placement()`.
 - **Ray Serve demo (HTTP services):**
-  - Config and launcher: [cookbook/client/server.py](cookbook/client/server.py), [cookbook/client/server_config.yaml](cookbook/client/server_config.yaml)
+  - Config and launcher: [cookbook/client/tinker/megatron/server.py](https://github.com/modelscope/twinkle/blob/main/cookbook/client/tinker/megatron/server.py), [cookbook/client/tinker/megatron/server_config.yaml](https://github.com/modelscope/twinkle/blob/main/cookbook/client/tinker/megatron/server_config.yaml)
   - Start:
-    - `python cookbook/client/server.py`
-    - Endpoints print on startup (default `localhost:8000`).
+    - `cd cookbook/client/tinker/megatron`
+    - `python server.py`
+    - Endpoints print on startup (default `localhost:8000` or `https://www.modelscope.cn/twinkle`).
   - Model app binds `MultiLoraTransformersModel` and exposes routes like `/add_adapter_to_model`, `/forward`, `/calculate_loss`, etc. See [src/twinkle/server/twinkle/model.py](src/twinkle/server/twinkle/model.py).
 - **vLLM inference:** Use `VLLMEngine` with engine args; LoRA weight sync via `patch.vllm_lora_weights`. See [src/twinkle/sampler/vllm_engine.py](src/twinkle/sampler/vllm_engine.py).
 
 
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Installation script - We offer a script to install the megatron and vllm related dependencies,
+# which always occur error
+
+set -e  # Exit immediately on error
+
+echo "=========================================="
+echo "Starting deep learning dependencies installation..."
+echo "=========================================="
+
+# Detect GPU architecture from nvidia-smi
+echo ""
+echo "Detecting GPU architecture..."
+GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1)
+echo "Detected GPU: $GPU_NAME"
+
+# Map GPU name to CUDA architecture
+get_cuda_arch() {
+    local gpu_name="$1"
+    case "$gpu_name" in
+        *H100*|*H200*|*H20*|*H800*)
+            echo "9.0"
+            ;;
+        *A100*|*A800*|*A30*)
+            echo "8.0"
+            ;;
+        *A10*|*A40*|*A16*|*A2*)
+            echo "8.6"
+            ;;
+        *L40*|*L4*|*Ada*|*RTX\ 40*|*RTX\ 50*)
+            echo "8.9"
+            ;;
+        *V100*)
+            echo "7.0"
+            ;;
+        *T4*)
+            echo "7.5"
+            ;;
+        *RTX\ 30*|*A6000*|*A5000*)
+            echo "8.6"
+            ;;
+        *RTX\ 20*)
+            echo "7.5"
+            ;;
+        *)
+            echo "8.0;9.0"  # Default fallback
+            ;;
+    esac
+}
+
+TORCH_CUDA_ARCH_LIST=$(get_cuda_arch "$GPU_NAME")
+export TORCH_CUDA_ARCH_LIST
+echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST"
+
+# Install latest base packages
+echo ""
+echo "Installing peft, accelerate, transformers, modelscope, oss2..."
+pip install --upgrade peft accelerate transformers "modelscope[framework]" oss2
+
+# Install latest vllm
+echo ""
+echo "Installing latest vllm..."
+pip install --upgrade vllm
+
+# Get site-packages path and install transformer_engine and megatron_core
+echo ""
+echo "Installing transformer_engine and megatron_core..."
+SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
+echo "Site-packages path: $SITE_PACKAGES"
+
+CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \
+CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \
+pip install --no-build-isolation "transformer_engine[pytorch]" megatron_core --no-cache-dir
+
+# Install flash-attention (force local build)
+echo ""
+echo "Installing flash-attention (local build for $GPU_NAME)..."
+TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \
+MAX_JOBS=8 \
+FLASH_ATTENTION_FORCE_BUILD=TRUE \
+pip install flash-attn --no-build-isolation --no-cache-dir
+
+# Install numpy
+echo ""
+echo "Installing numpy==2.2 and deep_gemm..."
+pip install numpy==2.2
+pip uninstall deep_gemm -y
+cd /tmp
+git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git
+cd DeepGEMM
+pip install . --no-build-isolation
+
+# Verify installation
+echo ""
+echo "Verifying installation..."
+echo ""
+python -c "
+import pkg_resources
+
+packages = ['peft', 'accelerate', 'transformers', 'modelscope', 'oss2', 'vllm', 'transformer_engine', 'megatron_core', 'flash_attn', 'numpy']
+
+print('Installed package versions:')
+print('-' * 40)
+for pkg in packages:
+    try:
+        version = pkg_resources.get_distribution(pkg).version
+        print(f'{pkg}: {version}')
+    except pkg_resources.DistributionNotFound:
+        print(f'{pkg}: Not installed')
+"
+
+echo ""
+echo "=========================================="
+echo "Installation complete!"
+echo "=========================================="
@@ -90,7 +90,7 @@ Tinker-compatible APIs.
 We are rolling out training service built atop Twinkle✨ on ModelScope. It is currently in _Beta_. You may
 sign up for free access by joining the [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) organization, and
 train via API endpoint  `base_url=https://www.modelscope.cn/twinkle`. For more details, please refer to
-our [documentation](docs/source_en/Usage%20Guide/ModelScope-Official-Resources.md).
+our [documentation](docs/source_en/Usage%20Guide/Train-as-a-Service.md).
 
 ## Supported Hardware
 
@@ -134,7 +134,7 @@ supported on Twinkle✨ framework.
 |                     | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1)                                          | transformers>=4.39.3 | ✅               | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)                                     |
 | deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) ~32B | transformers>=4.37   | ✅               | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) |
 
-For a more detailed model support list 👉  [Quick Start.md](https://github.com/modelscope/twinkle/blob/dev/docs/source/%E4%BD%BF%E7%94%A8%E6%8C%87%E5%BC%95/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B.md)
+For more detailed model support list 👉  [Quick Start](docs/source_en/Usage%20Guide/Quick-Start.md)
 
 ## Sample Code
 
@@ -207,7 +207,7 @@ if __name__ == '__main__':
 import os
 from tqdm import tqdm
 from tinker import types
-from twinkle_client import init_tinker_compat_client
+from twinkle_client import init_tinker_client
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
@@ -224,8 +224,11 @@ dataset.map(SelfCognitionProcessor('twinkle Model', 'twinkle Team'), load_from_c
 dataset.encode(batched=True, load_from_cache_file=False)
 dataloader = DataLoader(dataset=dataset, batch_size=8)
 
-# Initialize tinker client
-service_client = init_tinker_compat_client(base_url, api_key)
+# Initialize Tinker client before importing ServiceClient
+init_tinker_client()
+from tinker import ServiceClient
+
+service_client = ServiceClient(base_url=base_url, api_key=api_key)
 training_client = service_client.create_lora_training_client(base_model=base_model[len('ms://'):], rank=16)
 
 # Training loop: use input_feature_to_datum to transfer the input format
 
@@ -113,7 +113,7 @@ pip install -e .
 |                   | [deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1)                                          | transformers>=4.39.3 | ✅               | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)                                     |
 | deepSeek-r1-distill | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) ~32B | transformers>=4.37   | ✅               | [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) |
 
-更详细的模型支持列表 👉  [快速开始.md](https://github.com/modelscope/twinkle/blob/dev/docs/source/%E4%BD%BF%E7%94%A8%E6%8C%87%E5%BC%95/%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B.md)
+更详细的模型支持列表 👉  [快速开始.md](docs/source_zh/使用指引/快速开始.md)
 
 ## 示例代码
 
@@ -186,7 +186,7 @@ if __name__ == '__main__':
 import os
 from tqdm import tqdm
 from tinker import types
-from twinkle_client import init_tinker_compat_client
+from twinkle_client import init_tinker_client
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
@@ -203,8 +203,11 @@ dataset.map(SelfCognitionProcessor('twinkle Model', 'twinkle Team'), load_from_c
 dataset.encode(batched=True, load_from_cache_file=False)
 dataloader = DataLoader(dataset=dataset, batch_size=8)
 
-# Initialize tinker client
-service_client = init_tinker_compat_client(base_url, api_key)
+# Initialize Tinker client before importing ServiceClient
+init_tinker_client()
+from tinker import ServiceClient
+
+service_client = ServiceClient(base_url=base_url, api_key=api_key)
 training_client = service_client.create_lora_training_client(base_model=base_model[len('ms://'):], rank=16)
 
 # Training loop: use input_feature_to_datum to transfer the input format
 
@@ -65,6 +65,7 @@
 - [ ] 支持DPO对齐训练
 - [ ] 支持colocate RL训练
 - [ ] Preprocess支持batched
+- [ ] 对多replica的支持和粘滞路由
 
 ### 网络能力
 
@@ -84,5 +85,6 @@
 - [ ] Support for DPO alignment training
 - [ ] Support for colocate RL training
 - [ ] Support for batched preprocessing
+- [ ] Support for multiple replicas and sticky routing
 
 ### Networking Capabilities
@@ -13,20 +13,25 @@
 
 import os
 
-from twinkle_client import init_tinker_compat_client
+# Step 2: Initialize Tinker client before importing ServiceClient
+from twinkle_client import init_tinker_client
 
-# Step 2: Initialize the Tinker-compatible client to communicate with the server.
-# - base_url: the address of the running server
-# - api_key: authentication token (loaded from environment variable)
-service_client = init_tinker_compat_client(
-    base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN'))
+init_tinker_client()
 
-# Step 3: List models available on the server to verify the connection
+# Step 3: Use ServiceClient directly from tinker
+from tinker import ServiceClient
+
+service_client = ServiceClient(
+    base_url='http://www.modelscope.cn/twinkle',
+    api_key=os.environ.get('MODELSCOPE_TOKEN')
+)
+
+# Step 4: List models available on the server to verify the connection
 print('Available models:')
 for item in service_client.get_server_capabilities().supported_models:
     print('- ' + item.model_name)
 
-# Step 4: Create a REST client for querying training runs and checkpoints.
+# Step 5: Create a REST client for querying training runs and checkpoints.
 # This is useful for inspecting previous training sessions or resuming training.
 rest_client = service_client.create_rest_client()
 
@@ -51,7 +56,7 @@
         # Uncomment the line below to resume from the last checkpoint:
         # resume_path = chpt.tinker_path
 
-# Step 5: Create or resume a training client.
+# Step 6: Create or resume a training client.
 # If resume_path is set, it restores both model weights and optimizer state.
 base_model = 'Qwen/Qwen2.5-7B-Instruct'
 if not resume_path:
@@ -60,7 +65,7 @@
     print('Resuming from ' + resume_path)
     training_client = service_client.create_training_client_from_state_with_optimizer(path=resume_path)
 
-# Step 6: Prepare training data manually
+# Step 7: Prepare training data manually
 #
 # This example teaches the model to translate English into Pig Latin.
 # Each example has an "input" (English phrase) and "output" (Pig Latin).
@@ -146,7 +151,7 @@ def process_example(example: dict, tokenizer) -> types.Datum:
             datum0.loss_fn_inputs['weights'].tolist())):
     print(f'{repr(tokenizer.decode([inp])):<20} {repr(tokenizer.decode([tgt])):<20} {wgt:<10}')
 
-# Step 7: Run the training loop
+# Step 8: Run the training loop
 #
 # For each epoch, iterate over multiple batches:
 #   - forward_backward: sends data to the server, computes loss & gradients
@@ -174,7 +179,7 @@ def process_example(example: dict, tokenizer) -> types.Datum:
     save_result = save_future.result()
     print(f'Saved checkpoint for epoch {epoch} to {save_result.path}')
 
-# Step 8: Publish the final checkpoint to ModelScope Hub.
+# Step 9: Publish the final checkpoint to ModelScope Hub.
 # NOTE: Requires a valid ModelScope token set as api_key when initializing the client.
 # The published model name will be: {run_id}_{checkpoint_name}
 rest_client.publish_checkpoint_from_tinker_path(save_result.path).result()
 
@@ -21,6 +21,8 @@ applications:
     route_prefix: /api/v1          # API endpoint prefix (Tinker-compatible)
     import_path: server            # Python module to import
     args:
+      server_config:
+        per_token_model_limit: 3      # Maximum number of models (adapters) per token (server-globally enforced)
 
     deployments:
       - name: TinkerCompatServer
@@ -33,7 +35,6 @@ applications:
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
-              DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
@@ -52,7 +53,7 @@ applications:
       device_group:                   # Logical device group for the sampler
         name: sampler
         gpus_per_worker: 1
-        ranks: [0,1,2,3]                    # GPU rank indices to use
+        ranks: 4                   # GPU rank indices to use
         device_type: cuda
       device_mesh:
         device_type: cuda
@@ -71,7 +72,6 @@ applications:
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
-              DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
 
   # 2. Model Service (commented out) - Would host the base model for training.
   #    Uncomment and configure if you need a training model worker.
@@ -86,7 +86,7 @@ applications:
       nproc_per_node: 4                           # Number of GPU processes per node
       device_group:
         name: model
-        ranks: [4,5,6,7]                              # GPU rank indices
+        ranks: 4       # GPU rank indices
         device_type: cuda
       device_mesh:
         device_type: cuda
@@ -97,7 +97,6 @@ applications:
         rps_limit: 20                               # Max requests per second
         tps_limit: 16000                            # Max tokens per second
       adapter_config:
-        per_token_adapter_limit: 3                # Max concurrent LoRA adapters
         adapter_timeout: 30                       # Seconds before idle adapter unload
         adapter_max_lifetime: 36000               # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
     deployments:
@@ -111,4 +110,3 @@ applications:
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
-              DEVICE_COUNT_PER_PHYSICAL_NODE: "8"