modelscope
diff --git a/‎README.md‎
Lines changed: 6 additions & 3 deletions b/‎README.md‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎README_ZH.md‎
Lines changed: 6 additions & 3 deletions b/‎README_ZH.md‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎cookbook/client/tinker/lora.py‎
Lines changed: 17 additions & 12 deletions b/‎cookbook/client/tinker/lora.py‎
Lines changed: 17 additions & 12 deletions
diff --git a/‎cookbook/client/tinker/megatron/server_config_7b.yaml‎
Lines changed: 2 additions & 1 deletion b/‎cookbook/client/tinker/megatron/server_config_7b.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cookbook/client/tinker/sample.py‎
Lines changed: 16 additions & 9 deletions b/‎cookbook/client/tinker/sample.py‎
Lines changed: 16 additions & 9 deletions
diff --git a/‎cookbook/client/tinker/self_congnition.py‎
Lines changed: 15 additions & 7 deletions b/‎cookbook/client/tinker/self_congnition.py‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎cookbook/client/tinker/short_math_grpo.py‎
Lines changed: 8 additions & 3 deletions b/‎cookbook/client/tinker/short_math_grpo.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎docs/source_en/Usage Guide/Server and Client/Overview.md‎
Lines changed: 23 additions & 21 deletions b/‎docs/source_en/Usage Guide/Server and Client/Overview.md‎
Lines changed: 23 additions & 21 deletions
diff --git a/‎docs/source_en/Usage Guide/Server and Client/Server.md‎
Lines changed: 0 additions & 1 deletion b/‎docs/source_en/Usage Guide/Server and Client/Server.md‎
Lines changed: 0 additions & 1 deletion
@@ -203,7 +203,7 @@ if __name__ == '__main__':
 import os
 from tqdm import tqdm
 from tinker import types
-from twinkle_client import init_tinker_compat_client
+from twinkle_client import init_tinker_client
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
@@ -220,8 +220,11 @@ dataset.map(SelfCognitionProcessor('twinkle Model', 'twinkle Team'), load_from_c
 dataset.encode(batched=True, load_from_cache_file=False)
 dataloader = DataLoader(dataset=dataset, batch_size=8)
 
-# Initialize tinker client
-service_client = init_tinker_compat_client(base_url, api_key)
+# Initialize Tinker client before importing ServiceClient
+init_tinker_client()
+from tinker import ServiceClient
+
+service_client = ServiceClient(base_url=base_url, api_key=api_key)
 training_client = service_client.create_lora_training_client(base_model=base_model[len('ms://'):], rank=16)
 
 # Training loop: use input_feature_to_datum to transfer the input format
 
@@ -186,7 +186,7 @@ if __name__ == '__main__':
 import os
 from tqdm import tqdm
 from tinker import types
-from twinkle_client import init_tinker_compat_client
+from twinkle_client import init_tinker_client
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
@@ -203,8 +203,11 @@ dataset.map(SelfCognitionProcessor('twinkle Model', 'twinkle Team'), load_from_c
 dataset.encode(batched=True, load_from_cache_file=False)
 dataloader = DataLoader(dataset=dataset, batch_size=8)
 
-# Initialize tinker client
-service_client = init_tinker_compat_client(base_url, api_key)
+# Initialize Tinker client before importing ServiceClient
+init_tinker_client()
+from tinker import ServiceClient
+
+service_client = ServiceClient(base_url=base_url, api_key=api_key)
 training_client = service_client.create_lora_training_client(base_model=base_model[len('ms://'):], rank=16)
 
 # Training loop: use input_feature_to_datum to transfer the input format
 
@@ -13,20 +13,25 @@
 
 import os
 
-from twinkle_client import init_tinker_compat_client
+# Step 2: Initialize Tinker client before importing ServiceClient
+from twinkle_client import init_tinker_client
 
-# Step 2: Initialize the Tinker-compatible client to communicate with the server.
-# - base_url: the address of the running server
-# - api_key: authentication token (loaded from environment variable)
-service_client = init_tinker_compat_client(
-    base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN'))
+init_tinker_client()
 
-# Step 3: List models available on the server to verify the connection
+# Step 3: Use ServiceClient directly from tinker
+from tinker import ServiceClient
+
+service_client = ServiceClient(
+    base_url='http://www.modelscope.cn/twinkle',
+    api_key=os.environ.get('MODELSCOPE_TOKEN')
+)
+
+# Step 4: List models available on the server to verify the connection
 print('Available models:')
 for item in service_client.get_server_capabilities().supported_models:
     print('- ' + item.model_name)
 
-# Step 4: Create a REST client for querying training runs and checkpoints.
+# Step 5: Create a REST client for querying training runs and checkpoints.
 # This is useful for inspecting previous training sessions or resuming training.
 rest_client = service_client.create_rest_client()
 
@@ -51,7 +56,7 @@
         # Uncomment the line below to resume from the last checkpoint:
         # resume_path = chpt.tinker_path
 
-# Step 5: Create or resume a training client.
+# Step 6: Create or resume a training client.
 # If resume_path is set, it restores both model weights and optimizer state.
 base_model = 'Qwen/Qwen2.5-7B-Instruct'
 if not resume_path:
@@ -60,7 +65,7 @@
     print('Resuming from ' + resume_path)
     training_client = service_client.create_training_client_from_state_with_optimizer(path=resume_path)
 
-# Step 6: Prepare training data manually
+# Step 7: Prepare training data manually
 #
 # This example teaches the model to translate English into Pig Latin.
 # Each example has an "input" (English phrase) and "output" (Pig Latin).
@@ -146,7 +151,7 @@ def process_example(example: dict, tokenizer) -> types.Datum:
             datum0.loss_fn_inputs['weights'].tolist())):
     print(f'{repr(tokenizer.decode([inp])):<20} {repr(tokenizer.decode([tgt])):<20} {wgt:<10}')
 
-# Step 7: Run the training loop
+# Step 8: Run the training loop
 #
 # For each epoch, iterate over multiple batches:
 #   - forward_backward: sends data to the server, computes loss & gradients
@@ -174,7 +179,7 @@ def process_example(example: dict, tokenizer) -> types.Datum:
     save_result = save_future.result()
     print(f'Saved checkpoint for epoch {epoch} to {save_result.path}')
 
-# Step 8: Publish the final checkpoint to ModelScope Hub.
+# Step 9: Publish the final checkpoint to ModelScope Hub.
 # NOTE: Requires a valid ModelScope token set as api_key when initializing the client.
 # The published model name will be: {run_id}_{checkpoint_name}
 rest_client.publish_checkpoint_from_tinker_path(save_result.path).result()
 
@@ -21,7 +21,8 @@ applications:
     route_prefix: /api/v1          # API endpoint prefix (Tinker-compatible)
     import_path: server            # Python module to import
     args:
-
+      supported_models:
+        - Qwen/Qwen2.5-7B-Instruct
     deployments:
       - name: TinkerCompatServer
         autoscaling_config:
 
@@ -4,27 +4,34 @@
 # for text generation (sampling) via the Tinker-compatible client API.
 # The server must be running first (see server.py and server_config.yaml).
 
+import os
 from tinker import types
 
 from twinkle.data_format import Message, Trajectory
 from twinkle.template import Template
-from twinkle_client import init_tinker_compat_client
+from twinkle_client import init_tinker_client
 
-# Step 1: Define the base model and connect to the server
+# Step 1: Initialize Tinker client
+init_tinker_client()
+
+from tinker import ServiceClient
+
+# Step 2: Define the base model and connect to the server
 base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
-service_client = init_tinker_compat_client(
+service_client = ServiceClient(
     base_url='http://www.modelscope.cn/twinkle',
     api_key=os.environ.get('MODELSCOPE_TOKEN')
 )
-# Step 2: Create a sampling client by loading weights from a saved checkpoint.
+
+# Step 3: Create a sampling client by loading weights from a saved checkpoint.
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
-service_client.create_sampling_client(
+sampling_client = service_client.create_sampling_client(
     model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
-# Step 3: Load the tokenizer locally to encode the prompt and decode the results
+# Step 4: Load the tokenizer locally to encode the prompt and decode the results
 print(f'Using model {base_model}')
 
 template = Template(model_id=f'ms://{base_model}')
@@ -40,21 +47,21 @@
 
 input_ids = input_feature['input_ids'].tolist()
 
-# Step 4: Prepare the prompt and sampling parameters
+# Step 5: Prepare the prompt and sampling parameters
 prompt = types.ModelInput.from_ints(input_ids)
 params = types.SamplingParams(
     max_tokens=128,       # Maximum number of tokens to generate
     temperature=0.7,
     stop=['\n']          # Stop generation when a newline character is produced
 )
 
-# Step 5: Send the sampling request to the server.
+# Step 6: Send the sampling request to the server.
 # num_samples=8 generates 8 independent completions for the same prompt.
 print('Sampling...')
 future = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1)
 result = future.result()
 
-# Step 6: Decode and print the generated responses
+# Step 7: Decode and print the generated responses
 print('Responses:')
 for i, seq in enumerate(result.sequences):
     print(f'{i}: {repr(template.decode(seq.tokens))}')
@@ -10,16 +10,23 @@
 import os
 from tqdm import tqdm
 from tinker import types
-from twinkle_client import init_tinker_compat_client
+from twinkle_client import init_tinker_client
 from twinkle.data_format import Message, Trajectory
 from twinkle.template import Template
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.tinker.common import input_feature_to_datum
 
+# Initialize the Tinker client before importing ServiceClient
+init_tinker_client()
+
+from tinker import ServiceClient
+
 # The base model to fine-tune / evaluate
-base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
+# base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'Qwen/Qwen2.5-7B-Instruct'
+base_url = 'http://localhost:8000'
 
 
 def train():
@@ -42,9 +49,11 @@ def train():
 
     # Step 2: Initialize the training client
 
-    # Connect to the Twinkle server running locally
-    service_client = init_tinker_compat_client(
-        base_url='localhost:9000', api_key=os.environ.get('MODELSCOPE_TOKEN'))
+
+    service_client = ServiceClient(
+        base_url=base_url,
+        api_key=os.environ.get('MODELSCOPE_TOKEN')
+    )
 
     # Create a LoRA training client for the base model (rank=16 for the LoRA adapter)
     training_client = service_client.create_lora_training_client(base_model=base_model, rank=16)
@@ -85,8 +94,7 @@ def eval():
     # Path to a previously saved LoRA checkpoint (twinkle:// URI)
     weight_path = 'twinkle://20260212_174205-Qwen_Qwen2_5-7B-Instruct-51edc9ed/weights/twinkle-lora-2'
 
-    # Connect to the server and create a sampling client with the trained weights
-    service_client = init_tinker_compat_client(base_url='http://localhost:9000')
+    service_client = ServiceClient(base_url=base_url, api_key=os.environ.get('MODELSCOPE_TOKEN'))
     sampling_client = service_client.create_sampling_client(model_path=weight_path, base_model=base_model)
 
     # Step 2: Prepare the chat prompt
 
@@ -24,7 +24,7 @@
 from tinker import types
 from typing import List, Tuple
 
-from twinkle_client import init_tinker_compat_client
+from twinkle_client import init_tinker_client
 from twinkle import get_logger
 from twinkle.advantage import GRPOAdvantage
 from twinkle.data_format import Message, Trajectory
@@ -206,8 +206,13 @@ def main():
 
     # Step 2: Initialize the Tinker-compatible client
     logger.info('Connecting to Tinker server...')
-    service_client = init_tinker_compat_client(
-        base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN'))
+    init_tinker_client()
+
+    from tinker import ServiceClient
+    service_client = ServiceClient(
+        base_url='http://www.modelscope.cn/twinkle',
+        api_key=os.environ.get('MODELSCOPE_TOKEN')
+    )
 
     logger.info('Creating LoRA training client...')
     # Create a LoRA training client for GRPO
 
@@ -1,4 +1,4 @@
-# Server and Client
+# Overview
 
 Twinkle provides a complete HTTP Server/Client architecture that supports deploying models as services and remotely calling them through clients to complete training, inference, and other tasks. This architecture decouples **model hosting (Server side)** and **training logic (Client side)**, allowing multiple users to share the same base model for training.
 
@@ -14,7 +14,7 @@ Twinkle Server supports two protocol modes:
 | Mode | server_type | Description |
 |------|------------|------|
 | **Twinkle Server** | `twinkle` | Native Twinkle protocol, used with `twinkle_client`, simpler API |
-| **Tinker Compatible Server** | `tinker` | Compatible with Tinker protocol, used with `init_tinker_compat_client`, can reuse existing Tinker training code |
+| **Tinker Compatible Server** | `tinker` | Compatible with Tinker protocol, can reuse existing Tinker training code |
 
 ### Two Model Backends
 
@@ -30,7 +30,7 @@ Regardless of Server mode, model loading supports two backends:
 | Client | Initialization Method | Description |
 |--------|---------|------|
 | **Twinkle Client** | `init_twinkle_client` | Native client, simply change `from twinkle import` to `from twinkle_client import` to migrate local training code to remote calls |
-| **Tinker Compatible Client** | `init_tinker_compat_client` | Patches Tinker SDK, allowing existing Tinker training code to be directly reused |
+| **Tinker Client** | `init_tinker_client` | Patches Tinker SDK, allowing existing Tinker training code to be directly reused |
 
 ## How to Choose
 
@@ -47,7 +47,7 @@ Regardless of Server mode, model loading supports two backends:
 | Scenario | Recommendation |
 |------|------|
 | Existing Twinkle local training code, want to switch to remote | Twinkle Client — only need to change import paths |
-| Existing Tinker training code, want to reuse | Tinker Compatible Client — only need to initialize patch |
+| Existing Tinker training code, want to reuse | Tinker Client — only need to initialize patch |
 | New project | Twinkle Client — simpler API |
 
 ### Model Backend Selection
@@ -65,33 +65,35 @@ Complete runnable examples are located in the `cookbook/client/` directory:
 ```
 cookbook/client/
 ├── twinkle/                    # Twinkle native protocol examples
-│   ├── transformer/            # Transformers backend
+│   ├── transformer/            # Transformers backend server config
 │   │   ├── server.py           # Startup script
-│   │   ├── server_config.yaml  # Configuration file
-│   │   └── lora.py             # LoRA training client
-│   └── megatron/               # Megatron backend
-│       ├── server.py
-│       ├── server_config.yaml
-│       └── lora.py
+│   │   └── server_config.yaml  # Configuration file
+│   ├── megatron/               # Megatron backend server config
+│   │   ├── server.py
+│   │   └── server_config.yaml
+│   ├── grpo.py                 # GRPO training client
+│   ├── sample.py               # Inference sampling client
+│   └── self_congnition.py      # Self-cognition training client
 └── tinker/                     # Tinker compatible protocol examples
-    ├── transformer/            # Transformers backend
+    ├── transformer/            # Transformers backend server config
+    │   ├── server.py
+    │   └── server_config.yaml
+    ├── megatron/               # Megatron backend server config
     │   ├── server.py
     │   ├── server_config.yaml
-    │   ├── lora.py             # LoRA training
-    │   ├── sample.py           # Inference sampling
-    │   └── self_congnition.py  # Self-cognition training+evaluation
-    └── megatron/               # Megatron backend
-        ├── server.py
-        ├── server_config.yaml
-        └── lora.py
+    │   └── server_config_7b.yaml
+    ├── lora.py                 # LoRA training client
+    ├── sample.py               # Inference sampling client
+    ├── self_congnition.py      # Self-cognition training+evaluation
+    └── short_math_grpo.py      # GRPO math training client
 ```
 
 Running steps:
 
 ```bash
 # 1. Start Server first
-python cookbook/client/twinkle/transformer/server.py
+python cookbook/client/tinker/transformer/server.py
 
 # 2. Run Client in another terminal
-python cookbook/client/twinkle/transformer/lora.py
+python cookbook/client/tinker/lora.py
 ```
@@ -210,7 +210,6 @@ CLI supported parameters:
 | `-c, --config` | YAML configuration file path (required) | — |
 | `-t, --server-type` | Server mode: `twinkle` or `tinker` | `twinkle` |
 | `--namespace` | Ray namespace | tinker mode defaults to `twinkle_cluster` |
-| `--no-wait` | Do not block and wait (daemon mode) | `False` |
 | `--log-level` | Log level | `INFO` |
 
 ## YAML Configuration Details