Skip to content

Commit 8a2e681

Browse files
committed
fix processor
1 parent 66df30b commit 8a2e681

File tree

9 files changed

+78
-87
lines changed

9 files changed

+78
-87
lines changed

cookbook/client/server/megatron/server_config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ applications:
2323

2424
deployments:
2525
- name: TinkerCompatServer
26+
max_ongoing_requests: 50
2627
autoscaling_config:
2728
min_replicas: 1 # Minimum number of replicas
2829
max_replicas: 1 # Maximum number of replicas

cookbook/client/server/megatron/server_config_4b.yaml

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ applications:
2424
- Qwen/Qwen3.5-4B
2525
deployments:
2626
- name: TinkerCompatServer
27+
max_ongoing_requests: 50
2728
autoscaling_config:
2829
min_replicas: 1 # Minimum number of replicas
2930
max_replicas: 1 # Maximum number of replicas
@@ -70,39 +71,39 @@ applications:
7071

7172
# 3. Sampler Service - Runs inference / sampling using vLLM engine
7273
# Used for generating text from the model (e.g., evaluating LoRA results).
73-
# - name: sampler-Qwen3.5-4B
74-
# route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
75-
# import_path: sampler
76-
# args:
77-
# model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
78-
# nproc_per_node: 2 # Number of GPU processes per node
79-
# sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
80-
# engine_args: # vLLM engine-specific settings
81-
# max_model_len: 4096 # Maximum sequence length the engine supports
82-
# gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
83-
# enable_lora: true # Allow loading LoRA adapters during inference
84-
# logprobs_mode: processed_logprobs # Logprobs mode for sampling results
85-
# device_group: # Logical device group for the sampler
86-
# name: sampler
87-
# ranks: 1 # Number of GPUs to use
88-
# device_type: cuda
89-
# device_mesh:
90-
# device_type: cuda
91-
# dp_size: 1
92-
# queue_config:
93-
# rps_limit: 100 # Max requests per second
94-
# tps_limit: 100000 # Max tokens per second
95-
# deployments:
96-
# - name: SamplerManagement
97-
# autoscaling_config:
98-
# min_replicas: 1
99-
# max_replicas: 1
100-
# target_ongoing_requests: 16
101-
# ray_actor_options:
102-
# num_cpus: 0.1
103-
# runtime_env:
104-
# env_vars:
105-
# TWINKLE_TRUST_REMOTE_CODE: "0"
74+
- name: sampler-Qwen3.5-4B
75+
route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
76+
import_path: sampler
77+
args:
78+
model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
79+
nproc_per_node: 2 # Number of GPU processes per node
80+
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
81+
engine_args: # vLLM engine-specific settings
82+
max_model_len: 4096 # Maximum sequence length the engine supports
83+
gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
84+
enable_lora: true # Allow loading LoRA adapters during inference
85+
logprobs_mode: processed_logprobs # Logprobs mode for sampling results
86+
device_group: # Logical device group for the sampler
87+
name: sampler
88+
ranks: 1 # Number of GPUs to use
89+
device_type: cuda
90+
device_mesh:
91+
device_type: cuda
92+
dp_size: 1
93+
queue_config:
94+
rps_limit: 100 # Max requests per second
95+
tps_limit: 100000 # Max tokens per second
96+
deployments:
97+
- name: SamplerManagement
98+
autoscaling_config:
99+
min_replicas: 1
100+
max_replicas: 1
101+
target_ongoing_requests: 16
102+
ray_actor_options:
103+
num_cpus: 0.1
104+
runtime_env:
105+
env_vars:
106+
TWINKLE_TRUST_REMOTE_CODE: "0"
106107

107108
# 4. Processor Service
108109
- name: processor

cookbook/client/server/transformer/server_config.yaml

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -68,39 +68,39 @@ applications:
6868

6969
# 3. Sampler Service - Runs inference / sampling using vLLM engine
7070
# Used for generating text from the model (e.g., evaluating LoRA results).
71-
# - name: sampler-Qwen3.5-4B
72-
# route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
73-
# import_path: sampler
74-
# args:
75-
# model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
76-
# nproc_per_node: 2 # Number of GPU processes per node
77-
# sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
78-
# engine_args: # vLLM engine-specific settings
79-
# max_model_len: 4096 # Maximum sequence length the engine supports
80-
# gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
81-
# enable_lora: true # Allow loading LoRA adapters during inference
82-
# logprobs_mode: processed_logprobs # Logprobs mode for sampling results
83-
# device_group: # Logical device group for the sampler
84-
# name: sampler
85-
# ranks: 1 # Number of GPUs to use
86-
# device_type: cuda
87-
# device_mesh:
88-
# device_type: cuda
89-
# dp_size: 1
90-
# queue_config:
91-
# rps_limit: 100 # Max requests per second
92-
# tps_limit: 100000 # Max tokens per second
93-
# deployments:
94-
# - name: SamplerManagement
95-
# autoscaling_config:
96-
# min_replicas: 1
97-
# max_replicas: 1
98-
# target_ongoing_requests: 16
99-
# ray_actor_options:
100-
# num_cpus: 0.1
101-
# runtime_env:
102-
# env_vars:
103-
# TWINKLE_TRUST_REMOTE_CODE: "0"
71+
- name: sampler-Qwen3.5-4B
72+
route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
73+
import_path: sampler
74+
args:
75+
model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
76+
nproc_per_node: 2 # Number of GPU processes per node
77+
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
78+
engine_args: # vLLM engine-specific settings
79+
max_model_len: 4096 # Maximum sequence length the engine supports
80+
gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
81+
enable_lora: true # Allow loading LoRA adapters during inference
82+
logprobs_mode: processed_logprobs # Logprobs mode for sampling results
83+
device_group: # Logical device group for the sampler
84+
name: sampler
85+
ranks: 1 # Number of GPUs to use
86+
device_type: cuda
87+
device_mesh:
88+
device_type: cuda
89+
dp_size: 1
90+
queue_config:
91+
rps_limit: 100 # Max requests per second
92+
tps_limit: 100000 # Max tokens per second
93+
deployments:
94+
- name: SamplerManagement
95+
autoscaling_config:
96+
min_replicas: 1
97+
max_replicas: 1
98+
target_ongoing_requests: 16
99+
ray_actor_options:
100+
num_cpus: 0.1
101+
runtime_env:
102+
env_vars:
103+
TWINKLE_TRUST_REMOTE_CODE: "0"
104104

105105
# 4. Processor Service
106106
- name: processor

cookbook/client/tinker/self_host/sample.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from tinker import ServiceClient
1818

1919
# Step 2: Define the base model and connect to the server
20-
base_model = 'Qwen/Qwen3-4B'
20+
base_model = 'Qwen/Qwen3.5-4B'
2121
service_client = ServiceClient(
2222
base_url='http://localhost:8000',
2323
api_key='EMPTY-TOKEN'

cookbook/client/tinker/self_host/self_cognition.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,5 +133,5 @@ def eval():
133133

134134

135135
if __name__ == '__main__':
136-
# train() # Uncomment to run training
137-
eval() # Run evaluation / inference
136+
train() # Uncomment to run training
137+
# eval() # Run evaluation / inference

cookbook/client/twinkle/self_host/self_congnition.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def train():
5050
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
5151

5252
# Apply a chat template so the data matches the model's expected input format
53-
dataset.set_template('Template', model_id='ms://Qwen/Qwen3-4B', max_length=512)
53+
dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512)
5454

5555
# Replace placeholder names in the dataset with custom model/author names
5656
dataset.map('SelfCognitionProcessor', init_args={'model_name': 'twinkle模型', 'model_author': 'ModelScope社区'})
@@ -64,7 +64,7 @@ def train():
6464
# Step 5: Configure the model
6565

6666
# Create a multi-LoRA Transformers model pointing to the base model on ModelScope
67-
model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3-4B')
67+
model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
6868

6969
# Define LoRA configuration: apply low-rank adapters to all linear layers
7070
lora_config = LoraConfig(target_modules='all-linear')
@@ -87,7 +87,7 @@ def train():
8787
model.set_optimizer('Adam', lr=1e-4)
8888

8989
# Use a linear learning rate scheduler (Do not support LR scheduler if server use megatron)
90-
model.set_lr_scheduler('LinearLR')
90+
# model.set_lr_scheduler('LinearLR')
9191

9292
# Step 6: Optionally resume from a previous checkpoint
9393
if resume_path:

src/twinkle/server/model/app.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def __init__(self,
6161
# Choose model backend
6262
if use_megatron:
6363
from ..model.backends.megatron_model import TwinkleCompatMegatronModel
64+
6465
self.model = TwinkleCompatMegatronModel(
6566
model_id=model_id,
6667
device_mesh=self.device_mesh,

src/twinkle/server/model/backends/megatron_model.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,19 @@
11
# Copyright (c) ModelScope Contributors. All rights reserved.
22
"""
33
Megatron backend model for the unified model deployment.
4-
Moved from tinker/common/megatron_model.py — imports updated.
54
"""
65
import torch
76
from tinker import types
87
from typing import TYPE_CHECKING, Any, List, Optional, Tuple
98

109
from twinkle import remote_class, remote_function
10+
from twinkle.model.megatron import MultiLoraMegatronModel
1111
from twinkle.server.common.datum import datum_to_input_feature, extract_rl_feature
1212
from twinkle.server.model.backends.common import TwinkleCompatModelBase, clean_metrics, collect_forward_backward_results
13-
from twinkle.utils import exists, requires
14-
15-
if TYPE_CHECKING:
16-
from twinkle.model.megatron import MultiLoraMegatronModel as _MegatronBase
17-
elif exists('megatron_core'):
18-
import twinkle.model.megatron as megatron_module
19-
_MegatronBase = megatron_module.MultiLoraMegatronModel
20-
else:
21-
22-
class _MegatronBase:
23-
24-
def __init__(self, *args, **kwargs):
25-
requires('megatron_core')
2613

2714

2815
@remote_class(execute='all')
29-
class TwinkleCompatMegatronModel(_MegatronBase, TwinkleCompatModelBase):
16+
class TwinkleCompatMegatronModel(MultiLoraMegatronModel, TwinkleCompatModelBase):
3017
"""Compatibility wrapper around MultiLoraMegatronModel for Twinkle/Tinker.
3118
3219
Moved from tinker/common/megatron_model.py — logic unchanged.

src/twinkle/server/model/tinker_handlers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ async def create_model(
4040
async def _create_adapter():
4141
_model_id = None
4242
try:
43+
4344
_model_id = self.state.register_model(body.model_dump(), token=token, replica_id=self.replica_id)
4445
if body.lora_config:
4546
lora_cfg = LoraConfig(r=body.lora_config.rank, target_modules='all-linear')

0 commit comments

Comments
 (0)