Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cookbook/megatron/tp_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from twinkle.model import MegatronModel
from twinkle.preprocessor import SelfCognitionProcessor

# Construct a device_mesh, tp=pp=cp=ep=2, dp=1
device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2)
# Construct a device_mesh, tp=pp=ep=dp=2
device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, ep_size=2, sequence_parallel=True)
# use torchrun mode
twinkle.initialize(mode='local', global_device_mesh=device_mesh)

Expand Down
2 changes: 1 addition & 1 deletion cookbook/mm/fsdp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def train():
# Print metric
metric = model.calculate_metric(is_training=True)
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
if step > 0 and step % 40 == 0:
if step > 0 and step % 200 == 0:
metrics = eval(model)
logger.info(f'Eval metric: {metrics}')
metrics['step'] = step
Expand Down
1 change: 0 additions & 1 deletion cookbook/ray/run.sh

This file was deleted.

91 changes: 0 additions & 91 deletions cookbook/ray/single_controller.py

This file was deleted.

4 changes: 2 additions & 2 deletions cookbook/rl/gkd_off_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@
STUDENT_MODEL_ID = os.environ.get('STUDENT_MODEL_ID', 'ms://Qwen/Qwen3-0.6B')
TEACHER_MODEL_ID = os.environ.get('TEACHER_MODEL_ID', 'ms://Qwen/Qwen3-8B')

MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 8))
SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 8))
MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS

BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 16))
Expand Down
2 changes: 1 addition & 1 deletion cookbook/rl/gkd_on_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
TEACHER_MODEL_ID = os.environ.get('TEACHER_MODEL_ID', 'ms://Qwen/Qwen3-8B')

MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 2))
NUM_GPUS = MODEL_GPUS + 2*SAMPLER_GPUS

MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 2048))
Expand Down
2 changes: 1 addition & 1 deletion cookbook/transformers/fsdp2.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 fsdp2.py
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 fsdp2.py
6 changes: 3 additions & 3 deletions cookbook/transformers/fsdp2_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
def eval(model):
# 100 Samples
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100)))
dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507')
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
dataset.encode()
dataloader = DataLoader(dataset=dataset, batch_size=4)
Expand All @@ -35,15 +35,15 @@ def train():
# 1000 samples
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
# Set template to prepare encoding
dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507')
# Preprocess the dataset to standard format
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
# Encode dataset
dataset.encode()
# Global batch size = 4, for GPUs, so 1 sample per GPU
dataloader = DataLoader(dataset=dataset, batch_size=8)
# Use a TransformersModel, transformer_cls_names_to_wrap=Qwen3MoeSparseMoeBlock to avoid hang of fsdp2
model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', fsdp_config={'transformer_cls_names_to_wrap':['Qwen3MoeSparseMoeBlock']})
model = TransformersModel(model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507', fsdp_config={'transformer_cls_names_to_wrap':['Qwen3MoeSparseMoeBlock']})
# Patch MoE model to fix the hang bug, support transformers==4.*
model.apply_patch('ms://twinkle-kit/qwen3_moe_transformers4_patch')
lora_config = LoraConfig(
Expand Down
16 changes: 10 additions & 6 deletions src/twinkle/model/transformers/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,8 +476,12 @@ def calculate_loss(self, **kwargs):
optimizer_config = self.optimizer_group[adapter_name]
loss_instance: Loss = optimizer_config.loss_instance
assert isinstance(loss_instance, Loss), 'Set a loss_instance before calculating loss'
inputs = optimizer_config.train_status.inputs
outputs = optimizer_config.train_status.outputs
if self.model.training:
status = optimizer_config.train_status
else:
status = optimizer_config.eval_status
Comment on lines +479 to +482
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The introduction of the status variable to handle both training and evaluation metrics is a good improvement. However, the implementation appears incomplete as the accumulation of num_tokens (at line 506) still hardcodes optimizer_config.train_status.num_tokens. This inconsistency will cause evaluation tokens to be incorrectly added to training metrics when the model is in evaluation mode. Please ensure that all metric updates in this function use the status variable.

inputs = status.inputs
outputs = status.outputs
assert inputs is not None and outputs is not None, 'Cannot calculate loss of empty inputs and outputs'
result = loss_instance(inputs, outputs, **kwargs)
loss_value = result['loss']
Expand All @@ -499,15 +503,15 @@ def calculate_loss(self, **kwargs):
# = global_per_token_grad / dp_world_size = avg_per_token_grad
counts = counts / self.device_mesh.data_world_size
optimizer_config = self.optimizer_group[adapter_name]
optimizer_config.train_status.num_tokens += counts.item()
status.num_tokens += counts.item()
if self.sp_strategy is not None and 'labels' in inputs:
reduction = getattr(loss_instance, 'reduction', None)
if reduction is not None:
self.sp_strategy.sp_config['loss_reduction'] = str(reduction)
loss_value = self.sp_strategy.reduce_loss(loss_value, inputs['labels'])
optimizer_config.train_status.loss_value += loss_value
outputs['loss'] = optimizer_config.train_status.loss_value
return optimizer_config.train_status.loss_value.item()
status.loss_value += loss_value
outputs['loss'] = status.loss_value
return status.loss_value.item()

@remote_function()
def backward(self, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion src/twinkle/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def pad_cp_inputs(input_tensor: torch.Tensor, padding_value: int) -> torch.Tenso
if input_tensor is None:
return input_tensor

seq_len = input_tensor.shape[1]
seq_len = input_tensor.shape[-1]

# Calculate required divisor based on parallelism settings
if cp_size > 1:
Expand Down
4 changes: 2 additions & 2 deletions src/twinkle/sampler/vllm_sampler/vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,8 @@ async def sample(self,
continue

# Get logprob for the actual token
if i < len(prompt_token_ids):
token_id = prompt_token_ids[i]
if i < len(result.prompt_token_ids):
token_id = result.prompt_token_ids[i]
if token_id in lp_dict:
lp_obj = lp_dict[token_id]
result_prompt_logprobs.append(lp_obj.logprob)
Expand Down
22 changes: 17 additions & 5 deletions src/twinkle/sampler/vllm_sampler/vllm_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ def encode_trajectory_for_vllm(self,
add_generation_prompt=add_generation_prompt,
)[0]
encoded['prompt'] = prompt['prompt']
for key in encoded:
if isinstance(encoded[key], np.ndarray):
encoded[key] = encoded[key].tolist()
return encoded

def apply_patch(self, patch_cls: Union[Patch, Type[Patch], str], **kwargs) -> None:
Expand Down Expand Up @@ -218,6 +221,7 @@ async def _sample_single(
sampling_params: SamplingParams,
lora_request: Optional[Any] = None,
*,
multi_modal_data: Optional[Dict[str, Any]] = None,
logprobs_only: bool = False,
) -> SampleResponse:
"""Sample a single input asynchronously.
Expand All @@ -228,20 +232,23 @@ async def _sample_single(
adapter_path: Optional LoRA adapter path (legacy, prefer lora_request).
lora_request: Pre-built LoRARequest to attach to the sampling request.
Avoids repeated ``_get_or_load_lora`` calls per input.
multi_modal_data: The multi modal data dict.
logprobs_only: Only return logprobs (no generated tokens).

Returns:
A SampleResponse object
"""
multi_modal_data = self._extract_multi_modal_data(feat)
response = await self.engine.sample(
prompt=feat['prompt'] if 'prompt' in feat else feat['input_ids'],
# Pick input_ids first because prompt may not contain response
# if vLLM are used sequentially
# multi-modal does not support input_ids
prompt=feat['input_ids'] if 'input_ids' in feat and multi_modal_data else feat['prompt'],
sampling_params=sampling_params,
lora_request=lora_request,
multi_modal_data=multi_modal_data,
mm_processor_kwargs=feat.get('mm_processor_kwargs'),
)
if 'input_ids' not in feat:
if 'input_ids' not in feat or multi_modal_data:
feat['input_ids'] = response.prompt_token_ids
feat['labels'] = [-100] * len(response.prompt_token_ids)
if not logprobs_only:
Expand Down Expand Up @@ -325,7 +332,11 @@ def sample(
sampling_params.max_tokens = 1
logprobs_only = True

if is_trajectory:
multi_modal_data_list = []
for feat in inputs_list:
multi_modal_data_list.append(self._extract_multi_modal_data(feat))

if is_trajectory or any(multi_modal_data_list):
template = self.template
assert template is not None, \
'Use set_template to add a template when trying to input Trajectory'
Expand All @@ -349,8 +360,9 @@ async def _sample_all():
feat,
sampling_params,
lora_request=lora_request,
multi_modal_data=multi_modal_data,
logprobs_only=logprobs_only,
) for feat in encoded_inputs
) for feat, multi_modal_data in zip(encoded_inputs, multi_modal_data_list)
]
return await asyncio.gather(*tasks)

Expand Down
6 changes: 4 additions & 2 deletions src/twinkle/template/qwen3_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ def __init__(self, *args, **kwargs):
self._patch_size: Optional[int] = None
self._merge_size: Optional[int] = None
self._init_vision_config()
from transformers.models.qwen3_vl import Qwen3VLModel
with torch.device('meta'):
self.dummy_model = Qwen3VLModel(self.config)
import transformers
model_cls = self.config.architectures[0]
model_cls = getattr(transformers, model_cls)
self.dummy_model = model_cls(self.config)
self.rope_index_func = self.get_rope_index()

def get_rope_index(self):
Expand Down
Loading