Skip to content

Commit 2547659

Browse files
committed
feat(cookbook): add single controller SP example and reorganize transformers cookbook
- Add new single_controller_sp.py example demonstrating FSDP + SP validation over 4 GPUs - Move legacy single_controller_sp.py to transformers/sp_fsdp_dense.py - Add shell script sp_fsdp_dense.sh for running the example - Update imports and structure to use twinkle framework components
1 parent 2086e87 commit 2547659

File tree

3 files changed

+91
-0
lines changed

3 files changed

+91
-0
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from functools import partial
2+
import numpy as np
3+
import torch
4+
from peft import LoraConfig
5+
6+
import twinkle
7+
from twinkle import get_logger, DeviceGroup, Platform, DeviceMesh
8+
from twinkle.dataloader import DataLoader
9+
from twinkle.dataset import Dataset, DatasetMeta
10+
from twinkle.model import TransformersModel
11+
from twinkle.preprocessor import SelfCognitionProcessor
12+
13+
logger = get_logger()
14+
MODEL_ID = 'ms://Qwen/Qwen2.5-7B-Instruct'
15+
16+
device_group = [
17+
DeviceGroup(
18+
name="default",
19+
ranks=[0, 1, 2, 3],
20+
device_type=Platform.get_platform().device_prefix(),
21+
)
22+
]
23+
24+
# FSDP + SP validation over 4 GPUs: dp=2, fsdp=2 (SP only affects input slicing)
25+
device_mesh = DeviceMesh(
26+
device_type="cuda",
27+
mesh=np.arange(4).reshape(2, 2),
28+
mesh_dim_names=("dp", "fsdp"),
29+
ulysses_size=2,
30+
)
31+
32+
twinkle.initialize(
33+
mode="local",
34+
nproc_per_node=4,
35+
global_device_mesh=device_mesh,
36+
lazy_collect=False,
37+
)
38+
39+
40+
def create_dataset(data_slice=None):
41+
dataset = Dataset(
42+
dataset_meta=DatasetMeta("ms://swift/self-cognition", data_slice=data_slice)
43+
)
44+
dataset.set_template(
45+
"Template",
46+
model_id=MODEL_ID,
47+
truncation_strategy="left",
48+
max_length=256,
49+
)
50+
dataset.map(SelfCognitionProcessor("twinkle模型", "twinkle团队"))
51+
dataset.encode(batched=True)
52+
return dataset
53+
54+
55+
56+
def train():
57+
dataloader = DataLoader(
58+
dataset=partial(create_dataset, data_slice=None),
59+
batch_size=4,
60+
device_mesh=device_mesh,
61+
)
62+
63+
model = TransformersModel(
64+
model_id=MODEL_ID,
65+
device_mesh=device_mesh,
66+
strategy="native_fsdp",
67+
)
68+
69+
lora_config = LoraConfig(target_modules="all-linear")
70+
model.add_adapter_to_model("default", lora_config, gradient_accumulation_steps=1)
71+
model.set_optimizer("AdamW", lr=1e-4, adapter_name="default")
72+
73+
for step, batch in enumerate(dataloader):
74+
model.forward_backward(inputs=batch, adapter_name='default')
75+
model.clip_grad_and_step(adapter_name='default')
76+
if step % 1 == 0:
77+
metric = model.calculate_metric(is_training=True, adapter_name='default')
78+
_metrics = {}
79+
for key, value in metric.items():
80+
try:
81+
value = float(value)
82+
_metrics[key] = value
83+
except:
84+
pass
85+
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
86+
model.save(f'last-checkpoint', interval=1)
87+
88+
89+
if __name__ == "__main__":
90+
train()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 sp_fsdp_dense.py

0 commit comments

Comments
 (0)