Skip to content

Commit 2af73f2

Browse files
committed
fix
1 parent 5d83e5c commit 2af73f2

File tree

11 files changed

+1520
-1232
lines changed

11 files changed

+1520
-1232
lines changed

cookbook/megatron/ddp.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from twinkle.model import MegatronModel
1212
from twinkle.preprocessor import SelfCognitionProcessor
1313

14-
if Platform.get_rank() == 0:
14+
if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
1515
# rank0 recording
1616
import swanlab
1717
swanlab.login(api_key=os.environ['SWANLAB_API_KEY'], save=True)
@@ -84,7 +84,7 @@ def train():
8484
if step % 5 == 0:
8585
# Print metric
8686
metric = model.calculate_metric(is_training=True)
87-
if Platform.get_rank() == 0:
87+
if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
8888
swanlab.log(metric)
8989
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
9090
if step > 0 and step % 20 == 0:

cookbook/megatron/ddp.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 ddp.py

cookbook/megatron/ddp_moe.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from twinkle.model import MegatronModel
1212
from twinkle.preprocessor import SelfCognitionProcessor
1313

14-
if Platform.get_rank() == 0:
14+
if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
1515
# rank0 recording
1616
import swanlab
1717
swanlab.login(api_key=os.environ['SWANLAB_API_KEY'], save=True)
@@ -22,7 +22,7 @@
2222

2323

2424
# Construct a device_mesh, tp=pp=cp=ep=2, dp=1
25-
device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2, sequence_parallel=True)
25+
device_mesh = DeviceMesh.from_sizes(dp_size=1, tp_size=2, pp_size=2, cp_size=2, ep_size=2)
2626
# use torchrun mode
2727
twinkle.initialize(mode='local', global_device_mesh=device_mesh)
2828

@@ -74,8 +74,7 @@ def train():
7474
logger.info(model.get_train_configs())
7575
logger.info(f'Total steps: {len(dataloader)}')
7676
loss_metric = 99.0
77-
# lora: 10G * 8
78-
# full: 40G * 8
77+
# lora: 23G * 8
7978
for step, batch in enumerate(dataloader):
8079
# Do forward and backward
8180
model.forward_backward(inputs=batch)
@@ -84,7 +83,7 @@ def train():
8483
if step % 5 == 0:
8584
# Print metric
8685
metric = model.calculate_metric(is_training=True)
87-
if Platform.get_rank() == 0:
86+
if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
8887
swanlab.log(metric)
8988
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
9089
if step > 0 and step % 20 == 0:

cookbook/megatron/ddp_moe.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 ddp_moe.py

cookbook/transformers/fsdp2.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from twinkle.model import TransformersModel
1212
from twinkle.preprocessor import SelfCognitionProcessor
1313

14-
if Platform.get_rank() == 0:
14+
if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
1515
# rank0 recording
1616
import swanlab
1717
swanlab.login(api_key=os.environ['SWANLAB_API_KEY'], save=True)
@@ -21,8 +21,8 @@
2121
)
2222

2323

24-
# Construct a device_mesh, fsdp=2, dp=2
25-
device_mesh = DeviceMesh.from_sizes(fsdp_size=4)
24+
# Construct a device_mesh, fsdp=4, dp=2
25+
device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2)
2626
# use torchrun mode
2727
twinkle.initialize(mode='local', global_device_mesh=device_mesh)
2828

@@ -53,7 +53,7 @@ def train():
5353
# Encode dataset
5454
dataset.encode()
5555
# Global batch size = 4, for GPUs, so 1 sample per GPU
56-
dataloader = DataLoader(dataset=dataset, batch_size=4)
56+
dataloader = DataLoader(dataset=dataset, batch_size=8)
5757
# Use a TransformersModel
5858
model = TransformersModel(model_id='ms://Qwen/Qwen2.5-7B-Instruct')
5959

@@ -64,7 +64,8 @@ def train():
6464
)
6565

6666
# Add a lora to model, with name `default`
67-
model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=4)
67+
# Comment this to use full-parameter training
68+
model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
6869
# Add Optimizer for lora `default`
6970
model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
7071
# Add LRScheduler for lora `default`
@@ -84,7 +85,7 @@ def train():
8485
if step % 20 == 0:
8586
# Print metric
8687
metric = model.calculate_metric(is_training=True)
87-
if Platform.get_rank() == 0:
88+
if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
8889
swanlab.log(metric)
8990
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
9091
if step > 0 and step % 40 == 0:

cookbook/transformers/fsdp2.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 fsdp2.py

cookbook/transformers/fsdp2_moe.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from twinkle.model import TransformersModel
1212
from twinkle.preprocessor import SelfCognitionProcessor
1313

14-
if Platform.get_rank() == 0:
14+
if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
1515
# rank0 recording
1616
import swanlab
1717
swanlab.login(api_key=os.environ['SWANLAB_API_KEY'], save=True)
@@ -65,6 +65,7 @@ def train():
6565
)
6666

6767
# Add a lora to model, with name `default`
68+
# Comment this to use full-parameter training
6869
model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
6970
# Add Optimizer for lora `default`
7071
model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
@@ -75,8 +76,7 @@ def train():
7576
logger.info(model.get_train_configs())
7677
logger.info(f'Total steps: {len(dataloader)}')
7778
loss_metric = 99.0
78-
# lora: 18G * 4
79-
# full: 50G * 4
79+
# lora: 34G * 8
8080
for step, batch in enumerate(dataloader):
8181
# Do forward and backward
8282
model.forward_backward(inputs=batch)
@@ -85,7 +85,7 @@ def train():
8585
if step % 20 == 0:
8686
# Print metric
8787
metric = model.calculate_metric(is_training=True)
88-
if Platform.get_rank() == 0:
88+
if Platform.get_rank() == 0 and os.environ.get('SWANLAB_API_KEY'):
8989
swanlab.log(metric)
9090
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
9191
if step > 0 and step % 40 == 0:

cookbook/transformers/fsdp2_moe.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 fsdp2_moe.py

0 commit comments

Comments
 (0)