-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMain.py
More file actions
119 lines (92 loc) · 5 KB
/
Main.py
File metadata and controls
119 lines (92 loc) · 5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Basics
import os
# Pytorch
import torch
from torch import nn
from torch.utils.data import Dataset, random_split
# Huggingface & co.
from transformers import Trainer, TrainingArguments, pipeline
from accelerate import notebook_launcher
# Typing
from typing import List, Dict, Tuple, Union, Optional, Any
# Own Files
from GermanModel import GermanModel
from GermanDataset import GermanDataset
from MyTraining import MyTraining
# My own training loop based on HuggingFace's accelerate library, supports multi GPU training
# Toy Example is RoBERTa on the German Leipzig Corpora Collection https://wortschatz.uni-leipzig.de/en/download/German
# https://huggingface.co/blog/how-to-train
def main():
training:MyTraining = MyTraining(
"Pytorch_Training_Script",
seed=23,
allow_tf32=True,
mixed_percision=True,
wandb_logging=True,
wandb_entity="max-kaiser",
dispatch_batches=True)
vocab_size: int = 32_000
max_seq_length: int = 128 # in number of tokens
variable_seq_length: bool = True
training.trackers_append_config('model_and_dataset', {
"Vocab Size": vocab_size,
"Max Seq Length": max_seq_length,
})
if os.path.exists("/mnt/d/"):
dataset_path = "/mnt/d/Datasets/Leipzig Corpora Collection/German/"
else:
dataset_path = "D:/Datasets/Leipzig Corpora Collection/German/"
with training.accelerator.main_process_first(): # intern preprocess the dataset in the main process first and buffer to disk so that second process do not need to preprocess it again
dataset: Dataset = GermanDataset(training.isLocalMain, dataset_path, vocab_size, max_seq_length,
variable_seq_length, limit=None)
dataset_train, dataset_val = random_split(dataset, [0.8, 0.2])
model: nn.Module = GermanModel(vocab_size, max_seq_length, dataset.tokenizer)
# TODO: Train using HuggingFace's Trainer as reference
# https://huggingface.co/docs/transformers/v4.36.1/main_classes/trainer
# Trainer()
#model.load_state_dict(torch.load("./checkpoints_saved/grateful-cherry-135/checkpoint.pt")['model_state_dict']) # load pretrained model
model = training.training_loop(model, dataset_train, dataset_val,
learning_rate= 1e-4 * training.accelerator.num_processes, # learning rate should be scaled linearly with the number of GPUs https://huggingface.co/docs/accelerate/concept_guides/performance#learning-rates
epochs= 4,
all_device_batch_size= 200 * training.accelerator.num_processes,
gradient_accumulation_steps= 1,
weight_decay= 0.0003,
weight_decay_grouping= True,
warmup= 0.2,
lr_scheduler_type= "linear",
batch_collate_fn=dataset.collate_fn,
num_workers= 4,
eval_first= False,
manual_evaluation_fn=manual_evaluation)
manual_evaluation(model, -1, "<mask> Aktie von <mask> ist heute morgen erheblich stark <mask>.")
manual_evaluation(model, -1, "Nach den schlechten Nachrichten viel die Aktie der <mask> um <mask> Prozentpunkte.")
manual_evaluation(model, -1, "I walk with my <mask> in the <mask>.")
manual_evaluation(model, -1, "In <mask> ist viel <mask><mask>")
def manual_evaluation(model: nn.Module, epoch:int, sentence: str | None = None) -> Tuple[Dict[str, Any], bool]:
if sentence is None:
sentence = sentence = "<mask> Aktie von <mask> ist heute morgen <mask>."
fill_mask = pipeline(
"fill-mask",
model=model.roberta,
tokenizer=model.tokenizer,
device=model.roberta.device
)
print(sentence)
result = fill_mask(sentence)
if sentence.count("<mask>") == 1:
print(f"<mask> = ", [f"{token['token_str']} {token['score']:0.2f}" for token in result])
else:
for i, mask in enumerate(result):
print(f"<mask{i}> = ", [f"{token['token_str']} {token['score']:0.2f}" for token in mask])
# when we have multiple <mask> we could also select the most likely token for the first <mask> and
# then use the result as input for the next <mask> and so on
print()
return {}, False
if __name__ == '__main__':
multiGPU = True
if multiGPU:
os.environ["ACCELERATE_DEBUG_MODE"] = "1"
notebook_launcher(main, num_processes=2) # does not work on windows, but 2x RTX 3090 PC runs on WSL so no problem
else:
# os.environ["CUDA_VISIBLE_DEVICES"] = "1" # select only second GPU (MSI RTX 3090), because first GPU is also used by Windows, has to be done before importing torch
main()