Pytorch_Training/Main.py at main · TheEpicBigBoss/Pytorch_Training · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Basics
import os
# Pytorch
import torch
from torch import nn
from torch.utils.data import Dataset, random_split
# Huggingface & co.
from transformers import Trainer, TrainingArguments, pipeline
from accelerate import notebook_launcher
# Typing
from typing import List, Dict, Tuple, Union, Optional, Any
# Own Files
from GermanModel import GermanModel
from GermanDataset import GermanDataset
from MyTraining import MyTraining


# My own training loop based on HuggingFace's accelerate library, supports multi GPU training
# Toy Example is RoBERTa on the German Leipzig Corpora Collection https://wortschatz.uni-leipzig.de/en/download/German
# https://huggingface.co/blog/how-to-train


def main():
    training:MyTraining = MyTraining(
        "Pytorch_Training_Script",
        seed=23,
        allow_tf32=True,
        mixed_percision=True,
        wandb_logging=True,
        wandb_entity="max-kaiser",
        dispatch_batches=True)


    vocab_size: int = 32_000
    max_seq_length: int = 128                                                                                           #  in number of tokens
    variable_seq_length: bool = True

    training.trackers_append_config('model_and_dataset', {
        "Vocab Size": vocab_size,
        "Max Seq Length": max_seq_length,
    })

    if os.path.exists("/mnt/d/"):
        dataset_path = "/mnt/d/Datasets/Leipzig Corpora Collection/German/"
    else:
        dataset_path = "D:/Datasets/Leipzig Corpora Collection/German/"

    with training.accelerator.main_process_first():                                                                     #  intern preprocess the dataset in the main process first and buffer to disk so that second process do not need to preprocess it again
        dataset: Dataset = GermanDataset(training.isLocalMain, dataset_path, vocab_size, max_seq_length,
                                         variable_seq_length, limit=None)
    dataset_train, dataset_val = random_split(dataset, [0.8, 0.2])

    model: nn.Module = GermanModel(vocab_size, max_seq_length, dataset.tokenizer)


    # TODO: Train using HuggingFace's Trainer as reference
    # https://huggingface.co/docs/transformers/v4.36.1/main_classes/trainer
    # Trainer()

    #model.load_state_dict(torch.load("./checkpoints_saved/grateful-cherry-135/checkpoint.pt")['model_state_dict'])      #  load pretrained model

    model = training.training_loop(model, dataset_train, dataset_val,
        learning_rate=                 1e-4 * training.accelerator.num_processes,                                       #  learning rate should be scaled linearly with the number of GPUs https://huggingface.co/docs/accelerate/concept_guides/performance#learning-rates
        epochs=                           4,
        all_device_batch_size=          200 * training.accelerator.num_processes,
        gradient_accumulation_steps=      1,
        weight_decay=                0.0003,
        weight_decay_grouping=         True,
        warmup=                         0.2,
        lr_scheduler_type=         "linear",
        batch_collate_fn=dataset.collate_fn,
        num_workers=                      4,
        eval_first=                   False,
        manual_evaluation_fn=manual_evaluation)


    manual_evaluation(model, -1, "<mask> Aktie von <mask> ist heute morgen erheblich stark <mask>.")
    manual_evaluation(model, -1, "Nach den schlechten Nachrichten viel die Aktie der <mask> um <mask> Prozentpunkte.")
    manual_evaluation(model, -1, "I walk with my <mask> in the <mask>.")
    manual_evaluation(model, -1, "In <mask> ist viel <mask><mask>")


def manual_evaluation(model: nn.Module, epoch:int, sentence: str | None = None) -> Tuple[Dict[str, Any], bool]:
    if sentence is None:
        sentence = sentence = "<mask> Aktie von <mask> ist heute morgen <mask>."

    fill_mask = pipeline(
        "fill-mask",
        model=model.roberta,
        tokenizer=model.tokenizer,
        device=model.roberta.device
    )
    print(sentence)
    result = fill_mask(sentence)
    if sentence.count("<mask>") == 1:
        print(f"<mask> = ", [f"{token['token_str']} {token['score']:0.2f}" for token in result])
    else:
        for i, mask in enumerate(result):
            print(f"<mask{i}> = ", [f"{token['token_str']} {token['score']:0.2f}" for token in mask])
    # when we have multiple <mask> we could also select the most likely token for the first <mask> and
    # then use the result as input for the next <mask> and so on
    print()

    return {}, False


if __name__ == '__main__':
    multiGPU = True

    if multiGPU:
        os.environ["ACCELERATE_DEBUG_MODE"] = "1"
        notebook_launcher(main, num_processes=2)                                                                        #  does not work on windows, but 2x RTX 3090 PC runs on WSL so no problem
    else:
        # os.environ["CUDA_VISIBLE_DEVICES"] = "1"                                                                      #  select only second GPU (MSI RTX 3090), because first GPU is also used by Windows, has to be done before importing torch
        main()