lille/prepare_dataset.py at main · Nikityyy/lille · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import numpy as np
import os
from tqdm import tqdm
import tiktoken
import pickle
import gc
import random
from html2term import printc

dataset = "smol-sft"
input_file_path = f"data/{dataset}/train.txt"
output_dir = f"data/{dataset}"
val_split = 0.1
EOT_TOKEN = "<|endoftext|>"
TOKENIZER_PATH = 'tokenizer/Hastings.pkl'

def load_tokenizer(tokenizer_path=TOKENIZER_PATH):
    """Loads the tokenizer and returns the encoding instance."""
    with open(tokenizer_path, 'rb') as f:
        hastings = pickle.load(f)
    enc = tiktoken.core.Encoding(hastings.pop('name'), **hastings)
    printc(f"Tokenizer loaded. Vocab size: <b>{enc.n_vocab}</b>")
    try:
        enc.encode_single_token(EOT_TOKEN)
    except KeyError:
        raise ValueError(f"The EOT token '{EOT_TOKEN}' is not in the tokenizer vocabulary!")
    return enc

def process_and_save(lines, enc, output_path, split_name):
    """Tokenizes lines and saves them in the efficient tokens/offsets format."""
    printc(f"<br/><b>Processing and tokenizing {len(lines):,} documents for the <i>{split_name}</i> split...</b>")

    all_tokenized = []
    for doc in tqdm(lines, desc=f"Tokenizing {split_name}"):
        if doc:
            tokens = enc.encode(doc, allowed_special='all')
            all_tokenized.append(tokens)

    total_tokens = sum(len(doc) for doc in all_tokenized)
    num_docs = len(all_tokenized)

    tokens_arr = np.empty(total_tokens, dtype=np.uint16)
    offsets_arr = np.empty(num_docs + 1, dtype=np.uint64)
    offsets_arr[0] = 0

    token_pos = 0
    for i, doc in enumerate(all_tokenized):
        doc_len = len(doc)
        tokens_arr[token_pos : token_pos + doc_len] = doc
        token_pos += doc_len
        offsets_arr[i + 1] = token_pos

    del all_tokenized
    gc.collect()

    printc(f"  Saving <i>{split_name}</i> data to <i>{output_path}</i>...")
    np.savez_compressed(output_path, tokens=tokens_arr, offsets=offsets_arr)
    printc(f"  <green>Saved {num_docs:,} documents ({total_tokens:,} tokens).</green>")

def main():
    os.makedirs(output_dir, exist_ok=True)
    enc = load_tokenizer()

    with open(input_file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    docs = content.split(EOT_TOKEN)
    documents = [doc.strip() + EOT_TOKEN for doc in docs if doc.strip()]
    printc(f"Found and processed <b>{len(documents):,}</b> documents.")

    random.seed(42)
    random.shuffle(documents)
    printc("<i>Shuffled documents randomly to ensure a representative validation split.</i>")

    split_idx = int(len(documents) * (1 - val_split))
    train_docs = documents[:split_idx]
    val_docs = documents[split_idx:]

    train_output_path = os.path.join(output_dir, "train.npz")
    val_output_path = os.path.join(output_dir, "val.npz")

    process_and_save(train_docs, enc, train_output_path, "train")
    process_and_save(val_docs, enc, val_output_path, "validation")

    printc("<br/><b><green>✅ Processing complete.</green></b>")

if __name__ == "__main__":
    main()