llm-eval/generate_tokenizer_files.py at main · tilde-nlp/llm-eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from transformers import LlamaTokenizer

# Path to your tokenizer.model file
tokenizer_model_path = "tokenizer.model"

# Load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_model_path)

# Save the tokenizer, including the tokenizer.json and configuration
tokenizer.save_pretrained("./")

print("Generated tokenizer.json and tokenizer_config.json files in the current directory.")


# ----------------- OR COULD ALSO TRY THIS --------------

# from transformers import LlamaTokenizer
#
# # Path to your tokenizer.model file
# tokenizer_model_path = "tokenizer.model"
#
# # Load the tokenizer
# tokenizer = LlamaTokenizer.from_pretrained(tokenizer_model_path)
#
# # Manually serialize the vocabulary and tokenization logic into JSON format
# tokenizer_json_path = "./tokenizer.json"
# with open(tokenizer_json_path, "w", encoding="utf-8") as f:
#     json_data = tokenizer.save_vocabulary(".")
#     f.write(json_data)
#
# print("Manually generated tokenizer.json in the current directory.")