-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_tokenizer_files.py
More file actions
31 lines (24 loc) · 1010 Bytes
/
generate_tokenizer_files.py
File metadata and controls
31 lines (24 loc) · 1010 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from transformers import LlamaTokenizer
# Path to your tokenizer.model file
tokenizer_model_path = "tokenizer.model"
# Load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_model_path)
# Save the tokenizer, including the tokenizer.json and configuration
tokenizer.save_pretrained("./")
print("Generated tokenizer.json and tokenizer_config.json files in the current directory.")
# ----------------- OR COULD ALSO TRY THIS --------------
# from transformers import LlamaTokenizer
#
# # Path to your tokenizer.model file
# tokenizer_model_path = "tokenizer.model"
#
# # Load the tokenizer
# tokenizer = LlamaTokenizer.from_pretrained(tokenizer_model_path)
#
# # Manually serialize the vocabulary and tokenization logic into JSON format
# tokenizer_json_path = "./tokenizer.json"
# with open(tokenizer_json_path, "w", encoding="utf-8") as f:
# json_data = tokenizer.save_vocabulary(".")
# f.write(json_data)
#
# print("Manually generated tokenizer.json in the current directory.")