-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWPM2.py
More file actions
37 lines (30 loc) · 1.33 KB
/
WPM2.py
File metadata and controls
37 lines (30 loc) · 1.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import argparse
import os
from tokenizers import BertWordPieceTokenizer # pip install tokenizers==0.7.0
def train(corpus: list, size: int, limit: int, output: str) -> None:
tokenizer = BertWordPieceTokenizer(
vocab_file=None,
clean_text=True,
handle_chinese_chars=True,
strip_accents=False, # Must be False if cased model
lowercase=False,
wordpieces_prefix="##"
)
tokenizer.train(
files=corpus,
limit_alphabet=limit,
vocab_size=size
)
# tokenizer.save("./", "ch-{}-wpm-{}".format(args.limit_alphabet, args.vocab_size))
path, filename = os.path.split(output)
tokenizer.save(path, filename)
def main(corpus, size, limit, output):
train(corpus, size, limit, output)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="create WordPieceModel vocabulary")
parser.add_argument("--corpus", type=str, nargs="+", help="corpus paths")
parser.add_argument("--size", type=int, default=32000, help="vocab size")
parser.add_argument("--limit_alphabet", type=int, default=6000, help="num of only one character")
parser.add_argument("--output", type=str, help="output(vocab) file path/name")
args = parser.parse_args()
main(args.corpus, args.size, args.limit_alphabet, args.output)