-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprogram1.py
More file actions
48 lines (31 loc) · 1.33 KB
/
program1.py
File metadata and controls
48 lines (31 loc) · 1.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pathlib
import pickle
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
nltk.download('punkt')
data_folder = pathlib.Path('data')
def gen_ngrams(filename):
# read text and remove newlines
file_path = data_folder / filename
with open(file_path, 'r', encoding="utf8") as f:
text = f.read()
text = text.replace("\n", "")
# create bigrams and unigrams list
unigrams = word_tokenize(text)
bigrams = list(ngrams(unigrams, 2))
# create dict of bigram counts
bigram_dict = {b: bigrams.count(b) for b in set(bigrams)}
# create dict of unigram counts
unigram_dict = {u: unigrams.count(u) for u in set(unigrams)}
return bigram_dict, unigram_dict
if __name__ == '__main__':
bigram_dict_en, unigram_dict_en = gen_ngrams('LangId.train.English')
bigram_dict_fr, unigram_dict_fr = gen_ngrams('LangId.train.French')
bigram_dict_it, unigram_dict_it = gen_ngrams('LangId.train.Italian')
pickle.dump(bigram_dict_en, open('bigram_dict_en', 'wb'))
pickle.dump(unigram_dict_en, open('unigram_dict_en', 'wb'))
pickle.dump(bigram_dict_fr, open('bigram_dict_fr', 'wb'))
pickle.dump(unigram_dict_fr, open('unigram_dict_fr', 'wb'))
pickle.dump(bigram_dict_it, open('bigram_dict_it', 'wb'))
pickle.dump(unigram_dict_it, open('unigram_dict_it', 'wb'))