-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvocabulary.py
More file actions
26 lines (20 loc) · 807 Bytes
/
vocabulary.py
File metadata and controls
26 lines (20 loc) · 807 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import numpy as np
import string
import pickle
from pickle import load, dump
captions = pickle.load(open('./Preprocessed Data/clean_captions.pkl','rb'))
#creating vocabulary of words with count threshhold 10 in caption word corpus
word_count={}
max_len=0
for key,sentences in captions.items():
for sentence in sentences:
words=sentence.split()
if len(words)>max_len:
max_len=len(words)
for word in words:
word_count[word]=word_count.get(word,0)+1
vocabulary =[word for word in word_count if word_count[word]>=10 ]
print("Words to vocabulary length:",len(word_count),"-->",len(vocabulary))
print("Maximum caption length is :",max_len)
with open("Preprocessed Data/vocabulary.pkl", "wb") as encoded_pickle:
pickle.dump(vocabulary, encoded_pickle)