-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtitle_generator.py
More file actions
146 lines (112 loc) · 5.07 KB
/
title_generator.py
File metadata and controls
146 lines (112 loc) · 5.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import nltk, random, math, mongo_config, re, itertools
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
class TitleGenerator:
#Load tokens and tags from file
thefile = open('tokenized_titles.txt', 'r')
tokenized_titles = eval(thefile.read())
thefile.close()
thefile = open('tagged_titles.txt', 'r')
tagged_titles = eval(thefile.read())
thefile.close()
def __init__(self):
titles = mongo_config.titles
#choose random slice of 100 titles from dataset
self.rand = random.randint(0, len(titles) - 100)
self.titles = titles[self.rand:self.rand+100]
self.tokenized_titles_slice = self.tokenized_titles[self.rand:self.rand+100]
self.tagged_titles_slice = self.tagged_titles[self.rand:self.rand+100]
self.set_title_range()
#generate required stats for generation model
self.bigrams = self.build_bigrams()
self.freq_dist = self.build_freq_dist()
#generate required stats for title heuristics
self.title_pos_structures = self.build_title_pos_structures()
#compute range of title sizes from random 100 slice
def set_title_range(self):
self.min_title_length = 5
self.max_title_length = 5
for title in self.titles:
length = len(title.split())
if length > self.max_title_length:
self.max_title_length = length
if length < self.min_title_length:
self.min_title_length = length
def build_freq_dist(self):
flat_all_bigrams = list(itertools.chain(*self.bigrams))
return nltk.ConditionalFreqDist(flat_all_bigrams)
def build_bigrams(self):
bigrams = []
for title in self.tokenized_titles_slice:
bigrams.append(nltk.bigrams(title))
return bigrams
def first_words_list(self):
first_words = []
for title in self.tokenized_titles_slice:
first_words.append(title[0])
return first_words
def build_title(self):
word = random.choice(self.first_words_list()) #choose random seed word from all starting words
title_length = random.randint(self.min_title_length, self.max_title_length) #choose random length in range
generated_title = word + ' '
for i in range(title_length):
if (self.freq_dist[word]):
word = random.choice(self.freq_dist[word].most_common(3))[0]
generated_title += word + ' '
else:
break
return generated_title.lower()
def build_title_pos_structures(self):
sentence_structures = []
for word_tag_pair in self.tagged_titles:
temp = []
for pair in word_tag_pair:
temp.append(pair[1])
sentence_structures.append(temp)
return sentence_structures
def compare_readability(self,title_pos_structure, generated_title_pos_structure):
#iterate over real POS tagged titles checking for similarity to generated title
readability = 0
for index in range(len(title_pos_structure)):
if len(generated_title_pos_structure) > index:
if (title_pos_structure[index] == generated_title_pos_structure[index]):
readability += 1
else:
break
return readability
def generate_title(self):
match_found = False
while not match_found:
generated_title = self.build_title()
#tokenize and tag generated title
tokenizer = RegexpTokenizer(r"\w+[^\w\s]?\w+")
tokenized = tokenizer.tokenize(generated_title)
last_word = tokenized[-1]
tagged = nltk.pos_tag(tokenized)
#disallow title to end in stopword
if(last_word in stopwords.words('english')):
continue
#disallow exact duplicates of existing titles
if generated_title in " ".join(mongo_config.titles):
continue
#require a certain title length
if len(generated_title.split()) < self.min_title_length:
continue
#disallow ending on certain types of words
generated_structure = []
for word_tag_pair in tagged:
generated_structure.append(word_tag_pair[1])
not_aloud = ["JJ", "CC", "CD", "DT", "JJS", "JJR", "TO", "IN", "LS", "MD", "PDT", "POS",
"PP", "PPS", "SYM", "UH", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP",
"WPS", "WRB"]
if(generated_structure[-1] in not_aloud):
continue
#compute 80% match in pos tags
match_cutoff = int(math.ceil(len(tokenized) *.80))
#check if satisfies readability threshold
for sentence_structure in self.title_pos_structures:
match_count = self.compare_readability(sentence_structure, generated_structure)
if match_count >= match_cutoff:
match_found = True
break
return generated_title