click_bait/title_generator.py at master · fowler446/click_bait · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import nltk, random, math, mongo_config, re, itertools
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

class TitleGenerator:

    #Load tokens and tags from file
    thefile = open('tokenized_titles.txt', 'r')
    tokenized_titles = eval(thefile.read())
    thefile.close()

    thefile = open('tagged_titles.txt', 'r')
    tagged_titles = eval(thefile.read())
    thefile.close()

    def __init__(self):
        titles = mongo_config.titles
        #choose random slice of 100 titles from dataset
        self.rand = random.randint(0, len(titles) - 100)
        self.titles = titles[self.rand:self.rand+100]
        self.tokenized_titles_slice = self.tokenized_titles[self.rand:self.rand+100]
        self.tagged_titles_slice = self.tagged_titles[self.rand:self.rand+100]
        self.set_title_range()

        #generate required stats for generation model
        self.bigrams = self.build_bigrams()
        self.freq_dist = self.build_freq_dist()

        #generate required stats for title heuristics
        self.title_pos_structures = self.build_title_pos_structures()

    #compute range of title sizes from random 100 slice
    def set_title_range(self):
        self.min_title_length = 5
        self.max_title_length = 5
        for title in self.titles:
            length = len(title.split())

            if length > self.max_title_length:
                self.max_title_length = length

            if length < self.min_title_length:
                self.min_title_length = length

    def build_freq_dist(self):
        flat_all_bigrams = list(itertools.chain(*self.bigrams))
        return nltk.ConditionalFreqDist(flat_all_bigrams)

    def build_bigrams(self):
        bigrams = []
        for title in self.tokenized_titles_slice:
            bigrams.append(nltk.bigrams(title))
        return bigrams

    def first_words_list(self):
        first_words = []
        for title in self.tokenized_titles_slice:
            first_words.append(title[0])
        return first_words

    def build_title(self):
        word = random.choice(self.first_words_list()) #choose random seed word from all starting words
        title_length = random.randint(self.min_title_length, self.max_title_length)  #choose random length in range
        generated_title = word + ' '

        for i in range(title_length):
            if (self.freq_dist[word]):
                word = random.choice(self.freq_dist[word].most_common(3))[0]
                generated_title += word + ' '
            else:
                break

        return generated_title.lower()

    def build_title_pos_structures(self):
        sentence_structures = []
        for word_tag_pair in self.tagged_titles:
            temp = []
            for pair in word_tag_pair:
                temp.append(pair[1])
            sentence_structures.append(temp)

        return sentence_structures

    def compare_readability(self,title_pos_structure, generated_title_pos_structure):

        #iterate over real POS tagged titles checking for similarity to generated title
        readability = 0
        for index in range(len(title_pos_structure)):
            if len(generated_title_pos_structure) > index:
                if (title_pos_structure[index] == generated_title_pos_structure[index]):
                    readability += 1
            else:
                break

        return readability

    def generate_title(self):
        match_found = False

        while not match_found:
            generated_title = self.build_title()

            #tokenize and tag generated title
            tokenizer = RegexpTokenizer(r"\w+[^\w\s]?\w+")
            tokenized = tokenizer.tokenize(generated_title)
            last_word = tokenized[-1]
            tagged = nltk.pos_tag(tokenized)


            #disallow title to end in stopword
            if(last_word in stopwords.words('english')):
                continue

            #disallow exact duplicates of existing titles
            if generated_title in " ".join(mongo_config.titles):
                continue

            #require a certain title length
            if len(generated_title.split()) < self.min_title_length:
                continue

            #disallow ending on certain types of words
            generated_structure = []
            for word_tag_pair in tagged:
                generated_structure.append(word_tag_pair[1])

            not_aloud = ["JJ", "CC", "CD", "DT", "JJS", "JJR", "TO", "IN", "LS", "MD", "PDT", "POS",
                        "PP", "PPS", "SYM", "UH", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP",
                        "WPS", "WRB"]

            if(generated_structure[-1] in not_aloud):
                continue

            #compute 80% match in pos tags
            match_cutoff = int(math.ceil(len(tokenized) *.80))

            #check if satisfies readability threshold
            for sentence_structure in self.title_pos_structures:
                match_count = self.compare_readability(sentence_structure, generated_structure)
                if match_count >= match_cutoff:
                    match_found = True
                    break

        return generated_title