-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcandidate_list_gen.py
More file actions
47 lines (41 loc) · 1.4 KB
/
candidate_list_gen.py
File metadata and controls
47 lines (41 loc) · 1.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import nltk, collections
import numpy as np
from nltk.collocations import *
from nltk.corpus import stopwords
import pandas as pd
import string
def process_text(input_text):
'''
Generates the candidate list by removing the stop words, spaces, punctuation marks.
'''
#removal of stop words
bigram_count = 60
words = nltk.word_tokenize(input_text)
stop_words = set(stopwords.words("english"))
filtered_words = []
for w in words:
if w not in stop_words:
filtered_words.append(w)
#Removal of space
for w in filtered_words:
w.replace(" ", "")
#Removal of puncutation marks
filtered_words = [''.join(c for c in s if c not in string.punctuation) for s in filtered_words]
processed_word = []
for each in filtered_words:
if not each or len(''.join([i for i in each if not i.isdigit()])) < 3:
continue
else:
processed_word.append(each)
bigram = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(processed_word)
#To find frequently together occuring 2 words
finder.apply_freq_filter(2)
bigram_word = []
bigram_words = finder.nbest(bigram.pmi, bigram_count)
for each in bigram_words:
bigram_word.append(each[0]+" "+each[1])
word_list = []
word_list.extend(processed_word)
word_list.extend(bigram_word)
return word_list