-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathkeywords.py
More file actions
143 lines (119 loc) · 4.88 KB
/
keywords.py
File metadata and controls
143 lines (119 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import nltk
import sys
import json
import os.path
import collections
import string
from nltk.tokenize import regexp_tokenize, word_tokenize, wordpunct_tokenize
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
from collections import OrderedDict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
def get_lemmatize_pos(tag):
if tag.startswith('N'):
return 'n'
elif tag.startswith('V'):
return 'v'
elif tag.startswith('J'):
return 'a'
elif tag.startswith('R'):
return 'r'
else:
return ''
def keywords_with_word_count(input_filename, output_filename):
if os.path.isfile(input_filename+'.json') and os.path.isfile(input_filename+'.txt'):
# To prevent unicode errors
reload(sys)
sys.setdefaultencoding("utf-8")
# Extract top 10 keywords from text file
keywords_wc = keywords_all(input_filename+'.txt')
value = map(list, keywords_wc.items())
value = sorted(value, key = lambda x: int(x[1]), reverse=True)
wc_json = {}
wc_json['data'] = value
# Write to output file
with open(output_filename+'.wc.json', 'w') as output_file:
json.dump(wc_json, output_file)
def keywords_with_timestamps(input_filename, output_filename):
if os.path.isfile(input_filename+'.json') and os.path.isfile(input_filename+'.txt'):
# To prevent unicode errors
reload(sys)
sys.setdefaultencoding("utf-8")
# Read json input file
json_file = open(input_filename+'.json').read()
json_data = json.loads(json_file)
json_keywords = json_data['words'] #List of json
ts_json = {}
# Extract top 10 keywords
keywords_wc = keywords_all(input_filename+'.txt') #Dictionary
# Created the require output
result_json_value = []
for ele in keywords_wc.items():
ts_json = {}
top_key = ele[0]
ts_values = []
for key in json_keywords:
if top_key == key['name']:
#print key['name']
ts_values.append(key['time'])
ts_json["keyword"] = top_key
ts_json["timestamps"] = ts_values
#print ts_json
result_json_value.append(ts_json)
#print result_json_value
result_json = {}
result_json['data'] = result_json_value
#Write to the output file
with open(output_filename+'.ts.json', 'w') as output_file:
json.dump(result_json, output_file)
def keywords_all(filename):
f = open(filename, 'r')
result = []
wlem = WordNetLemmatizer()
stop = list(string.punctuation) + ['via', 'blah', 'rt', 'etc', 'eg', 'ex', 'btw', 'bn', 'omg', 'bfg', 'ftw', 'wtf', 'lol', 'bff', 'aka', 'hi', 'bye', 'thanks', 'hello', 'morning', 'night', 'day', 'tomorrow', 'meeting', 'email', 'recording', 'demo', 'thing', 'things']
# Remove stop words and pronouns and articles
result_tagged = []
for line in f.readlines():
if not line.startswith("SPEAKER:"):
line = unicode(line, errors='ignore')
#print stop
line = ' '.join([wrd for wrd in line.lower().split() if wrd not in stop])
line = ' '.join([wrd for wrd in wordpunct_tokenize(line) if len(wrd) > 1])
line = line.lower()
#line = ' '.join([wrd for wrd in word_tokenize(line) if len(wrd) > 1])
line = regexp_tokenize(line, pattern='\w+')
result_tagged.append(pos_tag(line))
#Extract only nouns and foreign words
result_stoplist=[]
for line in result_tagged:
for l in line:
#if l[1]=='FW' or l[1].startswith('N'):
if l[1] == 'NN':
print l[0]
if l[1].startswith('N'):
result_stoplist.append(l)
#Lemmatize the extracted words
result_lemmatize= []
for ele in result_stoplist:
if len(ele[1]) != 1:
if get_lemmatize_pos(ele[1]) != '':
#print ele[0] + " " + wlem.lemmatize(ele[0], get_lemmatize_pos(ele[1]))
result_lemmatize.append(wlem.lemmatize(ele[0], get_lemmatize_pos(ele[1])))
else:
result_lemmatize.append(ele[0])
# Get the count of all words
count = Counter(result_lemmatize)
# Sort in descending order
sorted_by_count = sorted(count, key=count.get, reverse=True)
keyword_wc = {}
for key in sorted_by_count:
keyword_wc[key] = count[key]
sorted_keyword = OrderedDict(sorted(keyword_wc.items(), key=lambda x:x[1], reverse=True))
# Return only the top 10 words
top_10_sorted_keyword = collections.Counter(sorted_keyword).most_common(15)
return dict(top_10_sorted_keyword)
#keywords_with_word_count('sample2', 'sample2_output')
#keywords_with_timestamps('sample2', 'sample2_output')