-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataframe_generation.py
More file actions
55 lines (44 loc) · 1.65 KB
/
dataframe_generation.py
File metadata and controls
55 lines (44 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import nltk, collections
import pandas as pd
import numpy as np
import math
n_src = 88 #number of source files
def generate_dataframe(filename,feat_dict,feature_list_from_tex):
df = pd.DataFrame()
candidate_list = feat_dict["candidate_list"]
frequency = collections.Counter()
for w in candidate_list:
frequency[w] += 1
tagged = nltk.pos_tag(frequency.keys())
notpos = ['VB','VBP','PRP','IN','RB','DT','WDT','WP','WRB','UH','TO','RBR','RBS','POS','MD','EX','WP$','PRP$','$','CC','LS','PDT','RP','VBZ', 'CD','JJS','JJR']
w1 = list(filter(lambda word_tag: word_tag[1] not in notpos, tagged))
word = []
pos = []
count = []
for each in w1:
word.append(each[0])
pos.append(each[1])
df['word']=word
df['pos']=pos
for each in word:
count.append(frequency[each])
df['wordcount'] = count
cols = ['NN','NNP','NNS','VB','VBG','VBD','VBN','FW','NNPS','JJ']
df1 = pd.DataFrame(0, index=np.arange(len(df['word'])), columns=cols)
df2 = pd.concat([df, df1], axis=1)
for idx, row in df2.iterrows():
pos = row['pos']
df2.set_value(idx, pos, 1)
for feat in feature_list_from_tex:
df2[feat] = 0
df2.loc[df2['word'].isin(feat_dict[feat]),feat]=1
df2["filename"] = filename
return df2
def add_tf_idf(df):
grouping = df.groupby('word').size()
idf_df = pd.DataFrame({'word':grouping.index, 'idf':grouping.values})
idf_df["idf"] = n_src/((idf_df["idf"])+1)
idf_df["idf"] = idf_df["idf"].apply(math.log)
df = df.merge(idf_df,on='word',how='left')
df["tf-idf"] = df["wordcount"] * df["idf"]
return df