-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
141 lines (113 loc) · 4.2 KB
/
preprocessing.py
File metadata and controls
141 lines (113 loc) · 4.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pickle
import subprocess
import re
import glob
DUMP_FILE = 'detexedsources.dump'
def get_source_files():
'''
This function returns the list of source tex files
'''
source_files = []
dataset_path = [
'dataset/calculus/*/*.tex',
'dataset/discover_physics/*/*.tex',
'dataset/fundamentals-of-calculus/*/*.rbtex',
'dataset/general_relativity/*/*.rbtex',
'dataset/special_relativity/*/*.rbtex',
'dataset/ThinkCPP/book/ch*.tex',
'dataset/ThinkJava/thinkjava.tex',
'dataset/javajavajava/texfiles/*.tex',
'dataset/lm/lm/qm/*.rbtex',
'dataset/lm/lm/vw/*.rbtex',
'dataset/*.tex'
]
for path in dataset_path:
for filename in glob.glob(path):
source_files.append(filename)
return source_files
def get_plain_text(filename):
'''
This function takes a tex source filename as input and returns content of
file converted to plain text
'''
try:
process = subprocess.Popen(['detex', filename], stdout=subprocess.PIPE)
except FileNotFoundError:
raise FileNotFoundError('Please verify that detex is installed on your system. https://github.com/pkubowicz/opendetex')
output, err = process.communicate()
output = str(output)
output = output.replace('\\n',' ')
return output
def dump_detex_data():
'''
This function dumps a dictionary of {filename: {plaintext: <>}} into DUMP_FILE
'''
properties_dict = {}
source_files = get_source_files()
for f in source_files:
properties_dict[f] = {'plaintext' : get_plain_text(f) }
fd = open(DUMP_FILE,'wb')
pickle.dump(properties_dict,fd)
fd.close()
print('Processed %d source files and dumped data' % len(properties_dict))
def read_detex_data():
'''
Reads the data dumped back into dictionary
'''
fd = open(DUMP_FILE, 'rb')
properties_dict = pickle.load(fd)
return properties_dict
def feature_extraction(tex_source):
'''
Input:
tex_source - tex source (str)
Returns:
A dictionary where each key represents a feature and value is a list of
words that exhibit that feature.
Eg:
{
'indicies' : ['Apple','Banana',.....],
'sections' : ['Fruits', 'Veggies', ...],
.
.
.
}
'''
features_dict = {}
indices = set(re.findall(r'\\index{(.*)}',tex_source))
# We will include sub indexing, formatting inside indexing, labels etc for final model
indices = [re.sub(r'(}.*?index|}.*?label|!).*','',string) for string in indices]
sections = set(re.findall(r'\\section{(.*)}',tex_source))
sections = [re.sub(r'(}.*?index|}.*?label|!).*','',string) for string in sections]
subsections = set(re.findall(r'\\subsection{(.*)}',tex_source))
subsections = [re.sub(r'(}.*?index|}.*?label|!).*','',string) for string in subsections]
italicized = re.findall(r'\\emph{(.*?)}',tex_source)
italicized.extend(re.findall(r'\\textit{(.*)}',tex_source))
#italicized.extend(re.findall(r'\\em([^}].*)',tex_source))
bold = re.findall(r'\\textbf{(.*)}',tex_source)
underline = re.findall(r'\\uline{(.*)}',tex_source)
underline.extend(re.findall(r'\\uwave{(.*)}',tex_source))
large = re.findall(r'(\\large|\\Large|\\LARGE|\\huge|\\Huge)',tex_source)
features_dict['indices'] = indices
features_dict['sections'] = sections
features_dict['subsections'] = subsections
features_dict['italicized'] = italicized
features_dict['bold'] = bold
features_dict['underline'] = underline
features_dict['large'] = large
return features_dict
def tf_idf():
"""computes tf-idf of each word and makes it a feature"""
def populate_properties():
'''
This function reads the dict that is dumped by dump_detex_data
and for each key (file), updates the value with feature list
'''
# Read the dump that is prepopulated
feature_dict = read_detex_data()
for filename in feature_dict.keys():
fd = open(filename)
tex_source = fd.read()
feature_dict[filename].update(feature_extraction(tex_source))
return feature_dict
# def get_dataframe_from_properties():