-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCleanUpData.py
More file actions
269 lines (223 loc) · 9.08 KB
/
CleanUpData.py
File metadata and controls
269 lines (223 loc) · 9.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""Contains variables and functions to help clean the data set.
"""
__all__ = ['is_valid_conclusion_or_fw_heading', 'clean_up_heading',
'repair_sentence']
__version__ = '0.1'
__author__ = "Steven Smith"
from nltk import regexp_tokenize, trigrams as get_trigrams
from nltk.corpus import words, stopwords
from nltk.stem import SnowballStemmer
# when this file is loaded the all_words, missing_pdf_word_parts, stemmer,
# heading_stop_words and valid_conclusion_or_fw_headings variables are all
# initialised for use in the functions.
# all_words is a vocabulary of words to act as valid words in a document
all_words = set(words.words())
words_to_remove = ['bene', 'c', 'cation', 'eld', 'con', 'cult',
'dent', 'er', 'ers', 'es',
'o', # to allow fixing o[ff]
'ort', # to allow fixing [e]ff[ort]
'ne', 're',
't'] # to allow fixing [bene]fi[t]
for word in words_to_remove:
if word in all_words:
all_words.remove(word)
words_to_add = ['afford', 'affordance', 'affordances',
'antiunification',
'benefit', 'benefitted', 'benefits',
'clarified', 'clarifies',
'classifiers', 'classification', 'classifications',
'certificate', 'certificates', 'certification',
'coefficients', 'coefficient',
'confident',
'configuration', 'configurations',
'conflict', 'conflicts',
'convexification',
'define', 'defines', 'defined', 'defining',
'definition', 'definitions',
'difficult', 'difficulties',
'differs', 'differing', 'differences',
'effort', 'efforts',
'field', 'fields',
'findings',
'first', 'first-order', 'first order',
'fits',
'exemplified',
'identified', 'identifies', 'identifiability', 'identifiers',
'influenced', 'influencer', 'influences', 'influencing',
'offline',
'justified', 'justifier', 'justifies',
'justification', 'justifications',
'magnified',
'misclassification', 'misclassifications',
'modified', 'modifier', 'modifiers', 'modifies',
'modifications',
'profit', 'profits', 'profitable',
'quantifies', 'quantifier', 'quantifiers', 'quantified',
'reconfiguration', 'reconfigurations',
'refine', 'refines', 'refined',
'reflect', 'reflects', 'reflected',
'reflection', 'reflections',
'sacrificed',
'simplifications', 'simplification', 'simplifies',
'specified', 'specifies', 'specifics',
'specification', 'specifications',
'traffic',
'unifies', 'unifier', 'unifiers', 'unified',
'unidentifiability',
'verified', 'verifier', 'verification', 'verifiability',
'workflows']
all_words.update(words_to_add)
# missing_pdf_word_parts - parts of words which are sometimes missing after
# text extraction by CERMINE.
missing_pdf_word_parts = ['ffl', 'ffi', 'ff', 'fl', 'fi']
# stemmer and stop words to help parse the headings
stemmer = SnowballStemmer('english')
heading_stop_words = stopwords.words("english")
heading_stop_words.extend(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii',
'ix', 'x', 'xi', 'xii', 'xiii'])
heading_stop_words.remove('further')
heading_stop_words = set(heading_stop_words)
# valid_conclusion_or_fw_headings - the stemmed headings to look for that are
# conclusion/further work sections.
valid_conclusion_or_fw_headings = {
'conclus',
'conclus futur',
'conclus further',
'conclus relat',
'discuss conclus',
'conclus open',
'chapter conclus',
'futur work',
'conclus futur work',
'conclusion',
'conclus perspect',
'conclus futur research'
}
def need_to_fix_word(word):
"""Determines if a word is missing word parts from the
missing_pdf_word_parts at the beginning or end of the word.
:param word: word to determine if it needs repairing
:return: tuple, either (True, corrected_word), if the word needed to be
repaired or (False, None) otherwise
"""
word_exists = word.lower() in all_words
if not word_exists:
for part in missing_pdf_word_parts:
new_word = '%s%s' % (part, word)
new_word_exists = new_word.lower() in all_words
if new_word_exists:
return True, new_word
for part in missing_pdf_word_parts:
new_word = '%s%s' % (word, part)
new_word_exists = new_word.lower() in all_words
if new_word_exists:
return True, new_word
return False, None
def need_to_fix_words(bigram):
"""Determines if a pair of words are missing word parts from the
missing_pdf_word_parts between them.
:param bigram: a tuple word pair
:return: tuple, either (True, corrected_word), if the word needed to be
repaired or (False, None) otherwise
"""
word1, word2 = bigram
word1_exists = word1.lower() in all_words
word2_exists = word2.lower() in all_words
if not word1_exists or not word2_exists:
for part in missing_pdf_word_parts:
new_word = '%s%s%s' % (word1, part, word2)
new_word_exists = new_word.lower() in all_words
if new_word_exists:
return True, new_word
return False, None
def tokenize(sentence):
"""break a sentence into a word list keeping letters, apostrophes and
dashes, as well as the remaining individual characters.
:param sentence: sentence to tokenize
:return: a list of words
"""
tokens, current = [], []
len_words = len(sentence)
for i, c in enumerate(sentence):
if c.isalpha() or c == "'" or c == '-':
current.append(c)
if i == (len_words - 1):
tokens.append(''.join(current))
else:
if current:
tokens.append(''.join(current))
tokens.append(c)
current = []
return tokens
def repair_sentence(sentence):
"""Repairs the sentence if it is found to have known missing letters.
:param sentence: sentence to repair
:return: a repaired sentence including it's missing letters
"""
tokens = tokenize(sentence)
len_tokens = len(tokens)
if len_tokens < 3:
return ''.join(tokens)
new_tokens = []
skip_next_trigram = 0
trigrams = list(get_trigrams(tokens))
last_trigram = trigrams[-1]
for trigram in trigrams:
if skip_next_trigram > 0:
skip_next_trigram -= 1
continue
# if the first and last part of the trigram are words determine if it
# is missing letters between the two. If it is put them back in to
# repair the word.
word1, word2, word3 = trigram
if word1[0].isalpha() \
and not word2[0].isalpha() \
and word3[0].isalpha():
need_fix, new_word = need_to_fix_words((word1, word3))
if need_fix:
new_tokens.append(new_word)
skip_next_trigram = 2
else:
new_tokens.append(word1)
else:
if trigram == last_trigram:
new_tokens.append(''.join(trigram))
else:
new_tokens.append(word1)
# fix single word issues such as [fl]oat and o[ff]
new_tokens2 = []
for word in tokenize(''.join(new_tokens)):
need_fix, new_word = need_to_fix_word(word)
if need_fix:
new_tokens2.append(new_word)
else:
new_tokens2.append(word)
return ''.join(new_tokens2)
def clean_up_heading(heading):
"""Repairs the heading and then only returns words containing at least two
characters that are not in the stop word list.
:param heading: the heading to clean up
:return: a cleaned up heading
"""
sentence = repair_sentence(heading)
tokens = regexp_tokenize(sentence, '[a-zA-Z]{2,}')
return ' '.join(stemmer.stem(word)
for word in tokens
if word not in heading_stop_words).lower()
def clean_up_sentence(sentence):
"""Returns a sentence only consisting of words containing at least two
characters.
:param sentence: sentence to clean up
:return: a lower case sentence only consisting of words containing at
least two characters.
"""
tokens = regexp_tokenize(sentence, '[a-zA-Z]{2,}')
return ' '.join(tokens).lower()
def is_valid_conclusion_or_fw_heading(heading):
"""Determine if a heading contains text identifying it as a conclusion or
further work.
:param heading: heading to check
:return: True, if the heading is a conclusion or further work, otherwise
False.
"""
return clean_up_heading(heading) in valid_conclusion_or_fw_headings