-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPostAnnotationCleanup.py
More file actions
84 lines (69 loc) · 3.02 KB
/
PostAnnotationCleanup.py
File metadata and controls
84 lines (69 loc) · 3.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""After the manual annotation, cleans up the xml data
due to issues found whilst annotating the data.
This is where a single sentence has been split into multiple
sentences due to abbreviations or numbered lists containing
full stops. These sentences are joined back into a single
sentence.
"""
#__all__ = []
__version__ = '0.1'
__author__ = "Steven Smith"
from AcquireData import get_xml
from Logging import LoggingWrapper
from lxml import etree
import re
def main():
"""This is the root function called by the script
Finds where a single sentence has been split into multiple
sentences due to abbreviations or numbered lists containing
full stops. These sentences are joined back into a single
sentence.
"""
with LoggingWrapper('PostAnnotationCleanup'):
xml_conclusions = get_xml('conclusions_with_fw_15_lines_or_less.xml')
re_digit_or_roman_numeral = \
re.compile('^([0-9]+|i|ii|iii|iv|v|vi|vii|viii|xi|xii|xiii|xiv|'
'xv|xvi|xvii|xviii|xix|xx)(\.)?')
new_docs = etree.Element('docs')
for doc in xml_conclusions.find_all("doc"):
phrase_list, is_fw, l = [], False, 0
new_doc = etree.SubElement(new_docs, 'doc',
attrib=dict(id=doc['id'], key=doc['key']))
for sentence in doc.find_all("sentence"):
is_fw |= sentence['is-fw'].lower() == 'true'
text = sentence.text
phrase_list.append(text)
has_sentence_ending = any(map(text.endswith, ['.', '?', '!']))
last_sentence = doc['num-sentences'] == sentence['id']
should_add_sentence = last_sentence
if len(text) < 8:
pass
elif re_digit_or_roman_numeral.match(text):
pass
elif any(map(text.endswith, ['i.e.', 'e.g.', 'et al.', 'w.r.t.',
'cf.', 'pp.', 'etc.', 'i.i.d.'])):
pass
elif has_sentence_ending:
should_add_sentence = True
if should_add_sentence:
if last_sentence and re_digit_or_roman_numeral.match(text):
pass
else:
new_sentence_text = ' '.join(phrase_list)
if not has_sentence_ending:
new_sentence_text += '.'
if last_sentence and len(new_sentence_text) < 8:
pass
else:
l += 1
new_sentence = etree.SubElement(
new_doc, 'sentence', attrib={'id': str(l),
'is-fw': str(is_fw)})
new_sentence.text = new_sentence_text
phrase_list, is_fw = [], False
new_doc.attrib['num-sentences'] = str(l)
with open('./conclusions_with_fw_15_lines_or_less_post_cleanup.xml',
'wb') as fp:
fp.write(etree.tostring(new_docs, pretty_print=True))
if __name__ == '__main__':
main()