forked from ThiagoCF05/webnlg
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser_original.py
More file actions
101 lines (80 loc) · 3.47 KB
/
parser_original.py
File metadata and controls
101 lines (80 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
__author__ = 'thiagocastroferreira'
"""
Author: Thiago Castro Ferreira
Date: 18/10/2018
Description:
Parser for the WebNLG corpus in the INLG hackathon
PYTHON VERSION: 2.7
"""
import os
import json
import xml.etree.ElementTree as ET
from entry import *
def parse(xml_file):
tree = ET.parse(xml_file)
root = tree.getroot()
entries = root.find('entries')
dataset = []
for entry in entries:
eid = entry.attrib['eid']
size = entry.attrib['size']
category = entry.attrib['category']
originaltripleset = []
otripleset = entry.find('originaltripleset')
for otriple in otripleset:
e1, pred, e2 = otriple.text.split(' | ')
originaltripleset.append(Triple(subject=e1.replace('\'', ''), predicate=pred, object=e2.replace('\'', '')))
modifiedtripleset = []
mtripleset = entry.find('modifiedtripleset')
for mtriple in mtripleset:
e1, pred, e2 = mtriple.text.split(' | ')
modifiedtripleset.append(Triple(subject=e1.replace('\'', ''), predicate=pred, object=e2.replace('\'', '')))
lexList = []
lexEntries = entry.findall('lex')
for lex in lexEntries:
comment = lex.attrib['comment']
lid = lex.attrib['lid']
text = lex.text
template = ''
lexList.append(Lex(comment=comment, lid=lid, text=text, template=template))
dataset.append(Entry(eid=eid, size=size, category=category, originaltripleset=originaltripleset, modifiedtripleset=modifiedtripleset, entitymap={}, lexEntries=lexList))
return dataset
def parse_json(json_file):
dataset = []
entries = json.load(open(json_file))['entries']
for entry in entries:
key = list(entry.keys())[0]
eid = entry[key]['xml_id']
size = entry[key]['size']
category = entry[key]['category']
originaltripleset = []
otripleset = entry[key]['originaltriplesets']['originaltripleset'][0]
for otriple in otripleset:
e1 = otriple['subject']
pred = otriple['property']
e2 = otriple['object']
originaltripleset.append(Triple(subject=e1.replace('\'', ''), predicate=pred, object=e2.replace('\'', '')))
modifiedtripleset = []
mtripleset = entry[key]['modifiedtripleset']
for mtriple in mtripleset:
e1 = mtriple['subject']
pred = mtriple['property']
e2 = mtriple['object']
modifiedtripleset.append(Triple(subject=e1.replace('\'', ''), predicate=pred, object=e2.replace('\'', '')))
lexList = []
lexEntries = entry[key]['lexicalisations']
for lex in lexEntries:
comment = lex['comment']
lid = lex['xml_id']
text = lex['lex']
lexList.append(Lex(comment=comment, lid=lid, text=text, template=''))
dataset.append(Entry(eid=eid, size=size, category=category, originaltripleset=originaltripleset, modifiedtripleset=modifiedtripleset, entitymap={}, lexEntries=lexList))
return dataset
def run_parser(set_path):
entryset = []
dirtriples = filter(lambda item: not str(item).startswith('.'), os.listdir(set_path))
for dirtriple in dirtriples:
fcategories = filter(lambda item: not str(item).startswith('.'), os.listdir(os.path.join(set_path, dirtriple)))
for fcategory in fcategories:
entryset.extend(list(parse(os.path.join(set_path, dirtriple, fcategory))))
return entryset