forked from ThiagoCF05/webnlg
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtemplate.py
More file actions
120 lines (94 loc) · 3.69 KB
/
template.py
File metadata and controls
120 lines (94 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
__author__='thiagocastroferreira'
"""
Author: Thiago Castro Ferreira
Date: 03/11/2018
Description:
Extract templates based on sentence segmentation and the ordering script
PYTHON VERSION: 2.7
"""
import cPickle as p
import collections
import copy
import json
import order
from stanfordcorenlp import StanfordCoreNLP
STANFORD_PATH=r'/home/tcastrof/workspace/stanford/stanford-corenlp-full-2018-02-27'
props = {'annotators': 'tokenize,ssplit','pipelineLanguage':'en','outputFormat':'json'}
def format(tripleset, template, tagentity):
entitytag = dict(map(lambda x: (x[1], x[0]), tagentity.items()))
triples = []
entitymap = {}
entities = 1
for triple in tripleset:
subject = entitytag[triple.subject]
if triple.subject not in entitymap:
entitymap[triple.subject] = 'ENTITY-' + str(entities)
entities += 1
new_subject = entitymap[triple.subject]
object_ = entitytag[triple.object]
if triple.object not in entitymap:
entitymap[triple.object] = 'ENTITY-' + str(entities)
entities += 1
new_object = entitymap[triple.object]
triples.append(new_subject + ' | ' + triple.predicate + ' | ' + new_object)
template = template.replace(subject, new_subject).replace(object_, new_object)
return triples, template
def extract(tripleset, template, entitymap, corenlp):
sentences = []
try:
out = corenlp.annotate(template.strip(), properties=props)
out = json.loads(out)
for snt in out['sentences']:
sentence = ' '.join(map(lambda w: w['originalText'], snt['tokens']))
sentences.append(sentence)
except:
print('Parsing error...')
templates = []
for i in range(len(sentences)):
idx = i+1
for j, snt in enumerate(sentences):
snts = sentences[j:j+idx]
if len(snts) == idx:
template_ = ' '.join(snts)
orderedset = order.order(copy.deepcopy(tripleset), template_, entitymap)
orderedset_, template_ = format(orderedset, template_, entitymap)
templates.append((orderedset_, template_))
return templates
def group(templates):
result = {}
for template in templates:
tripleset = template[0]
idx = len(tripleset)
if idx not in result:
result[idx] = {}
text = template[1]
if tuple(tripleset) not in result[idx]:
result[idx][tuple(tripleset)] = []
result[idx][tuple(tripleset)].append(text)
fjson = {'sizes': {}}
for idx in result:
fjson['sizes'][idx] = {'entries':[]}
for tripleset in result[idx]:
templates = map(lambda x: dict([('template', x[0]), ('frequency', x[1])]), collections.Counter(result[idx][tripleset]).items())
fjson['sizes'][idx]['entries'].append({
'orderedset': list(tripleset),
'templates': templates
})
return fjson
def run(entryset, lng='en'):
corenlp = StanfordCoreNLP(STANFORD_PATH)
templates = []
for i, entry in enumerate(entryset):
if i % 100 == 0:
print('Progress: ', round((float(i) / len(entryset)) * 100, 2))
for lex in entry.lexEntries:
if lng == 'en':
t = extract(copy.deepcopy(entry.modifiedtripleset), lex.template, entry.entitymap_to_dict(), corenlp)
else:
t = extract(copy.deepcopy(entry.modifiedtripleset), lex.template_de, entry.entitymap_to_dict(), corenlp)
templates.extend(t)
corenlp.close()
return group(templates)
if __name__ == '__main__':
entryset = p.load(open('corpus/dev.cPickle', 'rb'))
run(entryset)