-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path3_prepare_training.py
More file actions
114 lines (96 loc) · 4.41 KB
/
3_prepare_training.py
File metadata and controls
114 lines (96 loc) · 4.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import re
import json
import sys
import random
if len(sys.argv) < 2:
print("Data folder is required as an argument")
sys.exit(1)
folder = sys.argv[1]
if folder[-1] != "/":
folder = folder + "/"
def force_https(url):
return url.replace("http://", "https://")
# load ground truth
ground_truth = {} #diffbotUri -> boolean (positive or negative example)
with open(folder + "ground_truth.tsv") as gt:
for line in gt:
line = line.strip()
fields = line.split("\t")
if len(fields) < 3:
continue
if len(fields[0]) == 0:
continue #missing uri
if len(fields[2]) == 0:
continue # missing label
uri = fields[0]
diffbotId = uri[uri.rfind("/")+1:]
uri = "https://diffbot.com/entity/" + diffbotId
ground_truth[uri] = {"name":fields[1], "label":fields[2] == "TRUE"}
# generate training data
sentences = {} # sentence -> {label, validation, diffbotUri, name}
random.seed(77777)
with open(folder + "facts.jsonl", "r") as f:
for line in f:
doc = json.loads(line)
content = doc["title"]+"\n\n"+doc["text"]
nl = doc.get('naturalLanguage', None)
if nl and nl != None and 'entities' in nl:
for entity in nl['entities']:
if "diffbotUri" not in entity:
continue
entity["diffbotUri"] = force_https(entity["diffbotUri"])
if entity['diffbotUri'] not in ground_truth:
continue
label = ground_truth[entity["diffbotUri"]]["label"]
validation = random.random()>0.8 # 80% of organizations are used for validation only
for mention in entity["mentions"]:
mentionBegin = mention["beginOffset"]
mentionEnd = mention["endOffset"]
for sent in nl["sentences"]:
if mentionBegin >= sent["beginOffset"] and mentionBegin < sent["endOffset"]: # Ensure organization is mentioned in the sentence
sent_text = content[sent["beginOffset"]:sent["endOffset"]]
if "\n" in sent_text or "\t" in sent_text or len(sent_text) < 50:
continue
after = content[mentionEnd:min(sent["endOffset"],mentionEnd + 30)].lower()
after = " " + after + " "
before = content[max(0,mentionBegin - 10):mentionBegin].lower()
before = " " + before + " "
keywords = ["disconnect", "disconnects", "disconnected", "disconnecting",
"pause", "pauses", "paused", "pausing",
"block", "blocks", "blocked", "blocking",
"sanction", "sanctions", "sanctioned", "sanctioning",
"halt", "halts", "halting", "halted",
"suspend", "suspends", "suspended", "suspending",
"to stop", "stops", "stopped", "stopping",
"prohibit", "prohibits", "prohibited", "prohibiting",
"remove", "removed", "removing",
"no-fly", "no fly",
"to leave", "leaving", "left", "leaves",
"to cancel", "cancels", "cancelling", "cancelled",
"to close", "closes", "closed", "closing",
"shut down", "shuts down", "shutting down", "shutted down",
"restrict", "restricts", "restricted", "restricting",
"pulling out", "pulled out", "pulls out", "pulls out",
"withdrew", "withdraw", "withdraws", "withdrawing",
"cease", "ceasing", "ceased", "ceases",
"barred", "no longer",
"exclude", "excluded", "excluding", "excludes",
"blacklisting", "blacklist", "blacklists", "blacklisted"]
foundKeyword = False
for k in keywords:
if re.match(r".*\b"+k+r"\b.*", before + " " + after):
foundKeyword = True
normalized_text = content[sent["beginOffset"]:mentionBegin] + " _entity_ " + content[mentionEnd:sent["endOffset"]]
if (label and foundKeyword) or (label == False):
sentences[normalized_text] = {"label":label, "validation":validation, "diffbotUri":entity["diffbotUri"], "name" : ground_truth[entity["diffbotUri"]]["name"]}
train = open(folder + "training.txt", "w")
val = open(folder + "validation.txt", "w")
for (sentence, obj) in sentences.items():
label = "__label__"+str(obj["label"])
line = label + " " + sentence
if obj['validation']:
val.write(line + "\n")
else:
train.write(line + "\n")
train.close()
val.close()