forked from MurtyShikhar/robustqa
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconstructNewDatasets.py
More file actions
144 lines (124 loc) · 5.72 KB
/
constructNewDatasets.py
File metadata and controls
144 lines (124 loc) · 5.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import json
import os
import collections
import uuid
import argparse
domains = {
# 'indomain': ['nat_questions', 'newsqa', 'squad'],
'oodomain': ['duorc', 'race', 'relation_extraction']
}
classifications = ['train', 'val']
def construct(originalToParaphrases, dataset_path, useBleu):
# Make the containing directory and the output file that will contain the augmented dataset
bleu_path = '-bleu' if useBleu else ''
output_path = dataset_path + "-augmented" + bleu_path
if not os.path.exists(os.path.dirname(output_path)):
try:
os.makedirs(os.path.dirname(output_path))
except:
print ("Failed making directory for the augmented dataset files")
output_file = open(output_path, 'w')
# Construct the new dataset dictionary #LEFTOFF HERE
newDataset = dict()
input_file = open(dataset_path, "r")
text = input_file.read()
text_dict = json.loads(text)
# Make empty 'data' dict in the newDataset dictionary
newDataset['data'] = []
for dataDict in text_dict['data']:
# Within the newDataset list, add a data dict that contains
# 1) the title, 2) an empty 'paragraphs' list
newDataset['data'].append({
'title': dataDict['title'],
'paragraphs': []
})
paragraphs = dataDict['paragraphs']
for paragraph in paragraphs:
paragraphDict = {
'context': paragraph['context'],
'qas': []
}
newDataset['data'][-1]['paragraphs'].append(paragraphDict)
qas = paragraph['qas']
for qa in qas:
# Add the original question
qaDict = {
'question': qa['question'],
'answers': qa['answers'],
'id': qa['id']
}
newDataset['data'][-1]['paragraphs'][-1]['qas'].append(qaDict)
# Find the original question in the 'originalToParaphrases'
# Add each of the paraphrases to the new dataset
key = qa['question'] + '\n'
if (key in originalToParaphrases):
paraphrases = originalToParaphrases[key]
for pp in paraphrases:
pp_no_null_terminator = pp[0:len(pp)-1]
paraphrase_qaDict = {
'question': pp_no_null_terminator,
'answers': qa['answers'],
'id': uuid.uuid4().hex
}
newDataset['data'][-1]['paragraphs'][-1]['qas'].append(paraphrase_qaDict)
# Set the version in the newDataset dictionary
newDataset['version'] = text_dict['version']
json.dump(newDataset, output_file)
output_file.close()
input_file.close()
def getOriginalToParaDict(aug_dataset_path, saveToFile, useBleu):
# Open the input_file
bleu_path = '-bleu' if useBleu else ''
input_path = aug_dataset_path + "-questions-pps-selected" + bleu_path
input_file = open(input_path, 'r')
lines = input_file.readlines()
lineIndex = 0
# Convert input_file content into a dictionary mapping original questions
# to a list of paraphrases
originalToParaphrases = collections.defaultdict(list)
print("Beginning converting the file '{}' into a dictionary that maps from the original questions to a list of paraphrases".format(input_path))
while (lineIndex < len(lines)):
original = lines[lineIndex]
numPp = int(lines[lineIndex + 1])
for cnt in range(numPp):
paraphrase = lines[lineIndex + (cnt + 1) * 2]
originalToParaphrases[original].append(paraphrase)
lineIndex += (numPp + 1) * 2
input_file.close()
# If saveToFile, store the dictionary from original questions
# to its paraphrases in the output_file
if (saveToFile):
# Make the containing directory and the output file
output_path = aug_dataset_path + "-pps-selected-json" + bleu_path
if not os.path.exists(os.path.dirname(output_path)):
try:
os.makedirs(os.path.dirname(output_path))
except:
print ("Failed making directory for the -pps-selected-json files")
output_file = open(output_path, 'w')
json.dump(originalToParaphrases, output_file, indent = 4)
output_file.close()
return originalToParaphrases
def main(useBleu, saveIntermediate):
for inOrOut in domains:
for dataset in domains[inOrOut]:
for classification in classifications:
aug_dataset_path = 'augmentation/datasets/' + inOrOut + '_' + classification + '/' + dataset
dataset_path = 'datasets/' + inOrOut + '_' + classification + '/' + dataset
originalToParaphrases = getOriginalToParaDict(aug_dataset_path, saveIntermediate, useBleu)
construct(originalToParaphrases, dataset_path, useBleu)
print("Finished")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=("Construct a new dataset."),
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--saveIntermediate", type=bool, help=("Save the intermediate output file that store the dictionary mapping from the original questions to its paraphrases in the output_file"), default=False
)
parser.add_argument(
"--useBleu", type=bool, help=("Indicates if the input files were generated using BLEU scoring. If it was generated using cosine similarity, set this to False"), default=False
)
args = parser.parse_args()
print("This script is used to add the paraphrase questions to dataset text files.")
main(args.useBleu, args.saveIntermediate)