robustqa/constructNewDatasets.py at main · shellydeng/robustqa · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import json
import os
import collections
import uuid
import argparse

domains = {
#   'indomain': ['nat_questions', 'newsqa', 'squad'],
  'oodomain': ['duorc', 'race', 'relation_extraction']
}

classifications = ['train', 'val']

def construct(originalToParaphrases, dataset_path, useBleu):
    # Make the containing directory and the output file that will contain the augmented dataset
    bleu_path = '-bleu' if useBleu else ''
    output_path = dataset_path + "-augmented" + bleu_path
    if not os.path.exists(os.path.dirname(output_path)):
        try:
            os.makedirs(os.path.dirname(output_path))
        except:
            print ("Failed making directory for the augmented dataset files")
    output_file = open(output_path, 'w')

    # Construct the new dataset dictionary #LEFTOFF HERE
    newDataset = dict()
    input_file = open(dataset_path, "r")

    text = input_file.read()
    text_dict = json.loads(text)
    # Make empty 'data' dict in the newDataset dictionary
    newDataset['data'] = []

    for dataDict in text_dict['data']:
        # Within the newDataset list, add a data dict that contains
        # 1) the title, 2) an empty 'paragraphs' list
        newDataset['data'].append({
            'title': dataDict['title'],
            'paragraphs': []
        })
        paragraphs = dataDict['paragraphs']
        for paragraph in paragraphs:
            paragraphDict = {
                'context': paragraph['context'],
                'qas': []
            }
            newDataset['data'][-1]['paragraphs'].append(paragraphDict)
            qas = paragraph['qas']
            for qa in qas:
                # Add the original question
                qaDict = {
                    'question': qa['question'],
                    'answers': qa['answers'],
                    'id': qa['id']
                }
                newDataset['data'][-1]['paragraphs'][-1]['qas'].append(qaDict)

                # Find the original question in the 'originalToParaphrases'
                # Add each of the paraphrases to the new dataset
                key = qa['question'] + '\n'
                if (key in originalToParaphrases):
                    paraphrases = originalToParaphrases[key]
                    for pp in paraphrases:
                        pp_no_null_terminator = pp[0:len(pp)-1]
                        paraphrase_qaDict = {
                            'question': pp_no_null_terminator,
                            'answers': qa['answers'],
                            'id': uuid.uuid4().hex
                        }
                        newDataset['data'][-1]['paragraphs'][-1]['qas'].append(paraphrase_qaDict)

    # Set the version in the newDataset dictionary
    newDataset['version'] = text_dict['version']
    json.dump(newDataset, output_file)
    output_file.close()
    input_file.close()


def getOriginalToParaDict(aug_dataset_path, saveToFile, useBleu):
    # Open the input_file
    bleu_path = '-bleu' if useBleu else ''
    input_path = aug_dataset_path +  "-questions-pps-selected" + bleu_path
    input_file = open(input_path, 'r')
    lines = input_file.readlines()
    lineIndex = 0

    # Convert input_file content into a dictionary mapping original questions
    # to a list of paraphrases
    originalToParaphrases = collections.defaultdict(list)
    print("Beginning converting the file '{}' into a dictionary that maps from the original questions to a list of paraphrases".format(input_path))

    while (lineIndex < len(lines)):
        original = lines[lineIndex]
        numPp = int(lines[lineIndex + 1])
        for cnt in range(numPp):
            paraphrase = lines[lineIndex + (cnt + 1) * 2]
            originalToParaphrases[original].append(paraphrase)
        lineIndex += (numPp + 1) * 2
    input_file.close()

    # If saveToFile, store the dictionary from original questions
    # to its paraphrases in the output_file
    if (saveToFile):
        # Make the containing directory and the output file
        output_path = aug_dataset_path +  "-pps-selected-json" + bleu_path
        if not os.path.exists(os.path.dirname(output_path)):
            try:
                os.makedirs(os.path.dirname(output_path))
            except:
                print ("Failed making directory for the -pps-selected-json files")
        output_file = open(output_path, 'w')
        json.dump(originalToParaphrases, output_file, indent = 4)
        output_file.close()
    return originalToParaphrases


def main(useBleu, saveIntermediate):
    for inOrOut in domains:
        for dataset in domains[inOrOut]:
            for classification in classifications:
                aug_dataset_path = 'augmentation/datasets/' + inOrOut + '_' + classification + '/' + dataset
                dataset_path = 'datasets/' + inOrOut + '_' + classification + '/' + dataset
                originalToParaphrases = getOriginalToParaDict(aug_dataset_path, saveIntermediate, useBleu)
                construct(originalToParaphrases, dataset_path, useBleu)
    print("Finished")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=("Construct a new dataset."),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--saveIntermediate", type=bool, help=("Save the intermediate output file that store the dictionary mapping from the original questions to its paraphrases in the output_file"), default=False
    )
    parser.add_argument(
        "--useBleu", type=bool, help=("Indicates if the input files were generated using BLEU scoring. If it was generated using cosine similarity, set this to False"), default=False
    )

    args = parser.parse_args()
    print("This script is used to add the paraphrase questions to dataset text files.")
    main(args.useBleu, args.saveIntermediate)