robustqa/extractQuestions.py at main · shellydeng/robustqa · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import json
import os

domains = {
  'indomain': ['nat_questions', 'newsqa', 'squad'],
  'oodomain': ['duorc', 'race', 'relation_extraction']
}

classifications = ['train', 'val']

def main():
  for inOrOut in domains:
    for dataset in domains[inOrOut]:
      for classification in classifications:
        input_path = 'datasets/' + inOrOut + '_' + classification + '/' + dataset
        extract(input_path)
  print("Finished")


def extract(input_path):
  output_path = "augmentation/" + input_path + "-questions"
  if not os.path.exists(os.path.dirname(output_path)):
    try:
        os.makedirs(os.path.dirname(output_path))
    except:
      print ("Failed extracting questions")

  output_file = open(output_path, 'w')

  with open(input_path, "r") as input_file:
    text = input_file.read()
    text_dict = json.loads(text)
    for dataDict in text_dict['data']:
      title = dataDict['title']
      paragraphs = dataDict['paragraphs']
      for paragraph in paragraphs:
        context = paragraph['context']
        qas = paragraph['qas']
        for qa in qas:
          question = qa['question'] + '\n'
          answer = qa['answers']
          output_file.write(question)
  output_file.close()
  input_file.close()


if __name__ == "__main__":
  print("This script is used to extract the questions from dataset text files.")
  main()