-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
executable file
·173 lines (152 loc) · 7.63 KB
/
data.py
File metadata and controls
executable file
·173 lines (152 loc) · 7.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import re
import glob
import json
from datasets import load_dataset
from tqdm import tqdm
from models import GPT_MODEL_ID, code_datasets
from generate_summaries import process_dataset
def save_to_json(dictionary, file_name, force_overwrite=True):
"""Save a dictionary to a JSON file, creating directories if needed."""
directory = os.path.dirname(file_name)
if directory != "" and not os.path.exists(directory):
os.makedirs(directory)
if not force_overwrite and os.path.exists(file_name):
return
with open(file_name, "w") as f:
json.dump(dictionary, f)
def load_from_json(file_name) -> dict:
"""Load a dictionary from a JSON file."""
with open(file_name, "r") as f:
return json.load(f)
def load_data(dataset, sources, target_model, num_samples, load_responses=True, extras=False, logger=None):
"""
Load responses and source data for a given dataset and set of sources.
Returns (responses, sources, keys).
"""
all_responses = {}
print(num_samples)
data_type = "sources" if dataset not in code_datasets else "code" #in case we want to add other types of sources in the future
if load_responses:
for source_model in sources:
print(f"[DEBUG] Loading {dataset} data for model: {source_model}")
merged_file = f"responses/{dataset}/{dataset}_train_{source_model}_responses_merged.json"
print(f"[DEBUG] Checking merged file: {merged_file}")
if os.path.exists(merged_file):
print(f"[DEBUG] Merged file exists, loading...")
responses = load_from_json(merged_file)
print(f"[DEBUG] Loaded {len(responses)} samples from merged file.")
if len(responses) < num_samples:
print(f"[DEBUG] Not enough samples: {len(responses)} < {num_samples}. Generating {num_samples - len(responses)} more now.")
process_dataset(dataset, source_model, num_samples)
print(f"[DEBUG] Using merged file for {source_model}")
all_responses[source_model] = responses
else:
# Fallback: find the best available non-merged file with at least num_samples
pattern = f"responses/{dataset}/{dataset}_train_{source_model}_responses*{'_extra' if extras else ''}.json"
files = glob.glob(pattern)
best_file = None
for file in files:
with open(file, "r") as f:
data = json.load(f)
if len(data) >= num_samples:
best_file = file
break
if best_file is None:
if logger is not None:
logger.warning(f"No suitable {data_type} file found for {source_model} with at least {num_samples} samples. Generating now.")
else:
print(f"No suitable {data_type} file found for {source_model} with at least {num_samples} samples. Generating now.")
process_dataset(dataset, source_model, num_samples)
# After generation, check for the merged file first, then fallback files
merged_file = f"responses/{dataset}/{dataset}_train_{source_model}_responses_merged.json"
if os.path.exists(merged_file):
best_file = merged_file
else:
# Check for newly generated files
pattern = f"responses/{dataset}/{dataset}_train_{source_model}_responses*{'_extra' if extras else ''}.json"
files = glob.glob(pattern)
for file in files:
with open(file, "r") as f:
data = json.load(f)
if len(data) >= num_samples:
best_file = file
break
if best_file is None:
raise FileNotFoundError(f"Failed to generate or find suitable {data_type} file for {source_model} with at least {num_samples} samples.")
all_responses[source_model] = load_from_json(best_file)
articles = load_from_json(f"{data_type}/{dataset}_train_{data_type}{'_extra' if extras else ''}.json")
if target_model in sources:
print(all_responses.keys())
all_keys = list(all_responses[target_model].keys())
keys = all_keys[:num_samples]
elif load_responses:
raise Exception("Model not found!", target_model)
else:
keys = list(articles.keys())
return all_responses, articles, keys
def load_cnn_dailymail_data():
"""
Load the CNN/DailyMail dataset splits.
Returns (train_data, test_data, validation_data).
"""
dataset = load_dataset("cnn_dailymail", "3.0.0")
train_data = dataset["train"]
test_data = dataset["test"]
validation_data = dataset["validation"]
return train_data, test_data, validation_data
def load_xsum_data():
"""
Load the XSum dataset splits.
Returns (train_data, test_data, validation_data).
"""
dataset = load_dataset("EdinburghNLP/xsum")
train_data = dataset["train"]
test_data = dataset["test"]
validation_data = dataset["validation"]
return train_data, test_data, validation_data
def load_arena_data():
data = {}
arena = load_dataset("lmarena-ai/arena-human-preference-100k",split="train")
for model in ["llama-3.1-8b-instruct", "llama-3.1-70b-instruct", "llama-3.1-405b-instruct"]:
data[model] = arena.filter(lambda a: a['model_a'] == model or a['model_b'] == model)
for model in ["llama-3.1-8b-instruct", "llama-3.1-70b-instruct", "llama-3.1-405b-instruct"]:
model_data = []
for entry in data[model]:
add = {}
assert entry['conversation_a'][0]['content'] == entry['conversation_b'][0]['content']
if not len(entry['conversation_a']) == len(entry['conversation_b']) == 2:
ia += 1
for i in range(0, len(entry['conversation_a']), 2):
a, b = entry['conversation_a'][i], entry['conversation_b'][i]
assert a['content'] == b['content']
continue
own = 'a' if entry['model_a'] == model else 'b'
other = 'b' if own == 'a' else 'a'
add['id'] = entry['question_id']
add['self'] = entry[f"model_{own}"]
add['other'] = entry[f'model_{other}']
add['prompt'] = entry[f'conversation_{own}'][0]['content']
add['self_response'] = entry[f'conversation_{own}'][-1]['content']
add['other_response'] = entry[f'conversation_{other}'][-1]['content']
add['won'] = 1 if entry['winner'] == f"model_{own}" else 0
add['language'] = entry['language']
model_data.append(add)
return model_data
def load_medmcqa_data():
medmcqa = load_dataset("openlifescienceai/medmcqa")
medmcqa_json = {}; medmcqa_answers = {}
ref = ['a','b','c','d']
for example in tqdm(medmcqa['train']):
correct_answer = 'op' + ref[example['cop']]
medmcqa_json[example['id']] = \
f"""{example['question']}
Correct Answer: {example[correct_answer]}
Can you explain why this is the case?
"""
print(medmcqa_json[example['id']])
medmcqa_answers[example['id']] = example['exp']
with open("../sources/medmcqa_train_sources.json","w") as f:
json.dump(medmcqa_json, f)
with open("../responses/medmcqa/medmcqa_train_human_responses_merged.json","w") as f:
json.dump(medmcqa_answers, f)