-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_dataset.py
More file actions
61 lines (51 loc) · 2.39 KB
/
create_dataset.py
File metadata and controls
61 lines (51 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
from unidecode import unidecode
from sklearn.model_selection import KFold, train_test_split
from data_utils import clean_data
import utils
def convert_to_ascii(df):
new_df = []
for essay in df:
new_df.append(unidecode(essay))
return new_df
def create_dataset(fold=True):
'''Run this function once to create train,val,test files for K folds'''
data_all = pd.read_csv('asap/training_set_rel3.fixed.tsv.zip',
sep='\t', encoding='latin1')
data_all['essay'] = convert_to_ascii(data_all['essay'])
data_all['essay'] = clean_data(data_all['essay'])
for p in range(1, 9):
data_prompt = data_all[data_all['essay_set']
== p].reset_index(drop=True)
print(data_prompt.head())
if fold:
kf = KFold(n_splits=5, shuffle=True, random_state=420)
n = 1
for train_index, test_index in kf.split(data_prompt):
# print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
val_index = test_index[:len(test_index)//2]
test_index = test_index[len(test_index)//2:]
print(len(train_index), len(val_index), len(test_index))
fold_path = utils.mkpath('asap/fold_{}/'.format(n))
data_prompt.loc[train_index].to_csv(
fold_path + 'prompt_{}_train.tsv'.format(p), sep='\t', index=False)
data_prompt.loc[val_index].to_csv(
fold_path + 'prompt_{}_val.tsv'.format(p), sep='\t', index=False)
data_prompt.loc[test_index].to_csv(
fold_path + 'prompt_{}_test.tsv'.format(p), sep='\t', index=False)
n += 1
else:
train, test = train_test_split(
data_prompt, test_size=0.2, random_state=420, shuffle=False)
val = test[:len(test)//2]
test = test[len(test)//2:]
path = utils.mkpath('asap/')
print(len(train), len(val), len(test))
train.to_csv(path + 'prompt_{}_train.tsv'.format(p),
sep='\t', index=False)
val.to_csv(path + 'prompt_{}_val.tsv'.format(p),
sep='\t', index=False)
test.to_csv(path + 'prompt_{}_test.tsv'.format(p),
sep='\t', index=False)
if __name__ == "__main__":
create_dataset(fold=True)