-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcombine_predictions.py
More file actions
92 lines (81 loc) · 4.43 KB
/
combine_predictions.py
File metadata and controls
92 lines (81 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Combine predictions from one or more models and put them in the correct submission format
# Expected base rate, if it matters: 0.6038961038961039
import re
import argparse
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import jensenshannon
from sklearn import metrics
import numpy as np
from tqdm import tqdm
VALIDITY_REGEX = r'^(\s*0*((0?\.[0-9]+)|(0\.?)|(1\.?)|(1\.0*))\s*,){1231}\s*0*((0?\.[0-9]+)|(0\.?)|(1\.?)|(1\.0*))\s*$'
plt.style.use('nigel.mplstyle')
argparser = argparse.ArgumentParser(
description='Combine one or more predictions files (decision-level fusion) and adjust '
'thresholds to maximize kappa')
argparser.add_argument('prediction_files', nargs='+', type=str,
help='One or more filenames of holdout set predictions files; training '
'prediction filename will be inferred by adding "-train"')
args = argparser.parse_args()
df = pd.read_csv(args.prediction_files[0]).rename(columns={'pred': 'pred_0'})
train_df = pd.read_csv(
args.prediction_files[0].replace('.csv', '-train.csv')).rename(columns={'pred': 'pred_0'})
for i in range(1, len(args.prediction_files)):
df['pred_' + str(i)] = pd.read_csv(args.prediction_files[i]).pred
train_df['pred_' + str(i)] = pd.read_csv(
args.prediction_files[i].replace('.csv', '-train.csv')).pred
# Equal voting
df['pred'] = df[[c for c in df if c.startswith('pred_')]].sum(axis=1) / len(args.prediction_files)
train_df['pred'] = train_df[[c for c in train_df if c.startswith('pred_')]].sum(axis=1)
train_df['pred'] /= len(args.prediction_files)
print('Jensen-Shannon distances between prediction sets (square root of divergence):')
print('JSD 10 <=> 20:', jensenshannon(df[df.data_length == '10m'].pred,
df[df.data_length == '20m'].pred))
print('JSD 10 <=> 30:', jensenshannon(df[df.data_length == '10m'].pred.iloc[:-1], # 30m is 1 short
df[df.data_length == '30m'].pred))
print('JSD 20 <=> 30:', jensenshannon(df[df.data_length == '20m'].pred.iloc[:-1],
df[df.data_length == '30m'].pred))
# Plot to show if distributions across the three combined datasets are notably different
plt.hist(df[df.data_length == '10m'].pred, bins=50, alpha=.5, label='10 minutes')
plt.hist(df[df.data_length == '20m'].pred, bins=50, alpha=.5, label='20 minutes')
plt.hist(df[df.data_length == '30m'].pred, bins=50, alpha=.5, label='30 minutes')
plt.xlim(0, 1)
plt.axvline(.5, linestyle='--', linewidth=1, color='black')
plt.legend(loc='upper left')
plt.title('10m pred. rate: %.3f, 20m pred. rate: %.3f, 30m pred. rate: %.3f' %
((df[df.data_length == '10m'].pred > .5).mean(),
(df[df.data_length == '20m'].pred > .5).mean(),
(df[df.data_length == '30m'].pred > .5).mean()))
plt.xlabel('Predicted probability')
plt.ylabel('Count')
plt.show()
# Rescale predictions to try improve kappa
print('Overall kappa at .5 threshold =',
metrics.cohen_kappa_score(train_df.label, train_df.pred > .5))
print('Overall AUC =', metrics.roc_auc_score(train_df.label, train_df.pred))
plt.figure()
for datalen, dldf in train_df.groupby('data_length'):
# Plot kappa over decision thresholds
kappas = [metrics.cohen_kappa_score(dldf.label, dldf.pred > t)
for t in tqdm(np.linspace(0, 1, 101), desc='Calculating kappas')]
plt.plot(np.linspace(0, 1, len(kappas)), kappas, label=datalen + ' max = %.3f' % max(kappas))
print(datalen, 'AUC =', metrics.roc_auc_score(dldf.label, dldf.pred))
print(datalen, 'Truncated AUC =',
metrics.roc_auc_score(dldf.label, dldf.pred.astype(str).str.slice(stop=6).astype(float)))
# Adjust predictions in holdout data to match ideal training data threshold
thresh = np.argmax(kappas) / (len(kappas) - 1)
print(datalen, 'ideal threshold =', thresh)
dfpreds = df[df.data_length == datalen].pred
print('Holdout predicted rate at that threshold =', (dfpreds > thresh).mean())
print(((dfpreds - thresh).abs() < .0001).sum(), datalen, 'predictions at exactly threshold')
if thresh >= .5:
df.loc[dfpreds.index, 'pred'] = dfpreds / thresh / 2
else:
df.loc[dfpreds.index, 'pred'] = 1 - (1 - dfpreds) / (1 - thresh) / 2
plt.legend(loc='upper left')
plt.show()
# Save predictions
preds = ','.join(df.pred.astype(str))
assert re.match(VALIDITY_REGEX, preds)
with open('combine_predictions.txt', 'w') as outfile:
outfile.write(preds)