-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprob_voting.py
More file actions
148 lines (126 loc) · 5.13 KB
/
prob_voting.py
File metadata and controls
148 lines (126 loc) · 5.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import learner_functions as lf
import load_data as ld
import feature_selection as fs
import pandas as pd
data = ld.LoadData()
#create test and train labels
gender_labels = data.clinical['gender'].tolist()
MSI_labels = data.clinical['msi'].tolist()
test_gender_labels = data.test_clinical['gender'].tolist()
test_MSI_labels = data.test_clinical['msi'].tolist()
# Use feature selection on the data, get the same subset of features for the test data too
protein_sub_set = fs.univariate(data.proteomic, gender_labels)
selected_protein_columns = list(protein_sub_set.columns.values)
test_protein_sub_set = data.test_proteomic.ix[:, selected_protein_columns]
print('knn')
knn_params = { # Found by parameter optimization in knn-optimization.py
"n_neighbors": 11
}
# train learners for gender and msi here:
knn_gender, knn_gender_score = lf.train_knn(protein_sub_set,gender_labels, **knn_params)
knn_msi, knn_msi_score = lf.train_knn(protein_sub_set,MSI_labels, **knn_params)
print('lr')
lr_gender, lr_gender_score = lf.train_lr(protein_sub_set,gender_labels)
lr_msi, lr_msi_score = lf.train_lr(protein_sub_set,MSI_labels)
print('rf')
rf_params = { # Found by parameter optimization in randomforest.py
"criterion": 'gini',
"min_samples_leaf": 1,
"min_samples_split": 5,
"n_estimators": 100
}
#change data.proteomic to most important features
rf_gender, rf_gender_score = lf.train_rf(
protein_sub_set,
gender_labels,
**rf_params
)
rf_msi, rf_msi_score = lf.train_rf(
protein_sub_set,
MSI_labels,
**rf_params
)
print('NC Euclid')
nc_param = {
'metric': 'euclidean'
}
nc_gender, nc_gender_score = lf.train_nc(protein_sub_set,gender_labels, **nc_param)
nc_msi, nc_msi_score = lf.train_nc(protein_sub_set,MSI_labels, **nc_param)
print('SVM linear')
svm_param = {
'kernel': 'linear',
'probability': True,
}
svm_gender, svm_gender_score = lf.train_svm(protein_sub_set,gender_labels,**svm_param)
svm_msi, svm_msi_score = lf.train_svm(protein_sub_set,MSI_labels,**svm_param)
print('mlp')#optimization has been hardcoded in
mlp_gender, mlp_gender_score = lf.train_mlp(protein_sub_set,gender_labels)
mlp_msi, mlp_msi_score = lf.train_mlp(protein_sub_set,MSI_labels)
print('SGD')
sgd_gender, sgd_gender_score = lf.train_sgd(protein_sub_set,gender_labels)
sgd_msi, sgd_msi_score = lf.train_sgd(protein_sub_set,MSI_labels)
modelArrayGen = [knn_gender, lr_gender, rf_gender, svm_gender, mlp_gender]
modelArrayMSI = [knn_msi, lr_msi, rf_msi, svm_msi, mlp_msi]
"""
give a group of trained models, test data and test labels
returns array of 0s and 1s, 1 indicating a mismatch
"""
def prob_based_mismatches(models, data, labels):
# DataFrame for storing the mismatch predictions of each model
# predefine the number of columns as the number of models
results = pd.DataFrame(columns=range(0, len(models)))
model_count = 0
for model in models:
pred, prob = lf.get_prediction_and_prob(model, data)
count = 0
count_confidant = 0
mismatches = []
for p in range(0, len(pred)):
# if the predicted and actual label don't match
if pred[p] != labels[p]:
count += 1
# if the prediction is above 75% confidant on it's classification
if prob[p][0] > .8 or prob[p][1] > .8:
count_confidant += 1
# mark this sample as mislabeled
mismatches.append(1)
else:
# mark as not mislabeled
mismatches.append(0)
else:
# mark as not mislabeled
mismatches.append(0)
# add the current preditions to the data frame containing all model's predictions
results[model_count] = mismatches
model_count += 1
consensus = []
# get the consensus of all the models
for i in range(0, len(results.index)):
total = sum(results.iloc[i, :])
# if the number that vote mismatched are greater than half the total votes
if total > (float(len(results.columns)) / 2.0):
# mark as mismatched
consensus.append(1)
else:
# mark as not mismatched
consensus.append(0)
print(consensus)
print(sum(consensus))
return consensus
gender_mismatch_predictions = prob_based_mismatches(modelArrayGen,test_protein_sub_set,test_gender_labels)
msi_mismatch_prediction = prob_based_mismatches(modelArrayMSI,test_protein_sub_set,test_MSI_labels)
outfile = open('subchallenge_1.csv','w')
outfile.write('sample,mismatch\n')
sample_names = list(data.test_proteomic.index)
count = 0
for i in range(0, len(gender_mismatch_predictions)):
outfile.write(sample_names[i] + ',')
# if either the gender or msi are considered mismatched
if gender_mismatch_predictions[i] == 1 or msi_mismatch_prediction[i] == 1:
outfile.write('1\n')
count += 1
else:
outfile.write('0\n')
outfile.close()
# count is the total number of samples that are being labeled as mislabeled
print(count)