-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
185 lines (160 loc) · 6.36 KB
/
parser.py
File metadata and controls
185 lines (160 loc) · 6.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python
"""
AAD - Titanic Dataset Paretodominance Demo
Data Parser Driver
== Team 4 ==
Aaron McDaniel
Jeffrey Minowa
Joshua Reno
Joel Ye
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
import csv
data_dir = 'data/'
train_fn = 'train.csv'
test_fn = 'test.csv'
test_label_fn = 'gender_submission.csv'
folds = 5
def load_data(filename):
url = data_dir + filename
df = pd.read_csv(url, sep=',')
print("Loaded " + filename)
return df.values
# Returns: clean train_data, test_data
def load_split_all():
le = LabelEncoder()
train_data = load_data(train_fn)
test_data = load_data(test_fn)
test_labels = load_data(test_label_fn)
# Note test data has different data order
# Convert sex column (col 4)
le.fit(["male", "female"])
train_data[:, 4] = le.transform(train_data[:, 4])
test_data[:, 3] = le.transform(test_data[:, 3])
# Convert embark column (col 11)
# le.fit(["S", "C", "Q", None])
# print(train_data[:, 11])
# train_data[:, 11] = le.transform(train_data[:, 11])
# test_data[:, 10] = le.transform(test_data[:, 10])
# Feature selection:
# Trim passenger_id (c0), name (c3), ticket number (c8), cabin number (c10)
# As we're unsure about cabin_number domain effect, we're just dropping it
# Dropping embark since we think it's not too helpful, and has NaN
train_data = np.delete(train_data, [0, 3, 8, 10, 11], axis = 1)
test_data = np.delete(test_data, [2, 7, 9, 10], axis = 1)
# Fill in NaN
train_data = np.where(pd.isnull(train_data), -1, train_data)
# test_data = np.where(pd.isnull(test_data), -1, test_data)
x_test = np.where(pd.isnull(test_data), -1, test_data)
y_test = test_labels
# Separate train_data into x and y
x_train = train_data[:, 1:].astype('float')
y_train = train_data[:, 0].astype('int')
return ((x_train, y_train), (x_test, y_test))
def score(clf, data, labels):
"""
calculates the precision and recall for the given classifier on the given set of data and labels
:param clf: untrained classifier to be evaluated
:param data: the dataset used for cross validation
:param labels: the correct labels that match with the given data
:return: a tuple of the precision and recall scores for the given classifier
"""
kf = KFold(shuffle=False, random_state=0)
precision = cross_val_score(clf, data, labels, scoring='precision', cv=kf, n_jobs=-1)
recall = cross_val_score(clf, data, labels, scoring='recall', cv=5, n_jobs=-1)
precision = precision.mean()
recall = recall.mean()
return (precision, recall)
def pareto_dominance_max(ind1, ind2):
"""
returns true if ind1 dominates ind2 by the metrics that should be maximized
:param ind1: tuple of precision and recall scores
:param ind2: tuple of precision and recall scores
:return: boolean representing if ind1 dominates ind2 using metrics that should be maximized
"""
not_equal = False
for value_1, value_2 in zip(ind1.fitness.values, ind2.fitness.values):
if value_1 < value_2:
return False
elif value_1 > value_2:
not_equal = True
return not_equal
def pareto_dominance_min(ind1, ind2):
"""
returns true if ind1 dominates ind2 by the metrics that should be minimized
:param ind1: tuple of FP and FN
:param ind2: tuple of FP and FN
:return: boolean representing if ind1 dominates ind2 using the metrics that should be minimized
"""
not_equal = False
for value_1, value_2 in zip(ind1, ind2):
if value_1 > value_2:
return False
elif value_1 < value_2:
not_equal = True
return not_equal
def update_front(front, ind, comp):
"""
Makes a new pareto front out of the old pareto front and new individual
In this context an individual consists of scores and their hyper parameters
For example ind[0] is a tuple of precision and recall scores
and ind[1] is a list of the hyper-parameters needed to recreate the classifier
:param front: the old pareto front to be updated
:param ind: the new individual that may or may not change the old pareto front
:param comp: the method used to compare individuals as being pareto dominant or not
:return: the new pareto front
"""
# A member belongs on the front if it dominates or is not dominated by new ind
# New ind belongs on front if it is not dominated by any
# If new ind dominated, rest of front won't be dominated
newFront = []
isNewDominated = False
for i in range(len(front)):
old = front[i]
if comp(old[0], ind[0]): # Careful to compare the scores
isNewDominated = True
break
if not comp(ind[0], old[0]):
newFront.append(old)
if isNewDominated:
newFront.extend(front[i:]) # add rest of old front
else:
newFront.append(ind)
return newFront
def convert_to_FP_FN(labels, precision, recall):
"""
converts form precision and recall to FP and FN.
Since Recall = TP/(TP + FN), TP = Recall * Positives
This means we can solve for FN & FP with
FN = TP/Recall - TP
FP = TP/Precision - TP
:param labels: the list of numeric labels that the precision and recall metrics came from
:param precision: the precision of some classifier on the given labels
:param recall: the recall of some classifier on the given labels
:return: a tuple containing FP and FN in that order
"""
positives = sum([1 for l in labels if l == 1])
tp = int(recall * positives)
fn = int(tp / recall) - tp
fp = int(tp / precision) - tp
return (fp, fn)
"""
Takes in a classifier and writes predictions to a csv file after
being trained
:param clf: classifier
:param train_data: training data
:param train_label: training label
:param test_data: testing data
:param clf_name: String of the name of the classifier that you want to make csv of
"""
def convert_to_csv(clf, train_data, train_label, test_data, clf_name):
clf.fit(train_data, train_label)
id_column = test_data[:, 0]
test_data = test_data[:, 1:]
predictions = np.asarray(clf.predict(test_data))
final = np.column_stack((id_column, predictions))
df = pd.DataFrame(final)
df.to_csv(clf_name, index=False, header=["PassengerId", "Survived"])