-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmisc_util.py
More file actions
369 lines (321 loc) · 17.7 KB
/
misc_util.py
File metadata and controls
369 lines (321 loc) · 17.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
from collections import Counter, OrderedDict
import json
import re
import tempfile
import os
import subprocess
from scipy import stats
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import model_selection, metrics, tree
from sklearn.base import BaseEstimator, TransformerMixin
RANDOM_SEED = 11798
def uncorrelated_feature_sets(pandas_df, max_rho=.8, remove_perfect_corr=False, verbose=0,
priority_order=[]):
"""Given a dataset with some features, return a list of lists, where each sublist is a set of
feature names such that no pair of features are correlated more than `max_rho`. Ties will be
broken among pairs of highly-correlated features by removing the feature with higher mean
correlation with other features, unless tie-break preferences for either/both features are given
in `priority_order`.
Args:
pandas_df (pd.DataFrame): Dataset, where every column is assumed to be a numeric feature
max_rho (float): Maximum allowable pairwise absolute correlation between two features
remove_perfect_corr (bool): If True, when a pair of features correlate perfectly (rho = 1),
remove one of them completely from consideration
verbose (int): Verbosity level
priority_order (list of lists): Feature names to prefer keeping; each sublist will be given
preference over subsequent sublists, and over any features
not represented in any sublist
Returns:
list of lists: One or more sets of uncorrelated features
"""
assert max_rho <= 1 and max_rho > 0, 'Maximum allowable correlation should be in (0, 1]'
# Pairwise Spearman's rho correlation matrix with self-correlations set to 0
rho = pd.DataFrame(index=pandas_df.columns, columns=pandas_df.columns, dtype=float)
for i, a in enumerate(tqdm(pandas_df.columns, desc='Pairwise corr', disable=verbose < 1)):
for b in pandas_df.columns[i:]:
if a == b:
rho.at[a, b] = 0
else:
rho.at[a, b] = rho.at[b, a] = \
abs(pandas_df[a].corr(pandas_df[b], method='spearman'))
if verbose > 3:
print(rho)
if rho.isnull().sum().sum() > 0:
raise ValueError('Correlation matrix had NaN values; check that each feature has variance')
# Convert priority_order to a dict for faster/easier lookups
priority = {f: i for i, sublist in enumerate(priority_order) for f in sublist}
result = []
current_set = list(pandas_df.columns)
next_set = []
while True:
# Find maximum pairwise correlation to see if further splitting of feature set is needed
highest_corr = rho.loc[current_set, current_set].max().max()
if highest_corr > max_rho or highest_corr == 1:
a = rho.loc[current_set, current_set].max().idxmax()
b = rho.loc[a, current_set].idxmax()
if verbose > 2:
print(a, 'correlated with', b, 'rho =', rho.at[a, b])
# Break ties based on which has higher mean correlation unless priority order is given
to_remove = None
if a in priority:
if b not in priority or priority[a] < priority[b]:
to_remove = b
elif b in priority and priority[a] > priority[b]:
to_remove = a
elif b in priority:
to_remove = a
if not to_remove: # Priority order not specified or a tie; use higher mean correlation
if rho.loc[a, current_set].mean() < rho.loc[b, current_set].mean():
to_remove = b
else:
to_remove = a
if highest_corr < 1 or not remove_perfect_corr:
next_set.append(to_remove)
current_set.remove(to_remove)
elif len(next_set) > 0:
if verbose > 1:
print('Creating feature set of size', len(current_set))
result.append(current_set)
current_set = next_set
next_set = []
else:
if len(current_set) > 0:
if verbose > 1:
print('Creating feature set of size', len(current_set))
result.append(current_set)
break # No correlations larger than max allowed, no remaining features to check
return result
class UncorrelatedFeatureSelector(BaseEstimator, TransformerMixin):
def __init__(self, max_rho=.8, verbose=0):
self.max_rho = max_rho
self.verbose = verbose
self.uncorrelated_features = []
def fit(self, X, y=None):
assert isinstance(X, pd.DataFrame), 'Only pd.DataFrame inputs for X are supported'
if self.verbose > 0:
print('Finding features correlated at rho <=', self.max_rho)
fsets = uncorrelated_feature_sets(X, self.max_rho, remove_perfect_corr=True,
verbose=self.verbose - 1)
self.uncorrelated_features = fsets[0]
if self.verbose > 0:
print('Found', len(self.uncorrelated_features), 'uncorrelated features')
return self
def transform(self, X, y=None):
return X[self.uncorrelated_features]
def final_answers_from_df(df, verbose=0):
"""Extract the final answer given to each question/sub-question by a student, given a Pandas
DataFrame with sequences of actions from one or more students. This could be used to get the
answer to a specific attempt for a specific student as well, by inputting a DataFrame with only
data from that attempt.
Args:
df (pd.DataFrame): Sequence or subsequence of actions, e.g., from load_data.train_full()
verbose (int): Verbosity level
Returns:
dict: Mapping of STUDENTID -> question/sub-question ID -> answer
"""
answers = {}
for _, row in tqdm(df.iterrows(), desc='Getting answers', disable=not verbose, total=len(df)):
if row.STUDENTID not in answers:
answers[row.STUDENTID] = {}
if row.ItemType in ['MultipleFillInBlank', 'FillInBlank', 'CompositeCR']:
if row.Observable == 'Receive Focus':
subq = row.ExtendedInfo.replace('Part ', '').replace(', 1', ',1') # Fix VH139196
answer = ''
elif row.Observable == 'Math Keypress' or row.Observable == 'Equation Editor Button':
jdata = json.loads(row.ExtendedInfo)
answer = re.sub(r'(\$|\\| |mathrm\{\w*\}|overline\{\})', '', jdata['contentLaTeX'])
# Some actions don't show up immediately in contentLaTeX and must be appended
if '{' not in answer: # If there is a frac{} or anything going on, just give up
code = jdata['code'] if 'code' in jdata else ''
if code == 'Period':
answer += '.'
elif code.startswith('Digit'):
answer += code[5]
try: # Parse fractions (ambitious...)
answer = re.sub(r'frac\{(-?\d+)\}\{(-?\d+)\}',
lambda m: str(float(m.group(1)) /
float(m.group(2))).lstrip('0'), answer)
except:
pass # Cannot even begin to imagine the parsing errors
answer = answer.replace('^circ', '')
answer = re.sub(r'^0\.', '.', answer) # Leading 0 for decimals < 1
if '.' in answer:
answer = re.sub(r'0+$', '', answer) # Unnecessary trailing decimal zeros
if answer[-1] == '.':
answer = answer[:-1] # Remove .0's
elif row.Observable == 'Lose Focus':
try:
answers[row.STUDENTID][row.AccessionNumber + '_' + subq] = answer
except UnboundLocalError: # subq not defined
pass # Can only happen with incomplete data (e.g., for last 5 minutes of data)
elif row.ItemType == 'MCSS':
if row.Observable == 'Click Choice':
answers[row.STUDENTID][row.AccessionNumber] = \
row.ExtendedInfo[:row.ExtendedInfo.index(':')]
elif row.ItemType == 'MatchMS ':
if row.Observable == 'DropChoice': # e.g., [{'source': '3', 'target': 1}, ...]
for answer_pair in json.loads(row.ExtendedInfo):
subq_id, answer = answer_pair['source'], answer_pair['target']
answers[row.STUDENTID][row.AccessionNumber + '_' + subq_id] = answer
return answers
def answer_counts(answers):
"""Rank the most popular answers to a question, given a per-student mapping of answers to
questions from final_answers_from_df()
TODO: Make this into an expectation maximization problem instead to find source reliability, and
then also return source (student) reliability as a possible feature
Args:
answers (dict): Mapping of student answers to questions, from final_answers_from_df()
Returns:
dict: Mapping of question/sub-question ID -> collections.Counter of answers
"""
# Reformat as answers at the question level instead of student level
questions = {q: Counter() for q in set(qid for pid_map in answers.values() for qid in pid_map)}
for question in questions:
for pid_answers in answers.values():
if question in pid_answers:
questions[question][pid_answers[question]] += 1
return questions
def answer_ranks(question_answer_counts):
"""Rank the popularity of answers to a single question, given a collections.Counter of answers
to the question (e.g., one of the items obtained from answer_counts()). Ranking begins at 1
(the most popular answer), and ties are counted as the same rank.
Args:
question_answer_counts (collections.Counter): Counter of answers to a question
Returns:
dict: Mapping of answer -> rank
"""
assert type(question_answer_counts) == Counter
ranks = {}
unique_counts = 0
last_count = None
for ans, count in question_answer_counts.most_common():
if last_count is None or count < last_count:
unique_counts += 1
last_count = count
ranks[ans] = unique_counts
return ranks
def tree_error_analysis(X, y, cv, class_names, output_filename_prefix):
"""Train a simple decision tree model and graph it to help find cases where new/better features
are needed, or where current features may be behaving unexpectedly.
Args:
X (pd.DataFrame): Training data (column names are required)
y (array): Labels for training data
cv (int or sklearn cross-validator): Cross-validation method to apply
class_names (array): List of labels for class names in ascending order (y=0, y=1, etc.)
output_filename_prefix (str): Path + prefix for output graphs
Returns:
(dict, pd.DataFrame): Cross-validation results and predictions
"""
assert len(class_names) == len(np.unique(y)), 'There must be one class name per class'
scoring = {'AUC': metrics.make_scorer(metrics.roc_auc_score, needs_proba=True),
'MCC': metrics.make_scorer(metrics.cohen_kappa_score),
'Kappa': metrics.make_scorer(metrics.matthews_corrcoef)}
m = tree.DecisionTreeClassifier(min_samples_leaf=8, random_state=RANDOM_SEED)
res = model_selection.cross_validate(m, X, y, scoring=scoring, verbose=1, cv=cv,
return_estimator=True)
err_df = pd.DataFrame(index=X.index, data={'pred': '', 'truth': y, 'fold': '', 'leaf_size': ''})
for fold_i, (_, test_i) in enumerate(tqdm(cv.split(X, y), desc='Graphing trees')):
err_df.pred.iloc[test_i] = res['estimator'][fold_i].predict_proba(X.iloc[test_i]).T[1]
err_df.fold.iloc[test_i] = fold_i
# Graph trees, look for the most impure large leaves -- in the tree graphs or in a pivot
# table filtered by truth value looking for common wrong predicted probabilities
leaf_i = res['estimator'][fold_i].apply(X.iloc[test_i])
leaf_sizes = np.bincount(leaf_i) # Array with leaf index -> number of occurrences
err_df.leaf_size.iloc[test_i] = [leaf_sizes[i] for i in leaf_i]
with tempfile.TemporaryDirectory() as tmpdir:
dotfile = os.path.join(tmpdir, 'tree.dot')
tree.export_graphviz(res['estimator'][fold_i], out_file=dotfile,
class_names=class_names, feature_names=X.columns, filled=True)
subprocess.call(['dot', '-Tpng', dotfile, '-o',
output_filename_prefix + 'fold' + str(fold_i) + '.png', '-Gdpi=300'])
return res, err_df
def per_feature_analysis(X, y, cv):
"""Explore individual feature predictive accuracy for every feature in a dataset, via a simple
and fast CART model. This allows finding features that are especially effective and possible
inspirations for future features, as well as features that may be severely over-fit to the
training data and could need improvement
Args:
X (pd.DataFrame): Training data (columns are required)
y (array): Labels for training data
cv (int or sklearn cross-validator): Cross-validation method to apply
Returns:
pd.DataFrame: Results for each feature, probably for saving to a CSV file
"""
scoring = {'AUC': metrics.make_scorer(metrics.roc_auc_score, needs_proba=True),
'MCC': metrics.make_scorer(metrics.cohen_kappa_score),
'Kappa': metrics.make_scorer(metrics.matthews_corrcoef)}
m = tree.DecisionTreeClassifier(min_samples_leaf=8, random_state=RANDOM_SEED)
result = []
for feat in tqdm(X.columns, desc='Building 1-feature models'):
scores = model_selection.cross_validate(m, X[[feat]], y, scoring=scoring, cv=cv,
return_train_score=True)
result.append(OrderedDict({
'feature': feat,
'mean_test_auc': np.mean(scores['test_AUC']),
'min_test_auc': min(scores['test_AUC']),
'mean_test_kappa': np.mean(scores['test_Kappa']),
'min_test_kappa': min(scores['test_Kappa']),
'mean_train_auc': np.mean(scores['train_AUC']),
'mean_train_kappa': np.mean(scores['train_Kappa']),
}))
return pd.DataFrame.from_records(result)
def adjusted_thresh_kappa(y_true, y_pred, thresholds=100):
"""Cohen's kappa with the decision threshold adjusted to maximiize kappa. `thresholds` evenly-
spaced cutoffs in [0, 1] will be evaluated.
Args:
y_true (Numpy array): Ground truth labels (0 or 1)
y_pred (Numpy array): Predicted probabilities (must be continuous, probability-like)
thresholds (int, optional): Number of thresholds to explore. Defaults to 100.
Returns:
float: Adjusted-threshold kappa
"""
y_pred = np.array(y_pred)
return max(metrics.cohen_kappa_score(y_true, y_pred > t)
for t in np.linspace(0, 1, thresholds + 1))
def thresh_restricted_auk(y_true, y_pred, thresholds=100, auk_width=.1):
"""Area under the Cohen's kappa curve (AUK) restricted to a range centered around the ideal
(maximum kappa) threshold. For example, if the ideal threshold for maximizing kappa is 0.64 and
`auk_width` is set to 0.1, then the returned value will be AUK measured from 0.59 to 0.69 (i.e.,
0.65 +/- 0.1).
Args:
y_true (Numpy array): Ground truth labels (0 or 1)
y_pred (Numpy array): Predicted probabilities (must be continuous, probability-like)
thresholds (int, optional): Number of thresholds to explore. Defaults to 100.
auk_width (float, optional): Width of interval around ideal threshold for restricted-range
AUK calculation. Defaults to 0.1.
Returns:
float: Restricted-range AUK (value normalized to [-1, 1] based on `auk_width`)
"""
y_pred = np.array(y_pred)
cuts = np.linspace(0, 1, thresholds + 1)
kappas = np.array([metrics.cohen_kappa_score(y_true, y_pred > t) for t in cuts])
ideal = cuts[np.argmax(kappas)]
restricted_kappas = kappas[(cuts >= ideal - auk_width / 2) & (cuts <= ideal + auk_width / 2)]
return sum(restricted_kappas) / len(restricted_kappas)
def kappa_plus_auc(y_true, y_pred, threshold=.5):
"""Sum of Cohen's kappa and the area under the receiver operating characteristic curve (AUC)
Args:
y_true (Numpy array): Ground truth labels (0 or 1)
y_pred (Numpy array): Predicted probabilities (must be continuous, probability-like)
threshold (float, optional): Decision threshold for calculating kappa (>=). Defaults to .5.
Returns:
float: Sum of kappa and AUC (in the range [-1, 2])
"""
y_pred = np.array(y_pred)
return metrics.cohen_kappa_score(y_true, y_pred >= threshold) + \
metrics.roc_auc_score(y_true, y_pred)
if __name__ == '__main__':
df = pd.DataFrame({'w': [2, 2, 3, 4, 5], 'x': [1, -2, 1, 3, 3], 'y': [5, 1, 3, 0, 1],
'z': [1.1, -1, 1, 5, 5], 'w2': [2, 2, 3, 4, 5]})
print(uncorrelated_feature_sets(df, max_rho=.5, verbose=4, remove_perfect_corr=True))
print('\nWith prioritizing x over z and z over w2:')
print(uncorrelated_feature_sets(df, max_rho=.5, verbose=4, remove_perfect_corr=True,
priority_order=[['x'], ['z']]))
truth = [0, 1, 1, 1, 0, 0, 1, 1]
preds = [.1, .5, .4, .6, .2, .3, .2, .9]
print('Kappa:', metrics.cohen_kappa_score(truth, np.array(preds) >= .5))
print('Threshold-adjusted kappa:', adjusted_thresh_kappa(truth, preds))
print('Threshold-restricted AUK:', thresh_restricted_auk(truth, preds))
print('Kappa + AUC:', kappa_plus_auc(truth, preds))