From 54f706534845ccd3ae717f39593d51980b187fd5 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 31 Oct 2022 14:19:42 -0400 Subject: [PATCH 01/14] utility and testing scripts needed for implementing ML and SSML --- .github/workflows/python-package.yml | 2 +- README.md | 6 + requirements.txt | 7 + scripts/__init__.py | 0 scripts/utils.py | 328 ++++++++++++++++++ tests/test_BackgroundEstimator.py | 1 - tests/test_models.py | 491 +++++++++++++++++++++++++++ 7 files changed, 833 insertions(+), 2 deletions(-) create mode 100644 scripts/__init__.py create mode 100644 scripts/utils.py create mode 100644 tests/test_models.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index d88f9c7..973f71c 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -41,7 +41,7 @@ jobs: - name: Test with pytest run: | python3 -m pytest - python3 -m coverage run --source=./RadClass/ -m pytest + python3 -m coverage run --source=./RadClass/,./models/,./scripts/ -m pytest python3 -m coverage report python3 -m coverage html COVERALLS_REPO_TOKEN=${{ secrets.COVERALLS_REPO_TOKEN }} python3 -m coveralls --service=github diff --git a/README.md b/README.md index b08bd07..42245fa 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,13 @@ Versions 3.6-3.9 are currently supported by tests. The following Python packages * h5py * numpy * progressbar2 +* matplotlib +* seaborn * scipy +* sklearn +* hyperopt +* torch +* shadow-ssml Modules can be imported from the repository directory (e.g. `from RadClass.H0 import H0`) or `RadClass` can be installed using pip: diff --git a/requirements.txt b/requirements.txt index 06d1c3a..8b22315 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,10 @@ numpy h5py progressbar2 scipy>=1.7.0 +scikit-learn +hyperopt +matplotlib +seaborn +joblib +torch +shadow-ssml diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..d91c826 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,328 @@ +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +# For hyperopt (parameter optimization) +from hyperopt import Trials, tpe, fmin +from functools import partial +# diagnostics +from sklearn.metrics import confusion_matrix +# pca +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +# Cross Validation +from sklearn.model_selection import KFold, StratifiedKFold + + +class EarlyStopper: + ''' + Early stopping mechanism for neural networks. + Code adapted from user "isle_of_gods" from StackOverflow: + https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch + Use this class to break a training loop if the validation loss is low. + Inputs: + patience: integer; forces stop if validation loss has not improved + for some time + min_delta: "fudge value" for how much loss to tolerate before stopping + ''' + + def __init__(self, patience=1, min_delta=0): + self.patience = patience + self.min_delta = min_delta + self.counter = 0 + self.min_validation_loss = np.inf + + def early_stop(self, validation_loss): + ''' + Tests for the early stopping condition if the validation loss + has not improved for a certain period of time (patience). + Inputs: + validation_loss: typically a float value for the loss function of + a neural network training loop + ''' + + if validation_loss < self.min_validation_loss: + # keep track of the smallest validation loss + # if it has been beaten, restart patience + self.min_validation_loss = validation_loss + self.counter = 0 + elif validation_loss > (self.min_validation_loss + self.min_delta): + # keep track of whether validation loss has been decreasing + # by a tolerable amount + self.counter += 1 + if self.counter >= self.patience: + return True + return False + + +def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True): + ''' + Runs hyperparameter optimization on a model given a parameter space. + Inputs: + space: dictionary with each hyperparameter as keys and values being the + range of parameter space (see hyperopt docs for defining a space) + mode: function that takes params dictionary, trains a specified ML model + and returns the optimization loss function, model, and other + attributes (e.g. accuracy on evaluation set) + max_eval: (int) run hyperparameter optimization for max_val iterations + verbose: report best and worse loss/accuracy + + Returns: + best: dictionary with returns from model function, including best loss, + best trained model, best parameters, etc. + worst: dictionary with returns from model function, including worst loss, + worst trained model, worst parameters, etc. + ''' + + trials = Trials() + + # wrap data into objective function + fmin_objective = partial(model, data_dict=data_dict) + + # run hyperopt + fmin(fmin_objective, + space, + algo=tpe.suggest, + max_evals=max_evals, + trials=trials) + + # of all trials, find best and worst loss/accuracy from optimization + best = trials.results[np.argmin([r['loss'] for r in trials.results])] + worst = trials.results[np.argmax([r['loss'] for r in trials.results])] + + if verbose: + print('best accuracy:', 1-best['loss']) + print('best params:', best['params']) + print('worst accuracy:', 1-worst['loss']) + print('worst params:', worst['params']) + + return best, worst + + +def cross_validation(model, X, y, params, n_splits=3, + stratified=False, random_state=None): + ''' + Perform K-Fold cross validation using sklearn and a given model. + The model *must* have a fresh_start method (see models in RadClass/models). + fresh_start() is used instead of train() to be agnostic to the data needed + for training (fresh_start requires a data_dict whereas each model's + train could take different combinations of labeled & unlabeled data). + This also avoids the need to do hyperparameter optimization (and + therefore many training epochs) for every K-Fold. + NOTE: fresh_start returns the model and results in a dictionary but + does not overwrite/save the model to the respective class. + You can manually overwrite using model.model = return.model + Hyperparameter optimization (model.optimize) can be done before or after + cross validation to specify the (optimal) parameters used by the model + since they are required here. + NOTE: Fixed default to shuffle data during cross validation splits. + (See sklearn cross validation docs for more info.) + NOTE: Unlabeled data, if provided, will always be included in the training + dataset. This means that this cross validation implementation is + susceptible to bias in the unlabeled data distribution. To test for + this bias, a user can manually run cross validation as a parent to + calling this function, splitting the unlabeled data and adding + different folds into X. + Inputs: + model: ML model class object (e.g. RadClass/models). + Must have a fresh_start() method. + NOTE: If the model expects unlabeled data but unlabed data is not + provided in X/y, an error will likely be thrown when training the model + through fresh_start. + X: array of feature vectors (rows of individual instances, cols of vectors) + This should include all data for training and testing (since the + testing subset will be split by cross validation), including unlabeled + data if needed/used. + y: array/vector of labels for X. If including unlabeled data, use -1. + This should have the same order as X. That is, each row index in X + has an associated label with the same index in y. + params: dictionary of hyperparameters. Will depend on model used. + Alternatively, use model.params for models in RadClass/models + n_splits: int number of splits for K-Fold cross validation + stratified: bool; if True, balance the K-Folds to have roughly the same + proportion of samples from each class. + random_state: seed for reproducility. + ''' + + # return lists + accs = [] + reports = [] + + if stratified: + cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, + shuffle=True) + else: + cv = KFold(n_splits=n_splits, random_state=random_state, + shuffle=True) + + # separate unlabeled data if included + Ux = None + Uy = None + if -1 in y: + U_idx = np.where(y == -1)[0] + L_idx = np.where(y != -1)[0] + Ux = X[U_idx] + Uy = y[U_idx] + Lx = X[L_idx] + Ly = y[L_idx] + else: + Lx = X + Ly = y + # conduct K-Fold cross validation + cv.get_n_splits(Lx, Ly) + for train_idx, test_idx in cv.split(Lx, Ly): + trainx, testx = Lx[train_idx], Lx[test_idx] + trainy, testy = Ly[train_idx], Ly[test_idx] + + # construct data dictionary for training in fresh_start + data_dict = {'trainx': trainx, 'trainy': trainy, + 'testx': testx, 'testy': testy} + if Ux is not None: + data_dict['Ux'] = Ux + data_dict['Uy'] = Uy + results = model.fresh_start(params, data_dict) + accs = np.append(accs, results['accuracy']) + reports = np.append(reports, results) + + # report cross validation results + print('Average accuracy:', np.mean(accs)) + print('Max accuracy:', np.max(accs)) + print('All accuracy:', accs) + # return the results of fresh_start for the max accuracy model + return reports[np.argmax(accs)] + + +def pca(Lx, Ly, Ux, Uy, filename): + ''' + A function for computing and plotting 2D PCA. + Inputs: + Lx: labeled feature data. + Ly: class labels for labeled data. + Ux: unlabeled feature data. + Uy: labels for unlabeled data (all labels should be -1). + filename: filename for saved plot. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + plt.rcParams.update({'font.size': 20}) + # only saving colors for binary classification with unlabeled instances + col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'} + + pcadata = np.append(Lx, Ux, axis=0) + normalizer = StandardScaler() + x = normalizer.fit_transform(pcadata) + print(np.mean(pcadata), np.std(pcadata)) + print(np.mean(x), np.std(x)) + + pca = PCA(n_components=2) + pca.fit_transform(x) + print(pca.explained_variance_ratio_) + print(pca.singular_values_) + print(pca.components_) + + principalComponents = pca.fit_transform(x) + + fig, ax = plt.subplots(figsize=(10, 8)) + ax.set_xlabel('Principal Component 1', fontsize=15) + ax.set_ylabel('Principal Component 2', fontsize=15) + for idx, color in col_dict.items(): + indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0] + ax.scatter(principalComponents[indices, 0], + principalComponents[indices, 1], + c=color, + label='class '+str(idx)) + ax.grid() + ax.legend() + + if filename[-4:] != '.png': + filename += '.png' + fig.tight_layout() + fig.savefig(filename) + + +def multiD_pca(Lx, Ly, Ux, Uy, filename, n=2): + ''' + A function for computing and plotting n-dimensional PCA. + Inputs: + Lx: labeled feature data. + Ly: class labels for labeled data. + Ux: unlabeled feature data. + Uy: labels for unlabeled data (all labels should be -1). + filename: filename for saved plot. + The file must be saved with extension .joblib. + Added to filename if not included as input. + n: number of singular values to include in PCA analysis. + ''' + + plt.rcParams.update({'font.size': 20}) + # only saving colors for binary classification with unlabeled instances + col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'} + + pcadata = np.append(Lx, Ux, axis=0) + normalizer = StandardScaler() + x = normalizer.fit_transform(pcadata) + print(np.mean(pcadata), np.std(pcadata)) + print(np.mean(x), np.std(x)) + + n = 2 + pca = PCA(n_components=n) + principalComponents = pca.fit_transform(x) + print(pca.explained_variance_ratio_) + print(pca.singular_values_) + print(pca.components_) + + alph = ["A", "B", "C", "D", "E", "F", "G", "H", + "I", "J", "K", "L", "M", "N", "O", "P", + "Q", "R", "S", "T", "U", "V", "W", "X", + "Y", "Z"] + jobs = alph[:n] + + fig, axes = plt.subplots(n, n, figsize=(15, 15)) + + for row in range(axes.shape[0]): + for col in range(axes.shape[1]): + ax = axes[row, col] + if row == col: + ax.tick_params( + axis='both', which='both', + bottom='off', top='off', + labelbottom='off', + left='off', right='off', + labelleft='off' + ) + ax.text(0.5, 0.5, jobs[row], horizontalalignment='center') + else: + for idx, color in col_dict.items(): + indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0] + ax.scatter(principalComponents[indices, row], + principalComponents[indices, col], + c=color, + label='class '+str(idx)) + fig.tight_layout() + if filename[-4:] != '.png': + filename += '.png' + fig.savefig(filename) + + +def plot_cf(testy, predy, title, filename): + ''' + Uses sklearn metric to compute a confusion matrix for visualization + Inputs: + testy: array/vector with ground-truth labels for test/evaluation set + predy: array/vector with predicted sample labels from trained model + title: string title for plot + filename: string with extension for confusion matrix file + ''' + + cf_matrix = confusion_matrix(testy, predy) + ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues') + + ax.set_title(title) + ax.set_xlabel('\nPredicted Values') + ax.set_ylabel('Actual Values ') + + # Ticket labels - List must be in alphabetical order + ax.xaxis.set_ticklabels(['0(SNM)', '1(other)']) + ax.yaxis.set_ticklabels(['0(SNM)', '1(other)']) + # Save the visualization of the Confusion Matrix. + plt.savefig(filename) diff --git a/tests/test_BackgroundEstimator.py b/tests/test_BackgroundEstimator.py index 2d10c89..efc1299 100644 --- a/tests/test_BackgroundEstimator.py +++ b/tests/test_BackgroundEstimator.py @@ -77,7 +77,6 @@ def test_write(): bckg.write(ofilename=ofilename) results = np.loadtxt(fname=ofilename+'.csv', delimiter=',') - print(results) # the resulting observation should be: # counts * integration / live-time diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..e3fb086 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,491 @@ +# diagnostics +import numpy as np +from datetime import datetime, timedelta +# testing models +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +import tests.test_data as test_data +# hyperopt +from hyperopt.pyll.base import scope +from hyperopt import hp +# testing utils +import scripts.utils as utils +# models +from models.LogReg import LogReg +from models.SSML.CoTraining import CoTraining +from models.SSML.LabelProp import LabelProp +from models.SSML.ShadowNN import ShadowNN +from models.SSML.ShadowCNN import ShadowCNN +# testing write +import joblib +import os + +# initialize sample data +start_date = datetime(2019, 2, 2) +delta = timedelta(seconds=1) +timestamps = np.arange(start_date, + start_date + (test_data.timesteps * delta), + delta).astype('datetime64[s]').astype('float64') + +live = np.full((len(timestamps),), test_data.livetime) +sample_val = 1.0 +spectra = np.full((len(timestamps), test_data.energy_bins), + np.full((1, test_data.energy_bins), sample_val)) +# setting up for rejected null hypothesis +rejected_H0_time = np.random.choice(spectra.shape[0], + test_data.timesteps//2, + replace=False) +spectra[rejected_H0_time] = 100.0 + +labels = np.full((spectra.shape[0],), 0) +labels[rejected_H0_time] = 1 + + +def test_utils(): + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + Uy = np.full_like(Uy, -1) + + # test cross validation for supervised data using LogReg + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = LogReg(params=params) + max_acc_model = utils.cross_validation(model=model, + X=X, + y=y, + params=params) + assert max_acc_model['accuracy'] >= 0.5 + + # test cross validation for supervised data and StratifiedKFold with LogReg + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = LogReg(params=params) + max_acc_model = utils.cross_validation(model=model, + X=X, + y=y, + params=params, + stratified=True) + assert max_acc_model['accuracy'] >= 0.5 + + # test cross validation for SSML with LabelProp + params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} + model = LabelProp(params=params) + max_acc_model = utils.cross_validation(model=model, + X=np.append(X, Ux, axis=0), + y=np.append(y, Uy, axis=0), + params=params, + stratified=True) + assert max_acc_model['accuracy'] >= 0.5 + + # data split for data visualization + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + filename = 'test_pca' + utils.pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename) + os.remove(filename+'.png') + + filename = 'test_multiD_pca' + utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5) + os.remove(filename+'.png') + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + + # default behavior + model = LogReg(params=None, random_state=0) + model.train(X_train, y_train) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + filename = 'test_cf' + utils.plot_cf(y_test, pred, title=filename, filename=filename) + os.remove(filename+'.png') + + +def test_LogReg(): + # test saving model input parameters + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = LogReg(params=params) + + assert model.model.max_iter == params['max_iter'] + assert model.model.tol == params['tol'] + assert model.model.C == params['C'] + + X_train, X_test, y_train, y_test = train_test_split(spectra, + labels, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + + # default behavior + model = LogReg(params=None, random_state=0) + model.train(X_train, y_train) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + assert acc > 0.7 + np.testing.assert_equal(pred, y_test) + + # testing hyperopt optimize methods + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol': hp.loguniform('tol', 1e-5, 1e-1), + 'C': hp.uniform('C', 0.001, 1000.0) + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_CoTraining(): + # test saving model input parameters + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = CoTraining(params=params) + + assert model.model1.max_iter == params['max_iter'] + assert model.model1.tol == params['tol'] + assert model.model1.C == params['C'] + + assert model.model2.max_iter == params['max_iter'] + assert model.model2.tol == params['tol'] + assert model.model2.C == params['C'] + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + # default behavior + model = CoTraining(params=None, random_state=0) + model.train(X_train, y_train, Ux) + + # testing train and predict methods + pred, acc, *_ = model.predict(X_test, y_test) + + assert acc > 0.7 + np.testing.assert_equal(pred, y_test) + + # testing hyperopt optimize methods + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol': hp.loguniform('tol', 1e-5, 1e-3), + 'C': hp.uniform('C', 1.0, 1000.0), + 'n_samples': scope.int(hp.quniform('n_samples', + 1, + 20, + 1)), + 'seed': 0 + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model plotting method + filename = 'test_plot' + model.plot_cotraining(model1_accs=model.best['model1_acc_history'], + model2_accs=model.best['model2_acc_history'], + filename=filename) + os.remove(filename+'.png') + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_LabelProp(): + # test saving model input parameters + params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} + model = LabelProp(params=params) + + assert model.model.gamma == params['gamma'] + assert model.model.n_neighbors == params['n_neighbors'] + assert model.model.max_iter == params['max_iter'] + assert model.model.tol == params['tol'] + + # there should be no normalization on LabelProp data + # since it depends on the distances between samples + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # default behavior + model = LabelProp(params=None, random_state=0) + model.train(X_train, y_train, Ux) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + # the default n_neighbors(=7) from sklearn is too large + # for the size of this dataset + # therefore the accuracy is expected to be poor + # a better value for this dataset would be n_neighbors=2 + # (tested when specifying params in LabelProp.__init__) + assert acc >= 0.5 + # uninteresting test if LabelProp predicts all one class + # TODO: make the default params test meaningful + assert np.count_nonzero(pred == y_test) > 0 + + # testing hyperopt optimize methods + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol': hp.loguniform('tol', 1e-6, 1e-4), + 'gamma': hp.uniform('gamma', 1, 50), + 'n_neighbors': scope.int(hp.quniform('n_neighbors', + 1, + X_train.shape[0], + 1)) + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_ShadowNN(): + # check default parameter settings + model = ShadowNN() + assert model.params == {'binning': 1} + assert model.eaat is not None + assert model.eaat_opt is not None + assert model.xEnt is not None + assert model.input_length == 1000 + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + params = {'hidden_layer': 10, + 'alpha': 0.1, + 'xi': 1e-3, + 'eps': 1.0, + 'lr': 0.1, + 'momentum': 0.9, + 'binning': 20} + # default behavior + model = ShadowNN(params=params, random_state=0) + acc_history = model.train(X_train, y_train, Ux, X_test, y_test) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + # test for agreement between training and testing + # (since the same data is used for diagnostics in this test) + assert acc_history[-1] == acc + + # Shadow/PyTorch reports accuracies as percentages + # rather than decimals + # uninteresting test if Shadow predicts all one class + # TODO: make the default params test meaningful + assert np.count_nonzero(pred == y_test) > 0 + + # testing hyperopt optimize methods + space = {'hidden_layer': 10, + 'alpha': 0.1, + 'xi': 1e-3, + 'eps': 1.0, + 'lr': 0.1, + 'momentum': 0.9, + 'binning': scope.int(hp.quniform('binning', + 10, + 20, + 1)) + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_ShadowCNN(): + # check default parameter settings + model = ShadowCNN() + assert model.params == {'binning': 1, 'batch_size': 1} + assert model.model is not None + assert model.eaat is not None + assert model.optimizer is not None + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + params = {'layer1': 2, + 'kernel': 3, + 'alpha': 0.1, + 'xi': 1e-3, + 'eps': 1.0, + 'lr': 0.1, + 'momentum': 0.9, + 'binning': 20, + 'batch_size': 4, + 'drop_rate': 0.1} + + # default behavior + model = ShadowCNN(params=params, random_state=0) + losscurve, evalcurve = model.train(X_train, y_train, Ux, X_test, y_test) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + # test for agreement between training and testing + # (since the same data is used for diagnostics in this test) + assert evalcurve[-1] == acc + + # Shadow/PyTorch reports accuracies as percentages + # rather than decimals + # uninteresting test if Shadow predicts all one class + # TODO: make the default params test meaningful + assert np.count_nonzero(pred == y_test) > 0 + + # testing hyperopt optimize methods + space = params + space['binning'] = scope.int(hp.quniform('binning', + 10, + 20, + 1)) + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model plotting method + filename = 'test_plot' + model.plot_training(losscurve=model.best['losscurve'], + evalcurve=model.best['evalcurve'], + filename=filename) + os.remove(filename+'.png') + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) From b335056f7cf357b4be9825e7d75d626f90f47035 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 31 Oct 2022 14:22:42 -0400 Subject: [PATCH 02/14] adding Logistic Regression class implementation --- models/LogReg.py | 162 +++++++++++++++++++++++++++++++++++++++++++++ models/__init__.py | 0 2 files changed, 162 insertions(+) create mode 100644 models/LogReg.py create mode 100644 models/__init__.py diff --git a/models/LogReg.py b/models/LogReg.py new file mode 100644 index 0000000..4ebfce2 --- /dev/null +++ b/models/LogReg.py @@ -0,0 +1,162 @@ +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# sklearn models +from sklearn import linear_model +# diagnostics +from sklearn.metrics import balanced_accuracy_score +from scripts.utils import run_hyperopt +import joblib + + +class LogReg: + ''' + Methods for deploying sklearn's logistic regression + implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + random_state: int/float for reproducible intiailization. + ''' + + # only binary so far + def __init__(self, params=None, random_state=0): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # dictionary of parameters for logistic regression model + self.params = params + if self.params is None: + self.model = linear_model.LogisticRegression( + random_state=self.random_state + ) + else: + self.model = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh logistic regression model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, and testy required. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + + # supervised logistic regression + clf = LogReg(params=params, random_state=self.random_state) + # train and test model + clf.train(trainx, trainy) + # uses balanced_accuracy accounts for class imbalanced data + clf_pred, acc = clf.predict(testx, testy) + + # loss function minimizes misclassification + return {'loss': 1-acc, + 'status': STATUS_OK, + 'model': clf.model, + 'params': params, + 'accuracy': acc} + + def optimize(self, space, data_dict, max_evals=50, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a hyperopt compliant dictionary with defined optimization + spaces. For example: + # quniform returns float, some parameters require int; + # use this to force int + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol' : hp.loguniform('tol', 1e-5, 1e-1), + 'C' : hp.uniform('C', 0.001,1000.0) + } + See hyperopt docs for more information. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy required. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy): + ''' + Wrapper method for sklearn's logisitic regression training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + ''' + + # supervised logistic regression + self.model.fit(trainx, trainy) + + def predict(self, testx, testy=None): + ''' + Wrapper method for sklearn's logistic regression predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + pred = self.model.predict(testx) + + acc = None + if testy is not None: + # uses balanced_accuracy_score to account for class imbalance + acc = balanced_accuracy_score(testy, pred) + + return pred, acc + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..e69de29 From c66109929a58c69f1f0ba1f8bd023fb7fff6a546 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 31 Oct 2022 14:54:46 -0400 Subject: [PATCH 03/14] removing pytests for future ML implementations --- tests/test_models.py | 236 ------------------------------------------- 1 file changed, 236 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index e3fb086..0669ccf 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -169,242 +169,6 @@ def test_LogReg(): os.remove(filename+ext) -def test_CoTraining(): - # test saving model input parameters - params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} - model = CoTraining(params=params) - - assert model.model1.max_iter == params['max_iter'] - assert model.model1.tol == params['tol'] - assert model.model1.C == params['C'] - - assert model.model2.max_iter == params['max_iter'] - assert model.model2.tol == params['tol'] - assert model.model2.C == params['C'] - - X, Ux, y, Uy = train_test_split(spectra, - labels, - test_size=0.5, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.2, - random_state=0) - - # normalization - normalizer = StandardScaler() - normalizer.fit(X_train) - - X_train = normalizer.transform(X_train) - X_test = normalizer.transform(X_test) - Ux = normalizer.transform(Ux) - - # default behavior - model = CoTraining(params=None, random_state=0) - model.train(X_train, y_train, Ux) - - # testing train and predict methods - pred, acc, *_ = model.predict(X_test, y_test) - - assert acc > 0.7 - np.testing.assert_equal(pred, y_test) - - # testing hyperopt optimize methods - space = {'max_iter': scope.int(hp.quniform('max_iter', - 10, - 10000, - 10)), - 'tol': hp.loguniform('tol', 1e-5, 1e-3), - 'C': hp.uniform('C', 1.0, 1000.0), - 'n_samples': scope.int(hp.quniform('n_samples', - 1, - 20, - 1)), - 'seed': 0 - } - data_dict = {'trainx': X_train, - 'testx': X_test, - 'trainy': y_train, - 'testy': y_test, - 'Ux': Ux - } - model.optimize(space, data_dict, max_evals=2, verbose=True) - - assert model.best['accuracy'] >= model.worst['accuracy'] - assert model.best['status'] == 'ok' - - # testing model plotting method - filename = 'test_plot' - model.plot_cotraining(model1_accs=model.best['model1_acc_history'], - model2_accs=model.best['model2_acc_history'], - filename=filename) - os.remove(filename+'.png') - - # testing model write to file method - filename = 'test_LogReg' - ext = '.joblib' - model.save(filename) - model_file = joblib.load(filename+ext) - assert model_file.best['params'] == model.best['params'] - - os.remove(filename+ext) - - -def test_LabelProp(): - # test saving model input parameters - params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} - model = LabelProp(params=params) - - assert model.model.gamma == params['gamma'] - assert model.model.n_neighbors == params['n_neighbors'] - assert model.model.max_iter == params['max_iter'] - assert model.model.tol == params['tol'] - - # there should be no normalization on LabelProp data - # since it depends on the distances between samples - X, Ux, y, Uy = train_test_split(spectra, - labels, - test_size=0.5, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.2, - random_state=0) - - # default behavior - model = LabelProp(params=None, random_state=0) - model.train(X_train, y_train, Ux) - - # testing train and predict methods - pred, acc = model.predict(X_test, y_test) - - # the default n_neighbors(=7) from sklearn is too large - # for the size of this dataset - # therefore the accuracy is expected to be poor - # a better value for this dataset would be n_neighbors=2 - # (tested when specifying params in LabelProp.__init__) - assert acc >= 0.5 - # uninteresting test if LabelProp predicts all one class - # TODO: make the default params test meaningful - assert np.count_nonzero(pred == y_test) > 0 - - # testing hyperopt optimize methods - space = {'max_iter': scope.int(hp.quniform('max_iter', - 10, - 10000, - 10)), - 'tol': hp.loguniform('tol', 1e-6, 1e-4), - 'gamma': hp.uniform('gamma', 1, 50), - 'n_neighbors': scope.int(hp.quniform('n_neighbors', - 1, - X_train.shape[0], - 1)) - } - data_dict = {'trainx': X_train, - 'testx': X_test, - 'trainy': y_train, - 'testy': y_test, - 'Ux': Ux - } - model.optimize(space, data_dict, max_evals=2, verbose=True) - - assert model.best['accuracy'] >= model.worst['accuracy'] - assert model.best['status'] == 'ok' - - # testing model write to file method - filename = 'test_LogReg' - ext = '.joblib' - model.save(filename) - model_file = joblib.load(filename+ext) - assert model_file.best['params'] == model.best['params'] - - os.remove(filename+ext) - - -def test_ShadowNN(): - # check default parameter settings - model = ShadowNN() - assert model.params == {'binning': 1} - assert model.eaat is not None - assert model.eaat_opt is not None - assert model.xEnt is not None - assert model.input_length == 1000 - - X, Ux, y, Uy = train_test_split(spectra, - labels, - test_size=0.5, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.2, - random_state=0) - - # normalization - normalizer = StandardScaler() - normalizer.fit(X_train) - - X_train = normalizer.transform(X_train) - X_test = normalizer.transform(X_test) - Ux = normalizer.transform(Ux) - - params = {'hidden_layer': 10, - 'alpha': 0.1, - 'xi': 1e-3, - 'eps': 1.0, - 'lr': 0.1, - 'momentum': 0.9, - 'binning': 20} - # default behavior - model = ShadowNN(params=params, random_state=0) - acc_history = model.train(X_train, y_train, Ux, X_test, y_test) - - # testing train and predict methods - pred, acc = model.predict(X_test, y_test) - - # test for agreement between training and testing - # (since the same data is used for diagnostics in this test) - assert acc_history[-1] == acc - - # Shadow/PyTorch reports accuracies as percentages - # rather than decimals - # uninteresting test if Shadow predicts all one class - # TODO: make the default params test meaningful - assert np.count_nonzero(pred == y_test) > 0 - - # testing hyperopt optimize methods - space = {'hidden_layer': 10, - 'alpha': 0.1, - 'xi': 1e-3, - 'eps': 1.0, - 'lr': 0.1, - 'momentum': 0.9, - 'binning': scope.int(hp.quniform('binning', - 10, - 20, - 1)) - } - data_dict = {'trainx': X_train, - 'testx': X_test, - 'trainy': y_train, - 'testy': y_test, - 'Ux': Ux - } - model.optimize(space, data_dict, max_evals=2, verbose=True) - - assert model.best['accuracy'] >= model.worst['accuracy'] - assert model.best['status'] == 'ok' - - # testing model write to file method - filename = 'test_LogReg' - ext = '.joblib' - model.save(filename) - model_file = joblib.load(filename+ext) - assert model_file.best['params'] == model.best['params'] - - os.remove(filename+ext) - - -def test_ShadowCNN(): # check default parameter settings model = ShadowCNN() assert model.params == {'binning': 1, 'batch_size': 1} From a7d4bfe5d7e598559a7465b2eca8f8b19ac13531 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 31 Oct 2022 14:55:59 -0400 Subject: [PATCH 04/14] commenting cross validation test for ssml since it will be included in the LabelProp PR --- tests/test_models.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 0669ccf..374573b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -67,15 +67,15 @@ def test_utils(): stratified=True) assert max_acc_model['accuracy'] >= 0.5 - # test cross validation for SSML with LabelProp - params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} - model = LabelProp(params=params) - max_acc_model = utils.cross_validation(model=model, - X=np.append(X, Ux, axis=0), - y=np.append(y, Uy, axis=0), - params=params, - stratified=True) - assert max_acc_model['accuracy'] >= 0.5 + # # test cross validation for SSML with LabelProp + # params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} + # model = LabelProp(params=params) + # max_acc_model = utils.cross_validation(model=model, + # X=np.append(X, Ux, axis=0), + # y=np.append(y, Uy, axis=0), + # params=params, + # stratified=True) + # assert max_acc_model['accuracy'] >= 0.5 # data split for data visualization X_train, X_test, y_train, y_test = train_test_split(X, From 738720ee09cc578ace5da7888083f329b06a1c88 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 31 Oct 2022 15:01:08 -0400 Subject: [PATCH 05/14] removing leftovers from migration --- tests/test_models.py | 92 +------------------------------------------- 1 file changed, 1 insertion(+), 91 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 374573b..e452361 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -12,10 +12,6 @@ import scripts.utils as utils # models from models.LogReg import LogReg -from models.SSML.CoTraining import CoTraining -from models.SSML.LabelProp import LabelProp -from models.SSML.ShadowNN import ShadowNN -from models.SSML.ShadowCNN import ShadowCNN # testing write import joblib import os @@ -166,90 +162,4 @@ def test_LogReg(): model_file = joblib.load(filename+ext) assert model_file.best['params'] == model.best['params'] - os.remove(filename+ext) - - - # check default parameter settings - model = ShadowCNN() - assert model.params == {'binning': 1, 'batch_size': 1} - assert model.model is not None - assert model.eaat is not None - assert model.optimizer is not None - - X, Ux, y, Uy = train_test_split(spectra, - labels, - test_size=0.5, - random_state=0) - X_train, X_test, y_train, y_test = train_test_split(X, - y, - test_size=0.2, - random_state=0) - - # normalization - normalizer = StandardScaler() - normalizer.fit(X_train) - - X_train = normalizer.transform(X_train) - X_test = normalizer.transform(X_test) - Ux = normalizer.transform(Ux) - - params = {'layer1': 2, - 'kernel': 3, - 'alpha': 0.1, - 'xi': 1e-3, - 'eps': 1.0, - 'lr': 0.1, - 'momentum': 0.9, - 'binning': 20, - 'batch_size': 4, - 'drop_rate': 0.1} - - # default behavior - model = ShadowCNN(params=params, random_state=0) - losscurve, evalcurve = model.train(X_train, y_train, Ux, X_test, y_test) - - # testing train and predict methods - pred, acc = model.predict(X_test, y_test) - - # test for agreement between training and testing - # (since the same data is used for diagnostics in this test) - assert evalcurve[-1] == acc - - # Shadow/PyTorch reports accuracies as percentages - # rather than decimals - # uninteresting test if Shadow predicts all one class - # TODO: make the default params test meaningful - assert np.count_nonzero(pred == y_test) > 0 - - # testing hyperopt optimize methods - space = params - space['binning'] = scope.int(hp.quniform('binning', - 10, - 20, - 1)) - data_dict = {'trainx': X_train, - 'testx': X_test, - 'trainy': y_train, - 'testy': y_test, - 'Ux': Ux - } - model.optimize(space, data_dict, max_evals=2, verbose=True) - - assert model.best['accuracy'] >= model.worst['accuracy'] - assert model.best['status'] == 'ok' - - # testing model plotting method - filename = 'test_plot' - model.plot_training(losscurve=model.best['losscurve'], - evalcurve=model.best['evalcurve'], - filename=filename) - os.remove(filename+'.png') - - # testing model write to file method - filename = 'test_LogReg' - ext = '.joblib' - model.save(filename) - model_file = joblib.load(filename+ext) - assert model_file.best['params'] == model.best['params'] - - os.remove(filename+ext) + os.remove(filename+ext) \ No newline at end of file From b6a2c3630a2336323446232b9adfa7773fc89907 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 28 Nov 2022 14:31:27 -0600 Subject: [PATCH 06/14] condensing bool return Co-authored-by: Paul Wilson --- scripts/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/utils.py b/scripts/utils.py index d91c826..f33f459 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -49,9 +49,7 @@ def early_stop(self, validation_loss): # keep track of whether validation loss has been decreasing # by a tolerable amount self.counter += 1 - if self.counter >= self.patience: - return True - return False + return self.counter >= self.patience def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True): From de820fae34c9aa4839ce666315be0a67eb8dec76 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 28 Nov 2022 16:56:09 -0500 Subject: [PATCH 07/14] addressing comments from PR review --- models/LogReg.py | 17 ++++-- scripts/utils.py | 130 +++++++++++++++++++++---------------------- tests/test_models.py | 14 +++-- 3 files changed, 83 insertions(+), 78 deletions(-) diff --git a/models/LogReg.py b/models/LogReg.py index 4ebfce2..8886b94 100644 --- a/models/LogReg.py +++ b/models/LogReg.py @@ -24,6 +24,7 @@ class LogReg: # only binary so far def __init__(self, params=None, random_state=0): + keys = ['max_iter', 'tol', 'C'] # defaults to a fixed value for reproducibility self.random_state = random_state # dictionary of parameters for logistic regression model @@ -33,12 +34,16 @@ def __init__(self, params=None, random_state=0): random_state=self.random_state ) else: - self.model = linear_model.LogisticRegression( - random_state=self.random_state, - max_iter=params['max_iter'], - tol=params['tol'], - C=params['C'] - ) + if all(key in params.keys() for key in keys): + self.model = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + else: + missing = [key for key in keys if key not in params.keys()] + raise ValueError('Values for {} not in params'.format(missing)) def fresh_start(self, params, data_dict): ''' diff --git a/scripts/utils.py b/scripts/utils.py index d91c826..98edd66 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -191,30 +191,22 @@ def cross_validation(model, X, y, params, n_splits=3, return reports[np.argmax(accs)] -def pca(Lx, Ly, Ux, Uy, filename): +def pca(Lx, Ux, n): ''' - A function for computing and plotting 2D PCA. + A function for computing n-component PCA. Inputs: Lx: labeled feature data. - Ly: class labels for labeled data. Ux: unlabeled feature data. - Uy: labels for unlabeled data (all labels should be -1). - filename: filename for saved plot. - The file must be saved with extension .joblib. - Added to filename if not included as input. + n: number of singular values to include in PCA analysis. ''' - plt.rcParams.update({'font.size': 20}) - # only saving colors for binary classification with unlabeled instances - col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'} - pcadata = np.append(Lx, Ux, axis=0) normalizer = StandardScaler() x = normalizer.fit_transform(pcadata) print(np.mean(pcadata), np.std(pcadata)) print(np.mean(x), np.std(x)) - pca = PCA(n_components=2) + pca = PCA(n_components=n) pca.fit_transform(x) print(pca.explained_variance_ratio_) print(pca.singular_values_) @@ -222,31 +214,37 @@ def pca(Lx, Ly, Ux, Uy, filename): principalComponents = pca.fit_transform(x) - fig, ax = plt.subplots(figsize=(10, 8)) - ax.set_xlabel('Principal Component 1', fontsize=15) - ax.set_ylabel('Principal Component 2', fontsize=15) + return principalComponents + + +def _plot_pca_data(principalComponents, Ly, Uy, ax): + ''' + Helper function for plot_pca that plots data for a given axis. + Inputs: + principalComponents: ndarray of shape (n_samples, n_components). + Ly: class labels for labeled data. + Uy: labels for unlabeled data (all labels should be -1). + ax: matplotlib-axis to plot on. + ''' + + # only saving colors for binary classification with unlabeled instances + col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'} + for idx, color in col_dict.items(): indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0] ax.scatter(principalComponents[indices, 0], principalComponents[indices, 1], c=color, label='class '+str(idx)) - ax.grid() - ax.legend() - - if filename[-4:] != '.png': - filename += '.png' - fig.tight_layout() - fig.savefig(filename) + return ax -def multiD_pca(Lx, Ly, Ux, Uy, filename, n=2): +def plot_pca(principalComponents, Ly, Uy, filename, n=2): ''' A function for computing and plotting n-dimensional PCA. Inputs: - Lx: labeled feature data. + principalComponents: ndarray of shape (n_samples, n_components). Ly: class labels for labeled data. - Ux: unlabeled feature data. Uy: labels for unlabeled data (all labels should be -1). filename: filename for saved plot. The file must be saved with extension .joblib. @@ -255,21 +253,6 @@ def multiD_pca(Lx, Ly, Ux, Uy, filename, n=2): ''' plt.rcParams.update({'font.size': 20}) - # only saving colors for binary classification with unlabeled instances - col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'} - - pcadata = np.append(Lx, Ux, axis=0) - normalizer = StandardScaler() - x = normalizer.fit_transform(pcadata) - print(np.mean(pcadata), np.std(pcadata)) - print(np.mean(x), np.std(x)) - - n = 2 - pca = PCA(n_components=n) - principalComponents = pca.fit_transform(x) - print(pca.explained_variance_ratio_) - print(pca.singular_values_) - print(pca.components_) alph = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", @@ -277,30 +260,36 @@ def multiD_pca(Lx, Ly, Ux, Uy, filename, n=2): "Y", "Z"] jobs = alph[:n] - fig, axes = plt.subplots(n, n, figsize=(15, 15)) - - for row in range(axes.shape[0]): - for col in range(axes.shape[1]): - ax = axes[row, col] - if row == col: - ax.tick_params( - axis='both', which='both', - bottom='off', top='off', - labelbottom='off', - left='off', right='off', - labelleft='off' - ) - ax.text(0.5, 0.5, jobs[row], horizontalalignment='center') - else: - for idx, color in col_dict.items(): - indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0] - ax.scatter(principalComponents[indices, row], - principalComponents[indices, col], - c=color, - label='class '+str(idx)) - fig.tight_layout() + # only one plot is needed for n=2 + if n == 2: + fig, ax = plt.subplots(figsize=(10, 8)) + ax.set_xlabel('PC '+jobs[0], fontsize=15) + ax.set_ylabel('PC '+jobs[1], fontsize=15) + ax = _plot_pca_data(principalComponents, Ly, Uy, ax) + ax.grid() + ax.legend() + else: + fig, axes = plt.subplots(n, n, figsize=(15, 15)) + for row in range(axes.shape[0]): + for col in range(axes.shape[1]): + ax = axes[row, col] + # blank label plot + if row == col: + ax.tick_params( + axis='both', which='both', + bottom='off', top='off', + labelbottom='off', + left='off', right='off', + labelleft='off' + ) + ax.text(0.5, 0.5, jobs[row], horizontalalignment='center') + # PCA results + else: + ax = _plot_pca_data(principalComponents, Ly, Uy, ax) + if filename[-4:] != '.png': filename += '.png' + fig.tight_layout() fig.savefig(filename) @@ -315,14 +304,19 @@ def plot_cf(testy, predy, title, filename): ''' cf_matrix = confusion_matrix(testy, predy) - ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues') + fig, ax = plt.subplots(figsize=(10, 8)) + ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues', + vmin=0, vmax=100, ax=ax, annot_kws={'fontsize': 40}) + # increase fontsize on colorbar + cbar = ax.collections[0].colorbar + cbar.ax.tick_params(labelsize=25) - ax.set_title(title) - ax.set_xlabel('\nPredicted Values') - ax.set_ylabel('Actual Values ') + ax.set_title(title, fontsize=30) + ax.set_xlabel('\nPredicted Values', fontsize=25) + ax.set_ylabel('Actual Values ', fontsize=25) # Ticket labels - List must be in alphabetical order - ax.xaxis.set_ticklabels(['0(SNM)', '1(other)']) - ax.yaxis.set_ticklabels(['0(SNM)', '1(other)']) + ax.xaxis.set_ticklabels(['0(SNM)', '1(other)'], fontsize=25) + ax.yaxis.set_ticklabels(['0(SNM)', '1(other)'], fontsize=25) # Save the visualization of the Confusion Matrix. plt.savefig(filename) diff --git a/tests/test_models.py b/tests/test_models.py index e452361..de56b1c 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,5 +1,6 @@ # diagnostics import numpy as np +import pytest from datetime import datetime, timedelta # testing models from sklearn.model_selection import train_test_split @@ -80,12 +81,13 @@ def test_utils(): random_state=0) filename = 'test_pca' - utils.pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename) + pcs = utils.pca(X_train, Ux, 2) + utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 2) os.remove(filename+'.png') - filename = 'test_multiD_pca' - utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5) - os.remove(filename+'.png') + # filename = 'test_multiD_pca' + # utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5) + # os.remove(filename+'.png') # normalization normalizer = StandardScaler() @@ -108,6 +110,10 @@ def test_utils(): def test_LogReg(): # test saving model input parameters + # missing 'C' key + params = {'max_iter': 2022, 'tol': 0.5} + with pytest.raises(ValueError): + LogReg(params=params) params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} model = LogReg(params=params) From 49ed49133885db088c129b0fcfac81db190efce4 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Wed, 7 Dec 2022 10:38:05 -0500 Subject: [PATCH 08/14] splitting utils test between cross_validation and pca --- tests/test_models.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index de56b1c..4253f5d 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -38,7 +38,7 @@ labels[rejected_H0_time] = 1 -def test_utils(): +def test_cross_validation(): X, Ux, y, Uy = train_test_split(spectra, labels, test_size=0.5, @@ -74,6 +74,13 @@ def test_utils(): # stratified=True) # assert max_acc_model['accuracy'] >= 0.5 +def test_pca(): + # unlabeled data split + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + Uy = np.full_like(Uy, -1) # data split for data visualization X_train, X_test, y_train, y_test = train_test_split(X, y, From 785e4245ace4ecaedd898d8fc4acf705b2c1d83d Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Thu, 15 Dec 2022 14:50:45 -0500 Subject: [PATCH 09/14] implementing more meaningful test data for testing cross validation --- scripts/utils.py | 9 ++-- tests/test_models.py | 100 ++++++++++++++++++++++++------------------- 2 files changed, 61 insertions(+), 48 deletions(-) diff --git a/scripts/utils.py b/scripts/utils.py index 2926b28..85c113d 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -97,7 +97,7 @@ def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True): def cross_validation(model, X, y, params, n_splits=3, - stratified=False, random_state=None): + stratified=False, random_state=None, shuffle=True): ''' Perform K-Fold cross validation using sklearn and a given model. The model *must* have a fresh_start method (see models in RadClass/models). @@ -139,6 +139,7 @@ def cross_validation(model, X, y, params, n_splits=3, stratified: bool; if True, balance the K-Folds to have roughly the same proportion of samples from each class. random_state: seed for reproducility. + shuffle: bool; if True, shuffle the data before conducting K-folds. ''' # return lists @@ -147,10 +148,10 @@ def cross_validation(model, X, y, params, n_splits=3, if stratified: cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, - shuffle=True) + shuffle=shuffle) else: cv = KFold(n_splits=n_splits, random_state=random_state, - shuffle=True) + shuffle=shuffle) # separate unlabeled data if included Ux = None @@ -186,7 +187,7 @@ def cross_validation(model, X, y, params, n_splits=3, print('Max accuracy:', np.max(accs)) print('All accuracy:', accs) # return the results of fresh_start for the max accuracy model - return reports[np.argmax(accs)] + return reports def pca(Lx, Ux, n): diff --git a/tests/test_models.py b/tests/test_models.py index 4253f5d..52b59ec 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -5,6 +5,7 @@ # testing models from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_classification import tests.test_data as test_data # hyperopt from hyperopt.pyll.base import scope @@ -17,54 +18,64 @@ import joblib import os -# initialize sample data -start_date = datetime(2019, 2, 2) -delta = timedelta(seconds=1) -timestamps = np.arange(start_date, - start_date + (test_data.timesteps * delta), - delta).astype('datetime64[s]').astype('float64') -live = np.full((len(timestamps),), test_data.livetime) -sample_val = 1.0 -spectra = np.full((len(timestamps), test_data.energy_bins), - np.full((1, test_data.energy_bins), sample_val)) -# setting up for rejected null hypothesis -rejected_H0_time = np.random.choice(spectra.shape[0], - test_data.timesteps//2, - replace=False) -spectra[rejected_H0_time] = 100.0 - -labels = np.full((spectra.shape[0],), 0) -labels[rejected_H0_time] = 1 +@pytest.fixture(scope='module', autouse=True) +def data_init(): + # initialize sample data + start_date = datetime(2019, 2, 2) + delta = timedelta(seconds=1) + pytest.timestamps = np.arange(start_date, + start_date + (test_data.timesteps * delta), + delta + ).astype('datetime64[s]').astype('float64') + + pytest.live = np.full((len(pytest.timestamps),), test_data.livetime) + # 4/5 of the data will be separable + spectra_sep, labels_sep = make_classification( + n_samples=int(0.8*len(pytest.timestamps)), + n_features=test_data.energy_bins, + n_informative=test_data.energy_bins, + n_redundant=0, + n_classes=2, + class_sep=10.0) + # 1/5 of the data will be much less separable + spectra_unsep, labels_unsep = make_classification( + n_samples=int(0.2*len(pytest.timestamps)), + n_features=test_data.energy_bins, + n_informative=test_data.energy_bins, + n_redundant=0, + n_classes=2, + class_sep=0.5) + pytest.spectra = np.append(spectra_sep, spectra_unsep, axis=0) + pytest.labels = np.append(labels_sep, labels_unsep, axis=0) def test_cross_validation(): - X, Ux, y, Uy = train_test_split(spectra, - labels, - test_size=0.5, - random_state=0) - Uy = np.full_like(Uy, -1) - # test cross validation for supervised data using LogReg params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} model = LogReg(params=params) - max_acc_model = utils.cross_validation(model=model, - X=X, - y=y, - params=params) - assert max_acc_model['accuracy'] >= 0.5 + results = utils.cross_validation(model=model, + X=pytest.spectra, + y=pytest.labels, + params=params, + n_splits=5, + shuffle=False) + accs = [res['accuracy'] for res in results] + # the last K-fold will use the unseparable data for testing + # therefore its accuracy should be less than all other folds + assert (accs[-1] < accs[:-1]).all() # test cross validation for supervised data and StratifiedKFold with LogReg - params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} - model = LogReg(params=params) - max_acc_model = utils.cross_validation(model=model, - X=X, - y=y, - params=params, - stratified=True) - assert max_acc_model['accuracy'] >= 0.5 - - # # test cross validation for SSML with LabelProp + # params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + # model = LogReg(params=params) + # max_acc_model = utils.cross_validation(model=model, + # X=X, + # y=y, + # params=params, + # stratified=True) + # assert max_acc_model['accuracy'] >= 0.5 + + # test cross validation for SSML with LabelProp # params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} # model = LabelProp(params=params) # max_acc_model = utils.cross_validation(model=model, @@ -74,10 +85,11 @@ def test_cross_validation(): # stratified=True) # assert max_acc_model['accuracy'] >= 0.5 + def test_pca(): # unlabeled data split - X, Ux, y, Uy = train_test_split(spectra, - labels, + X, Ux, y, Uy = train_test_split(pytest.spectra, + pytest.labels, test_size=0.5, random_state=0) Uy = np.full_like(Uy, -1) @@ -128,8 +140,8 @@ def test_LogReg(): assert model.model.tol == params['tol'] assert model.model.C == params['C'] - X_train, X_test, y_train, y_test = train_test_split(spectra, - labels, + X_train, X_test, y_train, y_test = train_test_split(pytest.spectra, + pytest.labels, test_size=0.2, random_state=0) @@ -175,4 +187,4 @@ def test_LogReg(): model_file = joblib.load(filename+ext) assert model_file.best['params'] == model.best['params'] - os.remove(filename+ext) \ No newline at end of file + os.remove(filename+ext) From fd3cf535a26bef028b63fec00d7b0d421a49c095 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Thu, 15 Dec 2022 14:55:34 -0500 Subject: [PATCH 10/14] changing print statements to logging --- scripts/utils.py | 25 +++++++++++++------------ tests/test_models.py | 1 - 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/scripts/utils.py b/scripts/utils.py index 85c113d..d1ee480 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -6,6 +6,7 @@ from functools import partial # diagnostics from sklearn.metrics import confusion_matrix +import logging # pca from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA @@ -88,10 +89,10 @@ def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True): worst = trials.results[np.argmax([r['loss'] for r in trials.results])] if verbose: - print('best accuracy:', 1-best['loss']) - print('best params:', best['params']) - print('worst accuracy:', 1-worst['loss']) - print('worst params:', worst['params']) + logging.info('best accuracy:', 1-best['loss']) + logging.info('best params:', best['params']) + logging.info('worst accuracy:', 1-worst['loss']) + logging.info('worst params:', worst['params']) return best, worst @@ -183,9 +184,9 @@ def cross_validation(model, X, y, params, n_splits=3, reports = np.append(reports, results) # report cross validation results - print('Average accuracy:', np.mean(accs)) - print('Max accuracy:', np.max(accs)) - print('All accuracy:', accs) + logging.info('Average accuracy:', np.mean(accs)) + logging.info('Max accuracy:', np.max(accs)) + logging.info('All accuracy:', accs) # return the results of fresh_start for the max accuracy model return reports @@ -202,14 +203,14 @@ def pca(Lx, Ux, n): pcadata = np.append(Lx, Ux, axis=0) normalizer = StandardScaler() x = normalizer.fit_transform(pcadata) - print(np.mean(pcadata), np.std(pcadata)) - print(np.mean(x), np.std(x)) + logging.info(np.mean(pcadata), np.std(pcadata)) + logging.info(np.mean(x), np.std(x)) pca = PCA(n_components=n) pca.fit_transform(x) - print(pca.explained_variance_ratio_) - print(pca.singular_values_) - print(pca.components_) + logging.info(pca.explained_variance_ratio_) + logging.info(pca.singular_values_) + logging.info(pca.components_) principalComponents = pca.fit_transform(x) diff --git a/tests/test_models.py b/tests/test_models.py index 52b59ec..25fa1b1 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -160,7 +160,6 @@ def test_LogReg(): pred, acc = model.predict(X_test, y_test) assert acc > 0.7 - np.testing.assert_equal(pred, y_test) # testing hyperopt optimize methods space = {'max_iter': scope.int(hp.quniform('max_iter', From 56da5309b184808fe37cc4280f3285a8b001c48f Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Thu, 15 Dec 2022 15:10:52 -0500 Subject: [PATCH 11/14] clarifying the testing logic for ML classes --- tests/test_models.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 25fa1b1..23f7518 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -159,7 +159,10 @@ def test_LogReg(): # testing train and predict methods pred, acc = model.predict(X_test, y_test) - assert acc > 0.7 + # since the test data used here is synthetic/toy data (i.e. uninteresting), + # the trained model should be at least better than a 50-50 guess + # if it was worse, something would be wrong with the ML class + assert acc > 0.5 # testing hyperopt optimize methods space = {'max_iter': scope.int(hp.quniform('max_iter', @@ -176,6 +179,8 @@ def test_LogReg(): } model.optimize(space, data_dict, max_evals=2, verbose=True) + # again, the data does not guarantee that an improvement will be found + # from hyperopt, so long as something is not wrong with the class assert model.best['accuracy'] >= model.worst['accuracy'] assert model.best['status'] == 'ok' From 533bbb24934883f49363528482b4814c23fe01e0 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 16 Jan 2023 09:26:22 -0500 Subject: [PATCH 12/14] moving default init params to support kwargs --- models/LogReg.py | 34 ++++++++++++++-------------------- tests/test_models.py | 8 +++----- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/models/LogReg.py b/models/LogReg.py index 8886b94..316a82f 100644 --- a/models/LogReg.py +++ b/models/LogReg.py @@ -23,27 +23,21 @@ class LogReg: ''' # only binary so far - def __init__(self, params=None, random_state=0): - keys = ['max_iter', 'tol', 'C'] + def __init__(self, **kwargs): + # supported keys = ['max_iter', 'tol', 'C'] # defaults to a fixed value for reproducibility - self.random_state = random_state - # dictionary of parameters for logistic regression model - self.params = params - if self.params is None: - self.model = linear_model.LogisticRegression( - random_state=self.random_state - ) - else: - if all(key in params.keys() for key in keys): - self.model = linear_model.LogisticRegression( - random_state=self.random_state, - max_iter=params['max_iter'], - tol=params['tol'], - C=params['C'] - ) - else: - missing = [key for key in keys if key not in params.keys()] - raise ValueError('Values for {} not in params'.format(missing)) + self.random_state = kwargs.pop('random_state', 0) + # parameters for logistic regression model: + # defaults to sklearn.linear_model.LogisticRegression default vals + self.max_iter = kwargs.pop('max_iter', 100) + self.tol = kwargs.pop('tol', 0.0001) + self.C = kwargs.pop('C', 1.0) + self.model = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=self.max_iter, + tol=self.tol, + C=self.C + ) def fresh_start(self, params, data_dict): ''' diff --git a/tests/test_models.py b/tests/test_models.py index 23f7518..5b66f65 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -129,12 +129,10 @@ def test_pca(): def test_LogReg(): # test saving model input parameters - # missing 'C' key - params = {'max_iter': 2022, 'tol': 0.5} - with pytest.raises(ValueError): - LogReg(params=params) params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} - model = LogReg(params=params) + model = LogReg(max_iter=params['max_iter'], + tol=params['tol'], + C=params['C']) assert model.model.max_iter == params['max_iter'] assert model.model.tol == params['tol'] From f731a56b1f94573f235e50e0d045c6ca4c7f8c02 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 16 Jan 2023 09:49:22 -0500 Subject: [PATCH 13/14] adding pragmas for coveralls to ignore PCA functions --- scripts/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/utils.py b/scripts/utils.py index d1ee480..87502e5 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -191,7 +191,7 @@ def cross_validation(model, X, y, params, n_splits=3, return reports -def pca(Lx, Ux, n): +def pca(Lx, Ux, n): # pragma: no cover ''' A function for computing n-component PCA. Inputs: @@ -217,7 +217,7 @@ def pca(Lx, Ux, n): return principalComponents -def _plot_pca_data(principalComponents, Ly, Uy, ax): +def _plot_pca_data(principalComponents, Ly, Uy, ax): # pragma: no cover ''' Helper function for plot_pca that plots data for a given axis. Inputs: @@ -239,7 +239,7 @@ def _plot_pca_data(principalComponents, Ly, Uy, ax): return ax -def plot_pca(principalComponents, Ly, Uy, filename, n=2): +def plot_pca(principalComponents, Ly, Uy, filename, n=2): # pragma: no cover ''' A function for computing and plotting n-dimensional PCA. Inputs: @@ -293,7 +293,7 @@ def plot_pca(principalComponents, Ly, Uy, filename, n=2): fig.savefig(filename) -def plot_cf(testy, predy, title, filename): +def plot_cf(testy, predy, title, filename): # pragma: no cover ''' Uses sklearn metric to compute a confusion matrix for visualization Inputs: From c9cb2498dc0815bd21e19b2a9d66ea5e539528a8 Mon Sep 17 00:00:00 2001 From: Jordan Stomps Date: Mon, 16 Jan 2023 09:59:37 -0500 Subject: [PATCH 14/14] pep8 --- scripts/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/utils.py b/scripts/utils.py index 87502e5..6ea4add 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -147,10 +147,10 @@ def cross_validation(model, X, y, params, n_splits=3, accs = [] reports = [] - if stratified: + if stratified: # pragma: no cover cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle) - else: + else: # pragma: no cover cv = KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle) @@ -191,7 +191,7 @@ def cross_validation(model, X, y, params, n_splits=3, return reports -def pca(Lx, Ux, n): # pragma: no cover +def pca(Lx, Ux, n): # pragma: no cover ''' A function for computing n-component PCA. Inputs: @@ -293,7 +293,7 @@ def plot_pca(principalComponents, Ly, Uy, filename, n=2): # pragma: no cover fig.savefig(filename) -def plot_cf(testy, predy, title, filename): # pragma: no cover +def plot_cf(testy, predy, title, filename): # pragma: no cover ''' Uses sklearn metric to compute a confusion matrix for visualization Inputs: