diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index d88f9c7..973f71c 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -41,7 +41,7 @@ jobs: - name: Test with pytest run: | python3 -m pytest - python3 -m coverage run --source=./RadClass/ -m pytest + python3 -m coverage run --source=./RadClass/,./models/,./scripts/ -m pytest python3 -m coverage report python3 -m coverage html COVERALLS_REPO_TOKEN=${{ secrets.COVERALLS_REPO_TOKEN }} python3 -m coveralls --service=github diff --git a/README.md b/README.md index b08bd07..42245fa 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,13 @@ Versions 3.6-3.9 are currently supported by tests. The following Python packages * h5py * numpy * progressbar2 +* matplotlib +* seaborn * scipy +* sklearn +* hyperopt +* torch +* shadow-ssml Modules can be imported from the repository directory (e.g. `from RadClass.H0 import H0`) or `RadClass` can be installed using pip: diff --git a/models/LogReg.py b/models/LogReg.py new file mode 100644 index 0000000..4ebfce2 --- /dev/null +++ b/models/LogReg.py @@ -0,0 +1,162 @@ +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# sklearn models +from sklearn import linear_model +# diagnostics +from sklearn.metrics import balanced_accuracy_score +from scripts.utils import run_hyperopt +import joblib + + +class LogReg: + ''' + Methods for deploying sklearn's logistic regression + implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + random_state: int/float for reproducible intiailization. + ''' + + # only binary so far + def __init__(self, params=None, random_state=0): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # dictionary of parameters for logistic regression model + self.params = params + if self.params is None: + self.model = linear_model.LogisticRegression( + random_state=self.random_state + ) + else: + self.model = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh logistic regression model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, and testy required. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + + # supervised logistic regression + clf = LogReg(params=params, random_state=self.random_state) + # train and test model + clf.train(trainx, trainy) + # uses balanced_accuracy accounts for class imbalanced data + clf_pred, acc = clf.predict(testx, testy) + + # loss function minimizes misclassification + return {'loss': 1-acc, + 'status': STATUS_OK, + 'model': clf.model, + 'params': params, + 'accuracy': acc} + + def optimize(self, space, data_dict, max_evals=50, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a hyperopt compliant dictionary with defined optimization + spaces. For example: + # quniform returns float, some parameters require int; + # use this to force int + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol' : hp.loguniform('tol', 1e-5, 1e-1), + 'C' : hp.uniform('C', 0.001,1000.0) + } + See hyperopt docs for more information. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy required. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy): + ''' + Wrapper method for sklearn's logisitic regression training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + ''' + + # supervised logistic regression + self.model.fit(trainx, trainy) + + def predict(self, testx, testy=None): + ''' + Wrapper method for sklearn's logistic regression predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + pred = self.model.predict(testx) + + acc = None + if testy is not None: + # uses balanced_accuracy_score to account for class imbalance + acc = balanced_accuracy_score(testy, pred) + + return pred, acc + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py new file mode 100644 index 0000000..e6757bd --- /dev/null +++ b/models/SSML/CoTraining.py @@ -0,0 +1,335 @@ +import numpy as np +import matplotlib.pyplot as plt +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# sklearn models +from sklearn import linear_model +# diagnostics +from sklearn.metrics import balanced_accuracy_score +from scripts.utils import run_hyperopt +import joblib + + +class CoTraining: + ''' + Methods for deploying a basic co-training with logistic + regression implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + random_state: int/float for reproducible intiailization. + ''' + + # only binary so far + def __init__(self, params=None, random_state=0): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # dictionary of parameters for logistic regression model + self.params = params + if self.params is None: + self.model1 = linear_model.LogisticRegression( + random_state=self.random_state) + self.model2 = linear_model.LogisticRegression( + random_state=self.random_state) + # default needed for training + self.params = {'n_samples': 1} + else: + self.model1 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + self.model2 = linear_model.LogisticRegression( + random_state=self.random_state, + max_iter=params['max_iter'], + tol=params['tol'], + C=params['C'] + ) + + def training_loop(self, slr1, slr2, L_lr1, L_lr2, + Ly_lr1, Ly_lr2, U_lr, n_samples, + testx=None, testy=None): + ''' + Main training iteration for co-training. + Given two models, labeled training data, and unlabeled training data: + - Train both models using their respective labeled datasets + - Randomly sample n_samples number of unlabeled + instances for model 1 and 2 each. + - Label the sampled unlabeled instances using + model 1 (u1) and model 2 (u2). + - Remove u1 and u2 from the unlabeled dataset and + include in each model's respective labeled dataset + with their associated labels for future training. + Inputs: + slr1: logistic regression co-training model #1 + slr2: logistic regression co-training model #2 + L_lr1: feature training data for co-training model #1 + L_lr2: feature training data for co-training model #2 + Ly_lr1: labels for input data for co-training model #1 + Ly_lr2: labels for input data for co-training model #2 + U_lr: unlabeled feature training data used by both models + n_samples: the number of instances to sample and + predict from Ux at one time + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + model1_accs, model2_accs = np.array([]), np.array([]) + # should stay false but if true, + # the same unalbeled instance could be sampled multiple times + rep = False + while U_lr.shape[0] > 1: + slr1.fit(L_lr1, Ly_lr1) + slr2.fit(L_lr2, Ly_lr2) + + # pull u1 + # ensuring there is enough instances to sample for each model + if U_lr.shape[0] < n_samples*2: + n_samples = int(U_lr.shape[0]/2) + uidx1 = np.random.choice(range(U_lr.shape[0]), + n_samples, + replace=rep) + u1 = U_lr[uidx1].copy() + # remove instances that will be labeled + U_lr = np.delete(U_lr, uidx1, axis=0) + + # pull u2 + uidx2 = np.random.choice(range(U_lr.shape[0]), + n_samples, + replace=rep) + u2 = U_lr[uidx2].copy() + # remove instances that will be labeled + U_lr = np.delete(U_lr, uidx2, axis=0) + + # predict unlabeled samples + u1y = slr1.predict(u1) + u2y = slr2.predict(u2) + + if testx is not None and testy is not None: + # test and save model(s) accuracy over all training iterations + model1_accs = np.append(model1_accs, + balanced_accuracy_score(testy, + slr1.predict( + testx))) + model2_accs = np.append(model2_accs, + balanced_accuracy_score(testy, + slr2.predict( + testx))) + + # add predictions to cotrained model(s) labeled samples + L_lr1 = np.append(L_lr1, u2, axis=0) + L_lr2 = np.append(L_lr2, u1, axis=0) + Ly_lr1 = np.append(Ly_lr1, u2y, axis=0) + Ly_lr2 = np.append(Ly_lr2, u1y, axis=0) + + return slr1, slr2, model1_accs, model2_accs + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh co-training model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys n_samples, max_iter, tol, and C supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + # unlabeled co-training data + Ux = data_dict['Ux'] + + clf = CoTraining(params=params, random_state=self.random_state) + # training and testing + model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy) + # uses balanced_accuracy accounts for class imbalanced data + pred1, acc, pred2, model1_acc, model2_acc = clf.predict(testx, testy) + + return {'loss': 1-acc, + 'status': STATUS_OK, + 'model': clf.model1, + 'model2': clf.model2, + 'model1_acc_history': model1_accs, + 'model2_acc_history': model2_accs, + 'params': params, + 'accuracy': acc} + + def optimize(self, space, data_dict, max_evals=50, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a hyperopt compliant dictionary with defined optimization + spaces. For example: + # quniform returns float, some parameters require int; + # use this to force int + space = {'max_iter' : scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol' : hp.loguniform('tol', 1e-5, 1e-3), + 'C' : hp.uniform('C', 1.0, 1000.0), + 'n_samples' : scope.int(hp.quniform('n_samples', + 1, + 20, + 1)) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux, + testx=None, testy=None): + ''' + Wrapper method for a basic co-training with logistic regression + implementation training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + # avoid overwriting when deleting in co-training loop + U_lr = Ux.copy() + + # set the random seed of training splits for reproducibility + # This can be ignored by excluding params['seed'] + # in the hyperopt space dictionary + if 'seed' in self.params.keys(): + np.random.seed(self.params['seed']) + + # TODO: allow a user to specify uneven splits between the two models + split_frac = 0.5 + # labeled training data + idx = np.random.choice(range(trainy.shape[0]), + size=int(split_frac * trainy.shape[0]), + replace=False) + + # avoid overwriting when deleting in co-training loop + L_lr1 = trainx[idx].copy() + L_lr2 = trainx[~idx].copy() + Ly_lr1 = trainy[idx].copy() + Ly_lr2 = trainy[~idx].copy() + + self.model1, self.model2, model1_accs, model2_accs = \ + self.training_loop( + self.model1, self.model2, + L_lr1, L_lr2, + Ly_lr1, Ly_lr2, + U_lr, self.params['n_samples'], + testx, testy, + ) + + # optional returns if a user is interested in training diagnostics + return model1_accs, model2_accs + + def predict(self, testx, testy=None): + ''' + Wrapper method for sklearn's Label Propagation predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + pred1 = self.model1.predict(testx) + pred2 = self.model2.predict(testx) + + acc = None + if testy is not None: + # balanced_accuracy accounts for class imbalanced data + # could alternatively use pure accuracy + # for a more traditional hyperopt + model1_acc = balanced_accuracy_score(testy, pred1) + model2_acc = balanced_accuracy_score(testy, pred2) + # select best accuracy for hyperparameter optimization + acc = max(model1_acc, model2_acc) + + return pred1, acc, pred2, model1_acc, model2_acc + + def plot_cotraining(self, model1_accs=None, model2_accs=None, + filename='lr-cotraining-learningcurves.png'): + ''' + Plots the training error curves for two co-training models. + NOTE: The user must provide the curves to plot, but each curve is + saved by the class under self.best and self.worst models. + Inputs: + filename: name to store picture under. + Must end in .png (or will be added if missing). + model1_accs: the accuracy scores over training epochs for model 1 + model2_accs: the accuracy scores over training epochs for model 2 + ''' + + fig, ax = plt.subplots(figsize=(10, 8), dpi=300) + ax.plot(np.arange(len(model1_accs)), model1_accs, + color='tab:blue', label='Model 1') + ax.plot(np.arange(len(model2_accs)), model2_accs, + color='tab:orange', label='Model 2') + ax.legend() + ax.set_xlabel('Co-Training Iteration') + ax.set_ylabel('Test Accuracy') + ax.grid() + + if filename[-4:] != '.png': + filename += '.png' + fig.savefig(filename) + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/models/SSML/LabelProp.py b/models/SSML/LabelProp.py new file mode 100644 index 0000000..cb9ff05 --- /dev/null +++ b/models/SSML/LabelProp.py @@ -0,0 +1,186 @@ +import numpy as np +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# sklearn models +from sklearn import semi_supervised +# diagnostics +from sklearn.metrics import balanced_accuracy_score +from scripts.utils import run_hyperopt +import joblib + + +class LabelProp: + ''' + Methods for deploying sklearn's Label Propagation + implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + NOTE: Since LabelProp is guaranteed to converge given + enough iterations, there is no random_state defined. + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys gamma, n_neighbors, max_iter, and tol supported. + ''' + + # only binary so far + def __init__(self, params=None, random_state=0): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # dictionary of parameters for logistic regression model + self.params = params + if self.params is None: + # defaults: + # knn kernel, although an rbf is equally valid + # TODO: allow rbf kernels + # n_jobs, use parallelization if available. + self.model = semi_supervised.LabelPropagation( + kernel='knn', + n_jobs=-1 + ) + else: + self.model = semi_supervised.LabelPropagation( + kernel='knn', + gamma=params['gamma'], + n_neighbors=params['n_neighbors'], + max_iter=params['max_iter'], + tol=params['tol'], + n_jobs=-1 + ) + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh Label Propagation model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys max_iter, tol, and C supported. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + Ux = data_dict['Ux'] + + clf = LabelProp(params, random_state=self.random_state) + # training and testing + clf.train(trainx, trainy, Ux) + # uses balanced_accuracy accounts for class imbalanced data + pred, acc = clf.predict(testx, testy) + + # loss function minimizes misclassification + return {'loss': 1-acc, + 'status': STATUS_OK, + 'model': clf.model, + 'params': params, + 'accuracy': acc} + + def optimize(self, space, data_dict, max_evals=50, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a hyperopt compliant dictionary with defined optimization + spaces. For example: + # quniform returns float, some parameters require int; + # use this to force int + space = {'max_iter' : scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol' : hp.loguniform('tol', 1e-6, 1e-4), + 'gamma' : hp.uniform('gamma', 1, 50), + 'n_neighbors': scope.int(hp.quniform('n_neighbors', + 1, + 200, + 1)) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux): + ''' + Wrapper method for sklearn's Label Propagation training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + ''' + + # combine labeled and unlabeled instances for training + lp_trainx = np.append(trainx, Ux, axis=0) + lp_trainy = np.append(trainy, + np.full(shape=(Ux.shape[0],), fill_value=-1), + axis=0) + + # semi-supervised Label Propagation + self.model.fit(lp_trainx, lp_trainy) + + def predict(self, testx, testy=None): + ''' + Wrapper method for sklearn's Label Propagation predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + pred = self.model.predict(testx) + + acc = None + if testy is not None: + # uses balanced_accuracy_score to account for class imbalance + acc = balanced_accuracy_score(testy, pred) + + return pred, acc + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/models/SSML/ShadowCNN.py b/models/SSML/ShadowCNN.py new file mode 100644 index 0000000..a633283 --- /dev/null +++ b/models/SSML/ShadowCNN.py @@ -0,0 +1,436 @@ +import numpy as np +import matplotlib.pyplot as plt +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# torch imports +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +# shadow imports +import shadow.eaat +import shadow.losses +import shadow.utils +from shadow.utils import set_seed +# diagnostics +from scripts.utils import EarlyStopper, run_hyperopt +import joblib + + +class Net(nn.Module): + ''' + Neural Network constructor. + Also includes method for forward pass. + nn.Module: PyTorch object for neural networks. + Inputs: + layer1: int length for first layer. + layer2: int length for second layer. + Ideally a multiple of layer1. + layer3: int length for third layer. + Ideally a multiple of layer2. + kernel: convolutional kernel size. + NOTE: An optimal value is unclear for spectral data. + drop_rate: float (<1.) probability for reset/dropout layer. + length: single instance data length. + NOTE: Assumed to be 1000 for spectral data. + TODO: Allow hyperopt to optimize on arbitrary sized networks. + ''' + + def __init__(self, layer1=32, layer2=64, layer3=128, + kernel=3, drop_rate=0.1, length=1000): + ''' + Defines the structure for each type of layer. + The resulting network has fixed length but the + user can input arbitrary widths. + ''' + + # default max_pool1d kernel set by Shadow MNIST example + # NOTE: max_pool1d sets mp_kernel = mp_stride + self.mp_kernel = 2 + super(Net, self).__init__() + self.conv1 = nn.Conv1d(1, layer1, kernel, 1) + self.conv2 = nn.Conv1d(layer1, layer2, kernel, 1) + self.dropout = nn.Dropout(drop_rate) + # calculating the number of parameters/weights before the flattened + # fully-connected layer: + # first, there are two convolution layers, so the output length is + # the input length (feature_vector.shape[0] - 2_layers*(kernel-1)) + # if, in the future, more layers are desired, 2 must be adjusted + # next, calculate the output of the max_pool1d layer, which is + # round((conv_out - (kernel=stride - 1) - 1)/2 + 1) + # finally, multiply this by the number of channels in the last + # convolutional layer = layer2 + conv_out = length-2*(kernel-1) + parameters = layer2*( + ((conv_out - (self.mp_kernel - 1) - 1)//self.mp_kernel) + + 1) + self.fc1 = nn.Linear(int(parameters), layer3) + self.fc2 = nn.Linear(layer3, 2) + + def forward(self, x): + ''' + The resulting network has a fixed length with + two convolutional layers divided by relu activation, + a max pooling layer, a dropout layer, and two + fully-connected layers separated by a relu and + dropout layers. + ''' + + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.max_pool1d(x, self.mp_kernel) + x = self.dropout(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout(x) + x = self.fc2(x) + return x + + +class SpectralDataset(torch.utils.data.Dataset): + ''' + Dataset loader for use with PyTorch NN training. + torch.utils.data.Dataset: managing user input data for random sampling. + Inputs: + trainD: the nxm input vector/matrix of data. + labels: associated label vector for data. + ''' + + def __init__(self, trainD, labels): + self.labels = labels + self.trainD = trainD + + def __len__(self): + ''' + Define what length is for the Dataset + ''' + + return len(self.labels) + + def __getitem__(self, idx): + ''' + Define how to retrieve an instance from a dataset. + Inputs: + idx: the index to sample from. + ''' + + label = self.labels[idx] + data = self.trainD[idx] + # no need to bother with labels, unpacking both anyways + # sample = {"Spectrum": data, "Class": label} + # return sample + return data, label + + +class ShadowCNN: + ''' + Methods for deploying a Shadow CNN + implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys binning, hidden_layer, alpha, xi, eps, lr, and momentum + are supported. + TODO: Include functionality for manipulating other + CNN architecture parameters in hyperparameter optimization + random_state: int/float for reproducible intiailization. + length: int input length (i.e. dimensions of feature vectors) + TODO: Add input parameter, loss_function, for the other + loss function options available in Shadow (besides EAAT). + ''' + + # only binary so far + def __init__(self, params=None, random_state=0, length=1000): + # defaults to a fixed value for reproducibility + self.random_state = random_state + # set seeds for reproducibility + set_seed(0) + # device used for computation + self.device = torch.device("cuda" if + torch.cuda.is_available() else "cpu") + # dictionary of parameters for logistic regression model + self.params = params + if self.params is not None: + # assumes the input dimensions are measurements of 1000 bins + # TODO: Abstract this for arbitrary input size + self.model = Net(layer1=params['layer1'], + layer2=2*params['layer1'], + layer3=3*params['layer1'], + kernel=params['kernel'], + drop_rate=params['drop_rate'], + length=np.ceil(length/params['binning'])) + self.eaat = shadow.eaat.EAAT(model=self.model, + alpha=params['alpha'], + xi=params['xi'], + eps=params['eps']) + self.optimizer = optim.SGD(self.eaat.parameters(), + lr=params['lr'], + momentum=params['momentum']) + else: + # fixed value defaults needed by training algorithm + self.params = {'binning': 1, 'batch_size': 1} + # assumes the input dimensions are measurements of 1000 bins + # TODO: Abstract this for arbitrary input size + self.model = Net() + self.eaat = shadow.eaat.EAAT(model=self.model) + self.optimizer = optim.SGD(self.eaat.parameters(), + lr=0.1, momentum=0.9) + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh Shadow NN model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys binning, layer1, alpha, xi, eps, lr, momentum, + kernel, drop_rate, and batch_size are supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + self.params = params + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + # unlabeled co-training data + Ux = data_dict['Ux'] + + clf = ShadowCNN(params=params, + random_state=self.random_state, + length=trainx.shape[1]) + # training and testing + losscurve, evalcurve = clf.train(trainx, trainy, Ux, testx, testy) + # not used; max acc in past few epochs used instead + y_pred, acc = clf.predict(testx, testy) + max_acc = np.max(evalcurve[-25:]) + + return {'loss': 1-(max_acc/100.0), + 'status': STATUS_OK, + 'model': clf.eaat, + 'params': params, + 'losscurve': losscurve, + 'evalcurve': evalcurve, + 'accuracy': (max_acc/100.0)} + + def optimize(self, space, data_dict, max_evals=50, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a hyperopt compliant dictionary with defined optimization + spaces. For example: + # quniform returns float, some parameters require int; + # use this to force int + space = {'layer1' : scope.int(hp.quniform('layer1', + 1000, + 10000, + 10)), + 'kernel' : scope.int(hp.quniform('kernel', + 1, + 9, + 1)), + 'alpha' : hp.uniform('alpha', 0.0001, 0.999), + 'xi' : hp.uniform('xi', 1e-2, 1e0), + 'eps' : hp.uniform('eps', 0.5, 1.5), + 'lr' : hp.uniform('lr', 1e-3, 1e-1), + 'momentum' : hp.uniform('momentum', 0.5, 0.99), + 'binning' : scope.int(hp.quniform('binning', + 1, + 10, + 1)), + 'batch_size' : scope.int(hp.quniform('batch_size', + 1, + 100, + 1)) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux, testx=None, testy=None): + ''' + Wrapper method for Shadow NN training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + # avoid float round-off by using DoubleTensor + xtens = torch.FloatTensor(np.append(trainx, + Ux, + axis=0))[:, + ::self.params['binning']] + # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10 + ytens = torch.LongTensor(np.append(trainy, + np.full(shape=(Ux.shape[0],), + fill_value=-1), + axis=0)) + + # define data set object + dataset = SpectralDataset(xtens, ytens) + + # create DataLoader object of DataSet object + DL_DS = torch.utils.data.DataLoader(dataset, + batch_size=self.params[ + 'batch_size' + ], + shuffle=True) + + # labels for unlabeled data are always "-1" + xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1) + + # generate early-stopping watchdog + # TODO: allow a user of ShadowCNN to specify EarlyStopper's params + stopper = EarlyStopper(patience=3, min_delta=0) + n_epochs = 100 + self.eaat.to(self.device) + losscurve = [] + evalcurve = [] + for epoch in range(n_epochs): + self.eaat.train() + lossavg = [] + for i, (data, targets) in enumerate(DL_DS): + x = data.reshape((data.shape[0], + 1, + data.shape[1])).to(self.device) + y = targets.to(self.device) + self.optimizer.zero_grad() + out = self.eaat(x) + loss = xEnt(out, y) + self.eaat.get_technique_cost(x) + loss.backward() + self.optimizer.step() + lossavg.append(loss.item()) + losscurve.append(np.nanmedian(lossavg)) + if testx is not None and testy is not None: + pred, acc = self.predict(testx, testy) + evalcurve.append(acc) + + self.eaat.train() + # test for early stopping + x_val = torch.FloatTensor( + testx.copy()[:, ::self.params['binning']]) + x_val = x_val.reshape((x_val.shape[0], + 1, + x_val.shape[1])).to(self.device) + y_val = torch.LongTensor(testy).to(self.device) + out = self.eaat(x_val) + val_loss = xEnt(out, y_val) + \ + self.eaat.get_technique_cost(x_val) + if stopper.early_stop(val_loss): + break + + # optionally return the training accuracy if test data was provided + return losscurve, evalcurve + + def predict(self, testx, testy=None): + ''' + Wrapper method for Shadow NN predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + binning: int number of bins sampled in feature vector + ''' + + self.eaat.eval() + y_pred, y_true = [], [] + for i, data in enumerate(torch.FloatTensor( + testx.copy()[:, ::self.params['binning']]) + ): + x = data.reshape((1, 1, data.shape[0])).to(self.device) + out = self.eaat(x) + y_pred.extend(torch.argmax(out, 1).detach().cpu().tolist()) + acc = None + if testy is not None: + y_true = torch.LongTensor(testy.copy()) + acc = (np.array(y_true) == np.array(y_pred)).mean() * 100 + + return y_pred, acc + + def plot_training(self, losscurve=None, evalcurve=None, + filename='lr-cotraining-learningcurves.png'): + ''' + Plots the training error curves for two co-training models. + NOTE: The user must provide the curves to plot, but each curve is + saved by the class under self.best and self.worst models. + Inputs: + filename: name to store picture under. + Must end in .png (or will be added if missing). + losscurve: the loss value over training epochs + evalcurve: the accuracy scores over training epochs + ''' + + fig, (ax1, ax2) = plt.subplots(2, + 1, + sharex=True, + figsize=(10, 8), + dpi=300) + ax1.plot(losscurve) + ax2.plot(evalcurve) + ax1.set_xlabel('Epoch') + ax2.set_xlabel('Epoch') + ax1.set_ylabel('Loss Curve') + ax2.set_ylabel('Accuracy') + ax1.grid() + ax2.grid() + + if filename[-4:] != '.png': + filename += '.png' + fig.savefig(filename) + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/models/SSML/ShadowNN.py b/models/SSML/ShadowNN.py new file mode 100644 index 0000000..4857ccf --- /dev/null +++ b/models/SSML/ShadowNN.py @@ -0,0 +1,282 @@ +import numpy as np +# For hyperopt (parameter optimization) +from hyperopt import STATUS_OK +# torch imports +import torch +# shadow imports +import shadow.eaat +import shadow.losses +import shadow.utils +from shadow.utils import set_seed +# diagnostics +from scripts.utils import EarlyStopper, run_hyperopt +import joblib + + +class ShadowNN: + ''' + Methods for deploying a Shadow fully-connected NN + implementation with hyperparameter optimization. + Data agnostic (i.e. user supplied data inputs). + TODO: Currently only supports binary classification. + Add multinomial functions and unit tests. + Add functionality for regression(?) + Inputs: + params: dictionary of logistic regression input functions. + keys binning, hidden_layer, alpha, xi, eps, lr, and momentum + are supported. + random_state: int/float for reproducible intiailization. + TODO: Add input parameter, loss_function, for the other + loss function options available in Shadow (besides EAAT). + ''' + + # only binary so far + def __init__(self, params=None, random_state=0, input_length=1000): + # defaults to a fixed value for reproducibility + self.random_state = random_state + self.input_length = input_length + # set seeds for reproducibility + set_seed(0) + # device used for computation + self.device = torch.device("cuda" if + torch.cuda.is_available() else "cpu") + # dictionary of parameters for logistic regression model + self.params = params + if self.params is not None: + # assumes the input dimensions are measurements of 1000 bins + # TODO: Abstract this for arbitrary input size + self.eaat = shadow.eaat.EAAT(model=self.model_factory( + int(np.ceil( + self.input_length / + params['binning'])), + params['hidden_layer']), + alpha=params['alpha'], + xi=params['xi'], + eps=params['eps']).to(self.device) + self.eaat_opt = torch.optim.SGD(self.eaat.parameters(), + lr=params['lr'], + momentum=params['momentum']) + # unlabeled instances always have a label of "-1" + self.xEnt = torch.nn.CrossEntropyLoss( + ignore_index=-1).to(self.device) + else: + self.params = {'binning': 1} + # assumes the input dimensions are measurements of 1000 bins + self.eaat = shadow.eaat.EAAT( + model=self.model_factory()).to(self.device) + self.eaat_opt = torch.optim.SGD(self.eaat.parameters(), + lr=0.1, momentum=0.9) + # unlabeled instances always have a label of "-1" + self.xEnt = torch.nn.CrossEntropyLoss( + ignore_index=-1).to(self.device) + + def model_factory(self, length=1000, hidden_layer=10000): + return torch.nn.Sequential( + torch.nn.Linear(length, hidden_layer), + torch.nn.ReLU(), + torch.nn.Linear(hidden_layer, length), + torch.nn.ReLU(), + torch.nn.Linear(length, 2) + ) + + def fresh_start(self, params, data_dict): + ''' + Required method for hyperopt optimization. + Trains and tests a fresh Shadow NN model + with given input parameters. + This method does not overwrite self.model (self.optimize() does). + Inputs: + params: dictionary of logistic regression input functions. + keys binning, hidden_layer, alpha, xi, eps, lr, and momentum + are supported. + data_dict: compact data representation with the four requisite + data structures used for training and testing a model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + ''' + + # unpack data + trainx = data_dict['trainx'] + trainy = data_dict['trainy'] + testx = data_dict['testx'] + testy = data_dict['testy'] + # unlabeled co-training data + Ux = data_dict['Ux'] + + clf = ShadowNN(params=params, + random_state=self.random_state, + input_length=testx.shape[1]) + # training and testing + acc_history = clf.train(trainx, trainy, Ux, testx, testy) + # not used; max acc in past few epochs used instead + eaat_pred, acc = clf.predict(testx, testy) + max_acc = np.max(acc_history[-20:]) + + return {'loss': 1-(max_acc/100.0), + 'status': STATUS_OK, + 'model': clf.eaat, + 'params': params, + 'accuracy': (max_acc/100.0)} + + def optimize(self, space, data_dict, max_evals=50, verbose=True): + ''' + Wrapper method for using hyperopt (see utils.run_hyperopt + for more details). After hyperparameter optimization, results + are stored, the best model -overwrites- self.model, and the + best params -overwrite- self.params. + Inputs: + space: a hyperopt compliant dictionary with defined optimization + spaces. For example: + # quniform returns float, some parameters require int; + # use this to force int + space = {'hidden_layer' : scope.int(hp.quniform('hidden_layer', + 1000, + 10000, + 10)), + 'alpha' : hp.uniform('alpha', 0.0001, 0.999), + 'xi' : hp.uniform('xi', 1e-2, 1e0), + 'eps' : hp.uniform('eps', 0.5, 1.5), + 'lr' : hp.uniform('lr', 1e-3, 1e-1), + 'momentum' : hp.uniform('momentum', 0.5, 0.99), + 'binning' : scope.int(hp.quniform('binning', + 1, + 10, + 1)) + } + See hyperopt docs for more information. + data_dict: compact data representation with the five requisite + data structures used for training and testing an SSML model. + keys trainx, trainy, testx, testy, and Ux required. + NOTE: Uy is not needed since labels for unlabeled data + instances is not used. + max_evals: the number of epochs for hyperparameter optimization. + Each iteration is one set of hyperparameters trained + and tested on a fresh model. Convergence for simpler + models like logistic regression typically happens well + before 50 epochs, but can increase as more complex models, + more hyperparameters, and a larger hyperparameter space is tested. + verbose: boolean. If true, print results of hyperopt. + If false, print only the progress bar for optimization. + ''' + + best, worst = run_hyperopt(space=space, + model=self.fresh_start, + data_dict=data_dict, + max_evals=max_evals, + verbose=verbose) + + # save the results of hyperparameter optimization + self.best = best + self.model = best['model'] + self.params = best['params'] + self.worst = worst + + def train(self, trainx, trainy, Ux, testx=None, testy=None): + ''' + Wrapper method for Shadow NN training method. + Inputs: + trainx: nxm feature vector/matrix for training model. + trainy: nxk class label vector/matrix for training model. + Ux: feature vector/matrix like labeled trainx but unlabeled data. + testx: feature vector/matrix used for testing the performance + of each model at every iteration. + testy: label vector used for testing the performance + of each model at every iteration. + ''' + + # avoid float round-off by using DoubleTensor + xtens = torch.FloatTensor(np.append(trainx, + Ux, + axis=0)[:, + ::self.params['binning']]) + # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10 + ytens = torch.LongTensor(np.append(trainy, + np.full(shape=(Ux.shape[0],), + fill_value=-1), + axis=0)) + + n_epochs = 100 + xt = torch.Tensor(xtens).to(self.device) + yt = torch.LongTensor(ytens).to(self.device) + # generate early-stopping watchdog + # TODO: allow a user of ShadowCNN to specify EarlyStopper's params + stopper = EarlyStopper(patience=3, min_delta=0) + # saves history for max accuracy + acc_history = [] + for epoch in range(n_epochs): + # set the model into training mode + # NOTE: change this to .eval() mode for testing and back again + self.eaat.train() + # Forward/backward pass for training semi-supervised model + out = self.eaat(xt) + # supervised + unsupervised loss + loss = self.xEnt(out, yt) + self.eaat.get_technique_cost(xt) + self.eaat_opt.zero_grad() + loss.backward() + self.eaat_opt.step() + + if testx is not None and testy is not None: + x_val = torch.FloatTensor( + testx.copy() + )[:, ::self.params['binning']].to(self.device) + y_val = torch.LongTensor(testy.copy()).to(self.device) + + self.eaat.eval() + eaat_pred = torch.max(self.eaat(x_val), 1)[-1] + acc = shadow.losses.accuracy(eaat_pred, + y_val + ).data.item() + acc_history.append(acc) + + self.eaat.train() + # test for early stopping + out = self.eaat(x_val) + val_loss = self.xEnt(out, y_val) + \ + self.eaat.get_technique_cost(x_val) + if stopper.early_stop(val_loss): + break + + # optionally return the training accuracy if test data was provided + return acc_history + + def predict(self, testx, testy=None): + ''' + Wrapper method for Shadow NN predict method. + Inputs: + testx: nxm feature vector/matrix for testing model. + testy: nxk class label vector/matrix for training model. + optional: if included, the predicted classes -and- + the resulting classification accuracy will be returned. + ''' + + self.eaat.eval() + eaat_pred = torch.max(self.eaat( + torch.FloatTensor( + testx.copy()[:, ::self.params['binning']] + ).to(self.device) + ), 1)[-1] + + acc = None + if testy is not None: + acc = shadow.losses.accuracy(eaat_pred, + torch.LongTensor( + testy.copy()).to(self.device) + ).data.item() + + # return tensor to cpu if on gpu and convert to numpy for return + eaat_pred = eaat_pred.cpu().numpy() + return eaat_pred, acc + + def save(self, filename): + ''' + Save class instance to file using joblib. + Inputs: + filename: string filename to save object to file under. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + if filename[-7:] != '.joblib': + filename += '.joblib' + joblib.dump(self, filename) diff --git a/models/SSML/__init__.py b/models/SSML/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 06d1c3a..8b22315 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,10 @@ numpy h5py progressbar2 scipy>=1.7.0 +scikit-learn +hyperopt +matplotlib +seaborn +joblib +torch +shadow-ssml diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..d91c826 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,328 @@ +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +# For hyperopt (parameter optimization) +from hyperopt import Trials, tpe, fmin +from functools import partial +# diagnostics +from sklearn.metrics import confusion_matrix +# pca +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +# Cross Validation +from sklearn.model_selection import KFold, StratifiedKFold + + +class EarlyStopper: + ''' + Early stopping mechanism for neural networks. + Code adapted from user "isle_of_gods" from StackOverflow: + https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch + Use this class to break a training loop if the validation loss is low. + Inputs: + patience: integer; forces stop if validation loss has not improved + for some time + min_delta: "fudge value" for how much loss to tolerate before stopping + ''' + + def __init__(self, patience=1, min_delta=0): + self.patience = patience + self.min_delta = min_delta + self.counter = 0 + self.min_validation_loss = np.inf + + def early_stop(self, validation_loss): + ''' + Tests for the early stopping condition if the validation loss + has not improved for a certain period of time (patience). + Inputs: + validation_loss: typically a float value for the loss function of + a neural network training loop + ''' + + if validation_loss < self.min_validation_loss: + # keep track of the smallest validation loss + # if it has been beaten, restart patience + self.min_validation_loss = validation_loss + self.counter = 0 + elif validation_loss > (self.min_validation_loss + self.min_delta): + # keep track of whether validation loss has been decreasing + # by a tolerable amount + self.counter += 1 + if self.counter >= self.patience: + return True + return False + + +def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True): + ''' + Runs hyperparameter optimization on a model given a parameter space. + Inputs: + space: dictionary with each hyperparameter as keys and values being the + range of parameter space (see hyperopt docs for defining a space) + mode: function that takes params dictionary, trains a specified ML model + and returns the optimization loss function, model, and other + attributes (e.g. accuracy on evaluation set) + max_eval: (int) run hyperparameter optimization for max_val iterations + verbose: report best and worse loss/accuracy + + Returns: + best: dictionary with returns from model function, including best loss, + best trained model, best parameters, etc. + worst: dictionary with returns from model function, including worst loss, + worst trained model, worst parameters, etc. + ''' + + trials = Trials() + + # wrap data into objective function + fmin_objective = partial(model, data_dict=data_dict) + + # run hyperopt + fmin(fmin_objective, + space, + algo=tpe.suggest, + max_evals=max_evals, + trials=trials) + + # of all trials, find best and worst loss/accuracy from optimization + best = trials.results[np.argmin([r['loss'] for r in trials.results])] + worst = trials.results[np.argmax([r['loss'] for r in trials.results])] + + if verbose: + print('best accuracy:', 1-best['loss']) + print('best params:', best['params']) + print('worst accuracy:', 1-worst['loss']) + print('worst params:', worst['params']) + + return best, worst + + +def cross_validation(model, X, y, params, n_splits=3, + stratified=False, random_state=None): + ''' + Perform K-Fold cross validation using sklearn and a given model. + The model *must* have a fresh_start method (see models in RadClass/models). + fresh_start() is used instead of train() to be agnostic to the data needed + for training (fresh_start requires a data_dict whereas each model's + train could take different combinations of labeled & unlabeled data). + This also avoids the need to do hyperparameter optimization (and + therefore many training epochs) for every K-Fold. + NOTE: fresh_start returns the model and results in a dictionary but + does not overwrite/save the model to the respective class. + You can manually overwrite using model.model = return.model + Hyperparameter optimization (model.optimize) can be done before or after + cross validation to specify the (optimal) parameters used by the model + since they are required here. + NOTE: Fixed default to shuffle data during cross validation splits. + (See sklearn cross validation docs for more info.) + NOTE: Unlabeled data, if provided, will always be included in the training + dataset. This means that this cross validation implementation is + susceptible to bias in the unlabeled data distribution. To test for + this bias, a user can manually run cross validation as a parent to + calling this function, splitting the unlabeled data and adding + different folds into X. + Inputs: + model: ML model class object (e.g. RadClass/models). + Must have a fresh_start() method. + NOTE: If the model expects unlabeled data but unlabed data is not + provided in X/y, an error will likely be thrown when training the model + through fresh_start. + X: array of feature vectors (rows of individual instances, cols of vectors) + This should include all data for training and testing (since the + testing subset will be split by cross validation), including unlabeled + data if needed/used. + y: array/vector of labels for X. If including unlabeled data, use -1. + This should have the same order as X. That is, each row index in X + has an associated label with the same index in y. + params: dictionary of hyperparameters. Will depend on model used. + Alternatively, use model.params for models in RadClass/models + n_splits: int number of splits for K-Fold cross validation + stratified: bool; if True, balance the K-Folds to have roughly the same + proportion of samples from each class. + random_state: seed for reproducility. + ''' + + # return lists + accs = [] + reports = [] + + if stratified: + cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, + shuffle=True) + else: + cv = KFold(n_splits=n_splits, random_state=random_state, + shuffle=True) + + # separate unlabeled data if included + Ux = None + Uy = None + if -1 in y: + U_idx = np.where(y == -1)[0] + L_idx = np.where(y != -1)[0] + Ux = X[U_idx] + Uy = y[U_idx] + Lx = X[L_idx] + Ly = y[L_idx] + else: + Lx = X + Ly = y + # conduct K-Fold cross validation + cv.get_n_splits(Lx, Ly) + for train_idx, test_idx in cv.split(Lx, Ly): + trainx, testx = Lx[train_idx], Lx[test_idx] + trainy, testy = Ly[train_idx], Ly[test_idx] + + # construct data dictionary for training in fresh_start + data_dict = {'trainx': trainx, 'trainy': trainy, + 'testx': testx, 'testy': testy} + if Ux is not None: + data_dict['Ux'] = Ux + data_dict['Uy'] = Uy + results = model.fresh_start(params, data_dict) + accs = np.append(accs, results['accuracy']) + reports = np.append(reports, results) + + # report cross validation results + print('Average accuracy:', np.mean(accs)) + print('Max accuracy:', np.max(accs)) + print('All accuracy:', accs) + # return the results of fresh_start for the max accuracy model + return reports[np.argmax(accs)] + + +def pca(Lx, Ly, Ux, Uy, filename): + ''' + A function for computing and plotting 2D PCA. + Inputs: + Lx: labeled feature data. + Ly: class labels for labeled data. + Ux: unlabeled feature data. + Uy: labels for unlabeled data (all labels should be -1). + filename: filename for saved plot. + The file must be saved with extension .joblib. + Added to filename if not included as input. + ''' + + plt.rcParams.update({'font.size': 20}) + # only saving colors for binary classification with unlabeled instances + col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'} + + pcadata = np.append(Lx, Ux, axis=0) + normalizer = StandardScaler() + x = normalizer.fit_transform(pcadata) + print(np.mean(pcadata), np.std(pcadata)) + print(np.mean(x), np.std(x)) + + pca = PCA(n_components=2) + pca.fit_transform(x) + print(pca.explained_variance_ratio_) + print(pca.singular_values_) + print(pca.components_) + + principalComponents = pca.fit_transform(x) + + fig, ax = plt.subplots(figsize=(10, 8)) + ax.set_xlabel('Principal Component 1', fontsize=15) + ax.set_ylabel('Principal Component 2', fontsize=15) + for idx, color in col_dict.items(): + indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0] + ax.scatter(principalComponents[indices, 0], + principalComponents[indices, 1], + c=color, + label='class '+str(idx)) + ax.grid() + ax.legend() + + if filename[-4:] != '.png': + filename += '.png' + fig.tight_layout() + fig.savefig(filename) + + +def multiD_pca(Lx, Ly, Ux, Uy, filename, n=2): + ''' + A function for computing and plotting n-dimensional PCA. + Inputs: + Lx: labeled feature data. + Ly: class labels for labeled data. + Ux: unlabeled feature data. + Uy: labels for unlabeled data (all labels should be -1). + filename: filename for saved plot. + The file must be saved with extension .joblib. + Added to filename if not included as input. + n: number of singular values to include in PCA analysis. + ''' + + plt.rcParams.update({'font.size': 20}) + # only saving colors for binary classification with unlabeled instances + col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'} + + pcadata = np.append(Lx, Ux, axis=0) + normalizer = StandardScaler() + x = normalizer.fit_transform(pcadata) + print(np.mean(pcadata), np.std(pcadata)) + print(np.mean(x), np.std(x)) + + n = 2 + pca = PCA(n_components=n) + principalComponents = pca.fit_transform(x) + print(pca.explained_variance_ratio_) + print(pca.singular_values_) + print(pca.components_) + + alph = ["A", "B", "C", "D", "E", "F", "G", "H", + "I", "J", "K", "L", "M", "N", "O", "P", + "Q", "R", "S", "T", "U", "V", "W", "X", + "Y", "Z"] + jobs = alph[:n] + + fig, axes = plt.subplots(n, n, figsize=(15, 15)) + + for row in range(axes.shape[0]): + for col in range(axes.shape[1]): + ax = axes[row, col] + if row == col: + ax.tick_params( + axis='both', which='both', + bottom='off', top='off', + labelbottom='off', + left='off', right='off', + labelleft='off' + ) + ax.text(0.5, 0.5, jobs[row], horizontalalignment='center') + else: + for idx, color in col_dict.items(): + indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0] + ax.scatter(principalComponents[indices, row], + principalComponents[indices, col], + c=color, + label='class '+str(idx)) + fig.tight_layout() + if filename[-4:] != '.png': + filename += '.png' + fig.savefig(filename) + + +def plot_cf(testy, predy, title, filename): + ''' + Uses sklearn metric to compute a confusion matrix for visualization + Inputs: + testy: array/vector with ground-truth labels for test/evaluation set + predy: array/vector with predicted sample labels from trained model + title: string title for plot + filename: string with extension for confusion matrix file + ''' + + cf_matrix = confusion_matrix(testy, predy) + ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues') + + ax.set_title(title) + ax.set_xlabel('\nPredicted Values') + ax.set_ylabel('Actual Values ') + + # Ticket labels - List must be in alphabetical order + ax.xaxis.set_ticklabels(['0(SNM)', '1(other)']) + ax.yaxis.set_ticklabels(['0(SNM)', '1(other)']) + # Save the visualization of the Confusion Matrix. + plt.savefig(filename) diff --git a/tests/test_BackgroundEstimator.py b/tests/test_BackgroundEstimator.py index 2d10c89..efc1299 100644 --- a/tests/test_BackgroundEstimator.py +++ b/tests/test_BackgroundEstimator.py @@ -77,7 +77,6 @@ def test_write(): bckg.write(ofilename=ofilename) results = np.loadtxt(fname=ofilename+'.csv', delimiter=',') - print(results) # the resulting observation should be: # counts * integration / live-time diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..e3fb086 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,491 @@ +# diagnostics +import numpy as np +from datetime import datetime, timedelta +# testing models +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +import tests.test_data as test_data +# hyperopt +from hyperopt.pyll.base import scope +from hyperopt import hp +# testing utils +import scripts.utils as utils +# models +from models.LogReg import LogReg +from models.SSML.CoTraining import CoTraining +from models.SSML.LabelProp import LabelProp +from models.SSML.ShadowNN import ShadowNN +from models.SSML.ShadowCNN import ShadowCNN +# testing write +import joblib +import os + +# initialize sample data +start_date = datetime(2019, 2, 2) +delta = timedelta(seconds=1) +timestamps = np.arange(start_date, + start_date + (test_data.timesteps * delta), + delta).astype('datetime64[s]').astype('float64') + +live = np.full((len(timestamps),), test_data.livetime) +sample_val = 1.0 +spectra = np.full((len(timestamps), test_data.energy_bins), + np.full((1, test_data.energy_bins), sample_val)) +# setting up for rejected null hypothesis +rejected_H0_time = np.random.choice(spectra.shape[0], + test_data.timesteps//2, + replace=False) +spectra[rejected_H0_time] = 100.0 + +labels = np.full((spectra.shape[0],), 0) +labels[rejected_H0_time] = 1 + + +def test_utils(): + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + Uy = np.full_like(Uy, -1) + + # test cross validation for supervised data using LogReg + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = LogReg(params=params) + max_acc_model = utils.cross_validation(model=model, + X=X, + y=y, + params=params) + assert max_acc_model['accuracy'] >= 0.5 + + # test cross validation for supervised data and StratifiedKFold with LogReg + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = LogReg(params=params) + max_acc_model = utils.cross_validation(model=model, + X=X, + y=y, + params=params, + stratified=True) + assert max_acc_model['accuracy'] >= 0.5 + + # test cross validation for SSML with LabelProp + params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} + model = LabelProp(params=params) + max_acc_model = utils.cross_validation(model=model, + X=np.append(X, Ux, axis=0), + y=np.append(y, Uy, axis=0), + params=params, + stratified=True) + assert max_acc_model['accuracy'] >= 0.5 + + # data split for data visualization + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + filename = 'test_pca' + utils.pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename) + os.remove(filename+'.png') + + filename = 'test_multiD_pca' + utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5) + os.remove(filename+'.png') + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + + # default behavior + model = LogReg(params=None, random_state=0) + model.train(X_train, y_train) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + filename = 'test_cf' + utils.plot_cf(y_test, pred, title=filename, filename=filename) + os.remove(filename+'.png') + + +def test_LogReg(): + # test saving model input parameters + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = LogReg(params=params) + + assert model.model.max_iter == params['max_iter'] + assert model.model.tol == params['tol'] + assert model.model.C == params['C'] + + X_train, X_test, y_train, y_test = train_test_split(spectra, + labels, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + + # default behavior + model = LogReg(params=None, random_state=0) + model.train(X_train, y_train) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + assert acc > 0.7 + np.testing.assert_equal(pred, y_test) + + # testing hyperopt optimize methods + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol': hp.loguniform('tol', 1e-5, 1e-1), + 'C': hp.uniform('C', 0.001, 1000.0) + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_CoTraining(): + # test saving model input parameters + params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0} + model = CoTraining(params=params) + + assert model.model1.max_iter == params['max_iter'] + assert model.model1.tol == params['tol'] + assert model.model1.C == params['C'] + + assert model.model2.max_iter == params['max_iter'] + assert model.model2.tol == params['tol'] + assert model.model2.C == params['C'] + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + # default behavior + model = CoTraining(params=None, random_state=0) + model.train(X_train, y_train, Ux) + + # testing train and predict methods + pred, acc, *_ = model.predict(X_test, y_test) + + assert acc > 0.7 + np.testing.assert_equal(pred, y_test) + + # testing hyperopt optimize methods + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol': hp.loguniform('tol', 1e-5, 1e-3), + 'C': hp.uniform('C', 1.0, 1000.0), + 'n_samples': scope.int(hp.quniform('n_samples', + 1, + 20, + 1)), + 'seed': 0 + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model plotting method + filename = 'test_plot' + model.plot_cotraining(model1_accs=model.best['model1_acc_history'], + model2_accs=model.best['model2_acc_history'], + filename=filename) + os.remove(filename+'.png') + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_LabelProp(): + # test saving model input parameters + params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5} + model = LabelProp(params=params) + + assert model.model.gamma == params['gamma'] + assert model.model.n_neighbors == params['n_neighbors'] + assert model.model.max_iter == params['max_iter'] + assert model.model.tol == params['tol'] + + # there should be no normalization on LabelProp data + # since it depends on the distances between samples + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # default behavior + model = LabelProp(params=None, random_state=0) + model.train(X_train, y_train, Ux) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + # the default n_neighbors(=7) from sklearn is too large + # for the size of this dataset + # therefore the accuracy is expected to be poor + # a better value for this dataset would be n_neighbors=2 + # (tested when specifying params in LabelProp.__init__) + assert acc >= 0.5 + # uninteresting test if LabelProp predicts all one class + # TODO: make the default params test meaningful + assert np.count_nonzero(pred == y_test) > 0 + + # testing hyperopt optimize methods + space = {'max_iter': scope.int(hp.quniform('max_iter', + 10, + 10000, + 10)), + 'tol': hp.loguniform('tol', 1e-6, 1e-4), + 'gamma': hp.uniform('gamma', 1, 50), + 'n_neighbors': scope.int(hp.quniform('n_neighbors', + 1, + X_train.shape[0], + 1)) + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_ShadowNN(): + # check default parameter settings + model = ShadowNN() + assert model.params == {'binning': 1} + assert model.eaat is not None + assert model.eaat_opt is not None + assert model.xEnt is not None + assert model.input_length == 1000 + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + params = {'hidden_layer': 10, + 'alpha': 0.1, + 'xi': 1e-3, + 'eps': 1.0, + 'lr': 0.1, + 'momentum': 0.9, + 'binning': 20} + # default behavior + model = ShadowNN(params=params, random_state=0) + acc_history = model.train(X_train, y_train, Ux, X_test, y_test) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + # test for agreement between training and testing + # (since the same data is used for diagnostics in this test) + assert acc_history[-1] == acc + + # Shadow/PyTorch reports accuracies as percentages + # rather than decimals + # uninteresting test if Shadow predicts all one class + # TODO: make the default params test meaningful + assert np.count_nonzero(pred == y_test) > 0 + + # testing hyperopt optimize methods + space = {'hidden_layer': 10, + 'alpha': 0.1, + 'xi': 1e-3, + 'eps': 1.0, + 'lr': 0.1, + 'momentum': 0.9, + 'binning': scope.int(hp.quniform('binning', + 10, + 20, + 1)) + } + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext) + + +def test_ShadowCNN(): + # check default parameter settings + model = ShadowCNN() + assert model.params == {'binning': 1, 'batch_size': 1} + assert model.model is not None + assert model.eaat is not None + assert model.optimizer is not None + + X, Ux, y, Uy = train_test_split(spectra, + labels, + test_size=0.5, + random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + + # normalization + normalizer = StandardScaler() + normalizer.fit(X_train) + + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + Ux = normalizer.transform(Ux) + + params = {'layer1': 2, + 'kernel': 3, + 'alpha': 0.1, + 'xi': 1e-3, + 'eps': 1.0, + 'lr': 0.1, + 'momentum': 0.9, + 'binning': 20, + 'batch_size': 4, + 'drop_rate': 0.1} + + # default behavior + model = ShadowCNN(params=params, random_state=0) + losscurve, evalcurve = model.train(X_train, y_train, Ux, X_test, y_test) + + # testing train and predict methods + pred, acc = model.predict(X_test, y_test) + + # test for agreement between training and testing + # (since the same data is used for diagnostics in this test) + assert evalcurve[-1] == acc + + # Shadow/PyTorch reports accuracies as percentages + # rather than decimals + # uninteresting test if Shadow predicts all one class + # TODO: make the default params test meaningful + assert np.count_nonzero(pred == y_test) > 0 + + # testing hyperopt optimize methods + space = params + space['binning'] = scope.int(hp.quniform('binning', + 10, + 20, + 1)) + data_dict = {'trainx': X_train, + 'testx': X_test, + 'trainy': y_train, + 'testy': y_test, + 'Ux': Ux + } + model.optimize(space, data_dict, max_evals=2, verbose=True) + + assert model.best['accuracy'] >= model.worst['accuracy'] + assert model.best['status'] == 'ok' + + # testing model plotting method + filename = 'test_plot' + model.plot_training(losscurve=model.best['losscurve'], + evalcurve=model.best['evalcurve'], + filename=filename) + os.remove(filename+'.png') + + # testing model write to file method + filename = 'test_LogReg' + ext = '.joblib' + model.save(filename) + model_file = joblib.load(filename+ext) + assert model_file.best['params'] == model.best['params'] + + os.remove(filename+ext)