From e10e6322045d981bd51641b27d6f8d4c8e3d2d1d Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 22 Apr 2022 10:52:50 -0500
Subject: [PATCH 01/35] adding hyperopt functions

---
 scripts/hyperopt.py | 75 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 scripts/hyperopt.py

diff --git a/scripts/hyperopt.py b/scripts/hyperopt.py
new file mode 100644
index 0000000..1417ebf
--- /dev/null
+++ b/scripts/hyperopt.py
@@ -0,0 +1,75 @@
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# For hyperopt (parameter optimization)
+from hyperopt import Trials, tpe, fmin
+
+# diagnostics
+from sklearn.metrics import confusion_matrix
+
+
+def run_hyperopt(space, model, max_evals=50, verbose=True):
+    '''
+    Runs hyperparameter optimization on a model given a parameter space.
+    Inputs:
+    space: dictionary with each hyperparameter as keys and values being the
+        range of parameter space (see hyperopt docs for defining a space)
+    mode: function that takes params dictionary, trains a specified ML model
+        and returns the optimization loss function, model, and other
+        attributes (e.g. accuracy on evaluation set)
+    max_eval: (int) run hyperparameter optimization for max_val iterations
+    verbose: report best and worse loss/accuracy
+
+    Returns:
+    best: dictionary with returns from model function, including best loss,
+        best trained model, best parameters, etc.
+    worst: dictionary with returns from model function, including worst loss,
+        worst trained model, worst parameters, etc.
+    '''
+
+    trials = Trials()
+    # run hyperopt
+    optimizer = fmin(model, 
+                     space, 
+                     algo=tpe.suggest,
+                     max_evals=max_evals,
+                     trials=trials)
+
+    # of all trials, find best and worst loss/accuracy from optimization
+    best = trials.results[np.argmin([r['loss'] for r in 
+        trials.results])]
+    worst = trials.results[np.argmax([r['loss'] for r in 
+        trials.results])]
+    
+    if verbose:
+        print('best accuracy:', 1-best['loss'])
+        print('best params:', best['params'])
+        print('worst accuracy:', 1-worst['loss'])
+        print('worst params:', worst['params'])
+    
+    return best, worst
+
+
+def plot_cf(testy, predy, title, filename):
+    '''
+    Uses sklearn metric to compute a confusion matrix for visualization
+    Inputs:
+    testy: array/vector with ground-truth labels for test/evaluation set
+    predy: array/vector with predicted sample labels from trained model
+    title: string title for plot
+    filename: string with extension for confusion matrix file
+    '''
+
+    cf_matrix = confusion_matrix(testy, predy)
+    ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
+
+    ax.set_title(title)
+    ax.set_xlabel('\nPredicted Values')
+    ax.set_ylabel('Actual Values ')
+
+    ## Ticket labels - List must be in alphabetical order
+    ax.xaxis.set_ticklabels(['0(SNM)','1(other)'])
+    ax.yaxis.set_ticklabels(['0(SNM)','1(other)'])
+    ## Save the visualization of the Confusion Matrix.
+    plt.savefig(filename)

From bd0ab96122ad5bc2f3b50273300d6496b2ac0a9e Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 22 Apr 2022 10:55:04 -0500
Subject: [PATCH 02/35] add supervised logistic regression model function

---
 scripts/logreg.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 scripts/logreg.py

diff --git a/scripts/logreg.py b/scripts/logreg.py
new file mode 100644
index 0000000..e7e44bb
--- /dev/null
+++ b/scripts/logreg.py
@@ -0,0 +1,23 @@
+# For hyperopt (parameter optimization)
+# ! pip install hyperopt
+from hyperopt import STATUS_OK
+
+# sklearn models
+from sklearn import linear_model
+
+# diagnostics
+from sklearn.metrics import balanced_accuracy_score
+
+
+def f_lr(params):
+    # supervised logistic regression
+    slr = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])#, multi_class='multinomial')
+    slr.fit(trainx, trainy)
+    slr_pred = slr.predict(testx)
+    acc = balanced_accuracy_score(testy, slr_pred)
+
+    return {'loss': 1-acc,
+            'status': STATUS_OK,
+            'model': slr,
+            'params': params,
+            'accuracy': acc}

From 1afbcd61b05582e910ac1596a6f2bb784342bbf5 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 22 Apr 2022 10:57:13 -0500
Subject: [PATCH 03/35] adding cotraining model function

---
 scripts/hyperopt.py       |  2 -
 scripts/logreg.py         |  3 --
 scripts/ssl/cotraining.py | 84 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 5 deletions(-)
 create mode 100644 scripts/ssl/cotraining.py

diff --git a/scripts/hyperopt.py b/scripts/hyperopt.py
index 1417ebf..00a987a 100644
--- a/scripts/hyperopt.py
+++ b/scripts/hyperopt.py
@@ -1,10 +1,8 @@
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
-
 # For hyperopt (parameter optimization)
 from hyperopt import Trials, tpe, fmin
-
 # diagnostics
 from sklearn.metrics import confusion_matrix
 
diff --git a/scripts/logreg.py b/scripts/logreg.py
index e7e44bb..c799418 100644
--- a/scripts/logreg.py
+++ b/scripts/logreg.py
@@ -1,10 +1,7 @@
 # For hyperopt (parameter optimization)
-# ! pip install hyperopt
 from hyperopt import STATUS_OK
-
 # sklearn models
 from sklearn import linear_model
-
 # diagnostics
 from sklearn.metrics import balanced_accuracy_score
 
diff --git a/scripts/ssl/cotraining.py b/scripts/ssl/cotraining.py
new file mode 100644
index 0000000..1b86eee
--- /dev/null
+++ b/scripts/ssl/cotraining.py
@@ -0,0 +1,84 @@
+import numpy as np
+import matplotlib.pyplot as plt
+# For hyperopt (parameter optimization)
+from hyperopt import STATUS_OK
+# sklearn models
+from sklearn import linear_model
+# diagnostics
+from sklearn.metrics import balanced_accuracy_score
+
+split_frac = 0.5
+# labeled training data
+idx = np.random.choice(range(trainy.shape[0]),
+                        size=int(split_frac * trainy.shape[0]),
+                        replace = False)
+
+
+def f_ct(params):
+    slr1 = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])#, multi_class='multinomial')
+    slr2 = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])#, multi_class='multinomial')
+
+    L_lr1 = trainx[idx].copy()
+    L_lr2 = trainx[~idx].copy()
+    Ly_lr1 = trainy[idx].copy()
+    Ly_lr2 = trainy[~idx].copy()
+    # unlabeled cotraining data
+    U_lr = U[:,1:].copy()
+
+    model1_accs, model2_accs = np.array([]), np.array([])
+    n_samples = params['n_samples']
+    rep = False
+
+    while U_lr.shape[0] > 1:
+        #print(U_lr.shape[0])
+        slr1.fit(L_lr1, Ly_lr1)
+        slr2.fit(L_lr2, Ly_lr2)
+
+        # pull u1
+        if U_lr.shape[0] < n_samples*2:
+            n_samples = int(U_lr.shape[0]/2)
+        uidx1 = np.random.choice(range(U_lr.shape[0]), n_samples, replace=rep)
+        #u1 = U_lr[uidx1].copy().reshape((1, U_lr[uidx1].shape[0]))
+        u1 = U_lr[uidx1].copy()
+        U_lr = np.delete(U_lr, uidx1, axis=0)
+
+        # pull u2
+        uidx2 = np.random.choice(range(U_lr.shape[0]), n_samples, replace=rep)
+        #u2 = U_lr[uidx2].copy().reshape((1, U_lr[uidx2].shape[0]))
+        u2 = U_lr[uidx2].copy()
+        U_lr = np.delete(U_lr, uidx2, axis=0)
+
+        # predict unlabeled samples
+        u1y = slr1.predict(u1)
+        u2y = slr2.predict(u2)
+
+        model1_accs = np.append(model1_accs, balanced_accuracy_score(testy, slr1.predict(testx)))
+        model2_accs = np.append(model2_accs, balanced_accuracy_score(testy, slr2.predict(testx)))
+
+        # send predictions to cotrained function samples
+        L_lr1 = np.append(L_lr1, u2, axis=0)
+        L_lr2 = np.append(L_lr2, u1, axis=0)
+        Ly_lr1 = np.append(Ly_lr1, u2y, axis=0)
+        Ly_lr2 = np.append(Ly_lr2, u1y, axis=0)
+
+    model1_acc = balanced_accuracy_score(testy, slr1.predict(testx))
+    model2_acc = balanced_accuracy_score(testy, slr2.predict(testx))
+    acc = max(model1_acc, model2_acc)
+    return {'loss': 1-acc,
+            'status': STATUS_OK,
+            'model': slr1,
+            'model2': slr2,
+            'model1_acc_history': model1_accs,
+            'model2_acc_history': model2_accs,
+            'params': params,
+            'accuracy': acc}
+
+
+def plot_cotraining():
+    plt.plot(np.arange(len(best_ct['model1_acc_history'])), best_ct['model1_acc_history'], label='Model 1')
+    plt.plot(np.arange(len(best_ct['model2_acc_history'])), best_ct['model2_acc_history'], label='Model 2')
+    plt.legend()
+    plt.xlabel('Co-Training Iteration')
+    plt.ylabel('Test Accuracy')
+    plt.grid()
+    plt.savefig('lr-cotraining-learningcurves.png')
\ No newline at end of file

From e3a5e62ed69a884dfc953cab229e0d2ea085cc5c Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 22 Apr 2022 11:01:26 -0500
Subject: [PATCH 04/35] adding code for Label Prop model function

---
 scripts/ssl/LabelProp.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 scripts/ssl/LabelProp.py

diff --git a/scripts/ssl/LabelProp.py b/scripts/ssl/LabelProp.py
new file mode 100644
index 0000000..9b09257
--- /dev/null
+++ b/scripts/ssl/LabelProp.py
@@ -0,0 +1,22 @@
+import numpy as np
+# For hyperopt (parameter optimization)
+from hyperopt import STATUS_OK
+# sklearn models
+from sklearn.semi_supervised import LabelPropagation
+# diagnostics
+from sklearn.metrics import balanced_accuracy_score
+
+lp_trainx = np.append(trainx, U[:,1:], axis=0)
+lp_trainy = np.append(trainy, U[:,0], axis=0)
+
+
+def f_lp(params):
+    lp = LabelPropagation(kernel='knn', gamma=params['gamma'], n_neighbors=params['n_neighbors'], max_iter=params['max_iter'], tol=params['tol'], n_jobs=-1)
+    lp.fit(lp_trainx, lp_trainy)
+    acc = balanced_accuracy_score(testy, lp.predict(testx))
+
+    return {'loss': 1-acc,
+            'status': STATUS_OK,
+            'model': lp,
+            'params': params,
+            'accuracy': acc}
\ No newline at end of file

From 12c46deb5c165dcc374616c7d599d4df9692d980 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 22 Apr 2022 11:03:32 -0500
Subject: [PATCH 05/35] adding shadow fully connected NN model function

---
 scripts/ssl/shadow_nn.py | 55 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 scripts/ssl/shadow_nn.py

diff --git a/scripts/ssl/shadow_nn.py b/scripts/ssl/shadow_nn.py
new file mode 100644
index 0000000..99e2159
--- /dev/null
+++ b/scripts/ssl/shadow_nn.py
@@ -0,0 +1,55 @@
+import numpy as np
+# For hyperopt (parameter optimization)
+from hyperopt import STATUS_OK
+# torch imports
+import torch
+# shadow imports
+import shadow
+
+shadow.utils.set_seed(0)  # set seeds for reproducibility
+
+
+def model_factory(length=1000, hidden_layer=10000):
+    return torch.nn.Sequential(
+        torch.nn.Linear(length, hidden_layer),
+        torch.nn.ReLU(),
+        torch.nn.Linear(hidden_layer, length),
+        torch.nn.ReLU(),
+        torch.nn.Linear(length, 2)
+    )
+
+
+def f_nn(params):
+    device = torch.device('cpu')  # run on cpu, since model and data are very small
+    eaat = shadow.eaat.EAAT(model=model_factory(testx[:,::params['binning']].shape[1], params['hidden_layer']), alpha=params['alpha'], xi=params['xi'], eps=params['eps']).to(device)
+    eaat_opt = torch.optim.SGD(eaat.parameters(), lr=params['lr'], momentum=params['momentum'])
+    xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1).to(device)
+
+    # avoid float round-off by using DoubleTensor
+    xtens = torch.FloatTensor(np.append(trainx, U[:,1:], axis=0)[:,::params['binning']])
+    # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
+    ytens = torch.LongTensor(np.append(trainy, U[:,0], axis=0))
+    #n_epochs = params['n_epochs']
+    n_epochs = 100
+    xt, yt = torch.Tensor(xtens).to(device), torch.LongTensor(ytens).to(device)
+    acc_history = []    # saves history for max accuracy
+    eaat.train()
+    for epoch in range(n_epochs):
+        # Forward/backward pass for training semi-supervised model
+        out = eaat(xt)
+        loss = xEnt(out, yt) + eaat.get_technique_cost(xt)  # supervised + unsupervised loss
+        eaat_opt.zero_grad()
+        loss.backward()
+        eaat_opt.step()
+    
+        eaat.eval()
+        eaat_pred = torch.max(eaat(torch.FloatTensor(testx.copy()[:,::params['binning']])), 1)[-1]
+        acc = shadow.losses.accuracy(eaat_pred, torch.LongTensor(testy.copy())).data.item()
+        acc_history.append(acc)
+    max_acc = np.max(acc_history[-50:])
+
+    return {'loss': 1-(max_acc/100.0),
+            'status': STATUS_OK,
+            'model': eaat,
+            'params': params,
+            'accuracy': (max_acc/100.0)}
\ No newline at end of file

From 3cc5e950f1fec6e20248b37d1b059698be2cac18 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 22 Apr 2022 11:06:24 -0500
Subject: [PATCH 06/35] adding shadow eaat cnn function model

---
 scripts/ssl/shadow_eaat_cnn.py | 116 +++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 scripts/ssl/shadow_eaat_cnn.py

diff --git a/scripts/ssl/shadow_eaat_cnn.py b/scripts/ssl/shadow_eaat_cnn.py
new file mode 100644
index 0000000..61d3e56
--- /dev/null
+++ b/scripts/ssl/shadow_eaat_cnn.py
@@ -0,0 +1,116 @@
+import numpy as np
+# For hyperopt (parameter optimization)
+from hyperopt import STATUS_OK
+# torch imports
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+# shadow imports
+import shadow.eaat
+import shadow.losses
+import shadow.utils
+from shadow.utils import set_seed
+
+set_seed(0)
+device = torch.device('cpu')  # run on cpu, since model and data are very small
+
+class Net(nn.Module):
+    def __init__(self, layer1=32, layer2=64, layer3=128, kernel=3, drop_rate=0.1, length=1000):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv1d(1, layer1, kernel, 1)
+        self.conv2 = nn.Conv1d(layer1, layer2, kernel, 1)
+        self.dropout = nn.Dropout2d(drop_rate)
+        self.fc1 = nn.Linear(int(layer2*(length-2*(kernel-1))/2), layer3)
+        #self.fc1 = nn.Linear(31744, 128)
+        self.fc2 = nn.Linear(layer3, 2)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.max_pool1d(x, 2)
+        x = self.dropout(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return x
+
+class MINOSDataset(torch.utils.data.Dataset):
+    def __init__(self, trainD, labels):
+        self.labels = labels
+        self.trainD = trainD
+
+    def __len__(self):
+        return len(self.labels)
+
+    def __getitem__(self, idx):
+        label = self.labels[idx]
+        data = self.trainD[idx]
+        # no need to bother with labels, unpacking both anyways
+        #sample = {"Spectrum": data, "Class": label}
+        #return sample
+        return data, label
+
+def eval(eaat, binning):
+    eaat.eval()
+    y_pred, y_true = [], []
+    for i, (data, targets) in enumerate(zip(torch.FloatTensor(testx.copy()[:,::binning]), torch.LongTensor(testy.copy()))):
+        x = data.reshape((1, 1, data.shape[0])).to(device)
+        y = targets.reshape((1,)).to(device)
+        out = eaat(x)
+        y_true.extend(y.detach().cpu().tolist())
+        y_pred.extend(torch.argmax(out, 1).detach().cpu().tolist())
+    test_acc = (np.array(y_true) == np.array(y_pred)).mean() * 100
+    #print('test accuracy: {}'.format(test_acc))
+    return test_acc
+
+def f_eaat(params):
+    #print(params)
+    # avoid float round-off by using DoubleTensor
+    xtens = torch.FloatTensor(np.append(trainx, U[:,1:], axis=0))[:,::params['binning']]
+    # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
+    ytens = torch.LongTensor(np.append(trainy, U[:,0], axis=0))
+    
+    #print(xtens.shape)
+    device = torch.device('cpu')  # run on cpu, since model and data are very small
+    model = Net(layer1=params['layer1'], layer2=2*params['layer1'], layer3=3*params['layer1'], kernel=params['kernel'], drop_rate=params['drop_rate'], length=xtens.shape[1])
+    eaat = shadow.eaat.EAAT(model=model, alpha=params['alpha'], xi=params['xi'], eps=params['eps'])
+    optimizer = optim.SGD(eaat.parameters(), lr=params['lr'], momentum=params['momentum'])
+
+    # define data set object
+    MINOS_train = MINOSDataset(xtens, ytens)
+
+    # create DataLoader object of DataSet object
+    DL_DS = torch.utils.data.DataLoader(MINOS_train, batch_size=params['batch_size'], shuffle=True)
+
+    xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1)
+
+    n_epochs = 50
+    eaat.to(device)
+    losscurve = []
+    evalcurve = []
+    for epoch in range(n_epochs):
+        eaat.train()
+        lossavg = []
+        for i, (data, targets) in enumerate(DL_DS):
+            x = data.reshape((data.shape[0], 1, data.shape[1])).to(device)
+            y = targets.to(device)
+            optimizer.zero_grad()
+            out = eaat(x)
+            loss = xEnt(out, y) + eaat.get_technique_cost(x)
+            loss.backward()
+            optimizer.step()
+            lossavg.append(loss.item())
+        losscurve.append(np.nanmedian(lossavg))
+        evalcurve.append(eval(eaat, params['binning']))
+    
+    max_acc = np.max(evalcurve[-25:])
+
+    return {'loss': 1-(max_acc/100.0),
+            'status': STATUS_OK,
+            'model': eaat,
+            'params': params,
+            'accuracy': (max_acc/100.0)}
\ No newline at end of file

From 15fede0b4742b70d74495acbaab6dea4e9f46b92 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 22 Apr 2022 11:07:06 -0500
Subject: [PATCH 07/35] abstracting MINOS to Spectra

---
 scripts/ssl/shadow_eaat_cnn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/ssl/shadow_eaat_cnn.py b/scripts/ssl/shadow_eaat_cnn.py
index 61d3e56..e8cc477 100644
--- a/scripts/ssl/shadow_eaat_cnn.py
+++ b/scripts/ssl/shadow_eaat_cnn.py
@@ -38,7 +38,7 @@ def forward(self, x):
         x = self.fc2(x)
         return x
 
-class MINOSDataset(torch.utils.data.Dataset):
+class SpectralDataset(torch.utils.data.Dataset):
     def __init__(self, trainD, labels):
         self.labels = labels
         self.trainD = trainD
@@ -81,7 +81,7 @@ def f_eaat(params):
     optimizer = optim.SGD(eaat.parameters(), lr=params['lr'], momentum=params['momentum'])
 
     # define data set object
-    MINOS_train = MINOSDataset(xtens, ytens)
+    MINOS_train = SpectralDataset(xtens, ytens)
 
     # create DataLoader object of DataSet object
     DL_DS = torch.utils.data.DataLoader(MINOS_train, batch_size=params['batch_size'], shuffle=True)

From a9410dae3e4a230f2df10466d4a75d40ca0dd9cd Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 22 Apr 2022 12:21:51 -0500
Subject: [PATCH 08/35] removing duplicate device in eaat-cnn

---
 scripts/ssl/shadow_eaat_cnn.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/ssl/shadow_eaat_cnn.py b/scripts/ssl/shadow_eaat_cnn.py
index e8cc477..e7eac82 100644
--- a/scripts/ssl/shadow_eaat_cnn.py
+++ b/scripts/ssl/shadow_eaat_cnn.py
@@ -75,7 +75,6 @@ def f_eaat(params):
     ytens = torch.LongTensor(np.append(trainy, U[:,0], axis=0))
     
     #print(xtens.shape)
-    device = torch.device('cpu')  # run on cpu, since model and data are very small
     model = Net(layer1=params['layer1'], layer2=2*params['layer1'], layer3=3*params['layer1'], kernel=params['kernel'], drop_rate=params['drop_rate'], length=xtens.shape[1])
     eaat = shadow.eaat.EAAT(model=model, alpha=params['alpha'], xi=params['xi'], eps=params['eps'])
     optimizer = optim.SGD(eaat.parameters(), lr=params['lr'], momentum=params['momentum'])

From d3e5068bc8937176eab71bb2bf994dd6c314c1e2 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 29 Jul 2022 15:25:49 -0400
Subject: [PATCH 09/35] revamping design of ssl models, starting with logreg

---
 scripts/hyperopt.py |  9 +++++--
 scripts/logreg.py   | 60 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/scripts/hyperopt.py b/scripts/hyperopt.py
index 00a987a..2ec0a94 100644
--- a/scripts/hyperopt.py
+++ b/scripts/hyperopt.py
@@ -3,11 +3,12 @@
 import matplotlib.pyplot as plt
 # For hyperopt (parameter optimization)
 from hyperopt import Trials, tpe, fmin
+from functools import partial
 # diagnostics
 from sklearn.metrics import confusion_matrix
 
 
-def run_hyperopt(space, model, max_evals=50, verbose=True):
+def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True):
     '''
     Runs hyperparameter optimization on a model given a parameter space.
     Inputs:
@@ -27,8 +28,12 @@ def run_hyperopt(space, model, max_evals=50, verbose=True):
     '''
 
     trials = Trials()
+
+    # wrap data into objective function
+    fmin_objective = partial(model, data_dict=data_dict, device=None)
+
     # run hyperopt
-    optimizer = fmin(model, 
+    optimizer = fmin(fmin_objective, 
                      space, 
                      algo=tpe.suggest,
                      max_evals=max_evals,
diff --git a/scripts/logreg.py b/scripts/logreg.py
index c799418..f8f3505 100644
--- a/scripts/logreg.py
+++ b/scripts/logreg.py
@@ -4,17 +4,55 @@
 from sklearn import linear_model
 # diagnostics
 from sklearn.metrics import balanced_accuracy_score
+from scripts.hyperopt import run_hyperopt
 
+class LogisticRegression:
+    # only binary so far
+    def __init__(self, params=None):
+        # dictionary of parameters for logistic regression model
+        self.params = params
+        if self.params is None:
+            self.model = linear_model.LogisticRegression()
+        else:
+            self.model = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])
 
-def f_lr(params):
-    # supervised logistic regression
-    slr = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])#, multi_class='multinomial')
-    slr.fit(trainx, trainy)
-    slr_pred = slr.predict(testx)
-    acc = balanced_accuracy_score(testy, slr_pred)
+    def fresh_start(self, params, data_dict):
+        # unpack data
+        trainx = data_dict['trainx']
+        trainy = data_dict['trainy']
+        testx = data_dict['testx']
+        testy = data_dict['testy']
 
-    return {'loss': 1-acc,
-            'status': STATUS_OK,
-            'model': slr,
-            'params': params,
-            'accuracy': acc}
+        # supervised logistic regression
+        clr = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])
+        clr.fit(trainx, trainy)
+        clr_pred = clr.predict(testx)
+        # could alternatively use pure accuracy for a more traditional hyperopt
+        acc = balanced_accuracy_score(testy, clr_pred)
+
+        return {'loss': 1-acc,
+                'status': STATUS_OK,
+                'model': clr,
+                'params': params,
+                'accuracy': acc}
+
+    def optimize(self, space, max_evals=50, verbose=True):
+        best, worst = run_hyperopt(space, self.fresh_start, max_evals, verbose)
+
+        self.best = best
+        self.model = best['model']
+        self.params = best['params']
+        self.worst = worst
+
+    def train(self, trainx, trainy):
+        # supervised logistic regression
+        self.model.fit(trainx, trainy)
+
+    def test(self, testx, testy=None):
+        pred = self.model.predict(testx)
+
+        acc = 0.
+        if testy is not None:
+            acc = balanced_accuracy_score(testy, pred)
+        
+        return pred, acc

From 3126ebe8df7a42b994c89e9b3818830b39711acf Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Thu, 4 Aug 2022 11:57:38 -0400
Subject: [PATCH 10/35] adding save function to logreg class and renaming
 hyperopt.py

---
 scripts/logreg.py                    | 14 ++++++++++----
 scripts/{hyperopt.py => optimize.py} |  2 +-
 scripts/ssl/LabelProp.py             |  2 +-
 scripts/ssl/cotraining.py            |  2 +-
 scripts/ssl/shadow_eaat_cnn.py       |  2 +-
 scripts/ssl/shadow_nn.py             |  2 +-
 6 files changed, 15 insertions(+), 9 deletions(-)
 rename scripts/{hyperopt.py => optimize.py} (98%)

diff --git a/scripts/logreg.py b/scripts/logreg.py
index f8f3505..49d0087 100644
--- a/scripts/logreg.py
+++ b/scripts/logreg.py
@@ -1,10 +1,11 @@
 # For hyperopt (parameter optimization)
-from hyperopt import STATUS_OK
+from scripts.optimize import STATUS_OK
 # sklearn models
 from sklearn import linear_model
 # diagnostics
 from sklearn.metrics import balanced_accuracy_score
-from scripts.hyperopt import run_hyperopt
+from scripts.optimize import run_hyperopt
+import joblib
 
 class LogisticRegression:
     # only binary so far
@@ -48,11 +49,16 @@ def train(self, trainx, trainy):
         # supervised logistic regression
         self.model.fit(trainx, trainy)
 
-    def test(self, testx, testy=None):
+    def predict(self, testx, testy=None):
         pred = self.model.predict(testx)
 
-        acc = 0.
+        acc = None
         if testy is not None:
             acc = balanced_accuracy_score(testy, pred)
         
         return pred, acc
+
+    def save(self, filename):
+        if filename[-7:] != '.joblib':
+            filename += '.joblib'
+        joblib.dump(self, filename)
diff --git a/scripts/hyperopt.py b/scripts/optimize.py
similarity index 98%
rename from scripts/hyperopt.py
rename to scripts/optimize.py
index 2ec0a94..556dc3c 100644
--- a/scripts/hyperopt.py
+++ b/scripts/optimize.py
@@ -2,7 +2,7 @@
 import seaborn as sns
 import matplotlib.pyplot as plt
 # For hyperopt (parameter optimization)
-from hyperopt import Trials, tpe, fmin
+from scripts.optimize import Trials, tpe, fmin
 from functools import partial
 # diagnostics
 from sklearn.metrics import confusion_matrix
diff --git a/scripts/ssl/LabelProp.py b/scripts/ssl/LabelProp.py
index 9b09257..503513a 100644
--- a/scripts/ssl/LabelProp.py
+++ b/scripts/ssl/LabelProp.py
@@ -1,6 +1,6 @@
 import numpy as np
 # For hyperopt (parameter optimization)
-from hyperopt import STATUS_OK
+from scripts.optimize import STATUS_OK
 # sklearn models
 from sklearn.semi_supervised import LabelPropagation
 # diagnostics
diff --git a/scripts/ssl/cotraining.py b/scripts/ssl/cotraining.py
index 1b86eee..719d376 100644
--- a/scripts/ssl/cotraining.py
+++ b/scripts/ssl/cotraining.py
@@ -1,7 +1,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 # For hyperopt (parameter optimization)
-from hyperopt import STATUS_OK
+from scripts.optimize import STATUS_OK
 # sklearn models
 from sklearn import linear_model
 # diagnostics
diff --git a/scripts/ssl/shadow_eaat_cnn.py b/scripts/ssl/shadow_eaat_cnn.py
index e7eac82..4649435 100644
--- a/scripts/ssl/shadow_eaat_cnn.py
+++ b/scripts/ssl/shadow_eaat_cnn.py
@@ -1,6 +1,6 @@
 import numpy as np
 # For hyperopt (parameter optimization)
-from hyperopt import STATUS_OK
+from scripts.optimize import STATUS_OK
 # torch imports
 import torch
 import torch.nn as nn
diff --git a/scripts/ssl/shadow_nn.py b/scripts/ssl/shadow_nn.py
index 99e2159..380afbb 100644
--- a/scripts/ssl/shadow_nn.py
+++ b/scripts/ssl/shadow_nn.py
@@ -1,6 +1,6 @@
 import numpy as np
 # For hyperopt (parameter optimization)
-from hyperopt import STATUS_OK
+from scripts.optimize import STATUS_OK
 # torch imports
 import torch
 # shadow imports

From edcc56e57f083874cbc0f4f76bc542fb3464b70a Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Fri, 12 Aug 2022 10:16:10 -0400
Subject: [PATCH 11/35] commenting logistic regression class and methods

---
 scripts/logreg.py                 | 121 +++++++++++++++++++++++++++---
 scripts/ssl/LabelProp.py          |   2 +-
 scripts/ssl/cotraining.py         |   2 +-
 scripts/ssl/shadow_eaat_cnn.py    |   2 +-
 scripts/ssl/shadow_nn.py          |   2 +-
 scripts/{optimize.py => utils.py} |   2 +-
 6 files changed, 117 insertions(+), 14 deletions(-)
 rename scripts/{optimize.py => utils.py} (98%)

diff --git a/scripts/logreg.py b/scripts/logreg.py
index 49d0087..3b7b427 100644
--- a/scripts/logreg.py
+++ b/scripts/logreg.py
@@ -1,23 +1,57 @@
 # For hyperopt (parameter optimization)
-from scripts.optimize import STATUS_OK
+from scripts.utils import STATUS_OK
 # sklearn models
 from sklearn import linear_model
 # diagnostics
 from sklearn.metrics import balanced_accuracy_score
-from scripts.optimize import run_hyperopt
+from scripts.utils import run_hyperopt
 import joblib
 
+
 class LogisticRegression:
+    '''
+    Methods for deploying logistic regression with hyperparameter optimization.
+    Data agnostic (i.e. user supplied data inputs).
+    TODO: Currently only supports binary classification.
+        Add multinomial functions and unit tests.
+    Inputs:
+    params: dictionary of logistic regression input functions.
+        keys max_iter, tol, and C supported.
+    random_state: int/float for reproducible intiailization.
+    '''
+
     # only binary so far
-    def __init__(self, params=None):
+    def __init__(self, params=None, random_state=0):
+        # defaults to a fixed value for reproducibility
+        self.random_state = random_state
         # dictionary of parameters for logistic regression model
         self.params = params
         if self.params is None:
-            self.model = linear_model.LogisticRegression()
+            self.model = linear_model.LogisticRegression(
+                            random_state=self.random_state
+                        )
         else:
-            self.model = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])
+            self.model = linear_model.LogisticRegression(
+                            random_state=self.random_state,
+                            max_iter=params['max_iter'],
+                            tol=params['tol'],
+                            C=params['C']
+                        )
 
     def fresh_start(self, params, data_dict):
+        '''
+        Required method for hyperopt optimization.
+        Trains and tests a fresh logistic regression model
+        with given input parameters.
+        This method does not overwrite self.model (self.optimize() does).
+        Inputs:
+        params: dictionary of logistic regression input functions.
+            keys max_iter, tol, and C supported.
+        data_dict: compact data representation with the four requisite
+            data structures used for training and testing a model.
+            keys trainx, trainy, testx, testy required.
+        '''
+
         # unpack data
         trainx = data_dict['trainx']
         trainy = data_dict['trainy']
@@ -25,40 +59,109 @@ def fresh_start(self, params, data_dict):
         testy = data_dict['testy']
 
         # supervised logistic regression
-        clr = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])
+        clr = linear_model.LogisticRegression(
+                random_state=self.random_state,
+                max_iter=params['max_iter'],
+                tol=params['tol'],
+                C=params['C']
+              )
+        # train and test model
         clr.fit(trainx, trainy)
         clr_pred = clr.predict(testx)
+        # balanced_accuracy accounts for class imbalanced data
         # could alternatively use pure accuracy for a more traditional hyperopt
         acc = balanced_accuracy_score(testy, clr_pred)
 
+        # loss function minimizes misclassification
         return {'loss': 1-acc,
                 'status': STATUS_OK,
                 'model': clr,
                 'params': params,
                 'accuracy': acc}
 
-    def optimize(self, space, max_evals=50, verbose=True):
-        best, worst = run_hyperopt(space, self.fresh_start, max_evals, verbose)
+    def optimize(self, space, data_dict, max_evals=50, verbose=True):
+        '''
+        Wrapper method for using hyperopt (see utils.run_hyperopt
+        for more details). After hyperparameter optimization, results
+        are stored, the best model -overwrites- self.model, and the
+        best params -overwrite- self.params.
+        Inputs:
+        space: a hyperopt compliant dictionary with defined optimization
+            spaces. For example:
+                # quniform returns float, some parameters require int;
+                # use this to force int
+                space = {'max_iter': scope.int(hp.quniform('max_iter',
+                                                           10,
+                                                           10000,
+                                                           10)),
+                        'tol'      : hp.loguniform('tol', 1e-5, 1e-1),
+                        'C'        : hp.uniform('C', 0.001,1000.0)
+                        }
+            See hyperopt docs for more information.
+        data_dict: compact data representation with the four requisite
+            data structures used for training and testing a model.
+            keys trainx, trainy, testx, testy required.
+        max_evals: the number of epochs for hyperparameter optimization.
+            Each iteration is one set of hyperparameters trained
+            and tested on a fresh model. Convergence for simpler
+            models like logistic regression typically happens well
+            before 50 epochs, but can increase as more complex models,
+            more hyperparameters, and a larger hyperparameter space is tested.
+        verbose: boolean. If true, print results of hyperopt.
+            If false, print only the progress bar for optimization.
+        '''
+
+        best, worst = run_hyperopt(space=space,
+                                   model=self.fresh_start,
+                                   data_dict=data_dict,
+                                   max_evals=max_evals,
+                                   verbose=verbose)
 
+        # save the results of hyperparameter optimization
         self.best = best
         self.model = best['model']
         self.params = best['params']
         self.worst = worst
 
     def train(self, trainx, trainy):
+        '''
+        Wrapper method for sklearn's logisitic regression training method.
+        Inputs:
+        trainx: nxm feature vector/matrix for training model.
+        trainy: nxk class label vector/matrix for training model.
+        '''
+
         # supervised logistic regression
         self.model.fit(trainx, trainy)
 
     def predict(self, testx, testy=None):
+        '''
+        Wrapper method for sklearn's logistic regression predict method.
+        Inputs:
+        testx: nxm feature vector/matrix for testing model.
+        testy: nxk class label vector/matrix for training model.
+            optional: if included, the predicted classes -and-
+            the resulting classification accuracy will be returned.
+        '''
+
         pred = self.model.predict(testx)
 
         acc = None
         if testy is not None:
+            # uses balanced_accuracy_score to account for class imbalance
             acc = balanced_accuracy_score(testy, pred)
-        
+
         return pred, acc
 
     def save(self, filename):
+        '''
+        Save class instance to file using joblib.
+        Inputs:
+        filename: string filename to save object to file under.
+            The file must be saved with extension .joblib.
+            Added to filename if not included as input.
+        '''
+
         if filename[-7:] != '.joblib':
             filename += '.joblib'
         joblib.dump(self, filename)
diff --git a/scripts/ssl/LabelProp.py b/scripts/ssl/LabelProp.py
index 503513a..fc0f071 100644
--- a/scripts/ssl/LabelProp.py
+++ b/scripts/ssl/LabelProp.py
@@ -1,6 +1,6 @@
 import numpy as np
 # For hyperopt (parameter optimization)
-from scripts.optimize import STATUS_OK
+from scripts.utils import STATUS_OK
 # sklearn models
 from sklearn.semi_supervised import LabelPropagation
 # diagnostics
diff --git a/scripts/ssl/cotraining.py b/scripts/ssl/cotraining.py
index 719d376..60dc11c 100644
--- a/scripts/ssl/cotraining.py
+++ b/scripts/ssl/cotraining.py
@@ -1,7 +1,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 # For hyperopt (parameter optimization)
-from scripts.optimize import STATUS_OK
+from scripts.utils import STATUS_OK
 # sklearn models
 from sklearn import linear_model
 # diagnostics
diff --git a/scripts/ssl/shadow_eaat_cnn.py b/scripts/ssl/shadow_eaat_cnn.py
index 4649435..44154ba 100644
--- a/scripts/ssl/shadow_eaat_cnn.py
+++ b/scripts/ssl/shadow_eaat_cnn.py
@@ -1,6 +1,6 @@
 import numpy as np
 # For hyperopt (parameter optimization)
-from scripts.optimize import STATUS_OK
+from scripts.utils import STATUS_OK
 # torch imports
 import torch
 import torch.nn as nn
diff --git a/scripts/ssl/shadow_nn.py b/scripts/ssl/shadow_nn.py
index 380afbb..59cde53 100644
--- a/scripts/ssl/shadow_nn.py
+++ b/scripts/ssl/shadow_nn.py
@@ -1,6 +1,6 @@
 import numpy as np
 # For hyperopt (parameter optimization)
-from scripts.optimize import STATUS_OK
+from scripts.utils import STATUS_OK
 # torch imports
 import torch
 # shadow imports
diff --git a/scripts/optimize.py b/scripts/utils.py
similarity index 98%
rename from scripts/optimize.py
rename to scripts/utils.py
index 556dc3c..4a98ef9 100644
--- a/scripts/optimize.py
+++ b/scripts/utils.py
@@ -2,7 +2,7 @@
 import seaborn as sns
 import matplotlib.pyplot as plt
 # For hyperopt (parameter optimization)
-from scripts.optimize import Trials, tpe, fmin
+from scripts.utils import Trials, tpe, fmin
 from functools import partial
 # diagnostics
 from sklearn.metrics import confusion_matrix

From bf630f4539671fedcc4642dca51f37c116d0f770 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Fri, 12 Aug 2022 10:20:45 -0400
Subject: [PATCH 12/35] scripts/utils.py pep8 changes

---
 scripts/utils.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/scripts/utils.py b/scripts/utils.py
index 4a98ef9..38c2f5b 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -33,24 +33,22 @@ def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True):
     fmin_objective = partial(model, data_dict=data_dict, device=None)
 
     # run hyperopt
-    optimizer = fmin(fmin_objective, 
-                     space, 
-                     algo=tpe.suggest,
-                     max_evals=max_evals,
-                     trials=trials)
+    fmin(fmin_objective,
+         space,
+         algo=tpe.suggest,
+         max_evals=max_evals,
+         trials=trials)
 
     # of all trials, find best and worst loss/accuracy from optimization
-    best = trials.results[np.argmin([r['loss'] for r in 
-        trials.results])]
-    worst = trials.results[np.argmax([r['loss'] for r in 
-        trials.results])]
-    
+    best = trials.results[np.argmin([r['loss'] for r in trials.results])]
+    worst = trials.results[np.argmax([r['loss'] for r in trials.results])]
+
     if verbose:
         print('best accuracy:', 1-best['loss'])
         print('best params:', best['params'])
         print('worst accuracy:', 1-worst['loss'])
         print('worst params:', worst['params'])
-    
+
     return best, worst
 
 
@@ -71,8 +69,8 @@ def plot_cf(testy, predy, title, filename):
     ax.set_xlabel('\nPredicted Values')
     ax.set_ylabel('Actual Values ')
 
-    ## Ticket labels - List must be in alphabetical order
-    ax.xaxis.set_ticklabels(['0(SNM)','1(other)'])
-    ax.yaxis.set_ticklabels(['0(SNM)','1(other)'])
-    ## Save the visualization of the Confusion Matrix.
+    # Ticket labels - List must be in alphabetical order
+    ax.xaxis.set_ticklabels(['0(SNM)', '1(other)'])
+    ax.yaxis.set_ticklabels(['0(SNM)', '1(other)'])
+    # Save the visualization of the Confusion Matrix.
     plt.savefig(filename)

From fd824dd92980ef7c4b488b165880e293b0a6597a Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Fri, 12 Aug 2022 10:55:35 -0400
Subject: [PATCH 13/35] implementing LabelProp with hyperopt functionality

---
 scripts/{logreg.py => LogReg.py} |  16 +--
 scripts/ssl/LabelProp.py         | 204 +++++++++++++++++++++++++++++--
 2 files changed, 201 insertions(+), 19 deletions(-)
 rename scripts/{logreg.py => LogReg.py} (94%)

diff --git a/scripts/logreg.py b/scripts/LogReg.py
similarity index 94%
rename from scripts/logreg.py
rename to scripts/LogReg.py
index 3b7b427..58f3a2f 100644
--- a/scripts/logreg.py
+++ b/scripts/LogReg.py
@@ -8,12 +8,14 @@
 import joblib
 
 
-class LogisticRegression:
+class LogReg:
     '''
-    Methods for deploying logistic regression with hyperparameter optimization.
+    Methods for deploying sklearn's logistic regression
+    implementation with hyperparameter optimization.
     Data agnostic (i.e. user supplied data inputs).
     TODO: Currently only supports binary classification.
         Add multinomial functions and unit tests.
+        Add functionality for regression(?)
     Inputs:
     params: dictionary of logistic regression input functions.
         keys max_iter, tol, and C supported.
@@ -59,23 +61,23 @@ def fresh_start(self, params, data_dict):
         testy = data_dict['testy']
 
         # supervised logistic regression
-        clr = linear_model.LogisticRegression(
+        clf = linear_model.LogisticRegression(
                 random_state=self.random_state,
                 max_iter=params['max_iter'],
                 tol=params['tol'],
                 C=params['C']
               )
         # train and test model
-        clr.fit(trainx, trainy)
-        clr_pred = clr.predict(testx)
+        clf.fit(trainx, trainy)
+        clf_pred = clf.predict(testx)
         # balanced_accuracy accounts for class imbalanced data
         # could alternatively use pure accuracy for a more traditional hyperopt
-        acc = balanced_accuracy_score(testy, clr_pred)
+        acc = balanced_accuracy_score(testy, clf_pred)
 
         # loss function minimizes misclassification
         return {'loss': 1-acc,
                 'status': STATUS_OK,
-                'model': clr,
+                'model': clf,
                 'params': params,
                 'accuracy': acc}
 
diff --git a/scripts/ssl/LabelProp.py b/scripts/ssl/LabelProp.py
index fc0f071..aad970a 100644
--- a/scripts/ssl/LabelProp.py
+++ b/scripts/ssl/LabelProp.py
@@ -2,21 +2,201 @@
 # For hyperopt (parameter optimization)
 from scripts.utils import STATUS_OK
 # sklearn models
-from sklearn.semi_supervised import LabelPropagation
+from sklearn import semi_supervised
 # diagnostics
 from sklearn.metrics import balanced_accuracy_score
+from scripts.utils import run_hyperopt
+import joblib
 
-lp_trainx = np.append(trainx, U[:,1:], axis=0)
-lp_trainy = np.append(trainy, U[:,0], axis=0)
 
+class LabelProp:
+    '''
+    Methods for deploying sklearn's Label Propagation
+    implementation with hyperparameter optimization.
+    Data agnostic (i.e. user supplied data inputs).
+    NOTE: Since LabelProp is guaranteed to converge given
+        enough iterations, there is no random_state defined.
+    TODO: Currently only supports binary classification.
+        Add multinomial functions and unit tests.
+        Add functionality for regression(?)
+    Inputs:
+    params: dictionary of logistic regression input functions.
+        keys gamma, n_neighbors, max_iter, and tol supported.
+    '''
 
-def f_lp(params):
-    lp = LabelPropagation(kernel='knn', gamma=params['gamma'], n_neighbors=params['n_neighbors'], max_iter=params['max_iter'], tol=params['tol'], n_jobs=-1)
-    lp.fit(lp_trainx, lp_trainy)
-    acc = balanced_accuracy_score(testy, lp.predict(testx))
+    # only binary so far
+    def __init__(self, params=None, random_state=0):
+        # defaults to a fixed value for reproducibility
+        self.random_state = random_state
+        # dictionary of parameters for logistic regression model
+        self.params = params
+        if self.params is None:
+            # defaults:
+            # knn kernel, although an rbf is equally valid
+            # TODO: allow rbf kernels
+            # n_jobs, use parallelization if available.
+            self.model = semi_supervised.LabelPropagation(
+                            kernel='knn',
+                            n_jobs=-1
+                        )
+        else:
+            self.model = semi_supervised.LabelPropagation(
+                            kernel='knn',
+                            gamma=params['gamma'],
+                            n_neighbors=params['n_neighbors'],
+                            max_iter=params['max_iter'],
+                            tol=params['tol'],
+                            n_jobs=-1
+                        )
 
-    return {'loss': 1-acc,
-            'status': STATUS_OK,
-            'model': lp,
-            'params': params,
-            'accuracy': acc}
\ No newline at end of file
+    def fresh_start(self, params, data_dict):
+        '''
+        Required method for hyperopt optimization.
+        Trains and tests a fresh Label Propagation model
+        with given input parameters.
+        This method does not overwrite self.model (self.optimize() does).
+        Inputs:
+        params: dictionary of logistic regression input functions.
+            keys max_iter, tol, and C supported.
+        data_dict: compact data representation with the five requisite
+            data structures used for training and testing an SSML model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        '''
+
+        # unpack data
+        trainx = data_dict['trainx']
+        trainy = data_dict['trainy']
+        testx = data_dict['testx']
+        testy = data_dict['testy']
+        Ux = data_dict['Ux']
+
+        # combine labeled and unlabeled instances for training
+        lp_trainx = np.append(trainx, Ux, axis=0)
+        lp_trainy = np.append(trainy,
+                              np.full(shape=(Ux.shape[0],), fill_value=-1),
+                              axis=0)
+
+        # semi-supervised label propagation
+        clf = semi_supervised.LabelPropagation(
+                kernel='knn',
+                gamma=params['gamma'],
+                n_neighbors=params['n_neighbors'],
+                max_iter=params['max_iter'],
+                tol=params['tol'],
+                n_jobs=-1
+            )
+        # train and test model
+        clf.fit(lp_trainx, lp_trainy)
+        clf_pred = clf.predict(testx)
+        # balanced_accuracy accounts for class imbalanced data
+        # could alternatively use pure accuracy for a more traditional hyperopt
+        acc = balanced_accuracy_score(testy, clf_pred)
+
+        # loss function minimizes misclassification
+        return {'loss': 1-acc,
+                'status': STATUS_OK,
+                'model': clf,
+                'params': params,
+                'accuracy': acc}
+
+    def optimize(self, space, data_dict, max_evals=50, verbose=True):
+        '''
+        Wrapper method for using hyperopt (see utils.run_hyperopt
+        for more details). After hyperparameter optimization, results
+        are stored, the best model -overwrites- self.model, and the
+        best params -overwrite- self.params.
+        Inputs:
+        space: a hyperopt compliant dictionary with defined optimization
+            spaces. For example:
+                # quniform returns float, some parameters require int;
+                # use this to force int
+                space = {'max_iter'  : scope.int(hp.quniform('max_iter',
+                                                             10,
+                                                             10000,
+                                                             10)),
+                        'tol'        : hp.loguniform('tol', 1e-6, 1e-4),
+                        'gamma'      : hp.uniform('gamma', 1, 50),
+                        'n_neighbors': scope.int(hp.quniform('n_neighbors',
+                                                             1,
+                                                             200,
+                                                             1))
+                        }
+            See hyperopt docs for more information.
+        data_dict: compact data representation with the five requisite
+            data structures used for training and testing an SSML model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        max_evals: the number of epochs for hyperparameter optimization.
+            Each iteration is one set of hyperparameters trained
+            and tested on a fresh model. Convergence for simpler
+            models like logistic regression typically happens well
+            before 50 epochs, but can increase as more complex models,
+            more hyperparameters, and a larger hyperparameter space is tested.
+        verbose: boolean. If true, print results of hyperopt.
+            If false, print only the progress bar for optimization.
+        '''
+
+        best, worst = run_hyperopt(space=space,
+                                   model=self.fresh_start,
+                                   data_dict=data_dict,
+                                   max_evals=max_evals,
+                                   verbose=verbose)
+
+        # save the results of hyperparameter optimization
+        self.best = best
+        self.model = best['model']
+        self.params = best['params']
+        self.worst = worst
+
+    def train(self, trainx, trainy, Ux):
+        '''
+        Wrapper method for sklearn's Label Propagation training method.
+        Inputs:
+        trainx: nxm feature vector/matrix for training model.
+        trainy: nxk class label vector/matrix for training model.
+        Ux: feature vector/matrix like labeled trainx but unlabeled data.
+        '''
+
+        # combine labeled and unlabeled instances for training
+        lp_trainx = np.append(trainx, Ux, axis=0)
+        lp_trainy = np.append(trainy,
+                              np.full(shape=(Ux.shape[0],), fill_value=-1),
+                              axis=0)
+
+        # semi-supervised Label Propagation
+        self.model.fit(lp_trainx, lp_trainy)
+
+    def predict(self, testx, testy=None):
+        '''
+        Wrapper method for sklearn's Label Propagation predict method.
+        Inputs:
+        testx: nxm feature vector/matrix for testing model.
+        testy: nxk class label vector/matrix for training model.
+            optional: if included, the predicted classes -and-
+            the resulting classification accuracy will be returned.
+        '''
+
+        pred = self.model.predict(testx)
+
+        acc = None
+        if testy is not None:
+            # uses balanced_accuracy_score to account for class imbalance
+            acc = balanced_accuracy_score(testy, pred)
+
+        return pred, acc
+
+    def save(self, filename):
+        '''
+        Save class instance to file using joblib.
+        Inputs:
+        filename: string filename to save object to file under.
+            The file must be saved with extension .joblib.
+            Added to filename if not included as input.
+        '''
+
+        if filename[-7:] != '.joblib':
+            filename += '.joblib'
+        joblib.dump(self, filename)

From 0c3ae2a27ba5f46032910a0d976ae1d7374b9973 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Fri, 12 Aug 2022 12:16:24 -0400
Subject: [PATCH 14/35] implementing co-training with hyperopt functionality

---
 scripts/LogReg.py         |   2 +-
 scripts/ssl/cotraining.py | 454 +++++++++++++++++++++++++++++++-------
 2 files changed, 380 insertions(+), 76 deletions(-)

diff --git a/scripts/LogReg.py b/scripts/LogReg.py
index 58f3a2f..6e619a2 100644
--- a/scripts/LogReg.py
+++ b/scripts/LogReg.py
@@ -51,7 +51,7 @@ def fresh_start(self, params, data_dict):
             keys max_iter, tol, and C supported.
         data_dict: compact data representation with the four requisite
             data structures used for training and testing a model.
-            keys trainx, trainy, testx, testy required.
+            keys trainx, trainy, testx, and testy required.
         '''
 
         # unpack data
diff --git a/scripts/ssl/cotraining.py b/scripts/ssl/cotraining.py
index 60dc11c..0d33971 100644
--- a/scripts/ssl/cotraining.py
+++ b/scripts/ssl/cotraining.py
@@ -6,79 +6,383 @@
 from sklearn import linear_model
 # diagnostics
 from sklearn.metrics import balanced_accuracy_score
+from scripts.utils import run_hyperopt
+import joblib
 
-split_frac = 0.5
-# labeled training data
-idx = np.random.choice(range(trainy.shape[0]),
-                        size=int(split_frac * trainy.shape[0]),
-                        replace = False)
-
-
-def f_ct(params):
-    slr1 = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])#, multi_class='multinomial')
-    slr2 = linear_model.LogisticRegression(random_state=0, max_iter=params['max_iter'], tol=params['tol'], C=params['C'])#, multi_class='multinomial')
-
-    L_lr1 = trainx[idx].copy()
-    L_lr2 = trainx[~idx].copy()
-    Ly_lr1 = trainy[idx].copy()
-    Ly_lr2 = trainy[~idx].copy()
-    # unlabeled cotraining data
-    U_lr = U[:,1:].copy()
-
-    model1_accs, model2_accs = np.array([]), np.array([])
-    n_samples = params['n_samples']
-    rep = False
-
-    while U_lr.shape[0] > 1:
-        #print(U_lr.shape[0])
-        slr1.fit(L_lr1, Ly_lr1)
-        slr2.fit(L_lr2, Ly_lr2)
-
-        # pull u1
-        if U_lr.shape[0] < n_samples*2:
-            n_samples = int(U_lr.shape[0]/2)
-        uidx1 = np.random.choice(range(U_lr.shape[0]), n_samples, replace=rep)
-        #u1 = U_lr[uidx1].copy().reshape((1, U_lr[uidx1].shape[0]))
-        u1 = U_lr[uidx1].copy()
-        U_lr = np.delete(U_lr, uidx1, axis=0)
-
-        # pull u2
-        uidx2 = np.random.choice(range(U_lr.shape[0]), n_samples, replace=rep)
-        #u2 = U_lr[uidx2].copy().reshape((1, U_lr[uidx2].shape[0]))
-        u2 = U_lr[uidx2].copy()
-        U_lr = np.delete(U_lr, uidx2, axis=0)
-
-        # predict unlabeled samples
-        u1y = slr1.predict(u1)
-        u2y = slr2.predict(u2)
-
-        model1_accs = np.append(model1_accs, balanced_accuracy_score(testy, slr1.predict(testx)))
-        model2_accs = np.append(model2_accs, balanced_accuracy_score(testy, slr2.predict(testx)))
-
-        # send predictions to cotrained function samples
-        L_lr1 = np.append(L_lr1, u2, axis=0)
-        L_lr2 = np.append(L_lr2, u1, axis=0)
-        Ly_lr1 = np.append(Ly_lr1, u2y, axis=0)
-        Ly_lr2 = np.append(Ly_lr2, u1y, axis=0)
-
-    model1_acc = balanced_accuracy_score(testy, slr1.predict(testx))
-    model2_acc = balanced_accuracy_score(testy, slr2.predict(testx))
-    acc = max(model1_acc, model2_acc)
-    return {'loss': 1-acc,
-            'status': STATUS_OK,
-            'model': slr1,
-            'model2': slr2,
-            'model1_acc_history': model1_accs,
-            'model2_acc_history': model2_accs,
-            'params': params,
-            'accuracy': acc}
-
-
-def plot_cotraining():
-    plt.plot(np.arange(len(best_ct['model1_acc_history'])), best_ct['model1_acc_history'], label='Model 1')
-    plt.plot(np.arange(len(best_ct['model2_acc_history'])), best_ct['model2_acc_history'], label='Model 2')
-    plt.legend()
-    plt.xlabel('Co-Training Iteration')
-    plt.ylabel('Test Accuracy')
-    plt.grid()
-    plt.savefig('lr-cotraining-learningcurves.png')
\ No newline at end of file
+
+class CoTraining:
+    '''
+    Methods for deploying a basic co-training with logistic
+    regression implementation with hyperparameter optimization.
+    Data agnostic (i.e. user supplied data inputs).
+    TODO: Currently only supports binary classification.
+        Add multinomial functions and unit tests.
+        Add functionality for regression(?)
+    Inputs:
+    params: dictionary of logistic regression input functions.
+        keys max_iter, tol, and C supported.
+    random_state: int/float for reproducible intiailization.
+    '''
+
+    # only binary so far
+    def __init__(self, params=None, random_state=0):
+        # defaults to a fixed value for reproducibility
+        self.random_state = random_state
+        # dictionary of parameters for logistic regression model
+        self.params = params
+        if self.params is None:
+            self.model1 = linear_model.LogisticRegression(
+                            random_state=self.random_state)
+            self.model2 = linear_model.LogisticRegression(
+                            random_state=self.random_state)
+        else:
+            self.model1 = linear_model.LogisticRegression(
+                            random_state=self.random_state,
+                            max_iter=params['max_iter'],
+                            tol=params['tol'],
+                            C=params['C']
+                        )
+            self.model2 = linear_model.LogisticRegression(
+                            random_state=self.random_state,
+                            max_iter=params['max_iter'],
+                            tol=params['tol'],
+                            C=params['C']
+                        )
+
+    def training_loop(self, slr1, slr2, L_lr1, L_lr2,
+                      Ly_lr1, Ly_lr2, U_lr, n_samples,
+                      testx=None, testy=None):
+        '''
+        Main training iteration for co-training.
+        Given two models, labeled training data, and unlabeled training data:
+        - Train both models using their respective labeled datasets
+        - Randomly sample n_samples number of unlabeled
+            instances for model 1 and 2 each.
+        - Label the sampled unlabeled instances using
+            model 1 (u1) and model 2 (u2).
+        - Remove u1 and u2 from the unlabeled dataset and
+            include in each model's respective labeled dataset
+            with their associated labels for future training.
+        Inputs:
+        slr1: logistic regression co-training model #1
+        slr2: logistic regression co-training model #2
+        L_lr1: feature training data for co-training model #1
+        L_lr2: feature training data for co-training model #2
+        Ly_lr1: labels for input data for co-training model #1
+        Ly_lr2: labels for input data for co-training model #2
+        U_lr: unlabeled feature training data used by both models
+        n_samples: the number of instances to sample and
+            predict from Ux at one time
+        testx: feature vector/matrix used for testing the performance
+            of each model at every iteration.
+        testy: label vector used for testing the performance
+            of each model at every iteration.
+        '''
+
+        model1_accs, model2_accs = np.array([]), np.array([])
+        # should stay false but if true,
+        # the same unalbeled instance could be sampled multiple times
+        rep = False
+        while U_lr.shape[0] > 1:
+            slr1.fit(L_lr1, Ly_lr1)
+            slr2.fit(L_lr2, Ly_lr2)
+
+            # pull u1
+            # ensuring there is enough instances to sample for each model
+            if U_lr.shape[0] < n_samples*2:
+                n_samples = int(U_lr.shape[0]/2)
+            uidx1 = np.random.choice(range(U_lr.shape[0]),
+                                     n_samples,
+                                     replace=rep)
+            u1 = U_lr[uidx1].copy()
+            # remove instances that will be labeled
+            U_lr = np.delete(U_lr, uidx1, axis=0)
+
+            # pull u2
+            uidx2 = np.random.choice(range(U_lr.shape[0]),
+                                     n_samples,
+                                     replace=rep)
+            u2 = U_lr[uidx2].copy()
+            # remove instances that will be labeled
+            U_lr = np.delete(U_lr, uidx2, axis=0)
+
+            # predict unlabeled samples
+            u1y = slr1.predict(u1)
+            u2y = slr2.predict(u2)
+
+            if testx is not None and testy is not None:
+                # test and save model(s) accuracy over all training iterations
+                model1_accs = np.append(model1_accs,
+                                        balanced_accuracy_score(testy,
+                                                                slr1.predict(
+                                                                    testx)))
+                model2_accs = np.append(model2_accs,
+                                        balanced_accuracy_score(testy,
+                                                                slr2.predict(
+                                                                    testx)))
+
+            # add predictions to cotrained model(s) labeled samples
+            L_lr1 = np.append(L_lr1, u2, axis=0)
+            L_lr2 = np.append(L_lr2, u1, axis=0)
+            Ly_lr1 = np.append(Ly_lr1, u2y, axis=0)
+            Ly_lr2 = np.append(Ly_lr2, u1y, axis=0)
+
+        return slr1, slr2, model1_accs, model2_accs
+
+    def fresh_start(self, params, data_dict):
+        '''
+        Required method for hyperopt optimization.
+        Trains and tests a fresh co-training model
+        with given input parameters.
+        This method does not overwrite self.model (self.optimize() does).
+        Inputs:
+        params: dictionary of logistic regression input functions.
+            keys n_samples, max_iter, tol, and C supported.
+        data_dict: compact data representation with the four requisite
+            data structures used for training and testing a model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        '''
+
+        # unpack data
+        trainx = data_dict['trainx']
+        trainy = data_dict['trainy']
+        testx = data_dict['testx']
+        testy = data_dict['testy']
+        # unlabeled co-training data
+        Ux = data_dict['Ux']
+        # avoid overwriting when deleting in co-training loop
+        U_lr = Ux.copy()
+
+        # set the random seed of training splits for reproducibility
+        # This can be ignored by fixing params['seed'] to None
+        # in the hyperopt space dictionary
+        if params['seed'] is not None:
+            np.random.seed(params['seed'])
+
+        # TODO: allow a user to specify uneven splits between the two models
+        split_frac = 0.5
+        # labeled training data
+        idx = np.random.choice(range(trainy.shape[0]),
+                               size=int(split_frac * trainy.shape[0]),
+                               replace=False)
+
+        # avoid overwriting when deleting in co-training loop
+        L_lr1 = trainx[idx].copy()
+        L_lr2 = trainx[~idx].copy()
+        Ly_lr1 = trainy[idx].copy()
+        Ly_lr2 = trainy[~idx].copy()
+
+        # initialized logistic regression models for a fresh-start
+        slr1 = linear_model.LogisticRegression(
+                random_state=self.random_state,
+                max_iter=params['max_iter'],
+                tol=params['tol'],
+                C=params['C']
+            )
+        slr2 = linear_model.LogisticRegression(
+                random_state=self.random_state,
+                max_iter=params['max_iter'],
+                tol=params['tol'],
+                C=params['C']
+            )
+
+        slr1, slr2, model1_accs, model2_accs = self.training_loop(
+                                                slr1, slr2,
+                                                L_lr1, L_lr2,
+                                                Ly_lr1, Ly_lr2,
+                                                U_lr, testx, testy,
+                                                params['n_samples']
+                                                )
+
+        # balanced_accuracy accounts for class imbalanced data
+        # could alternatively use pure accuracy for a more traditional hyperopt
+        model1_acc = balanced_accuracy_score(testy, slr1.predict(testx))
+        model2_acc = balanced_accuracy_score(testy, slr2.predict(testx))
+        # select best accuracy for hyperparameter optimization
+        acc = max(model1_acc, model2_acc)
+        return {'loss': 1-acc,
+                'status': STATUS_OK,
+                'model': slr1,
+                'model2': slr2,
+                'model1_acc_history': model1_accs,
+                'model2_acc_history': model2_accs,
+                'params': params,
+                'accuracy': acc}
+
+    def optimize(self, space, data_dict, max_evals=50, verbose=True):
+        '''
+        Wrapper method for using hyperopt (see utils.run_hyperopt
+        for more details). After hyperparameter optimization, results
+        are stored, the best model -overwrites- self.model, and the
+        best params -overwrite- self.params.
+        Inputs:
+        space: a hyperopt compliant dictionary with defined optimization
+            spaces. For example:
+                # quniform returns float, some parameters require int;
+                # use this to force int
+                space = {'max_iter' : scope.int(hp.quniform('max_iter',
+                                                            10,
+                                                            10000,
+                                                            10)),
+                        'tol'       : hp.loguniform('tol', 1e-5, 1e-3),
+                        'C'         : hp.uniform('C', 1.0, 1000.0),
+                        'n_samples' : scope.int(hp.quniform('n_samples',
+                                                            1,
+                                                            20,
+                                                            1))
+                        }
+            See hyperopt docs for more information.
+        data_dict: compact data representation with the five requisite
+            data structures used for training and testing an SSML model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        max_evals: the number of epochs for hyperparameter optimization.
+            Each iteration is one set of hyperparameters trained
+            and tested on a fresh model. Convergence for simpler
+            models like logistic regression typically happens well
+            before 50 epochs, but can increase as more complex models,
+            more hyperparameters, and a larger hyperparameter space is tested.
+        verbose: boolean. If true, print results of hyperopt.
+            If false, print only the progress bar for optimization.
+        '''
+
+        best, worst = run_hyperopt(space=space,
+                                   model=self.fresh_start,
+                                   data_dict=data_dict,
+                                   max_evals=max_evals,
+                                   verbose=verbose)
+
+        # save the results of hyperparameter optimization
+        self.best = best
+        self.model = best['model']
+        self.params = best['params']
+        self.worst = worst
+
+    def train(self, trainx, trainy, Ux,
+              testx=None, testy=None, n_samples=1, seed=None):
+        '''
+        Wrapper method for a basic co-training with logistic regression
+        implementation training method.
+        Inputs:
+        trainx: nxm feature vector/matrix for training model.
+        trainy: nxk class label vector/matrix for training model.
+        testx: feature vector/matrix used for testing the performance
+            of each model at every iteration.
+        testy: label vector used for testing the performance
+            of each model at every iteration.
+        Ux: feature vector/matrix like labeled trainx but unlabeled data.
+        n_samples: the number of instances to sample and
+            predict from Ux at one time
+        seed: set the random seed of training splits for reproducibility
+        '''
+
+        # avoid overwriting when deleting in co-training loop
+        U_lr = Ux.copy()
+
+        # set the random seed of training splits for reproducibility
+        # This can be ignored by fixing params['seed'] to None
+        # in the hyperopt space dictionary
+        if seed is not None:
+            np.random.seed(seed)
+
+        # TODO: allow a user to specify uneven splits between the two models
+        split_frac = 0.5
+        # labeled training data
+        idx = np.random.choice(range(trainy.shape[0]),
+                               size=int(split_frac * trainy.shape[0]),
+                               replace=False)
+
+        # avoid overwriting when deleting in co-training loop
+        L_lr1 = trainx[idx].copy()
+        L_lr2 = trainx[~idx].copy()
+        Ly_lr1 = trainy[idx].copy()
+        Ly_lr2 = trainy[~idx].copy()
+
+        self.model1, self.model2,
+        model1_accs, model2_accs = self.training_loop(
+                                        self.model1, self.model2,
+                                        L_lr1, L_lr2,
+                                        Ly_lr1, Ly_lr2,
+                                        U_lr, testx, testy,
+                                        n_samples
+                                        )
+
+        # optional returns if a user is interested in training diagnostics
+        return model1_accs, model2_accs
+
+    def predict(self, testx, testy=None):
+        '''
+        Wrapper method for sklearn's Label Propagation predict method.
+        Inputs:
+        testx: nxm feature vector/matrix for testing model.
+        testy: nxk class label vector/matrix for training model.
+            optional: if included, the predicted classes -and-
+            the resulting classification accuracy will be returned.
+        '''
+
+        pred1 = self.model1.predict(testx)
+        pred2 = self.model2.predict(testx)
+
+        acc = None
+        if testy is not None:
+            # balanced_accuracy accounts for class imbalanced data
+            # could alternatively use pure accuracy
+            # for a more traditional hyperopt
+            model1_acc = balanced_accuracy_score(testy, pred1)
+            model2_acc = balanced_accuracy_score(testy, pred2)
+            # select best accuracy for hyperparameter optimization
+            acc = max(model1_acc, model2_acc)
+
+        return pred1, acc, pred2, model1_acc, model2_acc
+
+    def plot_cotraining(self, filename='lr-cotraining-learningcurves.png',
+                        model1_accs=None, model2_accs=None):
+        '''
+        Plots the training error curves for two co-training models.
+        NOTE: The user can either choose to plot what is stored in
+            the class instance by setting model#_accs=None or
+            the model#_accs can be inputted.
+        Inputs:
+        filename: name to store picture under.
+            Must end in .png (or will be added if missing).
+        model1_accs: the accuracy scores over training epochs for model 1
+        model2_accs: the accuracy scores over training epochs for model 2
+        '''
+
+        fig, ax = plt.subplots(figsize=(10, 8))
+        if model1_accs is not None and model2_accs is not None:
+            ax.plot(np.arange(len(model1_accs)), model1_accs, label='Model 1')
+            ax.plot(np.arange(len(model2_accs)), model2_accs, label='Model 2')
+        else:
+            ax.plot(np.arange(len(self.best['model1_acc_history'])),
+                    self.best['model1_acc_history'],
+                    color='tab:blue',
+                    label='Model 1')
+            ax.plot(np.arange(len(self.best['model2_acc_history'])),
+                    self.best['model2_acc_history'],
+                    color='tab:orange',
+                    label='Model 2')
+        ax.legend()
+        ax.set_xlabel('Co-Training Iteration')
+        ax.set_ylabel('Test Accuracy')
+        ax.grid()
+
+        if filename[-4:] != '.png':
+            filename += '.png'
+        fig.savefig(filename)
+
+    def save(self, filename):
+        '''
+        Save class instance to file using joblib.
+        Inputs:
+        filename: string filename to save object to file under.
+            The file must be saved with extension .joblib.
+            Added to filename if not included as input.
+        '''
+
+        if filename[-7:] != '.joblib':
+            filename += '.joblib'
+        joblib.dump(self, filename)

From 42f19f471697d2a028d7c2076e8c46663608f7b4 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Fri, 12 Aug 2022 14:04:02 -0400
Subject: [PATCH 15/35] implementing Shadow fully-connected NN with hyperopt

---
 .../ssl/{shadow_eaat_cnn.py => ShadowCNN.py}  |   2 +-
 scripts/ssl/ShadowNN.py                       | 302 ++++++++++++++++++
 scripts/ssl/cotraining.py                     |   2 +-
 scripts/ssl/shadow_nn.py                      |  55 ----
 4 files changed, 304 insertions(+), 57 deletions(-)
 rename scripts/ssl/{shadow_eaat_cnn.py => ShadowCNN.py} (98%)
 create mode 100644 scripts/ssl/ShadowNN.py
 delete mode 100644 scripts/ssl/shadow_nn.py

diff --git a/scripts/ssl/shadow_eaat_cnn.py b/scripts/ssl/ShadowCNN.py
similarity index 98%
rename from scripts/ssl/shadow_eaat_cnn.py
rename to scripts/ssl/ShadowCNN.py
index 44154ba..bc6a249 100644
--- a/scripts/ssl/shadow_eaat_cnn.py
+++ b/scripts/ssl/ShadowCNN.py
@@ -13,7 +13,7 @@
 from shadow.utils import set_seed
 
 set_seed(0)
-device = torch.device('cpu')  # run on cpu, since model and data are very small
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 class Net(nn.Module):
     def __init__(self, layer1=32, layer2=64, layer3=128, kernel=3, drop_rate=0.1, length=1000):
diff --git a/scripts/ssl/ShadowNN.py b/scripts/ssl/ShadowNN.py
new file mode 100644
index 0000000..6c7377c
--- /dev/null
+++ b/scripts/ssl/ShadowNN.py
@@ -0,0 +1,302 @@
+import numpy as np
+# For hyperopt (parameter optimization)
+from scripts.utils import STATUS_OK
+# torch imports
+import torch
+# shadow imports
+import shadow
+# diagnostics
+from scripts.utils import run_hyperopt
+import joblib
+
+
+class ShadowNN:
+    '''
+    Methods for deploying a Shadow fully-connected NN
+    implementation with hyperparameter optimization.
+    Data agnostic (i.e. user supplied data inputs).
+    TODO: Currently only supports binary classification.
+        Add multinomial functions and unit tests.
+        Add functionality for regression(?)
+    Inputs:
+    params: dictionary of logistic regression input functions.
+        keys binning, hidden_layer, alpha, xi, eps, lr, and momentum
+        are supported.
+    random_state: int/float for reproducible intiailization.
+    TODO: Add input parameter, loss_function, for the other
+        loss function options available in Shadow (besides EAAT).
+    '''
+
+    # only binary so far
+    def __init__(self, params=None, random_state=0):
+        # defaults to a fixed value for reproducibility
+        self.random_state = random_state
+        # set seeds for reproducibility
+        shadow.utils.set_seed(0)
+        # device used for computation
+        self.device = torch.device("cuda" if
+                                   torch.cuda.is_available() else "cpu")
+        # dictionary of parameters for logistic regression model
+        self.params = params
+        if self.params is None:
+            # assumes the input dimensions are measurements of 1000 bins
+            # TODO: Abstract this for arbitrary input size
+            self.eaat = shadow.eaat.EAAT(model=self.model_factory(
+                                            1000//params['binning'],
+                                            params['hidden_layer']),
+                                         alpha=params['alpha'],
+                                         xi=params['xi'],
+                                         eps=params['eps']).to(self.device)
+            self.eaat_opt = torch.optim.SGD(self.eaat.parameters(),
+                                            lr=params['lr'],
+                                            momentum=params['momentum'])
+            # unlabeled instances always have a label of "-1"
+            self.xEnt = torch.nn.CrossEntropyLoss(
+                            ignore_index=-1).to(self.device)
+        else:
+            self.params = {'binning': 1}
+            # assumes the input dimensions are measurements of 1000 bins
+            self.eaat = shadow.eaat.EAAT(
+                            model=self.model_factory()).to(self.device)
+            self.eaat_opt = torch.optim.SGD(self.eaat.parameters())
+            # unlabeled instances always have a label of "-1"
+            self.xEnt = torch.nn.CrossEntropyLoss(
+                            ignore_index=-1).to(self.device)
+
+    def model_factory(self, length=1000, hidden_layer=10000):
+        return torch.nn.Sequential(
+            torch.nn.Linear(length, hidden_layer),
+            torch.nn.ReLU(),
+            torch.nn.Linear(hidden_layer, length),
+            torch.nn.ReLU(),
+            torch.nn.Linear(length, 2)
+        )
+
+    def fresh_start(self, params, data_dict):
+        '''
+        Required method for hyperopt optimization.
+        Trains and tests a fresh Shadow NN model
+        with given input parameters.
+        This method does not overwrite self.model (self.optimize() does).
+        Inputs:
+        params: dictionary of logistic regression input functions.
+            keys binning, hidden_layer, alpha, xi, eps, lr, and momentum
+            are supported.
+        data_dict: compact data representation with the four requisite
+            data structures used for training and testing a model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        '''
+
+        # unpack data
+        trainx = data_dict['trainx']
+        trainy = data_dict['trainy']
+        testx = data_dict['testx']
+        testy = data_dict['testy']
+        # unlabeled co-training data
+        Ux = data_dict['Ux']
+
+        eaat = shadow.eaat.EAAT(model=self.model_factory(
+                                    testx[:, ::params['binning']].shape[1],
+                                    params['hidden_layer']),
+                                alpha=params['alpha'],
+                                xi=params['xi'],
+                                eps=params['eps']).to(self.device)
+        eaat_opt = torch.optim.SGD(eaat.parameters(),
+                                   lr=params['lr'],
+                                   momentum=params['momentum'])
+        xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1).to(self.device)
+
+        # avoid float round-off by using DoubleTensor
+        xtens = torch.FloatTensor(np.append(trainx,
+                                            Ux,
+                                            axis=0)[:, ::params['binning']])
+        # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
+        ytens = torch.LongTensor(np.append(trainy,
+                                           np.full(shape=(Ux.shape[0],),
+                                                   axis=0)))
+
+        n_epochs = 100
+        xt = torch.Tensor(xtens).to(self.device)
+        yt = torch.LongTensor(ytens).to(self.device)
+        # saves history for max accuracy
+        acc_history = []
+        # set the model into training mode
+        # NOTE: change this to .eval() mode for testing and back again
+        eaat.train()
+        for epoch in range(n_epochs):
+            # Forward/backward pass for training semi-supervised model
+            out = eaat(xt)
+            # supervised + unsupervised loss
+            loss = xEnt(out, yt) + eaat.get_technique_cost(xt)
+            eaat_opt.zero_grad()
+            loss.backward()
+            eaat_opt.step()
+
+            eaat.eval()
+            eaat_pred = torch.max(eaat(
+                                    torch.FloatTensor(
+                                        testx.copy()[:, ::params['binning']]
+                                        )
+                                    ), 1)[-1]
+            acc = shadow.losses.accuracy(eaat_pred,
+                                         torch.LongTensor(testy.copy())
+                                         ).data.item()
+            acc_history.append(acc)
+        max_acc = np.max(acc_history[-20:])
+
+        return {'loss': 1-(max_acc/100.0),
+                'status': STATUS_OK,
+                'model': eaat,
+                'params': params,
+                'accuracy': (max_acc/100.0)}
+
+    def optimize(self, space, data_dict, max_evals=50, verbose=True):
+        '''
+        Wrapper method for using hyperopt (see utils.run_hyperopt
+        for more details). After hyperparameter optimization, results
+        are stored, the best model -overwrites- self.model, and the
+        best params -overwrite- self.params.
+        Inputs:
+        space: a hyperopt compliant dictionary with defined optimization
+            spaces. For example:
+                # quniform returns float, some parameters require int;
+                # use this to force int
+                space = {'hidden_layer' : scope.int(hp.quniform('hidden_layer',
+                                                        1000,
+                                                        10000,
+                                                        10)),
+                         'alpha'        : hp.uniform('alpha', 0.0001, 0.999),
+                         'xi'           : hp.uniform('xi', 1e-2, 1e0),
+                         'eps'          : hp.uniform('eps', 0.5, 1.5),
+                         'lr'           : hp.uniform('lr', 1e-3, 1e-1),
+                         'momentum'     : hp.uniform('momentum', 0.5, 0.99),
+                         'binning'      : scope.int(hp.quniform('binning',
+                                                                1,
+                                                                10,
+                                                                1))
+                        }
+            See hyperopt docs for more information.
+        data_dict: compact data representation with the five requisite
+            data structures used for training and testing an SSML model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        max_evals: the number of epochs for hyperparameter optimization.
+            Each iteration is one set of hyperparameters trained
+            and tested on a fresh model. Convergence for simpler
+            models like logistic regression typically happens well
+            before 50 epochs, but can increase as more complex models,
+            more hyperparameters, and a larger hyperparameter space is tested.
+        verbose: boolean. If true, print results of hyperopt.
+            If false, print only the progress bar for optimization.
+        '''
+
+        best, worst = run_hyperopt(space=space,
+                                   model=self.fresh_start,
+                                   data_dict=data_dict,
+                                   max_evals=max_evals,
+                                   verbose=verbose)
+
+        # save the results of hyperparameter optimization
+        self.best = best
+        self.model = best['model']
+        self.params = best['params']
+        self.worst = worst
+
+    def train(self, trainx, trainy, Ux, testx=None, testy=None):
+        '''
+        Wrapper method for Shadow NN training method.
+        Inputs:
+        trainx: nxm feature vector/matrix for training model.
+        trainy: nxk class label vector/matrix for training model.
+        Ux: feature vector/matrix like labeled trainx but unlabeled data.
+        testx: feature vector/matrix used for testing the performance
+            of each model at every iteration.
+        testy: label vector used for testing the performance
+            of each model at every iteration.
+        '''
+
+        # avoid float round-off by using DoubleTensor
+        xtens = torch.FloatTensor(np.append(trainx,
+                                            Ux,
+                                            axis=0)[:,
+                                                    ::self.params['binning']])
+        # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
+        ytens = torch.LongTensor(np.append(trainy,
+                                           np.full(shape=(Ux.shape[0],),
+                                                   axis=0)))
+
+        n_epochs = 100
+        xt = torch.Tensor(xtens).to(self.device)
+        yt = torch.LongTensor(ytens).to(self.device)
+        # saves history for max accuracy
+        acc_history = []
+        # set the model into training mode
+        # NOTE: change this to .eval() mode for testing and back again
+        self.eaat.train()
+        for epoch in range(n_epochs):
+            # Forward/backward pass for training semi-supervised model
+            out = self.eaat(xt)
+            # supervised + unsupervised loss
+            loss = self.xEnt(out, yt) + self.eaat.get_technique_cost(xt)
+            self.eaat_opt.zero_grad()
+            loss.backward()
+            self.eaat_opt.step()
+
+            if testx is not None and testy is not None:
+                self.eaat.eval()
+                eaat_pred = torch.max(self.eaat(
+                                        torch.FloatTensor(
+                                            testx.copy()[:,
+                                                         ::self.params[
+                                                            'binning']
+                                                         ]
+                                            )
+                                        ), 1)[-1]
+                acc = shadow.losses.accuracy(eaat_pred,
+                                             torch.LongTensor(testy.copy())
+                                             ).data.item()
+                acc_history.append(acc)
+
+        # optionally return the training accuracy if test data was provided
+        return acc_history
+
+    def predict(self, testx, testy=None):
+        '''
+        Wrapper method for Shadow NN predict method.
+        Inputs:
+        testx: nxm feature vector/matrix for testing model.
+        testy: nxk class label vector/matrix for training model.
+            optional: if included, the predicted classes -and-
+            the resulting classification accuracy will be returned.
+        '''
+
+        self.eaat.eval()
+        eaat_pred = torch.max(self.eaat(
+                                torch.FloatTensor(
+                                    testx.copy()[:, ::self.params['binning']]
+                                    )
+                                ), 1)[-1]
+
+        acc = None
+        if testy is not None:
+            acc = shadow.losses.accuracy(eaat_pred,
+                                         torch.LongTensor(testy.copy())
+                                         ).data.item()
+
+        return eaat_pred, acc
+
+    def save(self, filename):
+        '''
+        Save class instance to file using joblib.
+        Inputs:
+        filename: string filename to save object to file under.
+            The file must be saved with extension .joblib.
+            Added to filename if not included as input.
+        '''
+
+        if filename[-7:] != '.joblib':
+            filename += '.joblib'
+        joblib.dump(self, filename)
diff --git a/scripts/ssl/cotraining.py b/scripts/ssl/cotraining.py
index 0d33971..dd961c2 100644
--- a/scripts/ssl/cotraining.py
+++ b/scripts/ssl/cotraining.py
@@ -269,11 +269,11 @@ def train(self, trainx, trainy, Ux,
         Inputs:
         trainx: nxm feature vector/matrix for training model.
         trainy: nxk class label vector/matrix for training model.
+        Ux: feature vector/matrix like labeled trainx but unlabeled data.
         testx: feature vector/matrix used for testing the performance
             of each model at every iteration.
         testy: label vector used for testing the performance
             of each model at every iteration.
-        Ux: feature vector/matrix like labeled trainx but unlabeled data.
         n_samples: the number of instances to sample and
             predict from Ux at one time
         seed: set the random seed of training splits for reproducibility
diff --git a/scripts/ssl/shadow_nn.py b/scripts/ssl/shadow_nn.py
deleted file mode 100644
index 59cde53..0000000
--- a/scripts/ssl/shadow_nn.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import numpy as np
-# For hyperopt (parameter optimization)
-from scripts.utils import STATUS_OK
-# torch imports
-import torch
-# shadow imports
-import shadow
-
-shadow.utils.set_seed(0)  # set seeds for reproducibility
-
-
-def model_factory(length=1000, hidden_layer=10000):
-    return torch.nn.Sequential(
-        torch.nn.Linear(length, hidden_layer),
-        torch.nn.ReLU(),
-        torch.nn.Linear(hidden_layer, length),
-        torch.nn.ReLU(),
-        torch.nn.Linear(length, 2)
-    )
-
-
-def f_nn(params):
-    device = torch.device('cpu')  # run on cpu, since model and data are very small
-    eaat = shadow.eaat.EAAT(model=model_factory(testx[:,::params['binning']].shape[1], params['hidden_layer']), alpha=params['alpha'], xi=params['xi'], eps=params['eps']).to(device)
-    eaat_opt = torch.optim.SGD(eaat.parameters(), lr=params['lr'], momentum=params['momentum'])
-    xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1).to(device)
-
-    # avoid float round-off by using DoubleTensor
-    xtens = torch.FloatTensor(np.append(trainx, U[:,1:], axis=0)[:,::params['binning']])
-    # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
-    ytens = torch.LongTensor(np.append(trainy, U[:,0], axis=0))
-    #n_epochs = params['n_epochs']
-    n_epochs = 100
-    xt, yt = torch.Tensor(xtens).to(device), torch.LongTensor(ytens).to(device)
-    acc_history = []    # saves history for max accuracy
-    eaat.train()
-    for epoch in range(n_epochs):
-        # Forward/backward pass for training semi-supervised model
-        out = eaat(xt)
-        loss = xEnt(out, yt) + eaat.get_technique_cost(xt)  # supervised + unsupervised loss
-        eaat_opt.zero_grad()
-        loss.backward()
-        eaat_opt.step()
-    
-        eaat.eval()
-        eaat_pred = torch.max(eaat(torch.FloatTensor(testx.copy()[:,::params['binning']])), 1)[-1]
-        acc = shadow.losses.accuracy(eaat_pred, torch.LongTensor(testy.copy())).data.item()
-        acc_history.append(acc)
-    max_acc = np.max(acc_history[-50:])
-
-    return {'loss': 1-(max_acc/100.0),
-            'status': STATUS_OK,
-            'model': eaat,
-            'params': params,
-            'accuracy': (max_acc/100.0)}
\ No newline at end of file

From a629bb3d024f9df038b58d255edd87be4b997cdd Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Fri, 12 Aug 2022 14:51:59 -0400
Subject: [PATCH 16/35] implementing Shadow EAAT CNN with hyperopt

---
 scripts/ssl/ShadowCNN.py  | 467 ++++++++++++++++++++++++++++++++------
 scripts/ssl/ShadowNN.py   |   2 +-
 scripts/ssl/cotraining.py |   2 +-
 3 files changed, 404 insertions(+), 67 deletions(-)

diff --git a/scripts/ssl/ShadowCNN.py b/scripts/ssl/ShadowCNN.py
index bc6a249..e1c5d7a 100644
--- a/scripts/ssl/ShadowCNN.py
+++ b/scripts/ssl/ShadowCNN.py
@@ -1,4 +1,5 @@
 import numpy as np
+import matplotlib.pyplot as plt
 # For hyperopt (parameter optimization)
 from scripts.utils import STATUS_OK
 # torch imports
@@ -11,21 +12,54 @@
 import shadow.losses
 import shadow.utils
 from shadow.utils import set_seed
+# diagnostics
+from scripts.utils import run_hyperopt
+import joblib
 
-set_seed(0)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 class Net(nn.Module):
-    def __init__(self, layer1=32, layer2=64, layer3=128, kernel=3, drop_rate=0.1, length=1000):
+    '''
+    Neural Network constructor .
+    Also includes method for forward pass.
+    nn.Module: PyTorch object for neural networks.
+    Inputs:
+    layer1: int length for first layer.
+    layer2: int length for second layer.
+        Ideally a multiple of layer1.
+    layer3: int length for third layer.
+        Ideally a multiple of layer2.
+    kernel: convolutional kernel size.
+        NOTE: An optimal value is unclear for spectral data.
+    drop_rate: float (<1.) probability for reset/dropout layer.
+    length: single instance data length.
+        NOTE: Assumed to be 1000 for spectral data.
+    TODO: Allow hyperopt to optimize on arbitrary sized networks.
+    '''
+
+    def __init__(self, layer1=32, layer2=64, layer3=128,
+                 kernel=3, drop_rate=0.1, length=1000):
+        '''
+        Defines the structure for each type of layer.
+        The resulting network has fixed length but the
+        user can input arbitrary widths.
+        '''
         super(Net, self).__init__()
         self.conv1 = nn.Conv1d(1, layer1, kernel, 1)
         self.conv2 = nn.Conv1d(layer1, layer2, kernel, 1)
         self.dropout = nn.Dropout2d(drop_rate)
         self.fc1 = nn.Linear(int(layer2*(length-2*(kernel-1))/2), layer3)
-        #self.fc1 = nn.Linear(31744, 128)
+        # self.fc1 = nn.Linear(31744, 128)
         self.fc2 = nn.Linear(layer3, 2)
 
     def forward(self, x):
+        '''
+        The resulting network has a fixed length with
+        two convolutional layers divided by relu activation,
+        a max pooling layer, a dropout layer, and two
+        fully-connected layers separated by a relu and
+        dropout layers.
+        '''
+
         x = self.conv1(x)
         x = F.relu(x)
         x = self.conv2(x)
@@ -38,78 +72,381 @@ def forward(self, x):
         x = self.fc2(x)
         return x
 
+
 class SpectralDataset(torch.utils.data.Dataset):
+    '''
+    Dataset loader for use with PyTorch NN training.
+    torch.utils.data.Dataset: managing user input data for random sampling.
+    Inputs:
+    trainD: the nxm input vector/matrix of data.
+    labels: associated label vector for data.
+    '''
+
     def __init__(self, trainD, labels):
         self.labels = labels
         self.trainD = trainD
 
     def __len__(self):
+        '''
+        Define what length is for the Dataset
+        '''
+
         return len(self.labels)
 
     def __getitem__(self, idx):
+        '''
+        Define how to retrieve an instance from a dataset.
+        Inputs:
+        idx: the index to sample from.
+        '''
+
         label = self.labels[idx]
         data = self.trainD[idx]
         # no need to bother with labels, unpacking both anyways
-        #sample = {"Spectrum": data, "Class": label}
-        #return sample
+        # sample = {"Spectrum": data, "Class": label}
+        # return sample
         return data, label
 
-def eval(eaat, binning):
-    eaat.eval()
-    y_pred, y_true = [], []
-    for i, (data, targets) in enumerate(zip(torch.FloatTensor(testx.copy()[:,::binning]), torch.LongTensor(testy.copy()))):
-        x = data.reshape((1, 1, data.shape[0])).to(device)
-        y = targets.reshape((1,)).to(device)
-        out = eaat(x)
-        y_true.extend(y.detach().cpu().tolist())
-        y_pred.extend(torch.argmax(out, 1).detach().cpu().tolist())
-    test_acc = (np.array(y_true) == np.array(y_pred)).mean() * 100
-    #print('test accuracy: {}'.format(test_acc))
-    return test_acc
-
-def f_eaat(params):
-    #print(params)
-    # avoid float round-off by using DoubleTensor
-    xtens = torch.FloatTensor(np.append(trainx, U[:,1:], axis=0))[:,::params['binning']]
-    # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
-    ytens = torch.LongTensor(np.append(trainy, U[:,0], axis=0))
-    
-    #print(xtens.shape)
-    model = Net(layer1=params['layer1'], layer2=2*params['layer1'], layer3=3*params['layer1'], kernel=params['kernel'], drop_rate=params['drop_rate'], length=xtens.shape[1])
-    eaat = shadow.eaat.EAAT(model=model, alpha=params['alpha'], xi=params['xi'], eps=params['eps'])
-    optimizer = optim.SGD(eaat.parameters(), lr=params['lr'], momentum=params['momentum'])
-
-    # define data set object
-    MINOS_train = SpectralDataset(xtens, ytens)
-
-    # create DataLoader object of DataSet object
-    DL_DS = torch.utils.data.DataLoader(MINOS_train, batch_size=params['batch_size'], shuffle=True)
-
-    xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1)
-
-    n_epochs = 50
-    eaat.to(device)
-    losscurve = []
-    evalcurve = []
-    for epoch in range(n_epochs):
-        eaat.train()
-        lossavg = []
-        for i, (data, targets) in enumerate(DL_DS):
-            x = data.reshape((data.shape[0], 1, data.shape[1])).to(device)
-            y = targets.to(device)
-            optimizer.zero_grad()
-            out = eaat(x)
-            loss = xEnt(out, y) + eaat.get_technique_cost(x)
-            loss.backward()
-            optimizer.step()
-            lossavg.append(loss.item())
-        losscurve.append(np.nanmedian(lossavg))
-        evalcurve.append(eval(eaat, params['binning']))
-    
-    max_acc = np.max(evalcurve[-25:])
-
-    return {'loss': 1-(max_acc/100.0),
-            'status': STATUS_OK,
-            'model': eaat,
-            'params': params,
-            'accuracy': (max_acc/100.0)}
\ No newline at end of file
+
+class ShadowCNN:
+    '''
+    Methods for deploying a Shadow CNN
+    implementation with hyperparameter optimization.
+    Data agnostic (i.e. user supplied data inputs).
+    TODO: Currently only supports binary classification.
+        Add multinomial functions and unit tests.
+        Add functionality for regression(?)
+    Inputs:
+    params: dictionary of logistic regression input functions.
+        keys binning, hidden_layer, alpha, xi, eps, lr, and momentum
+        are supported.
+    TODO: Include functionality for manipulating other
+        CNN architecture parameters in hyperparameter optimization
+    random_state: int/float for reproducible intiailization.
+    TODO: Add input parameter, loss_function, for the other
+        loss function options available in Shadow (besides EAAT).
+    '''
+
+    # only binary so far
+    def __init__(self, params=None, random_state=0):
+        # defaults to a fixed value for reproducibility
+        self.random_state = random_state
+        # set seeds for reproducibility
+        set_seed(0)
+        # device used for computation
+        self.device = torch.device("cuda" if
+                                   torch.cuda.is_available() else "cpu")
+        # dictionary of parameters for logistic regression model
+        self.params = params
+        if self.params is not None:
+            # assumes the input dimensions are measurements of 1000 bins
+            # TODO: Abstract this for arbitrary input size
+            self.model = Net(layer1=params['layer1'],
+                             layer2=2*params['layer1'],
+                             layer3=3*params['layer1'],
+                             kernel=params['kernel'],
+                             drop_rate=params['drop_rate'],
+                             length=1000)
+            self.eaat = shadow.eaat.EAAT(model=self.model,
+                                         alpha=params['alpha'],
+                                         xi=params['xi'],
+                                         eps=params['eps'])
+            self.optimizer = optim.SGD(self.eaat.parameters(),
+                                       lr=params['lr'],
+                                       momentum=params['momentum'])
+        else:
+            # assumes the input dimensions are measurements of 1000 bins
+            # TODO: Abstract this for arbitrary input size
+            self.model = Net()
+            self.eaat = shadow.eaat.EAAT(model=self.model)
+            self.optimizer = optim.SGD(self.eaat.parameters())
+
+    def fresh_start(self, params, data_dict):
+        '''
+        Required method for hyperopt optimization.
+        Trains and tests a fresh Shadow NN model
+        with given input parameters.
+        This method does not overwrite self.model (self.optimize() does).
+        Inputs:
+        params: dictionary of logistic regression input functions.
+            keys binning, layer1, alpha, xi, eps, lr, momentum,
+            kernel, drop_rate, and batch_size are supported.
+        data_dict: compact data representation with the four requisite
+            data structures used for training and testing a model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        '''
+
+        # unpack data
+        trainx = data_dict['trainx']
+        trainy = data_dict['trainy']
+        testx = data_dict['testx']
+        testy = data_dict['testy']
+        # unlabeled co-training data
+        Ux = data_dict['Ux']
+
+        # avoid float round-off by using DoubleTensor
+        xtens = torch.FloatTensor(np.append(trainx,
+                                            Ux,
+                                            axis=0))[:, ::params['binning']]
+        # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
+        ytens = torch.LongTensor(np.append(trainy,
+                                           np.full(shape=(Ux.shape[0],),
+                                                   axis=0)))
+
+        model = Net(layer1=params['layer1'],
+                    layer2=2*params['layer1'],
+                    layer3=3*params['layer1'],
+                    kernel=params['kernel'],
+                    drop_rate=params['drop_rate'],
+                    length=xtens.shape[1])
+        eaat = shadow.eaat.EAAT(model=model,
+                                alpha=params['alpha'],
+                                xi=params['xi'],
+                                eps=params['eps'])
+        optimizer = optim.SGD(eaat.parameters(),
+                              lr=params['lr'],
+                              momentum=params['momentum'])
+
+        # define data set object
+        dataset = SpectralDataset(xtens, ytens)
+
+        # create DataLoader object of DataSet object
+        DL_DS = torch.utils.data.DataLoader(dataset,
+                                            batch_size=params['batch_size'],
+                                            shuffle=True)
+
+        # labels for unlabeled data are always "-1"
+        xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1)
+
+        n_epochs = 100
+        eaat.to(self.device)
+        losscurve = []
+        evalcurve = []
+        for epoch in range(n_epochs):
+            eaat.train()
+            lossavg = []
+            for i, (data, targets) in enumerate(DL_DS):
+                x = data.reshape((data.shape[0],
+                                  1,
+                                  data.shape[1])).to(self.device)
+                y = targets.to(self.device)
+                optimizer.zero_grad()
+                out = eaat(x)
+                loss = xEnt(out, y) + eaat.get_technique_cost(x)
+                loss.backward()
+                optimizer.step()
+                lossavg.append(loss.item())
+            losscurve.append(np.nanmedian(lossavg))
+            evalcurve.append(self.predict(eaat,
+                                          testx,
+                                          testy,
+                                          params['binning']))
+
+        max_acc = np.max(evalcurve[-25:])
+
+        return {'loss': 1-(max_acc/100.0),
+                'status': STATUS_OK,
+                'model': eaat,
+                'params': params,
+                'losscurve': losscurve,
+                'evalcurve': evalcurve,
+                'accuracy': (max_acc/100.0)}
+
+    def optimize(self, space, data_dict, max_evals=50, verbose=True):
+        '''
+        Wrapper method for using hyperopt (see utils.run_hyperopt
+        for more details). After hyperparameter optimization, results
+        are stored, the best model -overwrites- self.model, and the
+        best params -overwrite- self.params.
+        Inputs:
+        space: a hyperopt compliant dictionary with defined optimization
+            spaces. For example:
+                # quniform returns float, some parameters require int;
+                # use this to force int
+                space = {'layer1'        : scope.int(hp.quniform('layer1',
+                                                                 1000,
+                                                                 10000,
+                                                                 10)),
+                         'kernel'        : scope.int(hp.quniform('kernel',
+                                                                 1,
+                                                                 9,
+                                                                 1)),
+                         'alpha'        : hp.uniform('alpha', 0.0001, 0.999),
+                         'xi'           : hp.uniform('xi', 1e-2, 1e0),
+                         'eps'          : hp.uniform('eps', 0.5, 1.5),
+                         'lr'           : hp.uniform('lr', 1e-3, 1e-1),
+                         'momentum'     : hp.uniform('momentum', 0.5, 0.99),
+                         'binning'      : scope.int(hp.quniform('binning',
+                                                                1,
+                                                                10,
+                                                                1)),
+                         'batch_szie'   : scope.int(hp.quniform('batch_size',
+                                                                1,
+                                                                100,
+                                                                1))
+                        }
+            See hyperopt docs for more information.
+        data_dict: compact data representation with the five requisite
+            data structures used for training and testing an SSML model.
+            keys trainx, trainy, testx, testy, and Ux required.
+            NOTE: Uy is not needed since labels for unlabeled data
+            instances is not used.
+        max_evals: the number of epochs for hyperparameter optimization.
+            Each iteration is one set of hyperparameters trained
+            and tested on a fresh model. Convergence for simpler
+            models like logistic regression typically happens well
+            before 50 epochs, but can increase as more complex models,
+            more hyperparameters, and a larger hyperparameter space is tested.
+        verbose: boolean. If true, print results of hyperopt.
+            If false, print only the progress bar for optimization.
+        '''
+
+        best, worst = run_hyperopt(space=space,
+                                   model=self.fresh_start,
+                                   data_dict=data_dict,
+                                   max_evals=max_evals,
+                                   verbose=verbose)
+
+        # save the results of hyperparameter optimization
+        self.best = best
+        self.model = best['model']
+        self.params = best['params']
+        self.worst = worst
+
+    def train(self, trainx, trainy, Ux, testx=None, testy=None):
+        '''
+        Wrapper method for Shadow NN training method.
+        Inputs:
+        trainx: nxm feature vector/matrix for training model.
+        trainy: nxk class label vector/matrix for training model.
+        Ux: feature vector/matrix like labeled trainx but unlabeled data.
+        testx: feature vector/matrix used for testing the performance
+            of each model at every iteration.
+        testy: label vector used for testing the performance
+            of each model at every iteration.
+        '''
+
+        # avoid float round-off by using DoubleTensor
+        xtens = torch.FloatTensor(np.append(trainx,
+                                            Ux,
+                                            axis=0))[:,
+                                                     ::self.params['binning']]
+        # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
+        ytens = torch.LongTensor(np.append(trainy,
+                                           np.full(shape=(Ux.shape[0],),
+                                                   axis=0)))
+
+        # define data set object
+        dataset = SpectralDataset(xtens, ytens)
+
+        # create DataLoader object of DataSet object
+        DL_DS = torch.utils.data.DataLoader(dataset,
+                                            batch_size=self.params[
+                                                        'batch_size'
+                                                        ],
+                                            shuffle=True)
+
+        # labels for unlabeled data are always "-1"
+        xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1)
+
+        n_epochs = 100
+        self.eaat.to(self.device)
+        losscurve = []
+        evalcurve = []
+        for epoch in range(n_epochs):
+            self.eaat.train()
+            lossavg = []
+            for i, (data, targets) in enumerate(DL_DS):
+                x = data.reshape((data.shape[0],
+                                  1,
+                                  data.shape[1])).to(self.device)
+                y = targets.to(self.device)
+                self.optimizer.zero_grad()
+                out = self.eaat(x)
+                loss = xEnt(out, y) + self.eaat.get_technique_cost(x)
+                loss.backward()
+                self.optimizer.step()
+                lossavg.append(loss.item())
+            losscurve.append(np.nanmedian(lossavg))
+            evalcurve.append(self.predict(self.eaat,
+                                          testx,
+                                          testy,
+                                          self.params['binning']))
+
+        # optionally return the training accuracy if test data was provided
+        return losscurve, evalcurve
+
+    def predict(self, testx, testy=None, binning=1000):
+        '''
+        Wrapper method for Shadow NN predict method.
+        Inputs:
+        testx: nxm feature vector/matrix for testing model.
+        testy: nxk class label vector/matrix for training model.
+            optional: if included, the predicted classes -and-
+            the resulting classification accuracy will be returned.
+        '''
+
+        self.eaat.eval()
+        y_pred, y_true = [], []
+        for i, data in enumerate(torch.FloatTensor(testx.copy()[:,
+                                                                ::binning])):
+            x = data.reshape((1, 1, data.shape[0])).to(self.device)
+            out = self.eaat(x)
+            y_pred.extend(torch.argmax(out, 1).detach().cpu().tolist())
+        acc = None
+        if testy is not None:
+            y_true = torch.LongTensor(testy.copy())
+            acc = (np.array(y_true) == np.array(y_pred)).mean() * 100
+
+        return y_pred, acc
+
+    def plot_cotraining(self, filename='lr-cotraining-learningcurves.png',
+                        losscurve=None, evalcurve=None):
+        '''
+        Plots the training error curves for two co-training models.
+        NOTE: The user can either choose to plot what is stored in
+            the class instance by setting curves=None or
+            the curves can be inputted.
+        Inputs:
+        filename: name to store picture under.
+            Must end in .png (or will be added if missing).
+        losscurve: the loss value over training epochs
+        evalcurve: the accuracy scores over training epochs
+        '''
+
+        fig, (ax1, ax2) = plt.subplots(2,
+                                       1,
+                                       sharex=True,
+                                       figsize=(10, 8),
+                                       dpi=300)
+        if losscurve is not None and evalcurve is not None:
+            ax1.plot(losscurve)
+            ax2.plot(evalcurve)
+        else:
+            ax1.plot(self.best['losscurve'])
+            ax2.plot(self.best['evalcurve'])
+        ax1.set_xlabel('Epoch')
+        ax2.set_xlabel('Epoch')
+        ax1.set_ylabel('Loss Curve')
+        ax2.set_ylabel('Accuracy')
+        ax1.grid()
+        ax2.grid()
+
+        if filename[-4:] != '.png':
+            filename += '.png'
+        fig.savefig(filename)
+
+    def save(self, filename):
+        '''
+        Save class instance to file using joblib.
+        Inputs:
+        filename: string filename to save object to file under.
+            The file must be saved with extension .joblib.
+            Added to filename if not included as input.
+        '''
+
+        if filename[-7:] != '.joblib':
+            filename += '.joblib'
+        joblib.dump(self, filename)
diff --git a/scripts/ssl/ShadowNN.py b/scripts/ssl/ShadowNN.py
index 6c7377c..2bb2ce5 100644
--- a/scripts/ssl/ShadowNN.py
+++ b/scripts/ssl/ShadowNN.py
@@ -38,7 +38,7 @@ def __init__(self, params=None, random_state=0):
                                    torch.cuda.is_available() else "cpu")
         # dictionary of parameters for logistic regression model
         self.params = params
-        if self.params is None:
+        if self.params is not None:
             # assumes the input dimensions are measurements of 1000 bins
             # TODO: Abstract this for arbitrary input size
             self.eaat = shadow.eaat.EAAT(model=self.model_factory(
diff --git a/scripts/ssl/cotraining.py b/scripts/ssl/cotraining.py
index dd961c2..f3193fe 100644
--- a/scripts/ssl/cotraining.py
+++ b/scripts/ssl/cotraining.py
@@ -352,7 +352,7 @@ def plot_cotraining(self, filename='lr-cotraining-learningcurves.png',
         model2_accs: the accuracy scores over training epochs for model 2
         '''
 
-        fig, ax = plt.subplots(figsize=(10, 8))
+        fig, ax = plt.subplots(figsize=(10, 8), dpi=300)
         if model1_accs is not None and model2_accs is not None:
             ax.plot(np.arange(len(model1_accs)), model1_accs, label='Model 1')
             ax.plot(np.arange(len(model2_accs)), model2_accs, label='Model 2')

From ebe247a526a4df9b71bc413e5bc2a0d7093655e3 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Fri, 12 Aug 2022 15:19:09 -0400
Subject: [PATCH 17/35] adding functions for pca analysis

---
 scripts/utils.py | 116 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/scripts/utils.py b/scripts/utils.py
index 38c2f5b..afe52c9 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -6,6 +6,9 @@
 from functools import partial
 # diagnostics
 from sklearn.metrics import confusion_matrix
+# pca
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
 
 
 def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True):
@@ -52,6 +55,119 @@ def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True):
     return best, worst
 
 
+def pca(Lx, Ly, Ux, Uy, filename):
+    '''
+    A function for computing and plotting 2D PCA.
+    Inputs:
+    Lx: labeled feature data.
+    Ly: class labels for labeled data.
+    Ux: unlabeled feature data.
+    Uy: labels for unlabeled data (all labels should be -1).
+    filename: filename for saved plot.
+        The file must be saved with extension .joblib.
+        Added to filename if not included as input.
+    '''
+
+    plt.rcParams.update({'font.size': 20})
+    # only saving colors for binary classification with unlabeled instances
+    col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'}
+
+    pcadata = np.append(Lx, Ux, axis=0)
+    normalizer = StandardScaler()
+    x = normalizer.fit_transform(pcadata)
+    print(np.mean(pcadata), np.std(pcadata))
+    print(np.mean(x), np.std(x))
+
+    pca = PCA(n_components=2)
+    pca.fit_transform(x)
+    print(pca.explained_variance_ratio_)
+    print(pca.singular_values_)
+    print(pca.components_)
+
+    principalComponents = pca.fit_transform(x)
+
+    fig, ax = plt.subplots(figsize=(10, 8))
+    ax.set_xlabel('Principal Component 1', fontsize=15)
+    ax.set_ylabel('Principal Component 2', fontsize=15)
+    for idx, color in col_dict.items():
+        indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0]
+        ax.scatter(principalComponents[indices, 0],
+                   principalComponents[indices, 1],
+                   c=color,
+                   label='class '+str(idx))
+    ax.grid()
+    ax.legend()
+
+    if filename[-4:] != '.png':
+        filename += '.png'
+    fig.tight_layout()
+    fig.savefig(filename)
+
+
+def multiD_PCA(Lx, Ly, Ux, Uy, filename, n=2):
+    '''
+    A function for computing and plotting n-dimensional PCA.
+    Inputs:
+    Lx: labeled feature data.
+    Ly: class labels for labeled data.
+    Ux: unlabeled feature data.
+    Uy: labels for unlabeled data (all labels should be -1).
+    filename: filename for saved plot.
+        The file must be saved with extension .joblib.
+        Added to filename if not included as input.
+    n: number of singular values to include in PCA analysis.
+    '''
+
+    plt.rcParams.update({'font.size': 20})
+    # only saving colors for binary classification with unlabeled instances
+    col_dict = {-1: 'tab:gray', 0: 'tab:orange', 1: 'tab:blue'}
+
+    pcadata = np.append(Lx, Ux, axis=0)
+    normalizer = StandardScaler()
+    x = normalizer.fit_transform(pcadata)
+    print(np.mean(pcadata), np.std(pcadata))
+    print(np.mean(x), np.std(x))
+
+    n = 2
+    pca = PCA(n_components=n)
+    principalComponents = pca.fit_transform(x)
+    print(pca.explained_variance_ratio_)
+    print(pca.singular_values_)
+    print(pca.components_)
+
+    alph = ["A", "B", "C", "D", "E", "F", "G", "H",
+            "I", "J", "K", "L", "M", "N", "O", "P",
+            "Q", "R", "S", "T", "U", "V", "W", "X",
+            "Y", "Z"]
+    jobs = alph[:n]
+
+    fig, axes = plt.subplots(n, n, figsize=(15, 15))
+
+    for row in range(axes.shape[0]):
+        for col in range(axes.shape[1]):
+            ax = axes[row, col]
+            if row == col:
+                ax.tick_params(
+                    axis='both', which='both',
+                    bottom='off', top='off',
+                    labelbottom='off',
+                    left='off', right='off',
+                    labelleft='off'
+                )
+                ax.text(0.5, 0.5, jobs[row], horizontalalignment='center')
+            else:
+                for idx, color in col_dict.items():
+                    indices = np.where(np.append(Ly, Uy, axis=0) == idx)[0]
+                    ax.scatter(principalComponents[indices, row],
+                               principalComponents[indices, col],
+                               c=color,
+                               label='class '+str(idx))
+    fig.tight_layout()
+    if filename[-4:] != '.png':
+        filename += '.png'
+    fig.savefig(filename)
+
+
 def plot_cf(testy, predy, title, filename):
     '''
     Uses sklearn metric to compute a confusion matrix for visualization

From 7ae467133a429b79df880511850681b52fa7ca7a Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 15 Aug 2022 09:20:21 -0400
Subject: [PATCH 18/35] rearranging model files

---
 {scripts => models}/LogReg.py                          | 0
 scripts/ssl/cotraining.py => models/SSML/CoTraining.py | 0
 {scripts/ssl => models/SSML}/LabelProp.py              | 0
 {scripts/ssl => models/SSML}/ShadowCNN.py              | 0
 {scripts/ssl => models/SSML}/ShadowNN.py               | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename {scripts => models}/LogReg.py (100%)
 rename scripts/ssl/cotraining.py => models/SSML/CoTraining.py (100%)
 rename {scripts/ssl => models/SSML}/LabelProp.py (100%)
 rename {scripts/ssl => models/SSML}/ShadowCNN.py (100%)
 rename {scripts/ssl => models/SSML}/ShadowNN.py (100%)

diff --git a/scripts/LogReg.py b/models/LogReg.py
similarity index 100%
rename from scripts/LogReg.py
rename to models/LogReg.py
diff --git a/scripts/ssl/cotraining.py b/models/SSML/CoTraining.py
similarity index 100%
rename from scripts/ssl/cotraining.py
rename to models/SSML/CoTraining.py
diff --git a/scripts/ssl/LabelProp.py b/models/SSML/LabelProp.py
similarity index 100%
rename from scripts/ssl/LabelProp.py
rename to models/SSML/LabelProp.py
diff --git a/scripts/ssl/ShadowCNN.py b/models/SSML/ShadowCNN.py
similarity index 100%
rename from scripts/ssl/ShadowCNN.py
rename to models/SSML/ShadowCNN.py
diff --git a/scripts/ssl/ShadowNN.py b/models/SSML/ShadowNN.py
similarity index 100%
rename from scripts/ssl/ShadowNN.py
rename to models/SSML/ShadowNN.py

From 6997a6ddb5b7d170f03380eab3804ec567a4cde4 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 15 Aug 2022 10:18:56 -0400
Subject: [PATCH 19/35] adding unit test for LogReg

---
 models/LogReg.py     |  2 +-
 models/__init__.py   |  0
 scripts/utils.py     |  4 +--
 tests/test_models.py | 82 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+), 3 deletions(-)
 create mode 100644 models/__init__.py
 create mode 100644 tests/test_models.py

diff --git a/models/LogReg.py b/models/LogReg.py
index 6e619a2..a848ac6 100644
--- a/models/LogReg.py
+++ b/models/LogReg.py
@@ -1,5 +1,5 @@
 # For hyperopt (parameter optimization)
-from scripts.utils import STATUS_OK
+from hyperopt import STATUS_OK
 # sklearn models
 from sklearn import linear_model
 # diagnostics
diff --git a/models/__init__.py b/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/utils.py b/scripts/utils.py
index afe52c9..4c1c593 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -2,7 +2,7 @@
 import seaborn as sns
 import matplotlib.pyplot as plt
 # For hyperopt (parameter optimization)
-from scripts.utils import Trials, tpe, fmin
+from hyperopt import Trials, tpe, fmin
 from functools import partial
 # diagnostics
 from sklearn.metrics import confusion_matrix
@@ -33,7 +33,7 @@ def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True):
     trials = Trials()
 
     # wrap data into objective function
-    fmin_objective = partial(model, data_dict=data_dict, device=None)
+    fmin_objective = partial(model, data_dict=data_dict)
 
     # run hyperopt
     fmin(fmin_objective,
diff --git a/tests/test_models.py b/tests/test_models.py
new file mode 100644
index 0000000..4c65016
--- /dev/null
+++ b/tests/test_models.py
@@ -0,0 +1,82 @@
+# diagnostics
+import numpy as np
+from datetime import datetime, timedelta
+# testing models
+from sklearn.model_selection import train_test_split
+import tests.test_data as test_data
+# hyperopt
+from hyperopt.pyll.base import scope
+from hyperopt import hp
+# models
+from models.LogReg import LogReg
+# testing write
+import joblib
+import os
+
+# initialize sample data
+start_date = datetime(2019, 2, 2)
+delta = timedelta(seconds=1)
+timestamps = np.arange(start_date,
+                       start_date + (test_data.timesteps * delta),
+                       delta).astype('datetime64[s]').astype('float64')
+
+live = np.full((len(timestamps),), test_data.livetime)
+sample_val = 1.0
+spectra = np.full((len(timestamps), test_data.energy_bins),
+                  np.full((1, test_data.energy_bins), sample_val))
+# setting up for rejected null hypothesis
+rejected_H0_time = np.random.choice(spectra.shape[0],
+                                    test_data.timesteps//2,
+                                    replace=False)
+spectra[rejected_H0_time] = 100.0
+
+labels = np.full((spectra.shape[0],), 0)
+labels[rejected_H0_time] = 1
+
+
+def test_LogReg():
+    X_train, X_test, y_train, y_test = train_test_split(spectra,
+                                                        labels,
+                                                        test_size=0.2,
+                                                        random_state=0)
+
+    # testing train and predict methods
+    print('------TESTING------')
+    print(spectra[rejected_H0_time])
+    print(timestamps[rejected_H0_time])
+
+    # default behavior
+    model = LogReg(params=None, random_state=0)
+    model.train(X_train, y_train)
+
+    pred, acc = model.predict(X_test, y_test)
+
+    assert acc > 0.7
+    np.testing.assert_equal(pred, y_test)
+
+    # testing hyperopt optimize methods
+    space = {'max_iter': scope.int(hp.quniform('max_iter',
+                                               10,
+                                               10000,
+                                               10)),
+             'tol': hp.loguniform('tol', 1e-5, 1e-1),
+             'C': hp.uniform('C', 0.001, 1000.0)
+             }
+    data_dict = {'trainx': X_train,
+                 'testx': X_test,
+                 'trainy': y_train,
+                 'testy': y_test
+                 }
+    model.optimize(space, data_dict, max_evals=50, verbose=True)
+
+    assert model.best['accuracy'] >= model.worst['accuracy']
+    assert model.best['status'] == 'ok'
+
+    # testing model write to file method
+    filename = 'test_LogReg'
+    ext = '.joblib'
+    model.save(filename)
+    model_file = joblib.load(filename+ext)
+    assert model_file.best['params'] == model.best['params']
+
+    os.remove(filename+ext)

From 73ce1f158cb1b9cb9693e49bd83e40c886922af6 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 15 Aug 2022 10:30:21 -0400
Subject: [PATCH 20/35] updating dependencies

---
 README.md        | 6 ++++++
 requirements.txt | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/README.md b/README.md
index b08bd07..851d352 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,13 @@ Versions 3.6-3.9 are currently supported by tests. The following Python packages
 * h5py
 * numpy
 * progressbar2
+* matplotlib
+* seaborn
 * scipy
+* sklearn
+* hyperopt
+* pytorch
+* shadow-ssml
 
 Modules can be imported from the repository directory (e.g. `from RadClass.H0 import H0`) or `RadClass` can be installed using pip:
 
diff --git a/requirements.txt b/requirements.txt
index 06d1c3a..74e268f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,10 @@ numpy
 h5py
 progressbar2
 scipy>=1.7.0
+scikit-learn
+hyperopt
+matplotlib
+seaborn
+joblib
+pytorch
+shadow-ssml

From 98e33e81ed52e024b24ba2cd3da202493d407c6d Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 15 Aug 2022 10:31:58 -0400
Subject: [PATCH 21/35] correcting pytorch package name

---
 README.md        | 2 +-
 requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 851d352..42245fa 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Versions 3.6-3.9 are currently supported by tests. The following Python packages
 * scipy
 * sklearn
 * hyperopt
-* pytorch
+* torch
 * shadow-ssml
 
 Modules can be imported from the repository directory (e.g. `from RadClass.H0 import H0`) or `RadClass` can be installed using pip:
diff --git a/requirements.txt b/requirements.txt
index 74e268f..8b22315 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,5 @@ hyperopt
 matplotlib
 seaborn
 joblib
-pytorch
+torch
 shadow-ssml

From 12982cac1a212ac3b83b7de41ab447a27c696e97 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 15 Aug 2022 10:56:22 -0400
Subject: [PATCH 22/35] adding unit test for CoTraining

---
 models/SSML/CoTraining.py | 28 +++++++++---------
 models/SSML/__init__.py   |  0
 tests/test_models.py      | 62 +++++++++++++++++++++++++++++++++++----
 3 files changed, 70 insertions(+), 20 deletions(-)
 create mode 100644 models/SSML/__init__.py

diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py
index f3193fe..ae2f9f5 100644
--- a/models/SSML/CoTraining.py
+++ b/models/SSML/CoTraining.py
@@ -1,7 +1,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 # For hyperopt (parameter optimization)
-from scripts.utils import STATUS_OK
+from hyperopt import STATUS_OK
 # sklearn models
 from sklearn import linear_model
 # diagnostics
@@ -156,9 +156,9 @@ def fresh_start(self, params, data_dict):
         U_lr = Ux.copy()
 
         # set the random seed of training splits for reproducibility
-        # This can be ignored by fixing params['seed'] to None
+        # This can be ignored by excluding params['seed']
         # in the hyperopt space dictionary
-        if params['seed'] is not None:
+        if 'seed' in params.keys():
             np.random.seed(params['seed'])
 
         # TODO: allow a user to specify uneven splits between the two models
@@ -192,8 +192,8 @@ def fresh_start(self, params, data_dict):
                                                 slr1, slr2,
                                                 L_lr1, L_lr2,
                                                 Ly_lr1, Ly_lr2,
-                                                U_lr, testx, testy,
-                                                params['n_samples']
+                                                U_lr, params['n_samples'],
+                                                testx, testy,
                                                 )
 
         # balanced_accuracy accounts for class imbalanced data
@@ -283,7 +283,7 @@ def train(self, trainx, trainy, Ux,
         U_lr = Ux.copy()
 
         # set the random seed of training splits for reproducibility
-        # This can be ignored by fixing params['seed'] to None
+        # This can be ignored by excluding params['seed']
         # in the hyperopt space dictionary
         if seed is not None:
             np.random.seed(seed)
@@ -301,14 +301,14 @@ def train(self, trainx, trainy, Ux,
         Ly_lr1 = trainy[idx].copy()
         Ly_lr2 = trainy[~idx].copy()
 
-        self.model1, self.model2,
-        model1_accs, model2_accs = self.training_loop(
-                                        self.model1, self.model2,
-                                        L_lr1, L_lr2,
-                                        Ly_lr1, Ly_lr2,
-                                        U_lr, testx, testy,
-                                        n_samples
-                                        )
+        self.model1, self.model2, model1_accs, model2_accs = \
+            self.training_loop(
+                                self.model1, self.model2,
+                                L_lr1, L_lr2,
+                                Ly_lr1, Ly_lr2,
+                                U_lr, n_samples,
+                                testx, testy,
+                                )
 
         # optional returns if a user is interested in training diagnostics
         return model1_accs, model2_accs
diff --git a/models/SSML/__init__.py b/models/SSML/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_models.py b/tests/test_models.py
index 4c65016..d47a3d1 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -9,6 +9,7 @@
 from hyperopt import hp
 # models
 from models.LogReg import LogReg
+from models.SSML.CoTraining import CoTraining
 # testing write
 import joblib
 import os
@@ -40,15 +41,11 @@ def test_LogReg():
                                                         test_size=0.2,
                                                         random_state=0)
 
-    # testing train and predict methods
-    print('------TESTING------')
-    print(spectra[rejected_H0_time])
-    print(timestamps[rejected_H0_time])
-
     # default behavior
     model = LogReg(params=None, random_state=0)
     model.train(X_train, y_train)
 
+    # testing train and predict methods
     pred, acc = model.predict(X_test, y_test)
 
     assert acc > 0.7
@@ -67,7 +64,60 @@ def test_LogReg():
                  'trainy': y_train,
                  'testy': y_test
                  }
-    model.optimize(space, data_dict, max_evals=50, verbose=True)
+    model.optimize(space, data_dict, max_evals=10, verbose=True)
+
+    assert model.best['accuracy'] >= model.worst['accuracy']
+    assert model.best['status'] == 'ok'
+
+    # testing model write to file method
+    filename = 'test_LogReg'
+    ext = '.joblib'
+    model.save(filename)
+    model_file = joblib.load(filename+ext)
+    assert model_file.best['params'] == model.best['params']
+
+    os.remove(filename+ext)
+
+
+def test_CoTraining():
+    X, Ux, y, Uy = train_test_split(spectra,
+                                    labels,
+                                    test_size=0.5,
+                                    random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X,
+                                                        y,
+                                                        test_size=0.2,
+                                                        random_state=0)
+
+    # default behavior
+    model = CoTraining(params=None, random_state=0)
+    model.train(X_train, y_train, Ux)
+
+    # testing train and predict methods
+    pred, acc, *_ = model.predict(X_test, y_test)
+
+    assert acc > 0.7
+    np.testing.assert_equal(pred, y_test)
+
+    # testing hyperopt optimize methods
+    space = {'max_iter': scope.int(hp.quniform('max_iter',
+                                   10,
+                                   10000,
+                                   10)),
+             'tol': hp.loguniform('tol', 1e-5, 1e-3),
+             'C': hp.uniform('C', 1.0, 1000.0),
+             'n_samples': scope.int(hp.quniform('n_samples',
+                                    1,
+                                    20,
+                                    1))
+             }
+    data_dict = {'trainx': X_train,
+                 'testx': X_test,
+                 'trainy': y_train,
+                 'testy': y_test,
+                 'Ux': Ux
+                 }
+    model.optimize(space, data_dict, max_evals=10, verbose=True)
 
     assert model.best['accuracy'] >= model.worst['accuracy']
     assert model.best['status'] == 'ok'

From 1365e303a79e5521813f09089cc892d04c8f4f5c Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 15 Aug 2022 11:18:12 -0400
Subject: [PATCH 23/35] adding unit test for LabelProp

---
 models/SSML/LabelProp.py |  2 +-
 tests/test_models.py     | 97 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/models/SSML/LabelProp.py b/models/SSML/LabelProp.py
index aad970a..aa1e795 100644
--- a/models/SSML/LabelProp.py
+++ b/models/SSML/LabelProp.py
@@ -1,6 +1,6 @@
 import numpy as np
 # For hyperopt (parameter optimization)
-from scripts.utils import STATUS_OK
+from hyperopt import STATUS_OK
 # sklearn models
 from sklearn import semi_supervised
 # diagnostics
diff --git a/tests/test_models.py b/tests/test_models.py
index d47a3d1..f1c5e90 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -3,6 +3,7 @@
 from datetime import datetime, timedelta
 # testing models
 from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
 import tests.test_data as test_data
 # hyperopt
 from hyperopt.pyll.base import scope
@@ -10,6 +11,7 @@
 # models
 from models.LogReg import LogReg
 from models.SSML.CoTraining import CoTraining
+from models.SSML.LabelProp import LabelProp
 # testing write
 import joblib
 import os
@@ -41,6 +43,13 @@ def test_LogReg():
                                                         test_size=0.2,
                                                         random_state=0)
 
+    # normalization
+    normalizer = StandardScaler()
+    normalizer.fit(X_train)
+
+    X_train = normalizer.transform(X_train)
+    X_test = normalizer.transform(X_test)
+
     # default behavior
     model = LogReg(params=None, random_state=0)
     model.train(X_train, y_train)
@@ -89,6 +98,14 @@ def test_CoTraining():
                                                         test_size=0.2,
                                                         random_state=0)
 
+    # normalization
+    normalizer = StandardScaler()
+    normalizer.fit(X_train)
+
+    X_train = normalizer.transform(X_train)
+    X_test = normalizer.transform(X_test)
+    Ux = normalizer.transform(Ux)
+
     # default behavior
     model = CoTraining(params=None, random_state=0)
     model.train(X_train, y_train, Ux)
@@ -101,15 +118,83 @@ def test_CoTraining():
 
     # testing hyperopt optimize methods
     space = {'max_iter': scope.int(hp.quniform('max_iter',
-                                   10,
-                                   10000,
-                                   10)),
+                                               10,
+                                               10000,
+                                               10)),
              'tol': hp.loguniform('tol', 1e-5, 1e-3),
              'C': hp.uniform('C', 1.0, 1000.0),
              'n_samples': scope.int(hp.quniform('n_samples',
-                                    1,
-                                    20,
-                                    1))
+                                                1,
+                                                20,
+                                                1))
+             }
+    data_dict = {'trainx': X_train,
+                 'testx': X_test,
+                 'trainy': y_train,
+                 'testy': y_test,
+                 'Ux': Ux
+                 }
+    model.optimize(space, data_dict, max_evals=10, verbose=True)
+
+    assert model.best['accuracy'] >= model.worst['accuracy']
+    assert model.best['status'] == 'ok'
+
+    # testing model write to file method
+    filename = 'test_LogReg'
+    ext = '.joblib'
+    model.save(filename)
+    model_file = joblib.load(filename+ext)
+    assert model_file.best['params'] == model.best['params']
+
+    os.remove(filename+ext)
+
+
+def test_LabelProp():
+    X, Ux, y, Uy = train_test_split(spectra,
+                                    labels,
+                                    test_size=0.5,
+                                    random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X,
+                                                        y,
+                                                        test_size=0.2,
+                                                        random_state=0)
+
+    # normalization
+    normalizer = StandardScaler()
+    normalizer.fit(X_train)
+
+    X_train = normalizer.transform(X_train)
+    X_test = normalizer.transform(X_test)
+    Ux = normalizer.transform(Ux)
+
+    # default behavior
+    model = LabelProp(params=None, random_state=0)
+    model.train(X_train, y_train, Ux)
+
+    # testing train and predict methods
+    pred, acc = model.predict(X_test, y_test)
+
+    # the default n_neighbors(=7) from sklearn is too large
+    # for the size of this dataset
+    # therefore the accuracy is expected to be poor
+    # a better value for this dataset would be n_neighbors=2
+    # (tested when specifying params in LabelProp.__init__)
+    assert acc >= 0.5
+    # uninteresting test if LabelProp predicts all one class
+    # TODO: make the default params test meaningful
+    assert np.count_nonzero(pred == y_test) > 0
+
+    # testing hyperopt optimize methods
+    space = {'max_iter': scope.int(hp.quniform('max_iter',
+                                               10,
+                                               10000,
+                                               10)),
+             'tol': hp.loguniform('tol', 1e-6, 1e-4),
+             'gamma': hp.uniform('gamma', 1, 50),
+             'n_neighbors': scope.int(hp.quniform('n_neighbors',
+                                                  1,
+                                                  X_train.shape[0],
+                                                  1))
              }
     data_dict = {'trainx': X_train,
                  'testx': X_test,

From c97136d6d8a2cafe621c18bd4e7bb3225eb9bae1 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 15 Aug 2022 11:48:35 -0400
Subject: [PATCH 24/35] adding unit test for ShadowNN

---
 .github/workflows/python-package.yml |  2 +-
 models/SSML/ShadowNN.py              | 18 ++++---
 tests/test_BackgroundEstimator.py    |  1 -
 tests/test_models.py                 | 77 +++++++++++++++++++++++++---
 4 files changed, 82 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index d88f9c7..48b3474 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -41,7 +41,7 @@ jobs:
     - name: Test with pytest
       run: |
         python3 -m pytest
-        python3 -m coverage run --source=./RadClass/ -m pytest
+        python3 -m coverage run --source=./RadClass/,./models/ -m pytest
         python3 -m coverage report
         python3 -m coverage html
         COVERALLS_REPO_TOKEN=${{ secrets.COVERALLS_REPO_TOKEN }} python3 -m coveralls --service=github
diff --git a/models/SSML/ShadowNN.py b/models/SSML/ShadowNN.py
index 2bb2ce5..f178b6c 100644
--- a/models/SSML/ShadowNN.py
+++ b/models/SSML/ShadowNN.py
@@ -1,10 +1,13 @@
 import numpy as np
 # For hyperopt (parameter optimization)
-from scripts.utils import STATUS_OK
+from hyperopt import STATUS_OK
 # torch imports
 import torch
 # shadow imports
-import shadow
+import shadow.eaat
+import shadow.losses
+import shadow.utils
+from shadow.utils import set_seed
 # diagnostics
 from scripts.utils import run_hyperopt
 import joblib
@@ -32,7 +35,7 @@ def __init__(self, params=None, random_state=0):
         # defaults to a fixed value for reproducibility
         self.random_state = random_state
         # set seeds for reproducibility
-        shadow.utils.set_seed(0)
+        set_seed(0)
         # device used for computation
         self.device = torch.device("cuda" if
                                    torch.cuda.is_available() else "cpu")
@@ -58,7 +61,8 @@ def __init__(self, params=None, random_state=0):
             # assumes the input dimensions are measurements of 1000 bins
             self.eaat = shadow.eaat.EAAT(
                             model=self.model_factory()).to(self.device)
-            self.eaat_opt = torch.optim.SGD(self.eaat.parameters())
+            self.eaat_opt = torch.optim.SGD(self.eaat.parameters(),
+                                            lr=0.1, momentum=0.9)
             # unlabeled instances always have a label of "-1"
             self.xEnt = torch.nn.CrossEntropyLoss(
                             ignore_index=-1).to(self.device)
@@ -115,7 +119,8 @@ def fresh_start(self, params, data_dict):
         # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
         ytens = torch.LongTensor(np.append(trainy,
                                            np.full(shape=(Ux.shape[0],),
-                                                   axis=0)))
+                                                   fill_value=-1),
+                                           axis=0))
 
         n_epochs = 100
         xt = torch.Tensor(xtens).to(self.device)
@@ -226,7 +231,8 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
         # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
         ytens = torch.LongTensor(np.append(trainy,
                                            np.full(shape=(Ux.shape[0],),
-                                                   axis=0)))
+                                                   fill_value=-1),
+                                           axis=0))
 
         n_epochs = 100
         xt = torch.Tensor(xtens).to(self.device)
diff --git a/tests/test_BackgroundEstimator.py b/tests/test_BackgroundEstimator.py
index 2d10c89..efc1299 100644
--- a/tests/test_BackgroundEstimator.py
+++ b/tests/test_BackgroundEstimator.py
@@ -77,7 +77,6 @@ def test_write():
     bckg.write(ofilename=ofilename)
 
     results = np.loadtxt(fname=ofilename+'.csv', delimiter=',')
-    print(results)
 
     # the resulting observation should be:
     #   counts * integration / live-time
diff --git a/tests/test_models.py b/tests/test_models.py
index f1c5e90..c748845 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -12,6 +12,7 @@
 from models.LogReg import LogReg
 from models.SSML.CoTraining import CoTraining
 from models.SSML.LabelProp import LabelProp
+from models.SSML.ShadowNN import ShadowNN
 # testing write
 import joblib
 import os
@@ -150,6 +151,8 @@ def test_CoTraining():
 
 
 def test_LabelProp():
+    # there should be no normalization on LabelProp data
+    # since it depends on the distances between samples
     X, Ux, y, Uy = train_test_split(spectra,
                                     labels,
                                     test_size=0.5,
@@ -159,14 +162,6 @@ def test_LabelProp():
                                                         test_size=0.2,
                                                         random_state=0)
 
-    # normalization
-    normalizer = StandardScaler()
-    normalizer.fit(X_train)
-
-    X_train = normalizer.transform(X_train)
-    X_test = normalizer.transform(X_test)
-    Ux = normalizer.transform(Ux)
-
     # default behavior
     model = LabelProp(params=None, random_state=0)
     model.train(X_train, y_train, Ux)
@@ -215,3 +210,69 @@ def test_LabelProp():
     assert model_file.best['params'] == model.best['params']
 
     os.remove(filename+ext)
+
+
+def test_ShadowNN():
+    X, Ux, y, Uy = train_test_split(spectra,
+                                    labels,
+                                    test_size=0.5,
+                                    random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X,
+                                                        y,
+                                                        test_size=0.2,
+                                                        random_state=0)
+
+    # normalization
+    normalizer = StandardScaler()
+    normalizer.fit(X_train)
+
+    X_train = normalizer.transform(X_train)
+    X_test = normalizer.transform(X_test)
+    Ux = normalizer.transform(Ux)
+
+    # default behavior
+    model = ShadowNN(params=None, random_state=0)
+    model.train(X_train, y_train, Ux)
+
+    # testing train and predict methods
+    pred, acc = model.predict(X_test, y_test)
+
+    # Shadow/PyTorch reports accuracies as percentages
+    # rather than decimals
+    assert acc >= 50.
+    np.testing.assert_equal(pred, y_test)
+
+    # testing hyperopt optimize methods
+    space = {'hidden_layer': scope.int(hp.quniform('hidden_layer',
+                                                   1000,
+                                                   10000,
+                                                   10)),
+             'alpha': hp.uniform('alpha', 0.0001, 0.999),
+             'xi': hp.uniform('xi', 1e-2, 1e0),
+             'eps': hp.uniform('eps', 0.5, 1.5),
+             'lr': hp.uniform('lr', 1e-3, 1e-1),
+             'momentum': hp.uniform('momentum', 0.5, 0.99),
+             'binning': scope.int(hp.quniform('binning',
+                                              1,
+                                              10,
+                                              1))
+             }
+    data_dict = {'trainx': X_train,
+                 'testx': X_test,
+                 'trainy': y_train,
+                 'testy': y_test,
+                 'Ux': Ux
+                 }
+    model.optimize(space, data_dict, max_evals=5, verbose=True)
+
+    assert model.best['accuracy'] >= model.worst['accuracy']
+    assert model.best['status'] == 'ok'
+
+    # testing model write to file method
+    filename = 'test_LogReg'
+    ext = '.joblib'
+    model.save(filename)
+    model_file = joblib.load(filename+ext)
+    assert model_file.best['params'] == model.best['params']
+
+    os.remove(filename+ext)

From 554eb05bca84265a8754c60597566ef90a4b072b Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 15 Aug 2022 11:49:11 -0400
Subject: [PATCH 25/35] including utils scripts in unit tests coverage

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 48b3474..973f71c 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -41,7 +41,7 @@ jobs:
     - name: Test with pytest
       run: |
         python3 -m pytest
-        python3 -m coverage run --source=./RadClass/,./models/ -m pytest
+        python3 -m coverage run --source=./RadClass/,./models/,./scripts/ -m pytest
         python3 -m coverage report
         python3 -m coverage html
         COVERALLS_REPO_TOKEN=${{ secrets.COVERALLS_REPO_TOKEN }} python3 -m coveralls --service=github

From 20f768ebe4d4a2ea37c92d59894b1b62552df980 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 15 Aug 2022 14:36:01 -0400
Subject: [PATCH 26/35] error: training NNs takes too long for a unit test, let
 alone hyperopt

---
 models/SSML/ShadowCNN.py |  48 +++++++++++-------
 tests/test_models.py     | 106 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 132 insertions(+), 22 deletions(-)

diff --git a/models/SSML/ShadowCNN.py b/models/SSML/ShadowCNN.py
index e1c5d7a..0d0651f 100644
--- a/models/SSML/ShadowCNN.py
+++ b/models/SSML/ShadowCNN.py
@@ -1,7 +1,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 # For hyperopt (parameter optimization)
-from scripts.utils import STATUS_OK
+from hyperopt import STATUS_OK
 # torch imports
 import torch
 import torch.nn as nn
@@ -19,7 +19,7 @@
 
 class Net(nn.Module):
     '''
-    Neural Network constructor .
+    Neural Network constructor.
     Also includes method for forward pass.
     nn.Module: PyTorch object for neural networks.
     Inputs:
@@ -155,11 +155,14 @@ def __init__(self, params=None, random_state=0):
                                        lr=params['lr'],
                                        momentum=params['momentum'])
         else:
+            # fixed value defaults needed by training algorithm
+            self.params = {'binning': 1, 'batch_size': 1}
             # assumes the input dimensions are measurements of 1000 bins
             # TODO: Abstract this for arbitrary input size
             self.model = Net()
             self.eaat = shadow.eaat.EAAT(model=self.model)
-            self.optimizer = optim.SGD(self.eaat.parameters())
+            self.optimizer = optim.SGD(self.eaat.parameters(),
+                                       lr=0.1, momentum=0.9)
 
     def fresh_start(self, params, data_dict):
         '''
@@ -193,7 +196,8 @@ def fresh_start(self, params, data_dict):
         # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
         ytens = torch.LongTensor(np.append(trainy,
                                            np.full(shape=(Ux.shape[0],),
-                                                   axis=0)))
+                                                   fill_value=-1),
+                                           axis=0))
 
         model = Net(layer1=params['layer1'],
                     layer2=2*params['layer1'],
@@ -239,10 +243,11 @@ def fresh_start(self, params, data_dict):
                 optimizer.step()
                 lossavg.append(loss.item())
             losscurve.append(np.nanmedian(lossavg))
-            evalcurve.append(self.predict(eaat,
-                                          testx,
-                                          testy,
-                                          params['binning']))
+            if testx is not None and testy is not None:
+                evalcurve.append(self.predict(testx,
+                                              testy,
+                                              params['binning'],
+                                              eaat))
 
         max_acc = np.max(evalcurve[-25:])
 
@@ -282,7 +287,7 @@ def optimize(self, space, data_dict, max_evals=50, verbose=True):
                                                                 1,
                                                                 10,
                                                                 1)),
-                         'batch_szie'   : scope.int(hp.quniform('batch_size',
+                         'batch_size'   : scope.int(hp.quniform('batch_size',
                                                                 1,
                                                                 100,
                                                                 1))
@@ -336,7 +341,8 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
         # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
         ytens = torch.LongTensor(np.append(trainy,
                                            np.full(shape=(Ux.shape[0],),
-                                                   axis=0)))
+                                                   fill_value=-1),
+                                           axis=0))
 
         # define data set object
         dataset = SpectralDataset(xtens, ytens)
@@ -370,15 +376,16 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
                 self.optimizer.step()
                 lossavg.append(loss.item())
             losscurve.append(np.nanmedian(lossavg))
-            evalcurve.append(self.predict(self.eaat,
-                                          testx,
-                                          testy,
-                                          self.params['binning']))
+            if testx is not None and testy is not None:
+                evalcurve.append(self.predict(testx,
+                                              testy,
+                                              self.params['binning'],
+                                              self.eaat))
 
         # optionally return the training accuracy if test data was provided
         return losscurve, evalcurve
 
-    def predict(self, testx, testy=None, binning=1000):
+    def predict(self, testx, testy=None, binning=1, eaat=None):
         '''
         Wrapper method for Shadow NN predict method.
         Inputs:
@@ -386,14 +393,21 @@ def predict(self, testx, testy=None, binning=1000):
         testy: nxk class label vector/matrix for training model.
             optional: if included, the predicted classes -and-
             the resulting classification accuracy will be returned.
+        binning: int number of bins sampled in feature vector
+        model: optional input for testing a given model in hyperparameter
+            optimization rather than the class saved model.
         '''
 
-        self.eaat.eval()
+        if eaat is not None:
+            eval_model = eaat
+        else:
+            eval_model = self.eaat
+        eval_model.eval()
         y_pred, y_true = [], []
         for i, data in enumerate(torch.FloatTensor(testx.copy()[:,
                                                                 ::binning])):
             x = data.reshape((1, 1, data.shape[0])).to(self.device)
-            out = self.eaat(x)
+            out = eval_model(x)
             y_pred.extend(torch.argmax(out, 1).detach().cpu().tolist())
         acc = None
         if testy is not None:
diff --git a/tests/test_models.py b/tests/test_models.py
index c748845..75350c4 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -13,6 +13,7 @@
 from models.SSML.CoTraining import CoTraining
 from models.SSML.LabelProp import LabelProp
 from models.SSML.ShadowNN import ShadowNN
+from models.SSML.ShadowCNN import ShadowCNN
 # testing write
 import joblib
 import os
@@ -74,7 +75,7 @@ def test_LogReg():
                  'trainy': y_train,
                  'testy': y_test
                  }
-    model.optimize(space, data_dict, max_evals=10, verbose=True)
+    model.optimize(space, data_dict, max_evals=2, verbose=True)
 
     assert model.best['accuracy'] >= model.worst['accuracy']
     assert model.best['status'] == 'ok'
@@ -135,7 +136,7 @@ def test_CoTraining():
                  'testy': y_test,
                  'Ux': Ux
                  }
-    model.optimize(space, data_dict, max_evals=10, verbose=True)
+    model.optimize(space, data_dict, max_evals=2, verbose=True)
 
     assert model.best['accuracy'] >= model.worst['accuracy']
     assert model.best['status'] == 'ok'
@@ -197,7 +198,7 @@ def test_LabelProp():
                  'testy': y_test,
                  'Ux': Ux
                  }
-    model.optimize(space, data_dict, max_evals=10, verbose=True)
+    model.optimize(space, data_dict, max_evals=2, verbose=True)
 
     assert model.best['accuracy'] >= model.worst['accuracy']
     assert model.best['status'] == 'ok'
@@ -230,6 +231,15 @@ def test_ShadowNN():
     X_test = normalizer.transform(X_test)
     Ux = normalizer.transform(Ux)
 
+    params = {'layer1': 4,
+              'kernel': 3,
+              'alpha': 0.1,
+              'xi': 1e-3,
+              'eps': 1.0,
+              'lr': 0.1,
+              'momentum': 0.9,
+              'binning': 5,
+              'batch_size': 2}
     # default behavior
     model = ShadowNN(params=None, random_state=0)
     model.train(X_train, y_train, Ux)
@@ -241,7 +251,7 @@ def test_ShadowNN():
     # rather than decimals
     assert acc >= 50.
     np.testing.assert_equal(pred, y_test)
-
+    '''
     # testing hyperopt optimize methods
     space = {'hidden_layer': scope.int(hp.quniform('hidden_layer',
                                                    1000,
@@ -263,11 +273,97 @@ def test_ShadowNN():
                  'testy': y_test,
                  'Ux': Ux
                  }
-    model.optimize(space, data_dict, max_evals=5, verbose=True)
+    model.optimize(space, data_dict, max_evals=2, verbose=True)
 
     assert model.best['accuracy'] >= model.worst['accuracy']
     assert model.best['status'] == 'ok'
+    '''
+    # testing model write to file method
+    filename = 'test_LogReg'
+    ext = '.joblib'
+    model.save(filename)
+    model_file = joblib.load(filename+ext)
+    assert model_file.best['params'] == model.best['params']
+
+    os.remove(filename+ext)
+
+
+def test_ShadowCNN():
+    X, Ux, y, Uy = train_test_split(spectra,
+                                    labels,
+                                    test_size=0.5,
+                                    random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X,
+                                                        y,
+                                                        test_size=0.2,
+                                                        random_state=0)
+
+    # normalization
+    normalizer = StandardScaler()
+    normalizer.fit(X_train)
+
+    X_train = normalizer.transform(X_train)
+    X_test = normalizer.transform(X_test)
+    Ux = normalizer.transform(Ux)
+
+    params = {'layer1': 4,
+              'kernel': 3,
+              'alpha': 0.1,
+              'xi': 1e-3,
+              'eps': 1.0,
+              'lr': 0.1,
+              'momentum': 0.9,
+              'binning': 1,
+              'batch_size': 2,
+              'drop_rate': 0.1}
 
+    # default behavior
+    model = ShadowCNN(params=params, random_state=0)
+    model.train(X_train, y_train, Ux)
+
+    # testing train and predict methods
+    pred, acc = model.predict(X_test, y_test)
+
+    # Shadow/PyTorch reports accuracies as percentages
+    # rather than decimals
+    assert acc >= 50.
+    np.testing.assert_equal(pred, y_test)
+
+    '''
+    # testing hyperopt optimize methods
+    space = {'layer1': scope.int(hp.quniform('layer1',
+                                             1000,
+                                             10000,
+                                             10)),
+             'kernel': scope.int(hp.quniform('kernel',
+                                             1,
+                                             9,
+                                             1)),
+             'alpha': hp.uniform('alpha', 0.0001, 0.999),
+             'xi': hp.uniform('xi', 1e-2, 1e0),
+             'eps': hp.uniform('eps', 0.5, 1.5),
+             'lr': hp.uniform('lr', 1e-3, 1e-1),
+             'momentum': hp.uniform('momentum', 0.5, 0.99),
+             'binning': scope.int(hp.quniform('binning',
+                                  1,
+                                  10,
+                                  1)),
+             'batch_size': scope.int(hp.quniform('batch_size',
+                                     1,
+                                     100,
+                                     1))
+             }
+    data_dict = {'trainx': X_train,
+                 'testx': X_test,
+                 'trainy': y_train,
+                 'testy': y_test,
+                 'Ux': Ux
+                 }
+    model.optimize(space, data_dict, max_evals=2, verbose=True)
+
+    assert model.best['accuracy'] >= model.worst['accuracy']
+    assert model.best['status'] == 'ok'
+    '''
     # testing model write to file method
     filename = 'test_LogReg'
     ext = '.joblib'

From 5d17d8ccda0ee0e6122516568090d385d63b6678 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Mon, 15 Aug 2022 17:38:40 -0400
Subject: [PATCH 27/35] error: these cnns are so bad that they can't even make
 predictions

---
 models/SSML/ShadowCNN.py | 20 ++++++-----
 models/SSML/ShadowNN.py  |  7 ++--
 tests/test_models.py     | 77 +++++++++++++++-------------------------
 3 files changed, 44 insertions(+), 60 deletions(-)

diff --git a/models/SSML/ShadowCNN.py b/models/SSML/ShadowCNN.py
index 0d0651f..039b9c5 100644
--- a/models/SSML/ShadowCNN.py
+++ b/models/SSML/ShadowCNN.py
@@ -47,7 +47,7 @@ def __init__(self, layer1=32, layer2=64, layer3=128,
         self.conv1 = nn.Conv1d(1, layer1, kernel, 1)
         self.conv2 = nn.Conv1d(layer1, layer2, kernel, 1)
         self.dropout = nn.Dropout2d(drop_rate)
-        self.fc1 = nn.Linear(int(layer2*(length-2*(kernel-1))/2), layer3)
+        self.fc1 = nn.Linear(int(layer1*(length-(kernel))), layer3)
         # self.fc1 = nn.Linear(31744, 128)
         self.fc2 = nn.Linear(layer3, 2)
 
@@ -123,12 +123,13 @@ class ShadowCNN:
     TODO: Include functionality for manipulating other
         CNN architecture parameters in hyperparameter optimization
     random_state: int/float for reproducible intiailization.
+    length: int input length (i.e. dimensions of feature vectors)
     TODO: Add input parameter, loss_function, for the other
         loss function options available in Shadow (besides EAAT).
     '''
 
     # only binary so far
-    def __init__(self, params=None, random_state=0):
+    def __init__(self, params=None, random_state=0, length=1000):
         # defaults to a fixed value for reproducibility
         self.random_state = random_state
         # set seeds for reproducibility
@@ -146,7 +147,7 @@ def __init__(self, params=None, random_state=0):
                              layer3=3*params['layer1'],
                              kernel=params['kernel'],
                              drop_rate=params['drop_rate'],
-                             length=1000)
+                             length=np.ceil(length/params['binning']))
             self.eaat = shadow.eaat.EAAT(model=self.model,
                                          alpha=params['alpha'],
                                          xi=params['xi'],
@@ -180,7 +181,8 @@ def fresh_start(self, params, data_dict):
             NOTE: Uy is not needed since labels for unlabeled data
             instances is not used.
         '''
-
+        
+        self.params = params
         # unpack data
         trainx = data_dict['trainx']
         trainy = data_dict['trainy']
@@ -204,7 +206,7 @@ def fresh_start(self, params, data_dict):
                     layer3=3*params['layer1'],
                     kernel=params['kernel'],
                     drop_rate=params['drop_rate'],
-                    length=xtens.shape[1])
+                    length=np.ceil(trainx.shape[1]/params['binning']))
         eaat = shadow.eaat.EAAT(model=model,
                                 alpha=params['alpha'],
                                 xi=params['xi'],
@@ -246,7 +248,6 @@ def fresh_start(self, params, data_dict):
             if testx is not None and testy is not None:
                 evalcurve.append(self.predict(testx,
                                               testy,
-                                              params['binning'],
                                               eaat))
 
         max_acc = np.max(evalcurve[-25:])
@@ -385,7 +386,7 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
         # optionally return the training accuracy if test data was provided
         return losscurve, evalcurve
 
-    def predict(self, testx, testy=None, binning=1, eaat=None):
+    def predict(self, testx, testy=None, eaat=None):
         '''
         Wrapper method for Shadow NN predict method.
         Inputs:
@@ -404,8 +405,9 @@ def predict(self, testx, testy=None, binning=1, eaat=None):
             eval_model = self.eaat
         eval_model.eval()
         y_pred, y_true = [], []
-        for i, data in enumerate(torch.FloatTensor(testx.copy()[:,
-                                                                ::binning])):
+        for i, data in enumerate(torch.FloatTensor(
+                                    testx.copy()[:, ::self.params['binning']])
+                                 ):
             x = data.reshape((1, 1, data.shape[0])).to(self.device)
             out = eval_model(x)
             y_pred.extend(torch.argmax(out, 1).detach().cpu().tolist())
diff --git a/models/SSML/ShadowNN.py b/models/SSML/ShadowNN.py
index f178b6c..e31e26e 100644
--- a/models/SSML/ShadowNN.py
+++ b/models/SSML/ShadowNN.py
@@ -31,9 +31,10 @@ class ShadowNN:
     '''
 
     # only binary so far
-    def __init__(self, params=None, random_state=0):
+    def __init__(self, params=None, random_state=0, input_length=1000):
         # defaults to a fixed value for reproducibility
         self.random_state = random_state
+        self.input_length = input_length
         # set seeds for reproducibility
         set_seed(0)
         # device used for computation
@@ -45,7 +46,9 @@ def __init__(self, params=None, random_state=0):
             # assumes the input dimensions are measurements of 1000 bins
             # TODO: Abstract this for arbitrary input size
             self.eaat = shadow.eaat.EAAT(model=self.model_factory(
-                                            1000//params['binning'],
+                                            int(np.ceil(
+                                                self.input_length /
+                                                params['binning'])),
                                             params['hidden_layer']),
                                          alpha=params['alpha'],
                                          xi=params['xi'],
diff --git a/tests/test_models.py b/tests/test_models.py
index 75350c4..1f1e5cd 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -231,17 +231,15 @@ def test_ShadowNN():
     X_test = normalizer.transform(X_test)
     Ux = normalizer.transform(Ux)
 
-    params = {'layer1': 4,
-              'kernel': 3,
+    params = {'hidden_layer': 10,
               'alpha': 0.1,
               'xi': 1e-3,
               'eps': 1.0,
               'lr': 0.1,
               'momentum': 0.9,
-              'binning': 5,
-              'batch_size': 2}
+              'binning': 20}
     # default behavior
-    model = ShadowNN(params=None, random_state=0)
+    model = ShadowNN(params=params, random_state=0)
     model.train(X_train, y_train, Ux)
 
     # testing train and predict methods
@@ -249,22 +247,20 @@ def test_ShadowNN():
 
     # Shadow/PyTorch reports accuracies as percentages
     # rather than decimals
-    assert acc >= 50.
-    np.testing.assert_equal(pred, y_test)
-    '''
+    # uninteresting test if Shadow predicts all one class
+    # TODO: make the default params test meaningful
+    assert np.count_nonzero(pred == y_test) > 0
+
     # testing hyperopt optimize methods
-    space = {'hidden_layer': scope.int(hp.quniform('hidden_layer',
-                                                   1000,
-                                                   10000,
-                                                   10)),
-             'alpha': hp.uniform('alpha', 0.0001, 0.999),
-             'xi': hp.uniform('xi', 1e-2, 1e0),
-             'eps': hp.uniform('eps', 0.5, 1.5),
-             'lr': hp.uniform('lr', 1e-3, 1e-1),
-             'momentum': hp.uniform('momentum', 0.5, 0.99),
+    space = {'hidden_layer': 10,
+             'alpha': 0.1,
+             'xi': 1e-3,
+             'eps': 1.0,
+             'lr': 0.1,
+             'momentum': 0.9,
              'binning': scope.int(hp.quniform('binning',
-                                              1,
                                               10,
+                                              20,
                                               1))
              }
     data_dict = {'trainx': X_train,
@@ -277,7 +273,7 @@ def test_ShadowNN():
 
     assert model.best['accuracy'] >= model.worst['accuracy']
     assert model.best['status'] == 'ok'
-    '''
+
     # testing model write to file method
     filename = 'test_LogReg'
     ext = '.joblib'
@@ -306,15 +302,15 @@ def test_ShadowCNN():
     X_test = normalizer.transform(X_test)
     Ux = normalizer.transform(Ux)
 
-    params = {'layer1': 4,
-              'kernel': 3,
+    params = {'layer1': 2,
+              'kernel': 2,
               'alpha': 0.1,
               'xi': 1e-3,
               'eps': 1.0,
               'lr': 0.1,
               'momentum': 0.9,
-              'binning': 1,
-              'batch_size': 2,
+              'binning': 20,
+              'batch_size': 4,
               'drop_rate': 0.1}
 
     # default behavior
@@ -326,33 +322,16 @@ def test_ShadowCNN():
 
     # Shadow/PyTorch reports accuracies as percentages
     # rather than decimals
-    assert acc >= 50.
-    np.testing.assert_equal(pred, y_test)
+    # uninteresting test if Shadow predicts all one class
+    # TODO: make the default params test meaningful
+    assert np.count_nonzero(pred == y_test) > 0
 
-    '''
     # testing hyperopt optimize methods
-    space = {'layer1': scope.int(hp.quniform('layer1',
-                                             1000,
-                                             10000,
-                                             10)),
-             'kernel': scope.int(hp.quniform('kernel',
-                                             1,
-                                             9,
-                                             1)),
-             'alpha': hp.uniform('alpha', 0.0001, 0.999),
-             'xi': hp.uniform('xi', 1e-2, 1e0),
-             'eps': hp.uniform('eps', 0.5, 1.5),
-             'lr': hp.uniform('lr', 1e-3, 1e-1),
-             'momentum': hp.uniform('momentum', 0.5, 0.99),
-             'binning': scope.int(hp.quniform('binning',
-                                  1,
-                                  10,
-                                  1)),
-             'batch_size': scope.int(hp.quniform('batch_size',
-                                     1,
-                                     100,
-                                     1))
-             }
+    space = params
+    space['binning'] = scope.int(hp.quniform('binning',
+                                 10,
+                                 20,
+                                 1))
     data_dict = {'trainx': X_train,
                  'testx': X_test,
                  'trainy': y_train,
@@ -363,7 +342,7 @@ def test_ShadowCNN():
 
     assert model.best['accuracy'] >= model.worst['accuracy']
     assert model.best['status'] == 'ok'
-    '''
+
     # testing model write to file method
     filename = 'test_LogReg'
     ext = '.joblib'

From 80d1e9b7a2a5b73c03e09571b90e31200aecd079 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Tue, 16 Aug 2022 11:27:07 -0400
Subject: [PATCH 28/35] correcting cnn parameter calculation to include
 max_pool1d

---
 models/SSML/ShadowCNN.py | 35 +++++++++++++++++++++++++++--------
 tests/test_models.py     |  5 +++--
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/models/SSML/ShadowCNN.py b/models/SSML/ShadowCNN.py
index 039b9c5..3653322 100644
--- a/models/SSML/ShadowCNN.py
+++ b/models/SSML/ShadowCNN.py
@@ -43,11 +43,28 @@ def __init__(self, layer1=32, layer2=64, layer3=128,
         The resulting network has fixed length but the
         user can input arbitrary widths.
         '''
+
+        # default max_pool1d kernel set by Shadow MNIST example
+        # NOTE: max_pool1d sets mp_kernel = mp_stride
+        self.mp_kernel = 2
         super(Net, self).__init__()
         self.conv1 = nn.Conv1d(1, layer1, kernel, 1)
         self.conv2 = nn.Conv1d(layer1, layer2, kernel, 1)
-        self.dropout = nn.Dropout2d(drop_rate)
-        self.fc1 = nn.Linear(int(layer1*(length-(kernel))), layer3)
+        self.dropout = nn.Dropout(drop_rate)
+        # calculating the number of parameters/weights before the flattened
+        # fully-connected layer:
+        #   first, there are two convolution layers, so the output length is
+        #   the input length (feature_vector.shape[0] - 2_layers*(kernel-1))
+        #   if, in the future, more layers are desired, 2 must be adjusted
+        #   next, calculate the output of the max_pool1d layer, which is
+        #   round((conv_out - (kernel=stride - 1) - 1)/2 + 1)
+        #   finally, multiply this by the number of channels in the last
+        #   convolutional layer = layer2
+        conv_out = length-2*(kernel-1)
+        parameters = layer2*(
+                        ((conv_out - (self.mp_kernel - 1) - 1)//self.mp_kernel)
+                        + 1)
+        self.fc1 = nn.Linear(int(parameters), layer3)
         # self.fc1 = nn.Linear(31744, 128)
         self.fc2 = nn.Linear(layer3, 2)
 
@@ -63,7 +80,7 @@ def forward(self, x):
         x = self.conv1(x)
         x = F.relu(x)
         x = self.conv2(x)
-        x = F.max_pool1d(x, 2)
+        x = F.max_pool1d(x, self.mp_kernel)
         x = self.dropout(x)
         x = torch.flatten(x, 1)
         x = self.fc1(x)
@@ -181,7 +198,7 @@ def fresh_start(self, params, data_dict):
             NOTE: Uy is not needed since labels for unlabeled data
             instances is not used.
         '''
-        
+
         self.params = params
         # unpack data
         trainx = data_dict['trainx']
@@ -246,11 +263,13 @@ def fresh_start(self, params, data_dict):
                 lossavg.append(loss.item())
             losscurve.append(np.nanmedian(lossavg))
             if testx is not None and testy is not None:
-                evalcurve.append(self.predict(testx,
-                                              testy,
-                                              eaat))
+                pred, acc = self.predict(testx,
+                                         testy,
+                                         eaat)
+                evalcurve.append(acc)
 
-        max_acc = np.max(evalcurve[-25:])
+        if testx is not None and testy is not None:
+            max_acc = np.max(evalcurve[-25:])
 
         return {'loss': 1-(max_acc/100.0),
                 'status': STATUS_OK,
diff --git a/tests/test_models.py b/tests/test_models.py
index 1f1e5cd..4fb04e6 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -249,7 +249,8 @@ def test_ShadowNN():
     # rather than decimals
     # uninteresting test if Shadow predicts all one class
     # TODO: make the default params test meaningful
-    assert np.count_nonzero(pred == y_test) > 0
+    # NOTE: .numpy() needed because model.predict() returns a tensor
+    assert np.count_nonzero(pred.numpy() == y_test) > 0
 
     # testing hyperopt optimize methods
     space = {'hidden_layer': 10,
@@ -303,7 +304,7 @@ def test_ShadowCNN():
     Ux = normalizer.transform(Ux)
 
     params = {'layer1': 2,
-              'kernel': 2,
+              'kernel': 3,
               'alpha': 0.1,
               'xi': 1e-3,
               'eps': 1.0,

From 95ee61b30bcf81a796c8188990decfb0efc2e763 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Tue, 16 Aug 2022 12:21:49 -0400
Subject: [PATCH 29/35] adding tests for more coverage

---
 models/SSML/CoTraining.py | 25 +++++---------
 models/SSML/ShadowCNN.py  | 25 ++++++--------
 tests/test_models.py      | 70 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py
index ae2f9f5..a7ae7ec 100644
--- a/models/SSML/CoTraining.py
+++ b/models/SSML/CoTraining.py
@@ -338,13 +338,12 @@ def predict(self, testx, testy=None):
 
         return pred1, acc, pred2, model1_acc, model2_acc
 
-    def plot_cotraining(self, filename='lr-cotraining-learningcurves.png',
-                        model1_accs=None, model2_accs=None):
+    def plot_cotraining(self, model1_accs=None, model2_accs=None,
+                        filename='lr-cotraining-learningcurves.png'):
         '''
         Plots the training error curves for two co-training models.
-        NOTE: The user can either choose to plot what is stored in
-            the class instance by setting model#_accs=None or
-            the model#_accs can be inputted.
+        NOTE: The user must provide the curves to plot, but each curve is
+            saved by the class under self.best and self.worst models.
         Inputs:
         filename: name to store picture under.
             Must end in .png (or will be added if missing).
@@ -353,18 +352,10 @@ def plot_cotraining(self, filename='lr-cotraining-learningcurves.png',
         '''
 
         fig, ax = plt.subplots(figsize=(10, 8), dpi=300)
-        if model1_accs is not None and model2_accs is not None:
-            ax.plot(np.arange(len(model1_accs)), model1_accs, label='Model 1')
-            ax.plot(np.arange(len(model2_accs)), model2_accs, label='Model 2')
-        else:
-            ax.plot(np.arange(len(self.best['model1_acc_history'])),
-                    self.best['model1_acc_history'],
-                    color='tab:blue',
-                    label='Model 1')
-            ax.plot(np.arange(len(self.best['model2_acc_history'])),
-                    self.best['model2_acc_history'],
-                    color='tab:orange',
-                    label='Model 2')
+        ax.plot(np.arange(len(model1_accs)), model1_accs,
+                color='tab:blue', label='Model 1')
+        ax.plot(np.arange(len(model2_accs)), model2_accs,
+                color='tab:orange', label='Model 2')
         ax.legend()
         ax.set_xlabel('Co-Training Iteration')
         ax.set_ylabel('Test Accuracy')
diff --git a/models/SSML/ShadowCNN.py b/models/SSML/ShadowCNN.py
index 3653322..ad68d6c 100644
--- a/models/SSML/ShadowCNN.py
+++ b/models/SSML/ShadowCNN.py
@@ -397,10 +397,10 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
                 lossavg.append(loss.item())
             losscurve.append(np.nanmedian(lossavg))
             if testx is not None and testy is not None:
-                evalcurve.append(self.predict(testx,
-                                              testy,
-                                              self.params['binning'],
-                                              self.eaat))
+                pred, acc = self.predict(testx,
+                                         testy,
+                                         self.eaat)
+                evalcurve.append(acc)
 
         # optionally return the training accuracy if test data was provided
         return losscurve, evalcurve
@@ -437,13 +437,12 @@ def predict(self, testx, testy=None, eaat=None):
 
         return y_pred, acc
 
-    def plot_cotraining(self, filename='lr-cotraining-learningcurves.png',
-                        losscurve=None, evalcurve=None):
+    def plot_training(self, losscurve=None, evalcurve=None,
+                      filename='lr-cotraining-learningcurves.png'):
         '''
         Plots the training error curves for two co-training models.
-        NOTE: The user can either choose to plot what is stored in
-            the class instance by setting curves=None or
-            the curves can be inputted.
+        NOTE: The user must provide the curves to plot, but each curve is
+            saved by the class under self.best and self.worst models.
         Inputs:
         filename: name to store picture under.
             Must end in .png (or will be added if missing).
@@ -456,12 +455,8 @@ def plot_cotraining(self, filename='lr-cotraining-learningcurves.png',
                                        sharex=True,
                                        figsize=(10, 8),
                                        dpi=300)
-        if losscurve is not None and evalcurve is not None:
-            ax1.plot(losscurve)
-            ax2.plot(evalcurve)
-        else:
-            ax1.plot(self.best['losscurve'])
-            ax2.plot(self.best['evalcurve'])
+        ax1.plot(losscurve)
+        ax2.plot(evalcurve)
         ax1.set_xlabel('Epoch')
         ax2.set_xlabel('Epoch')
         ax1.set_ylabel('Loss Curve')
diff --git a/tests/test_models.py b/tests/test_models.py
index 4fb04e6..1c6a7e2 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -40,6 +40,14 @@
 
 
 def test_LogReg():
+    # test saving model input parameters
+    params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
+    model = LogReg(params=params)
+
+    assert model.model.max_iter == params['max_iter']
+    assert model.model.tol == params['tol']
+    assert model.model.C == params['C']
+
     X_train, X_test, y_train, y_test = train_test_split(spectra,
                                                         labels,
                                                         test_size=0.2,
@@ -91,6 +99,18 @@ def test_LogReg():
 
 
 def test_CoTraining():
+    # test saving model input parameters
+    params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
+    model = CoTraining(params=params)
+
+    assert model.model1.max_iter == params['max_iter']
+    assert model.model1.tol == params['tol']
+    assert model.model1.C == params['C']
+
+    assert model.model2.max_iter == params['max_iter']
+    assert model.model2.tol == params['tol']
+    assert model.model2.C == params['C']
+
     X, Ux, y, Uy = train_test_split(spectra,
                                     labels,
                                     test_size=0.5,
@@ -141,6 +161,13 @@ def test_CoTraining():
     assert model.best['accuracy'] >= model.worst['accuracy']
     assert model.best['status'] == 'ok'
 
+    # testing model plotting method
+    filename = 'test_plot'
+    model.plot_cotraining(model1_accs=model.best['model1_acc_history'],
+                          model2_accs=model.best['model2_acc_history'],
+                          filename=filename)
+    os.remove(filename+'.png')
+
     # testing model write to file method
     filename = 'test_LogReg'
     ext = '.joblib'
@@ -152,6 +179,15 @@ def test_CoTraining():
 
 
 def test_LabelProp():
+    # test saving model input parameters
+    params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
+    model = LabelProp(params=params)
+
+    assert model.model.gamma == params['gamma']
+    assert model.model.n_neighbors == params['n_neighbors']
+    assert model.model.max_iter == params['max_iter']
+    assert model.model.tol == params['tol']
+
     # there should be no normalization on LabelProp data
     # since it depends on the distances between samples
     X, Ux, y, Uy = train_test_split(spectra,
@@ -214,6 +250,14 @@ def test_LabelProp():
 
 
 def test_ShadowNN():
+    # check default parameter settings
+    model = ShadowNN()
+    assert model.params == {'binning': 1}
+    assert model.eaat is not None
+    assert model.eaat_opt is not None
+    assert model.xEnt is not None
+    assert model.input_length == 1000
+
     X, Ux, y, Uy = train_test_split(spectra,
                                     labels,
                                     test_size=0.5,
@@ -240,11 +284,15 @@ def test_ShadowNN():
               'binning': 20}
     # default behavior
     model = ShadowNN(params=params, random_state=0)
-    model.train(X_train, y_train, Ux)
+    acc_history = model.train(X_train, y_train, Ux, X_test, y_test)
 
     # testing train and predict methods
     pred, acc = model.predict(X_test, y_test)
 
+    # test for agreement between training and testing
+    # (since the same data is used for diagnostics in this test)
+    assert acc_history[-1] == acc
+
     # Shadow/PyTorch reports accuracies as percentages
     # rather than decimals
     # uninteresting test if Shadow predicts all one class
@@ -286,6 +334,13 @@ def test_ShadowNN():
 
 
 def test_ShadowCNN():
+    # check default parameter settings
+    model = ShadowCNN()
+    assert model.params == {'binning': 1, 'batch_size': 1}
+    assert model.model is not None
+    assert model.eaat is not None
+    assert model.optimizer is not None
+
     X, Ux, y, Uy = train_test_split(spectra,
                                     labels,
                                     test_size=0.5,
@@ -316,11 +371,15 @@ def test_ShadowCNN():
 
     # default behavior
     model = ShadowCNN(params=params, random_state=0)
-    model.train(X_train, y_train, Ux)
+    losscurve, evalcurve = model.train(X_train, y_train, Ux, X_test, y_test)
 
     # testing train and predict methods
     pred, acc = model.predict(X_test, y_test)
 
+    # test for agreement between training and testing
+    # (since the same data is used for diagnostics in this test)
+    assert evalcurve[-1] == acc
+
     # Shadow/PyTorch reports accuracies as percentages
     # rather than decimals
     # uninteresting test if Shadow predicts all one class
@@ -344,6 +403,13 @@ def test_ShadowCNN():
     assert model.best['accuracy'] >= model.worst['accuracy']
     assert model.best['status'] == 'ok'
 
+    # testing model plotting method
+    filename = 'test_plot'
+    model.plot_training(losscurve=model.best['losscurve'],
+                        evalcurve=model.best['evalcurve'],
+                        filename=filename)
+    os.remove(filename+'.png')
+
     # testing model write to file method
     filename = 'test_LogReg'
     ext = '.joblib'

From 49ed669305dbc34b317b99260897e6f5c848f092 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Tue, 16 Aug 2022 12:35:43 -0400
Subject: [PATCH 30/35] adding a test for util plots

---
 scripts/__init__.py  |  0
 scripts/utils.py     |  2 +-
 tests/test_models.py | 42 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 scripts/__init__.py

diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/utils.py b/scripts/utils.py
index 4c1c593..9cd4754 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -104,7 +104,7 @@ def pca(Lx, Ly, Ux, Uy, filename):
     fig.savefig(filename)
 
 
-def multiD_PCA(Lx, Ly, Ux, Uy, filename, n=2):
+def multiD_pca(Lx, Ly, Ux, Uy, filename, n=2):
     '''
     A function for computing and plotting n-dimensional PCA.
     Inputs:
diff --git a/tests/test_models.py b/tests/test_models.py
index 1c6a7e2..4eedaa6 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -8,6 +8,8 @@
 # hyperopt
 from hyperopt.pyll.base import scope
 from hyperopt import hp
+# testing utils
+import scripts.utils as utils
 # models
 from models.LogReg import LogReg
 from models.SSML.CoTraining import CoTraining
@@ -39,6 +41,43 @@
 labels[rejected_H0_time] = 1
 
 
+def test_utils():
+    X, Ux, y, Uy = train_test_split(spectra,
+                                    labels,
+                                    test_size=0.5,
+                                    random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X,
+                                                        y,
+                                                        test_size=0.2,
+                                                        random_state=0)
+
+    filename = 'test_pca'
+    utils.pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename)
+    os.remove(filename+'.png')
+
+    filename = 'test_multiD_pca'
+    utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5)
+    os.remove(filename+'.png')
+
+    # normalization
+    normalizer = StandardScaler()
+    normalizer.fit(X_train)
+
+    X_train = normalizer.transform(X_train)
+    X_test = normalizer.transform(X_test)
+
+    # default behavior
+    model = LogReg(params=None, random_state=0)
+    model.train(X_train, y_train)
+
+    # testing train and predict methods
+    pred, acc = model.predict(X_test, y_test)
+
+    filename = 'test_cf'
+    utils.plot_cf(y_test, pred, title=filename, filename=filename)
+    os.remove(filename+'.png')
+
+
 def test_LogReg():
     # test saving model input parameters
     params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
@@ -148,7 +187,8 @@ def test_CoTraining():
              'n_samples': scope.int(hp.quniform('n_samples',
                                                 1,
                                                 20,
-                                                1))
+                                                1)),
+             'seed': 0
              }
     data_dict = {'trainx': X_train,
                  'testx': X_test,

From 3cb9b441923d87473cc9a4a7f418a86a9aece200 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Tue, 16 Aug 2022 12:46:10 -0400
Subject: [PATCH 31/35] adding seed test to co-training

---
 tests/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 4eedaa6..4e1070a 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -169,7 +169,7 @@ def test_CoTraining():
 
     # default behavior
     model = CoTraining(params=None, random_state=0)
-    model.train(X_train, y_train, Ux)
+    model.train(X_train, y_train, Ux, seed=0)
 
     # testing train and predict methods
     pred, acc, *_ = model.predict(X_test, y_test)

From c131dcffebe26da94a39fafac06a8e511a51bd80 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Mon, 22 Aug 2022 12:30:33 -0400
Subject: [PATCH 32/35] removing old commented line

---
 models/SSML/ShadowCNN.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/models/SSML/ShadowCNN.py b/models/SSML/ShadowCNN.py
index ad68d6c..60bb4ff 100644
--- a/models/SSML/ShadowCNN.py
+++ b/models/SSML/ShadowCNN.py
@@ -65,7 +65,6 @@ def __init__(self, layer1=32, layer2=64, layer3=128,
                         ((conv_out - (self.mp_kernel - 1) - 1)//self.mp_kernel)
                         + 1)
         self.fc1 = nn.Linear(int(parameters), layer3)
-        # self.fc1 = nn.Linear(31744, 128)
         self.fc2 = nn.Linear(layer3, 2)
 
     def forward(self, x):

From 4c538204b48f50f4c2591c23bd28f1512dbc2f5d Mon Sep 17 00:00:00 2001
From: Jordan Stomps <u9f@ulysses.ornl.gov>
Date: Thu, 29 Sep 2022 11:12:15 -0400
Subject: [PATCH 33/35] changing fresh_start methods of models to use class
 train method instead

---
 models/LogReg.py          | 17 +++-----
 models/SSML/CoTraining.py | 70 ++++++-------------------------
 models/SSML/LabelProp.py  | 28 +++----------
 models/SSML/ShadowCNN.py  | 88 ++++++---------------------------------
 models/SSML/ShadowNN.py   | 57 ++++---------------------
 tests/test_models.py      |  2 +-
 6 files changed, 46 insertions(+), 216 deletions(-)

diff --git a/models/LogReg.py b/models/LogReg.py
index a848ac6..4ebfce2 100644
--- a/models/LogReg.py
+++ b/models/LogReg.py
@@ -61,23 +61,16 @@ def fresh_start(self, params, data_dict):
         testy = data_dict['testy']
 
         # supervised logistic regression
-        clf = linear_model.LogisticRegression(
-                random_state=self.random_state,
-                max_iter=params['max_iter'],
-                tol=params['tol'],
-                C=params['C']
-              )
+        clf = LogReg(params=params, random_state=self.random_state)
         # train and test model
-        clf.fit(trainx, trainy)
-        clf_pred = clf.predict(testx)
-        # balanced_accuracy accounts for class imbalanced data
-        # could alternatively use pure accuracy for a more traditional hyperopt
-        acc = balanced_accuracy_score(testy, clf_pred)
+        clf.train(trainx, trainy)
+        # uses balanced_accuracy accounts for class imbalanced data
+        clf_pred, acc = clf.predict(testx, testy)
 
         # loss function minimizes misclassification
         return {'loss': 1-acc,
                 'status': STATUS_OK,
-                'model': clf,
+                'model': clf.model,
                 'params': params,
                 'accuracy': acc}
 
diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py
index a7ae7ec..e6757bd 100644
--- a/models/SSML/CoTraining.py
+++ b/models/SSML/CoTraining.py
@@ -35,6 +35,8 @@ def __init__(self, params=None, random_state=0):
                             random_state=self.random_state)
             self.model2 = linear_model.LogisticRegression(
                             random_state=self.random_state)
+            # default needed for training
+            self.params = {'n_samples': 1}
         else:
             self.model1 = linear_model.LogisticRegression(
                             random_state=self.random_state,
@@ -152,60 +154,17 @@ def fresh_start(self, params, data_dict):
         testy = data_dict['testy']
         # unlabeled co-training data
         Ux = data_dict['Ux']
-        # avoid overwriting when deleting in co-training loop
-        U_lr = Ux.copy()
-
-        # set the random seed of training splits for reproducibility
-        # This can be ignored by excluding params['seed']
-        # in the hyperopt space dictionary
-        if 'seed' in params.keys():
-            np.random.seed(params['seed'])
-
-        # TODO: allow a user to specify uneven splits between the two models
-        split_frac = 0.5
-        # labeled training data
-        idx = np.random.choice(range(trainy.shape[0]),
-                               size=int(split_frac * trainy.shape[0]),
-                               replace=False)
 
-        # avoid overwriting when deleting in co-training loop
-        L_lr1 = trainx[idx].copy()
-        L_lr2 = trainx[~idx].copy()
-        Ly_lr1 = trainy[idx].copy()
-        Ly_lr2 = trainy[~idx].copy()
+        clf = CoTraining(params=params, random_state=self.random_state)
+        # training and testing
+        model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy)
+        # uses balanced_accuracy accounts for class imbalanced data
+        pred1, acc, pred2, model1_acc, model2_acc = clf.predict(testx, testy)
 
-        # initialized logistic regression models for a fresh-start
-        slr1 = linear_model.LogisticRegression(
-                random_state=self.random_state,
-                max_iter=params['max_iter'],
-                tol=params['tol'],
-                C=params['C']
-            )
-        slr2 = linear_model.LogisticRegression(
-                random_state=self.random_state,
-                max_iter=params['max_iter'],
-                tol=params['tol'],
-                C=params['C']
-            )
-
-        slr1, slr2, model1_accs, model2_accs = self.training_loop(
-                                                slr1, slr2,
-                                                L_lr1, L_lr2,
-                                                Ly_lr1, Ly_lr2,
-                                                U_lr, params['n_samples'],
-                                                testx, testy,
-                                                )
-
-        # balanced_accuracy accounts for class imbalanced data
-        # could alternatively use pure accuracy for a more traditional hyperopt
-        model1_acc = balanced_accuracy_score(testy, slr1.predict(testx))
-        model2_acc = balanced_accuracy_score(testy, slr2.predict(testx))
-        # select best accuracy for hyperparameter optimization
-        acc = max(model1_acc, model2_acc)
         return {'loss': 1-acc,
                 'status': STATUS_OK,
-                'model': slr1,
-                'model2': slr2,
+                'model': clf.model1,
+                'model2': clf.model2,
                 'model1_acc_history': model1_accs,
                 'model2_acc_history': model2_accs,
                 'params': params,
@@ -262,7 +221,7 @@ def optimize(self, space, data_dict, max_evals=50, verbose=True):
         self.worst = worst
 
     def train(self, trainx, trainy, Ux,
-              testx=None, testy=None, n_samples=1, seed=None):
+              testx=None, testy=None):
         '''
         Wrapper method for a basic co-training with logistic regression
         implementation training method.
@@ -274,9 +233,6 @@ def train(self, trainx, trainy, Ux,
             of each model at every iteration.
         testy: label vector used for testing the performance
             of each model at every iteration.
-        n_samples: the number of instances to sample and
-            predict from Ux at one time
-        seed: set the random seed of training splits for reproducibility
         '''
 
         # avoid overwriting when deleting in co-training loop
@@ -285,8 +241,8 @@ def train(self, trainx, trainy, Ux,
         # set the random seed of training splits for reproducibility
         # This can be ignored by excluding params['seed']
         # in the hyperopt space dictionary
-        if seed is not None:
-            np.random.seed(seed)
+        if 'seed' in self.params.keys():
+            np.random.seed(self.params['seed'])
 
         # TODO: allow a user to specify uneven splits between the two models
         split_frac = 0.5
@@ -306,7 +262,7 @@ def train(self, trainx, trainy, Ux,
                                 self.model1, self.model2,
                                 L_lr1, L_lr2,
                                 Ly_lr1, Ly_lr2,
-                                U_lr, n_samples,
+                                U_lr, self.params['n_samples'],
                                 testx, testy,
                                 )
 
diff --git a/models/SSML/LabelProp.py b/models/SSML/LabelProp.py
index aa1e795..cb9ff05 100644
--- a/models/SSML/LabelProp.py
+++ b/models/SSML/LabelProp.py
@@ -72,32 +72,16 @@ def fresh_start(self, params, data_dict):
         testy = data_dict['testy']
         Ux = data_dict['Ux']
 
-        # combine labeled and unlabeled instances for training
-        lp_trainx = np.append(trainx, Ux, axis=0)
-        lp_trainy = np.append(trainy,
-                              np.full(shape=(Ux.shape[0],), fill_value=-1),
-                              axis=0)
-
-        # semi-supervised label propagation
-        clf = semi_supervised.LabelPropagation(
-                kernel='knn',
-                gamma=params['gamma'],
-                n_neighbors=params['n_neighbors'],
-                max_iter=params['max_iter'],
-                tol=params['tol'],
-                n_jobs=-1
-            )
-        # train and test model
-        clf.fit(lp_trainx, lp_trainy)
-        clf_pred = clf.predict(testx)
-        # balanced_accuracy accounts for class imbalanced data
-        # could alternatively use pure accuracy for a more traditional hyperopt
-        acc = balanced_accuracy_score(testy, clf_pred)
+        clf = LabelProp(params, random_state=self.random_state)
+        # training and testing
+        clf.train(trainx, trainy, Ux)
+        # uses balanced_accuracy accounts for class imbalanced data
+        pred, acc = clf.predict(testx, testy)
 
         # loss function minimizes misclassification
         return {'loss': 1-acc,
                 'status': STATUS_OK,
-                'model': clf,
+                'model': clf.model,
                 'params': params,
                 'accuracy': acc}
 
diff --git a/models/SSML/ShadowCNN.py b/models/SSML/ShadowCNN.py
index 60bb4ff..aa92a26 100644
--- a/models/SSML/ShadowCNN.py
+++ b/models/SSML/ShadowCNN.py
@@ -207,72 +207,18 @@ def fresh_start(self, params, data_dict):
         # unlabeled co-training data
         Ux = data_dict['Ux']
 
-        # avoid float round-off by using DoubleTensor
-        xtens = torch.FloatTensor(np.append(trainx,
-                                            Ux,
-                                            axis=0))[:, ::params['binning']]
-        # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
-        ytens = torch.LongTensor(np.append(trainy,
-                                           np.full(shape=(Ux.shape[0],),
-                                                   fill_value=-1),
-                                           axis=0))
-
-        model = Net(layer1=params['layer1'],
-                    layer2=2*params['layer1'],
-                    layer3=3*params['layer1'],
-                    kernel=params['kernel'],
-                    drop_rate=params['drop_rate'],
-                    length=np.ceil(trainx.shape[1]/params['binning']))
-        eaat = shadow.eaat.EAAT(model=model,
-                                alpha=params['alpha'],
-                                xi=params['xi'],
-                                eps=params['eps'])
-        optimizer = optim.SGD(eaat.parameters(),
-                              lr=params['lr'],
-                              momentum=params['momentum'])
-
-        # define data set object
-        dataset = SpectralDataset(xtens, ytens)
-
-        # create DataLoader object of DataSet object
-        DL_DS = torch.utils.data.DataLoader(dataset,
-                                            batch_size=params['batch_size'],
-                                            shuffle=True)
-
-        # labels for unlabeled data are always "-1"
-        xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1)
-
-        n_epochs = 100
-        eaat.to(self.device)
-        losscurve = []
-        evalcurve = []
-        for epoch in range(n_epochs):
-            eaat.train()
-            lossavg = []
-            for i, (data, targets) in enumerate(DL_DS):
-                x = data.reshape((data.shape[0],
-                                  1,
-                                  data.shape[1])).to(self.device)
-                y = targets.to(self.device)
-                optimizer.zero_grad()
-                out = eaat(x)
-                loss = xEnt(out, y) + eaat.get_technique_cost(x)
-                loss.backward()
-                optimizer.step()
-                lossavg.append(loss.item())
-            losscurve.append(np.nanmedian(lossavg))
-            if testx is not None and testy is not None:
-                pred, acc = self.predict(testx,
-                                         testy,
-                                         eaat)
-                evalcurve.append(acc)
-
-        if testx is not None and testy is not None:
-            max_acc = np.max(evalcurve[-25:])
+        clf = ShadowCNN(params=params,
+                        random_state=self.random_state,
+                        length=trainx.shape[1])
+        # training and testing
+        losscurve, evalcurve = clf.train(trainx, trainy, Ux, testx, testy)
+        # not used; max acc in past few epochs used instead
+        y_pred, acc = clf.predict(testx, testy)
+        max_acc = np.max(evalcurve[-25:])
 
         return {'loss': 1-(max_acc/100.0),
                 'status': STATUS_OK,
-                'model': eaat,
+                'model': clf.eaat,
                 'params': params,
                 'losscurve': losscurve,
                 'evalcurve': evalcurve,
@@ -396,15 +342,13 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
                 lossavg.append(loss.item())
             losscurve.append(np.nanmedian(lossavg))
             if testx is not None and testy is not None:
-                pred, acc = self.predict(testx,
-                                         testy,
-                                         self.eaat)
+                pred, acc = self.predict(testx, testy)
                 evalcurve.append(acc)
 
         # optionally return the training accuracy if test data was provided
         return losscurve, evalcurve
 
-    def predict(self, testx, testy=None, eaat=None):
+    def predict(self, testx, testy=None):
         '''
         Wrapper method for Shadow NN predict method.
         Inputs:
@@ -413,21 +357,15 @@ def predict(self, testx, testy=None, eaat=None):
             optional: if included, the predicted classes -and-
             the resulting classification accuracy will be returned.
         binning: int number of bins sampled in feature vector
-        model: optional input for testing a given model in hyperparameter
-            optimization rather than the class saved model.
         '''
 
-        if eaat is not None:
-            eval_model = eaat
-        else:
-            eval_model = self.eaat
-        eval_model.eval()
+        self.eaat.eval()
         y_pred, y_true = [], []
         for i, data in enumerate(torch.FloatTensor(
                                     testx.copy()[:, ::self.params['binning']])
                                  ):
             x = data.reshape((1, 1, data.shape[0])).to(self.device)
-            out = eval_model(x)
+            out = self.eaat(x)
             y_pred.extend(torch.argmax(out, 1).detach().cpu().tolist())
         acc = None
         if testy is not None:
diff --git a/models/SSML/ShadowNN.py b/models/SSML/ShadowNN.py
index e31e26e..f7e1757 100644
--- a/models/SSML/ShadowNN.py
+++ b/models/SSML/ShadowNN.py
@@ -104,59 +104,18 @@ def fresh_start(self, params, data_dict):
         # unlabeled co-training data
         Ux = data_dict['Ux']
 
-        eaat = shadow.eaat.EAAT(model=self.model_factory(
-                                    testx[:, ::params['binning']].shape[1],
-                                    params['hidden_layer']),
-                                alpha=params['alpha'],
-                                xi=params['xi'],
-                                eps=params['eps']).to(self.device)
-        eaat_opt = torch.optim.SGD(eaat.parameters(),
-                                   lr=params['lr'],
-                                   momentum=params['momentum'])
-        xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1).to(self.device)
-
-        # avoid float round-off by using DoubleTensor
-        xtens = torch.FloatTensor(np.append(trainx,
-                                            Ux,
-                                            axis=0)[:, ::params['binning']])
-        # xtens[xtens == 0.0] = torch.unique(xtens)[1]/1e10
-        ytens = torch.LongTensor(np.append(trainy,
-                                           np.full(shape=(Ux.shape[0],),
-                                                   fill_value=-1),
-                                           axis=0))
-
-        n_epochs = 100
-        xt = torch.Tensor(xtens).to(self.device)
-        yt = torch.LongTensor(ytens).to(self.device)
-        # saves history for max accuracy
-        acc_history = []
-        # set the model into training mode
-        # NOTE: change this to .eval() mode for testing and back again
-        eaat.train()
-        for epoch in range(n_epochs):
-            # Forward/backward pass for training semi-supervised model
-            out = eaat(xt)
-            # supervised + unsupervised loss
-            loss = xEnt(out, yt) + eaat.get_technique_cost(xt)
-            eaat_opt.zero_grad()
-            loss.backward()
-            eaat_opt.step()
-
-            eaat.eval()
-            eaat_pred = torch.max(eaat(
-                                    torch.FloatTensor(
-                                        testx.copy()[:, ::params['binning']]
-                                        )
-                                    ), 1)[-1]
-            acc = shadow.losses.accuracy(eaat_pred,
-                                         torch.LongTensor(testy.copy())
-                                         ).data.item()
-            acc_history.append(acc)
+        clf = ShadowNN(params=params,
+                       random_state=self.random_state,
+                       input_length=testx.shape[1])
+        # training and testing
+        acc_history = clf.train(trainx, trainy, Ux, testx, testy)
+        # not used; max acc in past few epochs used instead
+        eaat_pred, acc = clf.predict(testx, testy)
         max_acc = np.max(acc_history[-20:])
 
         return {'loss': 1-(max_acc/100.0),
                 'status': STATUS_OK,
-                'model': eaat,
+                'model': clf.eaat,
                 'params': params,
                 'accuracy': (max_acc/100.0)}
 
diff --git a/tests/test_models.py b/tests/test_models.py
index 4e1070a..4eedaa6 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -169,7 +169,7 @@ def test_CoTraining():
 
     # default behavior
     model = CoTraining(params=None, random_state=0)
-    model.train(X_train, y_train, Ux, seed=0)
+    model.train(X_train, y_train, Ux)
 
     # testing train and predict methods
     pred, acc, *_ = model.predict(X_test, y_test)

From f0bccf1661bc887e69b8aaba985b9816e9df9616 Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Fri, 7 Oct 2022 17:58:14 -0400
Subject: [PATCH 34/35] adding an EarlyStopper class for managing that
 functionality

---
 models/SSML/ShadowCNN.py | 19 +++++++++++++++++-
 models/SSML/ShadowNN.py  | 42 ++++++++++++++++++++++++++--------------
 scripts/utils.py         | 41 +++++++++++++++++++++++++++++++++++++++
 tests/test_models.py     |  3 +--
 4 files changed, 87 insertions(+), 18 deletions(-)

diff --git a/models/SSML/ShadowCNN.py b/models/SSML/ShadowCNN.py
index aa92a26..a633283 100644
--- a/models/SSML/ShadowCNN.py
+++ b/models/SSML/ShadowCNN.py
@@ -13,7 +13,7 @@
 import shadow.utils
 from shadow.utils import set_seed
 # diagnostics
-from scripts.utils import run_hyperopt
+from scripts.utils import EarlyStopper, run_hyperopt
 import joblib
 
 
@@ -322,6 +322,9 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
         # labels for unlabeled data are always "-1"
         xEnt = torch.nn.CrossEntropyLoss(ignore_index=-1)
 
+        # generate early-stopping watchdog
+        # TODO: allow a user of ShadowCNN to specify EarlyStopper's params
+        stopper = EarlyStopper(patience=3, min_delta=0)
         n_epochs = 100
         self.eaat.to(self.device)
         losscurve = []
@@ -345,6 +348,20 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
                 pred, acc = self.predict(testx, testy)
                 evalcurve.append(acc)
 
+                self.eaat.train()
+                # test for early stopping
+                x_val = torch.FloatTensor(
+                                    testx.copy()[:, ::self.params['binning']])
+                x_val = x_val.reshape((x_val.shape[0],
+                                       1,
+                                       x_val.shape[1])).to(self.device)
+                y_val = torch.LongTensor(testy).to(self.device)
+                out = self.eaat(x_val)
+                val_loss = xEnt(out, y_val) + \
+                    self.eaat.get_technique_cost(x_val)
+                if stopper.early_stop(val_loss):
+                    break
+
         # optionally return the training accuracy if test data was provided
         return losscurve, evalcurve
 
diff --git a/models/SSML/ShadowNN.py b/models/SSML/ShadowNN.py
index f7e1757..4857ccf 100644
--- a/models/SSML/ShadowNN.py
+++ b/models/SSML/ShadowNN.py
@@ -9,7 +9,7 @@
 import shadow.utils
 from shadow.utils import set_seed
 # diagnostics
-from scripts.utils import run_hyperopt
+from scripts.utils import EarlyStopper, run_hyperopt
 import joblib
 
 
@@ -199,12 +199,15 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
         n_epochs = 100
         xt = torch.Tensor(xtens).to(self.device)
         yt = torch.LongTensor(ytens).to(self.device)
+        # generate early-stopping watchdog
+        # TODO: allow a user of ShadowCNN to specify EarlyStopper's params
+        stopper = EarlyStopper(patience=3, min_delta=0)
         # saves history for max accuracy
         acc_history = []
-        # set the model into training mode
-        # NOTE: change this to .eval() mode for testing and back again
-        self.eaat.train()
         for epoch in range(n_epochs):
+            # set the model into training mode
+            # NOTE: change this to .eval() mode for testing and back again
+            self.eaat.train()
             # Forward/backward pass for training semi-supervised model
             out = self.eaat(xt)
             # supervised + unsupervised loss
@@ -214,20 +217,26 @@ def train(self, trainx, trainy, Ux, testx=None, testy=None):
             self.eaat_opt.step()
 
             if testx is not None and testy is not None:
+                x_val = torch.FloatTensor(
+                            testx.copy()
+                        )[:, ::self.params['binning']].to(self.device)
+                y_val = torch.LongTensor(testy.copy()).to(self.device)
+
                 self.eaat.eval()
-                eaat_pred = torch.max(self.eaat(
-                                        torch.FloatTensor(
-                                            testx.copy()[:,
-                                                         ::self.params[
-                                                            'binning']
-                                                         ]
-                                            )
-                                        ), 1)[-1]
+                eaat_pred = torch.max(self.eaat(x_val), 1)[-1]
                 acc = shadow.losses.accuracy(eaat_pred,
-                                             torch.LongTensor(testy.copy())
+                                             y_val
                                              ).data.item()
                 acc_history.append(acc)
 
+                self.eaat.train()
+                # test for early stopping
+                out = self.eaat(x_val)
+                val_loss = self.xEnt(out, y_val) + \
+                    self.eaat.get_technique_cost(x_val)
+                if stopper.early_stop(val_loss):
+                    break
+
         # optionally return the training accuracy if test data was provided
         return acc_history
 
@@ -245,15 +254,18 @@ def predict(self, testx, testy=None):
         eaat_pred = torch.max(self.eaat(
                                 torch.FloatTensor(
                                     testx.copy()[:, ::self.params['binning']]
-                                    )
+                                    ).to(self.device)
                                 ), 1)[-1]
 
         acc = None
         if testy is not None:
             acc = shadow.losses.accuracy(eaat_pred,
-                                         torch.LongTensor(testy.copy())
+                                         torch.LongTensor(
+                                            testy.copy()).to(self.device)
                                          ).data.item()
 
+        # return tensor to cpu if on gpu and convert to numpy for return
+        eaat_pred = eaat_pred.cpu().numpy()
         return eaat_pred, acc
 
     def save(self, filename):
diff --git a/scripts/utils.py b/scripts/utils.py
index 9cd4754..4211d77 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -11,6 +11,47 @@
 from sklearn.decomposition import PCA
 
 
+class EarlyStopper:
+    '''
+    Early stopping mechanism for neural networks.
+    Code adapted from user "isle_of_gods" from StackOverflow:
+    https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch
+    Use this class to break a training loop if the validation loss is low.
+    Inputs:
+    patience: integer; forces stop if validation loss has not improved
+        for some time
+    min_delta: "fudge value" for how much loss to tolerate before stopping
+    '''
+
+    def __init__(self, patience=1, min_delta=0):
+        self.patience = patience
+        self.min_delta = min_delta
+        self.counter = 0
+        self.min_validation_loss = np.inf
+
+    def early_stop(self, validation_loss):
+        '''
+        Tests for the early stopping condition if the validation loss
+        has not improved for a certain period of time (patience).
+        Inputs:
+        validation_loss: typically a float value for the loss function of
+            a neural network training loop
+        '''
+
+        if validation_loss < self.min_validation_loss:
+            # keep track of the smallest validation loss
+            # if it has been beaten, restart patience
+            self.min_validation_loss = validation_loss
+            self.counter = 0
+        elif validation_loss > (self.min_validation_loss + self.min_delta):
+            # keep track of whether validation loss has been decreasing
+            # by a tolerable amount
+            self.counter += 1
+            if self.counter >= self.patience:
+                return True
+        return False
+
+
 def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True):
     '''
     Runs hyperparameter optimization on a model given a parameter space.
diff --git a/tests/test_models.py b/tests/test_models.py
index 4eedaa6..d619700 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -337,8 +337,7 @@ def test_ShadowNN():
     # rather than decimals
     # uninteresting test if Shadow predicts all one class
     # TODO: make the default params test meaningful
-    # NOTE: .numpy() needed because model.predict() returns a tensor
-    assert np.count_nonzero(pred.numpy() == y_test) > 0
+    assert np.count_nonzero(pred == y_test) > 0
 
     # testing hyperopt optimize methods
     space = {'hidden_layer': 10,

From a094a251840469bc2ecb5e24a3753d41cc2afe6e Mon Sep 17 00:00:00 2001
From: Jordan Stomps <stomps@wisc.edu>
Date: Mon, 10 Oct 2022 11:09:28 -0400
Subject: [PATCH 35/35] adding cross validation implementation

---
 scripts/utils.py     | 95 ++++++++++++++++++++++++++++++++++++++++++++
 tests/test_models.py | 32 +++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/scripts/utils.py b/scripts/utils.py
index 4211d77..d91c826 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -9,6 +9,8 @@
 # pca
 from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA
+# Cross Validation
+from sklearn.model_selection import KFold, StratifiedKFold
 
 
 class EarlyStopper:
@@ -96,6 +98,99 @@ def run_hyperopt(space, model, data_dict, max_evals=50, verbose=True):
     return best, worst
 
 
+def cross_validation(model, X, y, params, n_splits=3,
+                     stratified=False, random_state=None):
+    '''
+    Perform K-Fold cross validation using sklearn and a given model.
+    The model *must* have a fresh_start method (see models in RadClass/models).
+    fresh_start() is used instead of train() to be agnostic to the data needed
+        for training (fresh_start requires a data_dict whereas each model's
+        train could take different combinations of labeled & unlabeled data).
+        This also avoids the need to do hyperparameter optimization (and
+        therefore many training epochs) for every K-Fold.
+    NOTE: fresh_start returns the model and results in a dictionary but
+        does not overwrite/save the model to the respective class.
+        You can manually overwrite using model.model = return.model
+    Hyperparameter optimization (model.optimize) can be done before or after
+        cross validation to specify the (optimal) parameters used by the model
+        since they are required here.
+    NOTE: Fixed default to shuffle data during cross validation splits.
+        (See sklearn cross validation docs for more info.)
+    NOTE: Unlabeled data, if provided, will always be included in the training
+        dataset. This means that this cross validation implementation is
+        susceptible to bias in the unlabeled data distribution. To test for
+        this bias, a user can manually run cross validation as a parent to
+        calling this function, splitting the unlabeled data and adding
+        different folds into X.
+    Inputs:
+    model: ML model class object (e.g. RadClass/models).
+        Must have a fresh_start() method.
+        NOTE: If the model expects unlabeled data but unlabed data is not
+        provided in X/y, an error will likely be thrown when training the model
+        through fresh_start.
+    X: array of feature vectors (rows of individual instances, cols of vectors)
+        This should include all data for training and testing (since the
+        testing subset will be split by cross validation), including unlabeled
+        data if needed/used.
+    y: array/vector of labels for X. If including unlabeled data, use -1.
+        This should have the same order as X. That is, each row index in X
+        has an associated label with the same index in y.
+    params: dictionary of hyperparameters. Will depend on model used.
+        Alternatively, use model.params for models in RadClass/models
+    n_splits: int number of splits for K-Fold cross validation
+    stratified: bool; if True, balance the K-Folds to have roughly the same
+        proportion of samples from each class.
+    random_state: seed for reproducility.
+    '''
+
+    # return lists
+    accs = []
+    reports = []
+
+    if stratified:
+        cv = StratifiedKFold(n_splits=n_splits, random_state=random_state,
+                             shuffle=True)
+    else:
+        cv = KFold(n_splits=n_splits, random_state=random_state,
+                   shuffle=True)
+
+    # separate unlabeled data if included
+    Ux = None
+    Uy = None
+    if -1 in y:
+        U_idx = np.where(y == -1)[0]
+        L_idx = np.where(y != -1)[0]
+        Ux = X[U_idx]
+        Uy = y[U_idx]
+        Lx = X[L_idx]
+        Ly = y[L_idx]
+    else:
+        Lx = X
+        Ly = y
+    # conduct K-Fold cross validation
+    cv.get_n_splits(Lx, Ly)
+    for train_idx, test_idx in cv.split(Lx, Ly):
+        trainx, testx = Lx[train_idx], Lx[test_idx]
+        trainy, testy = Ly[train_idx], Ly[test_idx]
+
+        # construct data dictionary for training in fresh_start
+        data_dict = {'trainx': trainx, 'trainy': trainy,
+                     'testx': testx, 'testy': testy}
+        if Ux is not None:
+            data_dict['Ux'] = Ux
+            data_dict['Uy'] = Uy
+        results = model.fresh_start(params, data_dict)
+        accs = np.append(accs, results['accuracy'])
+        reports = np.append(reports, results)
+
+    # report cross validation results
+    print('Average accuracy:', np.mean(accs))
+    print('Max accuracy:', np.max(accs))
+    print('All accuracy:', accs)
+    # return the results of fresh_start for the max accuracy model
+    return reports[np.argmax(accs)]
+
+
 def pca(Lx, Ly, Ux, Uy, filename):
     '''
     A function for computing and plotting 2D PCA.
diff --git a/tests/test_models.py b/tests/test_models.py
index d619700..e3fb086 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -46,6 +46,38 @@ def test_utils():
                                     labels,
                                     test_size=0.5,
                                     random_state=0)
+    Uy = np.full_like(Uy, -1)
+
+    # test cross validation for supervised data using LogReg
+    params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
+    model = LogReg(params=params)
+    max_acc_model = utils.cross_validation(model=model,
+                                           X=X,
+                                           y=y,
+                                           params=params)
+    assert max_acc_model['accuracy'] >= 0.5
+
+    # test cross validation for supervised data and StratifiedKFold with LogReg
+    params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
+    model = LogReg(params=params)
+    max_acc_model = utils.cross_validation(model=model,
+                                           X=X,
+                                           y=y,
+                                           params=params,
+                                           stratified=True)
+    assert max_acc_model['accuracy'] >= 0.5
+
+    # test cross validation for SSML with LabelProp
+    params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
+    model = LabelProp(params=params)
+    max_acc_model = utils.cross_validation(model=model,
+                                           X=np.append(X, Ux, axis=0),
+                                           y=np.append(y, Uy, axis=0),
+                                           params=params,
+                                           stratified=True)
+    assert max_acc_model['accuracy'] >= 0.5
+
+    # data split for data visualization
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.2,