From c33635a614ea1a92b74dbf37f5f4174c0c5c3c34 Mon Sep 17 00:00:00 2001
From: aryakrekhi <aryakrekhi@gmail.com>
Date: Wed, 23 Feb 2022 11:30:27 -0800
Subject: [PATCH 1/3] requested edits added

---
 de/impute.py | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/de/impute.py b/de/impute.py
index bee714e..a1e552a 100644
--- a/de/impute.py
+++ b/de/impute.py
@@ -24,33 +24,27 @@ def split_data(X, n=10):
     return train_X, test_X
 
 
-def impute(data, linear=False):
+def impute(data):
     """ Impute by repeated fitting. """
     missing = np.isnan(data)
 
     si = SoftImpute()
     data = si.fit_transform(data)
 
-    for _ in range(10):
-        U = np.copy(data)
-        np.fill_diagonal(U, 0.0)
-
-        # Fit
-        if linear:
-            model = runFitting(data)
-        else:
-            w, eta = factorizeEstimate(data, maxiter=20)
-
-        # Fill-in with model prediction
-        if linear:
-            predictt = model.predict(U)
-        else:
-            predictt = eta[0][:, np.newaxis] * expit(w @ U) / alpha
-
-        dataLast = np.copy(data)
-        data[missing] = predictt[missing]
-        change = np.linalg.norm(data - dataLast)
-        print(change, np.linalg.norm(dataLast))
+    
+    U = np.copy(data)
+    np.fill_diagonal(U, 0.0)
+
+    # Fit
+    w, eta = factorizeEstimate(data, maxiter=20)
+
+    # Fill-in with model prediction
+    predictt = eta[0][:, np.newaxis] * expit(w @ U) / alpha
+
+    dataLast = np.copy(data)
+    data[missing] = predictt[missing]
+    change = np.linalg.norm(data - dataLast)
+    print(change, np.linalg.norm(dataLast))
 
     return data
 

From 13f9c7c97798f4bea927e624412637dcfbea63f6 Mon Sep 17 00:00:00 2001
From: Aaron Meyer <ameyer@ucla.edu>
Date: Wed, 23 Feb 2022 12:48:20 -0800
Subject: [PATCH 2/3] Move imputation to within the fitting

---
 de/factorization.py           | 18 +++++++++++++++++-
 de/fancyimpute/soft_impute.py |  4 ++++
 de/impute.py                  | 29 ++++-------------------------
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/de/factorization.py b/de/factorization.py
index 48634e4..d7dd760 100644
--- a/de/factorization.py
+++ b/de/factorization.py
@@ -6,6 +6,7 @@
 from scipy.optimize import minimize
 from scipy.special import expit, logit
 from .importData import importLINCS
+from .fancyimpute.soft_impute import SoftImpute
 
 
 alpha = 0.1
@@ -84,6 +85,7 @@ def calcEta(data: np.ndarray, w: np.ndarray, alphaIn: float) -> np.ndarray:
     """
     Calculate an estimate for eta based on data and current iteration of w.
     """
+    assert np.all(np.isfinite(data))
     U = np.copy(data)
     np.fill_diagonal(U, 0.0)
     expM = expit(w @ U)
@@ -92,6 +94,8 @@ def calcEta(data: np.ndarray, w: np.ndarray, alphaIn: float) -> np.ndarray:
     # Least squares with one coefficient and no intercept
     xy = np.sum(expM * aData, axis=1)
     xx = np.sum(expM * expM, axis=1)
+    assert np.all(np.isfinite(xy))
+    assert np.all(np.isfinite(xx))
 
     etta = xy / xx
     assert np.min(etta) >= 0.0
@@ -99,7 +103,7 @@ def calcEta(data: np.ndarray, w: np.ndarray, alphaIn: float) -> np.ndarray:
     return etta
 
 
-def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False):
+def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False, returnData=False):
     """
     Iteravely solve for w and eta list based on the data.
     :param data: matrix or list of matrices representing a cell line's gene expression interactions with knockdowns
@@ -117,6 +121,9 @@ def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=Fal
     if isinstance(data, np.ndarray):
         data = [data]
 
+    missing = [np.isnan(d) for d in data]
+    data = [SoftImpute(min_value=0.0, verbose=False).fit_transform(d) for d in data]
+
     w = np.zeros((data[0].shape[0], data[0].shape[0]))
     etas = [calcEta(x, w, alpha) for x in data]
 
@@ -133,6 +140,12 @@ def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=Fal
         else:
             wProposed = fitW(w, data, etas, alpha)
 
+        for jj, dd in enumerate(data):
+            U = np.copy(dd)
+            np.fill_diagonal(U, 0.0)
+            predictt = etas[jj][:, np.newaxis] * expit(wProposed @ U) / alpha
+            data[jj][missing[jj]] = predictt[missing[jj]]
+
         costNew = costF(data, wProposed, etas, alpha)
 
         if cost - costNew > 1e-3:
@@ -150,6 +163,9 @@ def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=Fal
     if returnCost:
         return w, etas, cost
 
+    if returnData:
+        return w, etas, data
+
     return w, etas
 
 
diff --git a/de/fancyimpute/soft_impute.py b/de/fancyimpute/soft_impute.py
index e6161de..b5a75de 100644
--- a/de/fancyimpute/soft_impute.py
+++ b/de/fancyimpute/soft_impute.py
@@ -151,6 +151,10 @@ def solve(self, X, missing_mask):
 
         X_filled = X
         observed_mask = ~missing_mask
+        if np.sum(missing_mask) == 0:
+            if self.verbose:
+                print("[SoftImpute] No missing values.")
+            return X_filled
         max_singular_value = self._max_singular_value(X_filled)
         if self.verbose:
             print("[SoftImpute] Max Singular Value of X_init = %f" % (
diff --git a/de/impute.py b/de/impute.py
index a1e552a..4df3ed5 100644
--- a/de/impute.py
+++ b/de/impute.py
@@ -4,10 +4,7 @@
 import pandas as pd
 import seaborn as sns
 import itertools
-from scipy.special import expit
-from .factorization import alpha, factorizeEstimate
-from .linearModel import runFitting
-from .fancyimpute.soft_impute import SoftImpute
+from .factorization import factorizeEstimate
 from .importData import importLINCS, ImportMelanoma
 
 def split_data(X, n=10):
@@ -26,27 +23,9 @@ def split_data(X, n=10):
 
 def impute(data):
     """ Impute by repeated fitting. """
-    missing = np.isnan(data)
-
-    si = SoftImpute()
-    data = si.fit_transform(data)
-
-    
-    U = np.copy(data)
-    np.fill_diagonal(U, 0.0)
-
     # Fit
-    w, eta = factorizeEstimate(data, maxiter=20)
-
-    # Fill-in with model prediction
-    predictt = eta[0][:, np.newaxis] * expit(w @ U) / alpha
-
-    dataLast = np.copy(data)
-    data[missing] = predictt[missing]
-    change = np.linalg.norm(data - dataLast)
-    print(change, np.linalg.norm(dataLast))
-
-    return data
+    _, _, data = factorizeEstimate(data, maxiter=50, returnData=True)
+    return data[0]
 
 
 def repeatImputation(data, linear=False, numIter=20):
@@ -54,7 +33,7 @@ def repeatImputation(data, linear=False, numIter=20):
     coefs = []
     for _ in range(numIter):
         train_X, test_X = split_data(data)
-        full_X = impute(train_X, linear)
+        full_X = impute(train_X)
         corr_coef = ma.corrcoef(ma.masked_invalid(full_X.flatten()), ma.masked_invalid(test_X.flatten()))
         coefs.append(corr_coef[0][1])
     print(f"average corr coef: {sum(coefs)/len(coefs)}")

From 2e98627077eb83f053165acdb7439859350570b2 Mon Sep 17 00:00:00 2001
From: farnazm <farnazmd95@gmail.com>
Date: Sun, 27 Feb 2022 15:27:37 -0800
Subject: [PATCH 3/3] linear model edits

---
 de/figures/figure3.py |  6 +++++-
 de/impute.py          | 23 +++++++++++++++--------
 de/linearModel.py     | 11 ++++++++++-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/de/figures/figure3.py b/de/figures/figure3.py
index fc0c3ae..560a1fe 100644
--- a/de/figures/figure3.py
+++ b/de/figures/figure3.py
@@ -4,6 +4,7 @@
 from .common import subplotLabel, getSetup
 from ..graph import Network, load_w, remove, normalize, bar_graph
 from ..grndb_network import load_w_GRNdb, Network_GRNdb
+from ..impute import plot_imputation
 
 
 def makeFigure():
@@ -12,7 +13,7 @@ def makeFigure():
     :type f: Figure
     """
     # Get list of axis objects
-    ax, f = getSetup((10, 8), (2, 3))
+    ax, f = getSetup((16, 8), (2, 3))
     # load w for the Melanoma dataset from Torre paper
     w = load_w()
     w = normalize(w)
@@ -33,6 +34,9 @@ def makeFigure():
     Network_GRNdb(w_GRNdb, ax[3])
     ax[3].set_title("w Network Graph - GRNdb")
 
+    # ax[4] would be the boxplot comparing linear and nonlinear fitting
+    plot_imputation(ax[4])
+
     # Add subplot labels
     subplotLabel(ax)
     return f
diff --git a/de/impute.py b/de/impute.py
index 4df3ed5..2b046ce 100644
--- a/de/impute.py
+++ b/de/impute.py
@@ -5,6 +5,7 @@
 import seaborn as sns
 import itertools
 from .factorization import factorizeEstimate
+from .linearModel import runFitting
 from .importData import importLINCS, ImportMelanoma
 
 def split_data(X, n=10):
@@ -21,10 +22,14 @@ def split_data(X, n=10):
     return train_X, test_X
 
 
-def impute(data):
+def impute(data, linear=False):
     """ Impute by repeated fitting. """
-    # Fit
-    _, _, data = factorizeEstimate(data, maxiter=50, returnData=True)
+    if linear:
+        data = runFitting(data)
+    else:
+        # Fit nonlinear
+        _, _, data = factorizeEstimate(data, maxiter=50, returnData=True)
+
     return data[0]
 
 
@@ -33,7 +38,7 @@ def repeatImputation(data, linear=False, numIter=20):
     coefs = []
     for _ in range(numIter):
         train_X, test_X = split_data(data)
-        full_X = impute(train_X)
+        full_X = impute(train_X, linear)
         corr_coef = ma.corrcoef(ma.masked_invalid(full_X.flatten()), ma.masked_invalid(test_X.flatten()))
         coefs.append(corr_coef[0][1])
     print(f"average corr coef: {sum(coefs)/len(coefs)}")
@@ -49,6 +54,8 @@ def calc_imputation():
     for data in data_list:
         linear_coeffs.append(repeatImputation(data, linear=True))
         nonlinear_coeffs.append(repeatImputation(data))
+    print("linear ", linear_coeffs)
+    print("nonlinear ", nonlinear_coeffs)
 
     return linear_coeffs, nonlinear_coeffs
 
@@ -58,11 +65,11 @@ def plot_imputation(ax):
 
     n = len(linear[0])
     labels = 2 * [["A375"] * n, ["A549"] * n, ["HA1E"] * n, ["HT29"] * n, ["MCF7"] * n, ["PC3"] * n, ["Mel"] * n]
-    hue = [["linear"] * 5 * n, ["nonlinear"] * 5 * n]
-    df = pd.DataFrame({'correlation coef.': list(itertools.chain(linear + nonlinear)), 'cellLines': labels, 'model': hue})
-    sns.boxplot(x='cellLines', y='correlation coef', hue='model', data=df, ax=ax, split=True, jitter=0.2, palette=sns.color_palette('Paired'))
+    hue = [["linear"] * 7 * n, ["nonlinear"] * 7 * n]
+    df = pd.DataFrame({'correlation_coef.': list(itertools.chain(*(linear + nonlinear))), 'cellLines': list(itertools.chain(*labels)), 'model': list(itertools.chain(*hue))})
+    sns.boxplot(x='cellLines', y='correlation_coef.', hue='model', data=df, ax=ax, palette=sns.color_palette('Paired'))
     handles, labels = ax.get_legend_handles_labels()
     lgd = ax.legend(handles[0:2], labels[0:2],
                        loc='upper left',
                        fontsize='large',
-                       handletextpad=0.5)
\ No newline at end of file
+                       handletextpad=0.5)
diff --git a/de/linearModel.py b/de/linearModel.py
index 312447d..d0efe21 100644
--- a/de/linearModel.py
+++ b/de/linearModel.py
@@ -1,9 +1,14 @@
 from sklearn.linear_model import Lasso
 import numpy as np
+from .fancyimpute.soft_impute import SoftImpute
 
 
 def runFitting(data, U=None, alpha=1.0):
     """ Creates Lasso object, fits model to data. """
+
+    missing = np.isnan(data)
+    data = SoftImpute(min_value=0.0, verbose=False).fit_transform(data)
+
     if U is None:
         U = np.copy(data)
         np.fill_diagonal(U, 0.0)
@@ -11,4 +16,8 @@ def runFitting(data, U=None, alpha=1.0):
     model = Lasso(max_iter=300000, alpha=alpha)
     model.fit(U, data)
 
-    return model
+    predictt = model.predict(U)
+    dataLast = np.copy(data)
+    data[missing] = predictt[missing]
+
+    return [data]