de/factorization.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ @@
     from scipy.optimize import minimize
     from scipy.special import expit, logit
     from .importData import importLINCS
+    from .fancyimpute.soft_impute import SoftImpute
     alpha = 0.1
@@ Expand Down Expand Up @@
         """
         Calculate an estimate for eta based on data and current iteration of w.
         """
+        assert np.all(np.isfinite(data))
         U = np.copy(data)
         np.fill_diagonal(U, 0.0)
         expM = expit(w @ U)
@@ Expand All @@
         # Least squares with one coefficient and no intercept
         xy = np.sum(expM * aData, axis=1)
         xx = np.sum(expM * expM, axis=1)
+        assert np.all(np.isfinite(xy))
+        assert np.all(np.isfinite(xx))
         etta = xy / xx
         assert np.min(etta) >= 0.0
         assert np.max(etta) < 1e10
         return etta
-    def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False):
+    def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False, returnData=False):
         """
         Iteravely solve for w and eta list based on the data.
         :param data: matrix or list of matrices representing a cell line's gene expression interactions with knockdowns
@@ Expand All @@
         if isinstance(data, np.ndarray):
             data = [data]
+        missing = [np.isnan(d) for d in data]
+        data = [SoftImpute(min_value=0.0, verbose=False).fit_transform(d) for d in data]
         w = np.zeros((data[0].shape[0], data[0].shape[0]))
         etas = [calcEta(x, w, alpha) for x in data]
@@ Expand All @@
             else:
                 wProposed = fitW(w, data, etas, alpha)
+            for jj, dd in enumerate(data):
+                U = np.copy(dd)
+                np.fill_diagonal(U, 0.0)
+                predictt = etas[jj][:, np.newaxis] * expit(wProposed @ U) / alpha
+                data[jj][missing[jj]] = predictt[missing[jj]]
             costNew = costF(data, wProposed, etas, alpha)
             if cost - costNew > 1e-3:
@@ Expand All @@
         if returnCost:
             return w, etas, cost
+        if returnData:
+            return w, etas, data
         return w, etas
@@ Expand Down @@

de/fancyimpute/soft_impute.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -151,6 +151,10 @@ def solve(self, X, missing_mask): @@
             X_filled = X
             observed_mask = ~missing_mask
+            if np.sum(missing_mask) == 0:
+                if self.verbose:
+                    print("[SoftImpute] No missing values.")
+                return X_filled
             max_singular_value = self._max_singular_value(X_filled)
             if self.verbose:
                 print("[SoftImpute] Max Singular Value of X_init = %f" % (
@@ Expand Down @@

de/figures/figure3.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
     from .common import subplotLabel, getSetup
     from ..graph import Network, load_w, remove, normalize, bar_graph
     from ..grndb_network import load_w_GRNdb, Network_GRNdb
+    from ..impute import plot_imputation
     def makeFigure():
@@ Expand All / @@ -12,7 +13,7 @@ def makeFigure(): @@
         :type f: Figure
         """
         # Get list of axis objects
-        ax, f = getSetup((10, 8), (2, 3))
+        ax, f = getSetup((16, 8), (2, 3))
         # load w for the Melanoma dataset from Torre paper
         w = load_w()
         w = normalize(w)
@@ Expand All / @@ -33,6 +34,9 @@ def makeFigure(): @@
         Network_GRNdb(w_GRNdb, ax[3])
         ax[3].set_title("w Network Graph - GRNdb")
+        # ax[4] would be the boxplot comparing linear and nonlinear fitting
+        plot_imputation(ax[4])
         # Add subplot labels
         subplotLabel(ax)
         return f

de/impute.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -4,10 +4,8 @@
  
    import pandas as pd

    import seaborn as sns

    import itertools

    from scipy.special import expit

    from .factorization import alpha, factorizeEstimate

    from .factorization import factorizeEstimate

    from .linearModel import runFitting

    from .fancyimpute.soft_impute import SoftImpute

    from .importData import importLINCS, ImportMelanoma

    def split_data(X, n=10):

    @@ -26,33 +24,13 @@ def split_data(X, n=10):
  
    def impute(data, linear=False):

        """ Impute by repeated fitting. """

        missing = np.isnan(data)

        if linear:

            data = runFitting(data)

        else:

            # Fit nonlinear

            _, _, data = factorizeEstimate(data, maxiter=50, returnData=True)

        si = SoftImpute()

        data = si.fit_transform(data)

        for _ in range(10):

            U = np.copy(data)

            np.fill_diagonal(U, 0.0)

            # Fit

            if linear:

                model = runFitting(data)

            else:

                w, eta = factorizeEstimate(data, maxiter=20)

            # Fill-in with model prediction

            if linear:

                predictt = model.predict(U)

            else:

                predictt = eta[0][:, np.newaxis] * expit(w @ U) / alpha

            dataLast = np.copy(data)

            data[missing] = predictt[missing]

            change = np.linalg.norm(data - dataLast)

            print(change, np.linalg.norm(dataLast))

        return data

        return data[0]

    def repeatImputation(data, linear=False, numIter=20):

    @@ -76,6 +54,8 @@ def calc_imputation():
  
        for data in data_list:

            linear_coeffs.append(repeatImputation(data, linear=True))

            nonlinear_coeffs.append(repeatImputation(data))

        print("linear ", linear_coeffs)

        print("nonlinear ", nonlinear_coeffs)

        return linear_coeffs, nonlinear_coeffs

    @@ -85,11 +65,11 @@ def plot_imputation(ax):
  
        n = len(linear[0])

        labels = 2 * [["A375"] * n, ["A549"] * n, ["HA1E"] * n, ["HT29"] * n, ["MCF7"] * n, ["PC3"] * n, ["Mel"] * n]

        hue = [["linear"] * 5 * n, ["nonlinear"] * 5 * n]

        df = pd.DataFrame({'correlation coef.': list(itertools.chain(linear + nonlinear)), 'cellLines': labels, 'model': hue})

        sns.boxplot(x='cellLines', y='correlation coef', hue='model', data=df, ax=ax, split=True, jitter=0.2, palette=sns.color_palette('Paired'))

        hue = [["linear"] * 7 * n, ["nonlinear"] * 7 * n]

        df = pd.DataFrame({'correlation_coef.': list(itertools.chain(*(linear + nonlinear))), 'cellLines': list(itertools.chain(*labels)), 'model': list(itertools.chain(*hue))})

        sns.boxplot(x='cellLines', y='correlation_coef.', hue='model', data=df, ax=ax, palette=sns.color_palette('Paired'))

        handles, labels = ax.get_legend_handles_labels()

        lgd = ax.legend(handles[0:2], labels[0:2],

                           loc='upper left',

                           fontsize='large',

                           handletextpad=0.5)
      
                           handletextpad=0.5)

de/linearModel.py

-Original file line number
+Diff line change
@@ -1,14 +1,23 @@
     from sklearn.linear_model import Lasso
     import numpy as np
+    from .fancyimpute.soft_impute import SoftImpute
     def runFitting(data, U=None, alpha=1.0):
         """ Creates Lasso object, fits model to data. """
+        missing = np.isnan(data)
+        data = SoftImpute(min_value=0.0, verbose=False).fit_transform(data)
         if U is None:
             U = np.copy(data)
             np.fill_diagonal(U, 0.0)
         model = Lasso(max_iter=300000, alpha=alpha)
         model.fit(U, data)
-        return model
+        predictt = model.predict(U)
+        dataLast = np.copy(data)
+        data[missing] = predictt[missing]
+        return [data]

requested edits added #161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

aryakrekhi wants to merge 3 commits into main from impute_edits

-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ @@
     from scipy.optimize import minimize
     from scipy.special import expit, logit
     from .importData import importLINCS
+    from .fancyimpute.soft_impute import SoftImpute
     alpha = 0.1
@@ Expand Down Expand Up @@
         """
         Calculate an estimate for eta based on data and current iteration of w.
         """
+        assert np.all(np.isfinite(data))
         U = np.copy(data)
         np.fill_diagonal(U, 0.0)
         expM = expit(w @ U)
@@ Expand All @@
         # Least squares with one coefficient and no intercept
         xy = np.sum(expM * aData, axis=1)
         xx = np.sum(expM * expM, axis=1)
+        assert np.all(np.isfinite(xy))
+        assert np.all(np.isfinite(xx))
         etta = xy / xx
         assert np.min(etta) >= 0.0
         assert np.max(etta) < 1e10
         return etta
-    def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False):
+    def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False, returnData=False):
         """
         Iteravely solve for w and eta list based on the data.
         :param data: matrix or list of matrices representing a cell line's gene expression interactions with knockdowns
@@ Expand All @@
         if isinstance(data, np.ndarray):
             data = [data]
+        missing = [np.isnan(d) for d in data]
+        data = [SoftImpute(min_value=0.0, verbose=False).fit_transform(d) for d in data]
         w = np.zeros((data[0].shape[0], data[0].shape[0]))
         etas = [calcEta(x, w, alpha) for x in data]
@@ Expand All @@
             else:
                 wProposed = fitW(w, data, etas, alpha)
+            for jj, dd in enumerate(data):
+                U = np.copy(dd)
+                np.fill_diagonal(U, 0.0)
+                predictt = etas[jj][:, np.newaxis] * expit(wProposed @ U) / alpha
+                data[jj][missing[jj]] = predictt[missing[jj]]
             costNew = costF(data, wProposed, etas, alpha)
             if cost - costNew > 1e-3:
@@ Expand All @@
         if returnCost:
             return w, etas, cost
+        if returnData:
+            return w, etas, data
         return w, etas
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -151,6 +151,10 @@ def solve(self, X, missing_mask): @@
             X_filled = X
             observed_mask = ~missing_mask
+            if np.sum(missing_mask) == 0:
+                if self.verbose:
+                    print("[SoftImpute] No missing values.")
+                return X_filled
             max_singular_value = self._max_singular_value(X_filled)
             if self.verbose:
                 print("[SoftImpute] Max Singular Value of X_init = %f" % (
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
     from .common import subplotLabel, getSetup
     from ..graph import Network, load_w, remove, normalize, bar_graph
     from ..grndb_network import load_w_GRNdb, Network_GRNdb
+    from ..impute import plot_imputation
     def makeFigure():
@@ Expand All / @@ -12,7 +13,7 @@ def makeFigure(): @@
         :type f: Figure
         """
         # Get list of axis objects
-        ax, f = getSetup((10, 8), (2, 3))
+        ax, f = getSetup((16, 8), (2, 3))
         # load w for the Melanoma dataset from Torre paper
         w = load_w()
         w = normalize(w)
@@ Expand All / @@ -33,6 +34,9 @@ def makeFigure(): @@
         Network_GRNdb(w_GRNdb, ax[3])
         ax[3].set_title("w Network Graph - GRNdb")
+        # ax[4] would be the boxplot comparing linear and nonlinear fitting
+        plot_imputation(ax[4])
         # Add subplot labels
         subplotLabel(ax)
         return f

-Original file line number
+Diff line change
@@ -1,14 +1,23 @@
     from sklearn.linear_model import Lasso
     import numpy as np
+    from .fancyimpute.soft_impute import SoftImpute
     def runFitting(data, U=None, alpha=1.0):
         """ Creates Lasso object, fits model to data. """
+        missing = np.isnan(data)
+        data = SoftImpute(min_value=0.0, verbose=False).fit_transform(data)
         if U is None:
             U = np.copy(data)
             np.fill_diagonal(U, 0.0)
         model = Lasso(max_iter=300000, alpha=alpha)
         model.fit(U, data)
-        return model
+        predictt = model.predict(U)
+        dataLast = np.copy(data)
+        data[missing] = predictt[missing]
+        return [data]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

requested edits added #161

Uh oh!

Diff view

Diff view

There are no files selected for viewing

requested edits added #161

Are you sure you want to change the base?

Uh oh!

requested edits added #161

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing