From c33635a614ea1a92b74dbf37f5f4174c0c5c3c34 Mon Sep 17 00:00:00 2001 From: aryakrekhi Date: Wed, 23 Feb 2022 11:30:27 -0800 Subject: [PATCH 1/3] requested edits added --- de/impute.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/de/impute.py b/de/impute.py index bee714e..a1e552a 100644 --- a/de/impute.py +++ b/de/impute.py @@ -24,33 +24,27 @@ def split_data(X, n=10): return train_X, test_X -def impute(data, linear=False): +def impute(data): """ Impute by repeated fitting. """ missing = np.isnan(data) si = SoftImpute() data = si.fit_transform(data) - for _ in range(10): - U = np.copy(data) - np.fill_diagonal(U, 0.0) - - # Fit - if linear: - model = runFitting(data) - else: - w, eta = factorizeEstimate(data, maxiter=20) - - # Fill-in with model prediction - if linear: - predictt = model.predict(U) - else: - predictt = eta[0][:, np.newaxis] * expit(w @ U) / alpha - - dataLast = np.copy(data) - data[missing] = predictt[missing] - change = np.linalg.norm(data - dataLast) - print(change, np.linalg.norm(dataLast)) + + U = np.copy(data) + np.fill_diagonal(U, 0.0) + + # Fit + w, eta = factorizeEstimate(data, maxiter=20) + + # Fill-in with model prediction + predictt = eta[0][:, np.newaxis] * expit(w @ U) / alpha + + dataLast = np.copy(data) + data[missing] = predictt[missing] + change = np.linalg.norm(data - dataLast) + print(change, np.linalg.norm(dataLast)) return data From 13f9c7c97798f4bea927e624412637dcfbea63f6 Mon Sep 17 00:00:00 2001 From: Aaron Meyer Date: Wed, 23 Feb 2022 12:48:20 -0800 Subject: [PATCH 2/3] Move imputation to within the fitting --- de/factorization.py | 18 +++++++++++++++++- de/fancyimpute/soft_impute.py | 4 ++++ de/impute.py | 29 ++++------------------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/de/factorization.py b/de/factorization.py index 48634e4..d7dd760 100644 --- a/de/factorization.py +++ b/de/factorization.py @@ -6,6 +6,7 @@ from scipy.optimize import minimize from scipy.special import expit, logit from .importData import importLINCS +from .fancyimpute.soft_impute import SoftImpute alpha = 0.1 @@ -84,6 +85,7 @@ def calcEta(data: np.ndarray, w: np.ndarray, alphaIn: float) -> np.ndarray: """ Calculate an estimate for eta based on data and current iteration of w. """ + assert np.all(np.isfinite(data)) U = np.copy(data) np.fill_diagonal(U, 0.0) expM = expit(w @ U) @@ -92,6 +94,8 @@ def calcEta(data: np.ndarray, w: np.ndarray, alphaIn: float) -> np.ndarray: # Least squares with one coefficient and no intercept xy = np.sum(expM * aData, axis=1) xx = np.sum(expM * expM, axis=1) + assert np.all(np.isfinite(xy)) + assert np.all(np.isfinite(xx)) etta = xy / xx assert np.min(etta) >= 0.0 @@ -99,7 +103,7 @@ def calcEta(data: np.ndarray, w: np.ndarray, alphaIn: float) -> np.ndarray: return etta -def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False): +def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False, returnData=False): """ Iteravely solve for w and eta list based on the data. :param data: matrix or list of matrices representing a cell line's gene expression interactions with knockdowns @@ -117,6 +121,9 @@ def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=Fal if isinstance(data, np.ndarray): data = [data] + missing = [np.isnan(d) for d in data] + data = [SoftImpute(min_value=0.0, verbose=False).fit_transform(d) for d in data] + w = np.zeros((data[0].shape[0], data[0].shape[0])) etas = [calcEta(x, w, alpha) for x in data] @@ -133,6 +140,12 @@ def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=Fal else: wProposed = fitW(w, data, etas, alpha) + for jj, dd in enumerate(data): + U = np.copy(dd) + np.fill_diagonal(U, 0.0) + predictt = etas[jj][:, np.newaxis] * expit(wProposed @ U) / alpha + data[jj][missing[jj]] = predictt[missing[jj]] + costNew = costF(data, wProposed, etas, alpha) if cost - costNew > 1e-3: @@ -150,6 +163,9 @@ def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=Fal if returnCost: return w, etas, cost + if returnData: + return w, etas, data + return w, etas diff --git a/de/fancyimpute/soft_impute.py b/de/fancyimpute/soft_impute.py index e6161de..b5a75de 100644 --- a/de/fancyimpute/soft_impute.py +++ b/de/fancyimpute/soft_impute.py @@ -151,6 +151,10 @@ def solve(self, X, missing_mask): X_filled = X observed_mask = ~missing_mask + if np.sum(missing_mask) == 0: + if self.verbose: + print("[SoftImpute] No missing values.") + return X_filled max_singular_value = self._max_singular_value(X_filled) if self.verbose: print("[SoftImpute] Max Singular Value of X_init = %f" % ( diff --git a/de/impute.py b/de/impute.py index a1e552a..4df3ed5 100644 --- a/de/impute.py +++ b/de/impute.py @@ -4,10 +4,7 @@ import pandas as pd import seaborn as sns import itertools -from scipy.special import expit -from .factorization import alpha, factorizeEstimate -from .linearModel import runFitting -from .fancyimpute.soft_impute import SoftImpute +from .factorization import factorizeEstimate from .importData import importLINCS, ImportMelanoma def split_data(X, n=10): @@ -26,27 +23,9 @@ def split_data(X, n=10): def impute(data): """ Impute by repeated fitting. """ - missing = np.isnan(data) - - si = SoftImpute() - data = si.fit_transform(data) - - - U = np.copy(data) - np.fill_diagonal(U, 0.0) - # Fit - w, eta = factorizeEstimate(data, maxiter=20) - - # Fill-in with model prediction - predictt = eta[0][:, np.newaxis] * expit(w @ U) / alpha - - dataLast = np.copy(data) - data[missing] = predictt[missing] - change = np.linalg.norm(data - dataLast) - print(change, np.linalg.norm(dataLast)) - - return data + _, _, data = factorizeEstimate(data, maxiter=50, returnData=True) + return data[0] def repeatImputation(data, linear=False, numIter=20): @@ -54,7 +33,7 @@ def repeatImputation(data, linear=False, numIter=20): coefs = [] for _ in range(numIter): train_X, test_X = split_data(data) - full_X = impute(train_X, linear) + full_X = impute(train_X) corr_coef = ma.corrcoef(ma.masked_invalid(full_X.flatten()), ma.masked_invalid(test_X.flatten())) coefs.append(corr_coef[0][1]) print(f"average corr coef: {sum(coefs)/len(coefs)}") From 2e98627077eb83f053165acdb7439859350570b2 Mon Sep 17 00:00:00 2001 From: farnazm Date: Sun, 27 Feb 2022 15:27:37 -0800 Subject: [PATCH 3/3] linear model edits --- de/figures/figure3.py | 6 +++++- de/impute.py | 23 +++++++++++++++-------- de/linearModel.py | 11 ++++++++++- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/de/figures/figure3.py b/de/figures/figure3.py index fc0c3ae..560a1fe 100644 --- a/de/figures/figure3.py +++ b/de/figures/figure3.py @@ -4,6 +4,7 @@ from .common import subplotLabel, getSetup from ..graph import Network, load_w, remove, normalize, bar_graph from ..grndb_network import load_w_GRNdb, Network_GRNdb +from ..impute import plot_imputation def makeFigure(): @@ -12,7 +13,7 @@ def makeFigure(): :type f: Figure """ # Get list of axis objects - ax, f = getSetup((10, 8), (2, 3)) + ax, f = getSetup((16, 8), (2, 3)) # load w for the Melanoma dataset from Torre paper w = load_w() w = normalize(w) @@ -33,6 +34,9 @@ def makeFigure(): Network_GRNdb(w_GRNdb, ax[3]) ax[3].set_title("w Network Graph - GRNdb") + # ax[4] would be the boxplot comparing linear and nonlinear fitting + plot_imputation(ax[4]) + # Add subplot labels subplotLabel(ax) return f diff --git a/de/impute.py b/de/impute.py index 4df3ed5..2b046ce 100644 --- a/de/impute.py +++ b/de/impute.py @@ -5,6 +5,7 @@ import seaborn as sns import itertools from .factorization import factorizeEstimate +from .linearModel import runFitting from .importData import importLINCS, ImportMelanoma def split_data(X, n=10): @@ -21,10 +22,14 @@ def split_data(X, n=10): return train_X, test_X -def impute(data): +def impute(data, linear=False): """ Impute by repeated fitting. """ - # Fit - _, _, data = factorizeEstimate(data, maxiter=50, returnData=True) + if linear: + data = runFitting(data) + else: + # Fit nonlinear + _, _, data = factorizeEstimate(data, maxiter=50, returnData=True) + return data[0] @@ -33,7 +38,7 @@ def repeatImputation(data, linear=False, numIter=20): coefs = [] for _ in range(numIter): train_X, test_X = split_data(data) - full_X = impute(train_X) + full_X = impute(train_X, linear) corr_coef = ma.corrcoef(ma.masked_invalid(full_X.flatten()), ma.masked_invalid(test_X.flatten())) coefs.append(corr_coef[0][1]) print(f"average corr coef: {sum(coefs)/len(coefs)}") @@ -49,6 +54,8 @@ def calc_imputation(): for data in data_list: linear_coeffs.append(repeatImputation(data, linear=True)) nonlinear_coeffs.append(repeatImputation(data)) + print("linear ", linear_coeffs) + print("nonlinear ", nonlinear_coeffs) return linear_coeffs, nonlinear_coeffs @@ -58,11 +65,11 @@ def plot_imputation(ax): n = len(linear[0]) labels = 2 * [["A375"] * n, ["A549"] * n, ["HA1E"] * n, ["HT29"] * n, ["MCF7"] * n, ["PC3"] * n, ["Mel"] * n] - hue = [["linear"] * 5 * n, ["nonlinear"] * 5 * n] - df = pd.DataFrame({'correlation coef.': list(itertools.chain(linear + nonlinear)), 'cellLines': labels, 'model': hue}) - sns.boxplot(x='cellLines', y='correlation coef', hue='model', data=df, ax=ax, split=True, jitter=0.2, palette=sns.color_palette('Paired')) + hue = [["linear"] * 7 * n, ["nonlinear"] * 7 * n] + df = pd.DataFrame({'correlation_coef.': list(itertools.chain(*(linear + nonlinear))), 'cellLines': list(itertools.chain(*labels)), 'model': list(itertools.chain(*hue))}) + sns.boxplot(x='cellLines', y='correlation_coef.', hue='model', data=df, ax=ax, palette=sns.color_palette('Paired')) handles, labels = ax.get_legend_handles_labels() lgd = ax.legend(handles[0:2], labels[0:2], loc='upper left', fontsize='large', - handletextpad=0.5) \ No newline at end of file + handletextpad=0.5) diff --git a/de/linearModel.py b/de/linearModel.py index 312447d..d0efe21 100644 --- a/de/linearModel.py +++ b/de/linearModel.py @@ -1,9 +1,14 @@ from sklearn.linear_model import Lasso import numpy as np +from .fancyimpute.soft_impute import SoftImpute def runFitting(data, U=None, alpha=1.0): """ Creates Lasso object, fits model to data. """ + + missing = np.isnan(data) + data = SoftImpute(min_value=0.0, verbose=False).fit_transform(data) + if U is None: U = np.copy(data) np.fill_diagonal(U, 0.0) @@ -11,4 +16,8 @@ def runFitting(data, U=None, alpha=1.0): model = Lasso(max_iter=300000, alpha=alpha) model.fit(U, data) - return model + predictt = model.predict(U) + dataLast = np.copy(data) + data[missing] = predictt[missing] + + return [data]