Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion de/factorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from scipy.optimize import minimize
from scipy.special import expit, logit
from .importData import importLINCS
from .fancyimpute.soft_impute import SoftImpute


alpha = 0.1
Expand Down Expand Up @@ -84,6 +85,7 @@ def calcEta(data: np.ndarray, w: np.ndarray, alphaIn: float) -> np.ndarray:
"""
Calculate an estimate for eta based on data and current iteration of w.
"""
assert np.all(np.isfinite(data))
U = np.copy(data)
np.fill_diagonal(U, 0.0)
expM = expit(w @ U)
Expand All @@ -92,14 +94,16 @@ def calcEta(data: np.ndarray, w: np.ndarray, alphaIn: float) -> np.ndarray:
# Least squares with one coefficient and no intercept
xy = np.sum(expM * aData, axis=1)
xx = np.sum(expM * expM, axis=1)
assert np.all(np.isfinite(xy))
assert np.all(np.isfinite(xx))

etta = xy / xx
assert np.min(etta) >= 0.0
assert np.max(etta) < 1e10
return etta


def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False):
def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=False, returnData=False):
"""
Iteravely solve for w and eta list based on the data.
:param data: matrix or list of matrices representing a cell line's gene expression interactions with knockdowns
Expand All @@ -117,6 +121,9 @@ def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=Fal
if isinstance(data, np.ndarray):
data = [data]

missing = [np.isnan(d) for d in data]
data = [SoftImpute(min_value=0.0, verbose=False).fit_transform(d) for d in data]

w = np.zeros((data[0].shape[0], data[0].shape[0]))
etas = [calcEta(x, w, alpha) for x in data]

Expand All @@ -133,6 +140,12 @@ def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=Fal
else:
wProposed = fitW(w, data, etas, alpha)

for jj, dd in enumerate(data):
U = np.copy(dd)
np.fill_diagonal(U, 0.0)
predictt = etas[jj][:, np.newaxis] * expit(wProposed @ U) / alpha
data[jj][missing[jj]] = predictt[missing[jj]]

costNew = costF(data, wProposed, etas, alpha)

if cost - costNew > 1e-3:
Expand All @@ -150,6 +163,9 @@ def factorizeEstimate(data: Union[list, np.ndarray], maxiter=300, returnCost=Fal
if returnCost:
return w, etas, cost

if returnData:
return w, etas, data

return w, etas


Expand Down
4 changes: 4 additions & 0 deletions de/fancyimpute/soft_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ def solve(self, X, missing_mask):

X_filled = X
observed_mask = ~missing_mask
if np.sum(missing_mask) == 0:
if self.verbose:
print("[SoftImpute] No missing values.")
return X_filled
max_singular_value = self._max_singular_value(X_filled)
if self.verbose:
print("[SoftImpute] Max Singular Value of X_init = %f" % (
Expand Down
6 changes: 5 additions & 1 deletion de/figures/figure3.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .common import subplotLabel, getSetup
from ..graph import Network, load_w, remove, normalize, bar_graph
from ..grndb_network import load_w_GRNdb, Network_GRNdb
from ..impute import plot_imputation


def makeFigure():
Expand All @@ -12,7 +13,7 @@ def makeFigure():
:type f: Figure
"""
# Get list of axis objects
ax, f = getSetup((10, 8), (2, 3))
ax, f = getSetup((16, 8), (2, 3))
# load w for the Melanoma dataset from Torre paper
w = load_w()
w = normalize(w)
Expand All @@ -33,6 +34,9 @@ def makeFigure():
Network_GRNdb(w_GRNdb, ax[3])
ax[3].set_title("w Network Graph - GRNdb")

# ax[4] would be the boxplot comparing linear and nonlinear fitting
plot_imputation(ax[4])

# Add subplot labels
subplotLabel(ax)
return f
46 changes: 13 additions & 33 deletions de/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
import pandas as pd
import seaborn as sns
import itertools
from scipy.special import expit
from .factorization import alpha, factorizeEstimate
from .factorization import factorizeEstimate
from .linearModel import runFitting
from .fancyimpute.soft_impute import SoftImpute
from .importData import importLINCS, ImportMelanoma

def split_data(X, n=10):
Expand All @@ -26,33 +24,13 @@ def split_data(X, n=10):

def impute(data, linear=False):
""" Impute by repeated fitting. """
missing = np.isnan(data)
if linear:
data = runFitting(data)
else:
# Fit nonlinear
_, _, data = factorizeEstimate(data, maxiter=50, returnData=True)

si = SoftImpute()
data = si.fit_transform(data)

for _ in range(10):
U = np.copy(data)
np.fill_diagonal(U, 0.0)

# Fit
if linear:
model = runFitting(data)
else:
w, eta = factorizeEstimate(data, maxiter=20)

# Fill-in with model prediction
if linear:
predictt = model.predict(U)
else:
predictt = eta[0][:, np.newaxis] * expit(w @ U) / alpha

dataLast = np.copy(data)
data[missing] = predictt[missing]
change = np.linalg.norm(data - dataLast)
print(change, np.linalg.norm(dataLast))

return data
return data[0]


def repeatImputation(data, linear=False, numIter=20):
Expand All @@ -76,6 +54,8 @@ def calc_imputation():
for data in data_list:
linear_coeffs.append(repeatImputation(data, linear=True))
nonlinear_coeffs.append(repeatImputation(data))
print("linear ", linear_coeffs)
print("nonlinear ", nonlinear_coeffs)

return linear_coeffs, nonlinear_coeffs

Expand All @@ -85,11 +65,11 @@ def plot_imputation(ax):

n = len(linear[0])
labels = 2 * [["A375"] * n, ["A549"] * n, ["HA1E"] * n, ["HT29"] * n, ["MCF7"] * n, ["PC3"] * n, ["Mel"] * n]
hue = [["linear"] * 5 * n, ["nonlinear"] * 5 * n]
df = pd.DataFrame({'correlation coef.': list(itertools.chain(linear + nonlinear)), 'cellLines': labels, 'model': hue})
sns.boxplot(x='cellLines', y='correlation coef', hue='model', data=df, ax=ax, split=True, jitter=0.2, palette=sns.color_palette('Paired'))
hue = [["linear"] * 7 * n, ["nonlinear"] * 7 * n]
df = pd.DataFrame({'correlation_coef.': list(itertools.chain(*(linear + nonlinear))), 'cellLines': list(itertools.chain(*labels)), 'model': list(itertools.chain(*hue))})
sns.boxplot(x='cellLines', y='correlation_coef.', hue='model', data=df, ax=ax, palette=sns.color_palette('Paired'))
handles, labels = ax.get_legend_handles_labels()
lgd = ax.legend(handles[0:2], labels[0:2],
loc='upper left',
fontsize='large',
handletextpad=0.5)
handletextpad=0.5)
11 changes: 10 additions & 1 deletion de/linearModel.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
from sklearn.linear_model import Lasso
import numpy as np
from .fancyimpute.soft_impute import SoftImpute


def runFitting(data, U=None, alpha=1.0):
""" Creates Lasso object, fits model to data. """

missing = np.isnan(data)
data = SoftImpute(min_value=0.0, verbose=False).fit_transform(data)

if U is None:
U = np.copy(data)
np.fill_diagonal(U, 0.0)

model = Lasso(max_iter=300000, alpha=alpha)
model.fit(U, data)

return model
predictt = model.predict(U)
dataLast = np.copy(data)
data[missing] = predictt[missing]

return [data]