Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions dlomix/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,49 @@
"W": 19,
"Y": 20,
}

ALPHABET_MOD = {
"A": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
"K": 9,
"L": 10,
"M": 11,
"N": 12,
"P": 13,
"Q": 14,
"R": 15,
"S": 16,
"T": 17,
"V": 18,
"W": 19,
"Y": 20,
"^": 21,
"}": 22,
}

MCD_pipeline_parameters = {
# params for full dataset with modified alphabet
#"model_params": {"seq_length": 30, "vocab_dict": ALPHABET_MOD.copy()},
#"data_params": {"seq_length": 30, 'sequence_col': 'modified_sequence_single_letter', 'target_col': 'median'},
"model_params": {"seq_length": 30, "vocab_dict": ALPHABET_UNMOD.copy()},
"data_params": {"seq_length": 30,},
#TODO upload model in dlomix and change path
"base_model_path": "../data/models/rtmodel_prosit_epoch20",
"test_set_path": "https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/proteomTools_test.csv"
}


QR_pipeline_parameters = {
"model_params": {"seq_length": 30, "vocab_dict": ALPHABET_UNMOD.copy()},
"data_params": {"seq_length": 30,},
#specify path of desired model here
"base_model_path": "../../data/models/rtmodel_prosit_epoch20",
"train_val_path": 'https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/proteomTools_train_val.csv',
"test_path": 'https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/proteomTools_test.csv',
}
4 changes: 3 additions & 1 deletion dlomix/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .rt_eval import TimeDeltaMetric
from .interval_conformal import IntervalSize, AbsoluteIntervalSize, RelativeCentralDistance, IntervalConformalScore, IntervalConformalQuantile
from .scalar_conformal import ScalarConformalScore, ScalarConformalQuantile

__all__ = [TimeDeltaMetric]
__all__ = [TimeDeltaMetric, IntervalSize, AbsoluteIntervalSize, RelativeCentralDistance, IntervalConformalScore, IntervalConformalQuantile, ScalarConformalScore, ScalarConformalQuantile]
104 changes: 104 additions & 0 deletions dlomix/eval/interval_conformal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import tensorflow as tf

class IntervalSize(tf.keras.losses.Loss):
'''
Size of the prediction interval.
'''
def __init__(self, name="interval_size", **kwargs):
super().__init__(name=name, **kwargs)

def call(self, y_true, y_pred):
interval_size = tf.subtract(y_pred[:,1], y_pred[:,0])
return interval_size

def get_config(self):
config = {}
base_config = super().get_config()
return {**base_config, **config}


class AbsoluteIntervalSize(tf.keras.losses.Loss):
'''
Absolute size of the prediction interval.
'''
def __init__(self, name="abs_interval_size", **kwargs):
super().__init__(name=name, **kwargs)

def call(self, y_true, y_pred):
abs_interval_size = tf.math.abs(tf.subtract(y_pred[:,1], y_pred[:,0]))
return abs_interval_size

def get_config(self):
config = {}
base_config = super().get_config()
return {**base_config, **config}


class RelativeCentralDistance(tf.keras.losses.Loss):
'''
Distance of the true value from the center of the prediction intverval,
divided by half of the the prediction interval size.
The result is 0 if true value is at center of the prediction interval,
1 if at (symm.) interval boundary and >1 else.
'''
def __init__(self, name="relative_central_distance", **kwargs):
super().__init__(name=name, **kwargs)

def call(self, y_true, y_pred):
# absolute distance between the two interval boundaries
# (if upper boudary is below lower, this is nevertheless positive)
interval_size = tf.math.abs(tf.subtract(y_pred[:,1], y_pred[:,0]))
# absolute distance of the true value from the inverval center/mean
central_dist = tf.math.abs(tf.subtract(y_true[:,0], tf.math.reduce_mean(y_pred, axis=-1)))
# divide by half of interval size
res = tf.divide(central_dist, tf.divide(interval_size, 2.))
indices = tf.where(tf.math.is_inf(res))
return tf.tensor_scatter_nd_update(
res,
indices,
tf.ones((tf.shape(indices)[0])) * 0.
)

def get_config(self):
config = {}
base_config = super().get_config()
return {**base_config, **config}


class IntervalConformalScore(tf.keras.losses.Loss):
'''
Computes conformal scores for prediction intervals pred_intervals and true values y_true.
'''
def __init__(self, name="interval_conformal_score", **kwargs):
super().__init__(name=name, **kwargs)

def call(self, y_true, pred_intervals):
return tf.reduce_max(tf.stack([tf.subtract(pred_intervals, y_true)[:,0],
-tf.subtract(pred_intervals, y_true)[:,1]], 1), -1)

def get_config(self):
config = {}
base_config = super().get_config()
return {**base_config, **config}


class IntervalConformalQuantile(tf.keras.losses.Loss):
'''
Computes the conformal quantile based on the distribution of conformal scores
for prediction intervals pred_intervals and true values y_true.
'''
def __init__(self, alpha=0.1, name="interval_conformal_quantile", **kwargs):
super().__init__(name=name, **kwargs)
self.alpha = alpha

def call(self, y_true, pred_intervals):
scores = tf.reduce_max(tf.stack([tf.subtract(pred_intervals, y_true)[:,0], -tf.subtract(pred_intervals, y_true)[:,1]], 1), -1)
n = tf.cast(tf.shape(y_true)[0], tf.float32) #without casting to float, next line throws an error
q = tf.math.ceil((n + 1.) * (1. - self.alpha)) / n
tfp_quantile = tf.sort(scores, axis=-1, direction='ASCENDING', name=None)[tf.math.minimum(int(q * n), int(n-1))]
return tfp_quantile

def get_config(self):
config = {'alpha' : self.alpha}
base_config = super().get_config()
return {**base_config, **config}
38 changes: 38 additions & 0 deletions dlomix/eval/scalar_conformal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import tensorflow as tf

class ScalarConformalScore(tf.keras.losses.Loss):
'''
Computes conformal scores for predicted scalar error estimates pred_err and true values y_true.
'''
def __init__(self, name="scalar_conformal_scores", **kwargs):
super().__init__(name=name, **kwargs)

def call(self, y_true, pred_err):
scores = tf.divide(tf.math.abs(y_true - pred_err[:,0]), pred_err[:,1])
return scores

def get_config(self):
config = {}
base_config = super().get_config()
return {**base_config, **config}

class ScalarConformalQuantile(tf.keras.losses.Loss):
'''
Computes the conformal quantile based on the distribution of conformal scores
for scalar error estimates pred_err and true values y_true.
'''
def __init__(self, alpha=0.1, name="scalar_conformal_quantile", **kwargs):
super().__init__(name=name, **kwargs)
self.alpha = alpha

def call(self, y_true, pred_err):
scores = tf.divide(tf.math.abs(y_true - pred_err[:,0]), pred_err[:,1])
n = tf.cast(tf.shape(y_true)[0], tf.float32) #without casting to float, next line throws an error
q = tf.math.ceil((n + 1.)*(1. - self.alpha)) / n
tfp_quantile = tf.sort(scores, axis=-1, direction='ASCENDING', name=None)[int(q * n)]
return tfp_quantile

def get_config(self):
config = {'alpha' : self.alpha}
base_config = super().get_config()
return {**base_config, **config}
3 changes: 2 additions & 1 deletion dlomix/losses/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .intensity import masked_spectral_distance, masked_pearson_correlation_distance
from .quantile import QuantileLoss

__all__ = [masked_spectral_distance, masked_pearson_correlation_distance]
__all__ = [masked_spectral_distance, masked_pearson_correlation_distance, QuantileLoss]
20 changes: 20 additions & 0 deletions dlomix/losses/quantile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import tensorflow as tf

class QuantileLoss(tf.keras.losses.Loss):
'''
Quantile loss (pinball loss)
'''
def __init__(self, quantile=tf.constant([[0.1, 0.9]]), name="quantile_loss", **kwargs):
super().__init__(name=name, **kwargs)
self.quantile = quantile

def call(self, y_true, y_pred):
err = tf.subtract(y_true, y_pred)
return tf.reduce_sum(tf.maximum(self.quantile * err, (self.quantile - 1) * err), axis=-1)

def get_config(self):
config = {
'quantile': self.quantile
}
base_config = super().get_config()
return {**base_config, **config}
2 changes: 1 addition & 1 deletion dlomix/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
"PrositRetentionTimePredictor",
"DeepLCRetentionTimePredictor",
"PrositIntensityPredictor",
]
]
101 changes: 101 additions & 0 deletions dlomix/pipelines/MCDpipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import pickle
import numpy as np
import tensorflow as tf
from scipy.stats import ks_2samp

from dlomix.constants import MCD_pipeline_parameters
from dlomix.data.RetentionTimeDataset import RetentionTimeDataset
from dlomix.models import PrositRetentionTimePredictor
from dlomix.eval.scalar_conformal import ScalarConformalScore, ScalarConformalQuantile
from dlomix.losses import QuantileLoss
from dlomix.reports import UncertaintyReport


class MCDPipeline:
def __init__(self, alpha=0.1):
self.base_model = None
self.test_dataset = None
self.alpha = alpha
self.res = None
self.label = None

self._build_base_model()

def _build_base_model(self):

self.base_model = PrositRetentionTimePredictor(**MCD_pipeline_parameters["model_params"])
self.base_model.load_weights(MCD_pipeline_parameters["base_model_path"]).expect_partial()

def _predict_with_dropout(self, n):

predictions = []
for i in range(n):
res = np.concatenate([self.base_model(batch[0], training=True).numpy() for batch in list(self.test_dataset.test_data)])
predictions.append(res)
return np.column_stack(predictions)

def load_data(self, data=MCD_pipeline_parameters["test_set_path"], batchsize=32):

if not (isinstance(data, str) or isinstance(data, np.ndarray)):
raise ValueError(
"Dataset should be provided either as a numpy array or a string pointing to a file."
)

self.test_dataset = RetentionTimeDataset(
data_source=data,
**MCD_pipeline_parameters["data_params"],
batch_size=batchsize,
test=True)

def predict(self, n=10):

res = {}
self.label = f"PROSIT_MCD_n={n}"
res[self.label] = {}
print("model :", self.label, ", n =", n)
pred = self._predict_with_dropout(n=n)
res[self.label]['data'] = np.array((pred.mean(axis=1), pred.std(axis=1)))
self.res = res

#return res

def report(self):

test_targets = self.test_dataset.get_split_targets(split="test")
avgs, stds = self.res[self.label]['data'][0], self.res[self.label]['data'][1]
print(f'#### {self.label} ####')

conf_scores = ScalarConformalScore(reduction='none')(test_targets, self.res[self.label]['data'].T).numpy()
conf_quantile = ScalarConformalQuantile()(test_targets, self.res[self.label]['data'].T).numpy()

print(f"alpha = {self.alpha}, conformal quantile: {conf_quantile:.2f}")

intervals = np.array([avgs - stds * conf_quantile, avgs + stds * conf_quantile]).T
interval_sizes = intervals[:,1] - intervals[:,0]
within = (test_targets >= intervals[:,0]) & (test_targets <= intervals[:,1])

UncertaintyReport.plot_conformal_scores(conf_scores, quantile=conf_quantile)
UncertaintyReport.plot_predictions_with_intervals(test_targets, avgs, intervals)
UncertaintyReport.plot_conformalized_interval_size(interval_sizes)

pvalue = ks_2samp(interval_sizes[within], interval_sizes[~within]).pvalue # prob. for Null: distr are identical
print(f"p = {pvalue:.5f} : {'Reject' if pvalue < 0.01 else 'Accept'} Null Hypothesis (Distr. identical)")

UncertaintyReport.plot_conformalized_interval_size_PDFs(interval_sizes, within, pvalue)

def save_results(self, path="MCD_results.pkl"):

with open(path, 'wb') as f:
pickle.dump(self.res, f)

def run_full_pipeline(self, n=10, save=True, save_path='MCD_results.pkl'):

self.load_data()
self.predict(n=n)
self.report()
if save:
self.save_results(path=save_path)




Loading