wilhelm-lab · omsh · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023
diff --git a/dlomix/constants.py b/dlomix/constants.py
@@ -40,3 +40,49 @@
     "W": 19,
     "Y": 20,
 }
+
+ALPHABET_MOD = {
+    "A": 1,
+    "C": 2,
+    "D": 3,
+    "E": 4,
+    "F": 5,
+    "G": 6,
+    "H": 7,
+    "I": 8,
+    "K": 9,
+    "L": 10,
+    "M": 11,
+    "N": 12,
+    "P": 13,
+    "Q": 14,
+    "R": 15,
+    "S": 16,
+    "T": 17,
+    "V": 18,
+    "W": 19,
+    "Y": 20,
+    "^": 21,
+    "}": 22,
+}
+
+MCD_pipeline_parameters = {
+    # params for full dataset with modified alphabet 
+    #"model_params": {"seq_length": 30, "vocab_dict": ALPHABET_MOD.copy()},
+    #"data_params": {"seq_length": 30, 'sequence_col': 'modified_sequence_single_letter', 'target_col': 'median'},
+    "model_params": {"seq_length": 30, "vocab_dict": ALPHABET_UNMOD.copy()},
+    "data_params": {"seq_length": 30,},
+    #TODO upload model in dlomix and change path
+    "base_model_path": "../data/models/rtmodel_prosit_epoch20",
+    "test_set_path": "https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/proteomTools_test.csv"
+}
+
+
+QR_pipeline_parameters = {
+    "model_params": {"seq_length": 30, "vocab_dict": ALPHABET_UNMOD.copy()},
+    "data_params": {"seq_length": 30,},
+    #specify path of desired model here
+    "base_model_path": "../../data/models/rtmodel_prosit_epoch20",
+    "train_val_path": 'https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/proteomTools_train_val.csv',
+    "test_path": 'https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop/example_dataset/proteomTools_test.csv',
+}
diff --git a/dlomix/eval/__init__.py b/dlomix/eval/__init__.py
@@ -1,3 +1,5 @@
 from .rt_eval import TimeDeltaMetric
+from .interval_conformal import IntervalSize, AbsoluteIntervalSize, RelativeCentralDistance, IntervalConformalScore, IntervalConformalQuantile
+from .scalar_conformal import ScalarConformalScore, ScalarConformalQuantile
 
-__all__ = [TimeDeltaMetric]
+__all__ = [TimeDeltaMetric, IntervalSize, AbsoluteIntervalSize, RelativeCentralDistance, IntervalConformalScore, IntervalConformalQuantile, ScalarConformalScore, ScalarConformalQuantile]
diff --git a/dlomix/eval/interval_conformal.py b/dlomix/eval/interval_conformal.py
@@ -0,0 +1,104 @@
+import tensorflow as tf
+
+class IntervalSize(tf.keras.losses.Loss):
+    '''
+    Size of the prediction interval.
+    '''
+    def __init__(self, name="interval_size", **kwargs):
+        super().__init__(name=name, **kwargs)
+
+    def call(self, y_true, y_pred):
+        interval_size = tf.subtract(y_pred[:,1], y_pred[:,0])
+        return interval_size
+
+    def get_config(self):
+        config = {}
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+
+class AbsoluteIntervalSize(tf.keras.losses.Loss):
+    '''
+    Absolute size of the prediction interval.
+    '''
+    def __init__(self, name="abs_interval_size", **kwargs):
+        super().__init__(name=name, **kwargs)
+
+    def call(self, y_true, y_pred):
+        abs_interval_size = tf.math.abs(tf.subtract(y_pred[:,1], y_pred[:,0]))
+        return abs_interval_size
+
+    def get_config(self):
+        config = {}
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+
+class RelativeCentralDistance(tf.keras.losses.Loss):
+    '''
+    Distance of the true value from the center of the prediction intverval,
+    divided by half of the the prediction interval size.
+    The result is 0 if true value is at center of the prediction interval, 
+    1 if at (symm.) interval boundary and >1 else.
+    '''
+    def __init__(self, name="relative_central_distance", **kwargs):
+        super().__init__(name=name, **kwargs)
+
+    def call(self, y_true, y_pred):
+        # absolute distance between the two interval boundaries 
+        # (if upper boudary is below lower, this is nevertheless positive)
+        interval_size = tf.math.abs(tf.subtract(y_pred[:,1], y_pred[:,0]))
+        # absolute distance of the true value from the inverval center/mean
+        central_dist = tf.math.abs(tf.subtract(y_true[:,0], tf.math.reduce_mean(y_pred, axis=-1)))
+        # divide by half of interval size
+        res = tf.divide(central_dist, tf.divide(interval_size, 2.))
+        indices = tf.where(tf.math.is_inf(res))
+        return tf.tensor_scatter_nd_update(
+            res,
+            indices,
+            tf.ones((tf.shape(indices)[0])) * 0.
+        )
+
+    def get_config(self):
+        config = {}
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+
+class IntervalConformalScore(tf.keras.losses.Loss):
+    '''
+    Computes conformal scores for prediction intervals pred_intervals and true values y_true.
+    '''
+    def __init__(self, name="interval_conformal_score", **kwargs):
+        super().__init__(name=name, **kwargs)
+
+    def call(self, y_true, pred_intervals):
+        return tf.reduce_max(tf.stack([tf.subtract(pred_intervals, y_true)[:,0], 
+                                       -tf.subtract(pred_intervals, y_true)[:,1]], 1), -1)
+
+    def get_config(self):
+        config = {}
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+
+class IntervalConformalQuantile(tf.keras.losses.Loss):
+    '''
+    Computes the conformal quantile based on the distribution of conformal scores
+    for prediction intervals pred_intervals and true values y_true.
+    '''
+    def __init__(self, alpha=0.1, name="interval_conformal_quantile", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.alpha = alpha
+
+    def call(self, y_true, pred_intervals):
+        scores = tf.reduce_max(tf.stack([tf.subtract(pred_intervals, y_true)[:,0], -tf.subtract(pred_intervals, y_true)[:,1]], 1), -1)
+        n = tf.cast(tf.shape(y_true)[0], tf.float32) #without casting to float, next line throws an error
+        q = tf.math.ceil((n + 1.) * (1. - self.alpha)) / n
+        tfp_quantile = tf.sort(scores, axis=-1, direction='ASCENDING', name=None)[tf.math.minimum(int(q * n), int(n-1))]
+        return tfp_quantile
+
+    def get_config(self):
+        config = {'alpha' : self.alpha}
+        base_config = super().get_config()
+        return {**base_config, **config}
diff --git a/dlomix/eval/scalar_conformal.py b/dlomix/eval/scalar_conformal.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+
+class ScalarConformalScore(tf.keras.losses.Loss):
+    '''
+    Computes conformal scores for predicted scalar error estimates pred_err and true values y_true.
+    '''
+    def __init__(self, name="scalar_conformal_scores", **kwargs):
+        super().__init__(name=name, **kwargs)
+
+    def call(self, y_true, pred_err):
+        scores = tf.divide(tf.math.abs(y_true - pred_err[:,0]), pred_err[:,1])
+        return scores
+
+    def get_config(self):
+        config = {}
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+class ScalarConformalQuantile(tf.keras.losses.Loss):
+    '''
+    Computes the conformal quantile based on the distribution of conformal scores
+    for scalar error estimates pred_err and true values y_true.
+    '''
+    def __init__(self, alpha=0.1, name="scalar_conformal_quantile", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.alpha = alpha
+
+    def call(self, y_true, pred_err):
+        scores = tf.divide(tf.math.abs(y_true - pred_err[:,0]), pred_err[:,1])
+        n = tf.cast(tf.shape(y_true)[0], tf.float32) #without casting to float, next line throws an error
+        q = tf.math.ceil((n + 1.)*(1. - self.alpha)) / n
+        tfp_quantile = tf.sort(scores, axis=-1, direction='ASCENDING', name=None)[int(q * n)]
+        return tfp_quantile
+
+    def get_config(self):
+        config = {'alpha' : self.alpha}
+        base_config = super().get_config()
+        return {**base_config, **config}
diff --git a/dlomix/losses/__init__.py b/dlomix/losses/__init__.py
@@ -1,3 +1,4 @@
 from .intensity import masked_spectral_distance, masked_pearson_correlation_distance
+from .quantile import QuantileLoss
 
-__all__ = [masked_spectral_distance, masked_pearson_correlation_distance]
+__all__ = [masked_spectral_distance, masked_pearson_correlation_distance, QuantileLoss]
diff --git a/dlomix/losses/quantile.py b/dlomix/losses/quantile.py
@@ -0,0 +1,20 @@
+import tensorflow as tf
+
+class QuantileLoss(tf.keras.losses.Loss):
+    '''
+    Quantile loss (pinball loss) 
+    '''
+    def __init__(self, quantile=tf.constant([[0.1, 0.9]]), name="quantile_loss", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.quantile = quantile
+
+    def call(self, y_true, y_pred):
+        err = tf.subtract(y_true, y_pred)
+        return tf.reduce_sum(tf.maximum(self.quantile * err, (self.quantile - 1) * err), axis=-1)
+
+    def get_config(self):
+        config = {
+            'quantile': self.quantile
+        }
+        base_config = super().get_config()
+        return {**base_config, **config}
diff --git a/dlomix/models/__init__.py b/dlomix/models/__init__.py
@@ -7,4 +7,4 @@
     "PrositRetentionTimePredictor",
     "DeepLCRetentionTimePredictor",
     "PrositIntensityPredictor",
-]
+]
diff --git a/dlomix/pipelines/MCDpipeline.py b/dlomix/pipelines/MCDpipeline.py
@@ -0,0 +1,101 @@
+import pickle
+import numpy as np
+import tensorflow as tf
+from scipy.stats import ks_2samp
+
+from dlomix.constants import MCD_pipeline_parameters
+from dlomix.data.RetentionTimeDataset import RetentionTimeDataset
+from dlomix.models import PrositRetentionTimePredictor
+from dlomix.eval.scalar_conformal import ScalarConformalScore, ScalarConformalQuantile
+from dlomix.losses import QuantileLoss
+from dlomix.reports import UncertaintyReport
+
+
+class MCDPipeline:
+    def __init__(self, alpha=0.1):
+        self.base_model = None
+        self.test_dataset = None
+        self.alpha = alpha
+        self.res = None
+        self.label = None
+
+        self._build_base_model()
+
+    def _build_base_model(self):
+
+        self.base_model = PrositRetentionTimePredictor(**MCD_pipeline_parameters["model_params"])
+        self.base_model.load_weights(MCD_pipeline_parameters["base_model_path"]).expect_partial()
+
+    def _predict_with_dropout(self, n):
+
+        predictions = []
+        for i in range(n):
+            res = np.concatenate([self.base_model(batch[0], training=True).numpy() for batch in list(self.test_dataset.test_data)])
+            predictions.append(res)
+        return np.column_stack(predictions)
+
+    def load_data(self, data=MCD_pipeline_parameters["test_set_path"], batchsize=32):
+
+        if not (isinstance(data, str) or isinstance(data, np.ndarray)):
+            raise ValueError(
+                "Dataset should be provided either as a numpy array or a string pointing to a file."
+            )
+
+        self.test_dataset = RetentionTimeDataset(
+            data_source=data,
+            **MCD_pipeline_parameters["data_params"], 
+            batch_size=batchsize, 
+            test=True)
+
+    def predict(self, n=10):
+
+        res = {}
+        self.label = f"PROSIT_MCD_n={n}"
+        res[self.label] = {}
+        print("model :", self.label, ", n =", n)
+        pred = self._predict_with_dropout(n=n)
+        res[self.label]['data'] = np.array((pred.mean(axis=1), pred.std(axis=1)))
+        self.res = res
+
+        #return res
+
+    def report(self):
+
+        test_targets = self.test_dataset.get_split_targets(split="test")
+        avgs, stds = self.res[self.label]['data'][0], self.res[self.label]['data'][1]
+        print(f'#### {self.label} ####')
+
+        conf_scores = ScalarConformalScore(reduction='none')(test_targets, self.res[self.label]['data'].T).numpy()
+        conf_quantile = ScalarConformalQuantile()(test_targets, self.res[self.label]['data'].T).numpy()
+
+        print(f"alpha = {self.alpha}, conformal quantile: {conf_quantile:.2f}")
+
+        intervals = np.array([avgs - stds * conf_quantile, avgs + stds * conf_quantile]).T
+        interval_sizes = intervals[:,1] - intervals[:,0]
+        within = (test_targets >= intervals[:,0]) & (test_targets <= intervals[:,1])
+
+        UncertaintyReport.plot_conformal_scores(conf_scores, quantile=conf_quantile)
+        UncertaintyReport.plot_predictions_with_intervals(test_targets, avgs, intervals)
+        UncertaintyReport.plot_conformalized_interval_size(interval_sizes)
+
+        pvalue = ks_2samp(interval_sizes[within], interval_sizes[~within]).pvalue # prob. for Null: distr are identical
+        print(f"p = {pvalue:.5f} : {'Reject' if pvalue < 0.01 else 'Accept'} Null Hypothesis (Distr. identical)")
+
+        UncertaintyReport.plot_conformalized_interval_size_PDFs(interval_sizes, within, pvalue)
+
+    def save_results(self, path="MCD_results.pkl"):
+
+        with open(path, 'wb') as f:
+            pickle.dump(self.res, f)
+
+    def run_full_pipeline(self, n=10, save=True, save_path='MCD_results.pkl'):
+
+        self.load_data()
+        self.predict(n=n)
+        self.report()
+        if save:
+            self.save_results(path=save_path)
+
+
+
+