From a2d5da149da27f923ed5c165b5a3e90a8978d931 Mon Sep 17 00:00:00 2001 From: omsh Date: Mon, 9 Feb 2026 11:31:14 +0100 Subject: [PATCH 1/3] version 0.2.5.dev0 --- src/dlomix/_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dlomix/_metadata.py b/src/dlomix/_metadata.py index 72338d62..cb854c3f 100644 --- a/src/dlomix/_metadata.py +++ b/src/dlomix/_metadata.py @@ -1,4 +1,4 @@ -__version__ = "0.2.4" +__version__ = "0.2.5.dev0" __author__ = "Wilhelm Lab" __author_email__ = "o.shouman@tum.de" __license__ = "MIT" From af0c19572dd220109a78075d5e5d51358f0da111 Mon Sep 17 00:00:00 2001 From: Omar Shouman Date: Thu, 12 Feb 2026 20:17:00 +0100 Subject: [PATCH 2/3] Fix/tfdata first epoch latency (#87) * reverting to using to_tf_dataset args for tfdata control * num_workers active on linux * missing import * exclude num workers with to_tf_dataset --- src/dlomix/data/dataset.py | 63 +++++++++----------------------------- 1 file changed, 15 insertions(+), 48 deletions(-) diff --git a/src/dlomix/data/dataset.py b/src/dlomix/data/dataset.py index 42b42bbc..35befdff 100644 --- a/src/dlomix/data/dataset.py +++ b/src/dlomix/data/dataset.py @@ -26,27 +26,6 @@ logger = logging.getLogger(__name__) -# TensorFlow import for tf.data is deferred until needed to avoid unnecessary imports for users who only want to use PyTorch datasets or other functionalities of the PeptideDataset class. -# This also helps to reduce the initial loading time and memory footprint for users who do not need TensorFlow. - -_tf = None - - -def _get_tensorflow(): - """Lazy import of TensorFlow. Only imports when needed.""" - global _tf - if _tf is None: - try: - import tensorflow as tf - - _tf = tf - except ImportError: - raise ImportError( - "TensorFlow backend requires tensorflow to be installed. " - "Install with: pip install tensorflow" - ) - return _tf - class PeptideDataset: """ @@ -189,13 +168,20 @@ def __init__(self, dataset_config: DatasetConfig, **kwargs): self.processed = True def _set_num_proc(self): + n_processors = get_num_processors() if self._num_proc: - n_processors = get_num_processors() if self._num_proc > n_processors: warnings.warn( f"Number of processors provided is greater than the available processors. Using the maximum number of processors available: {n_processors}." ) self._num_proc = n_processors + else: + warnings.warn( + f"Number of processors not provided. Using the maximum number of processors available: {n_processors}.\n" + f"If you want to specify a different number of processors, please provide num_proc= parameter in the dataset configuration.\n" + f"If you face issues with memory usage, please consider providing a smaller number of processors or setting num_proc=1 to disable multi-processing." + ) + self._num_proc = n_processors def _set_hf_cache_management(self): if self.disable_cache: @@ -715,8 +701,6 @@ def tensor_train_data(self): if self.dataset_type == "pt": return self._get_split_torch_dataset(PeptideDataset.DEFAULT_SPLIT_NAMES[0]) else: - tf = _get_tensorflow() - dataset_len = len(self.hf_dataset[PeptideDataset.DEFAULT_SPLIT_NAMES[0]]) tf_dataset = self._get_split_tf_dataset( PeptideDataset.DEFAULT_SPLIT_NAMES[0] ) @@ -724,17 +708,6 @@ def tensor_train_data(self): if self.enable_tf_dataset_cache: tf_dataset = tf_dataset.cache() - if self.shuffle: - tf_dataset = tf_dataset.shuffle( - buffer_size=min(10000, dataset_len), - reshuffle_each_iteration=True, - ) - - # Batch the data - tf_dataset = tf_dataset.batch(self.batch_size) - - tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE) - return tf_dataset @property @@ -743,7 +716,6 @@ def tensor_val_data(self): if self.dataset_type == "pt": return self._get_split_torch_dataset(PeptideDataset.DEFAULT_SPLIT_NAMES[1]) else: - tf = _get_tensorflow() tf_dataset = self._get_split_tf_dataset( PeptideDataset.DEFAULT_SPLIT_NAMES[1] ) @@ -751,9 +723,6 @@ def tensor_val_data(self): if self.enable_tf_dataset_cache: tf_dataset = tf_dataset.cache() - tf_dataset = tf_dataset.batch(self.batch_size) - tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE) - return tf_dataset @property @@ -762,17 +731,10 @@ def tensor_test_data(self): if self.dataset_type == "pt": return self._get_split_torch_dataset(PeptideDataset.DEFAULT_SPLIT_NAMES[2]) else: - tf = _get_tensorflow() tf_dataset = self._get_split_tf_dataset( PeptideDataset.DEFAULT_SPLIT_NAMES[2] ) - if self.enable_tf_dataset_cache: - tf_dataset = tf_dataset.cache() - - tf_dataset = tf_dataset.batch(self.batch_size) - tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE) - return tf_dataset def _check_if_split_exists(self, split_name: str): @@ -796,7 +758,10 @@ def _get_split_tf_dataset(self, split_name: str): return self.hf_dataset[split_name].to_tf_dataset( columns=self._get_input_tensor_column_names(), label_cols=label_cols, - shuffle=False, + shuffle=self.shuffle + if split_name == PeptideDataset.DEFAULT_SPLIT_NAMES[0] + else False, + batch_size=self.batch_size, ) def _get_split_torch_dataset(self, split_name: str): @@ -811,7 +776,9 @@ def _get_split_torch_dataset(self, split_name: str): columns=[*self._get_input_tensor_column_names(), *self.label_column], ), "batch_size": self.batch_size, - "shuffle": self.shuffle, + "shuffle": self.shuffle + if split_name == PeptideDataset.DEFAULT_SPLIT_NAMES[0] + else False, } # Update with user-provided torch_dataloader_kwargs if available From ee643cf713280ba4697da093ebd72b2d715a0ede Mon Sep 17 00:00:00 2001 From: omsh Date: Thu, 12 Feb 2026 21:19:10 +0200 Subject: [PATCH 3/3] version 0.2.5 --- src/dlomix/_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dlomix/_metadata.py b/src/dlomix/_metadata.py index cb854c3f..f1223c8e 100644 --- a/src/dlomix/_metadata.py +++ b/src/dlomix/_metadata.py @@ -1,4 +1,4 @@ -__version__ = "0.2.5.dev0" +__version__ = "0.2.5" __author__ = "Wilhelm Lab" __author_email__ = "o.shouman@tum.de" __license__ = "MIT"