From a2d5da149da27f923ed5c165b5a3e90a8978d931 Mon Sep 17 00:00:00 2001
From: omsh <omar.shouman@gmail.com>
Date: Mon, 9 Feb 2026 11:31:14 +0100
Subject: [PATCH 1/3] version 0.2.5.dev0

---
 src/dlomix/_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dlomix/_metadata.py b/src/dlomix/_metadata.py
index 72338d62..cb854c3f 100644
--- a/src/dlomix/_metadata.py
+++ b/src/dlomix/_metadata.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.4"
+__version__ = "0.2.5.dev0"
 __author__ = "Wilhelm Lab"
 __author_email__ = "o.shouman@tum.de"
 __license__ = "MIT"

From af0c19572dd220109a78075d5e5d51358f0da111 Mon Sep 17 00:00:00 2001
From: Omar Shouman <omar.shouman@gmail.com>
Date: Thu, 12 Feb 2026 20:17:00 +0100
Subject: [PATCH 2/3] Fix/tfdata first epoch latency (#87)

* reverting to using to_tf_dataset args for tfdata control

* num_workers active on linux

* missing import

* exclude num workers with to_tf_dataset
---
 src/dlomix/data/dataset.py | 63 +++++++++-----------------------------
 1 file changed, 15 insertions(+), 48 deletions(-)

diff --git a/src/dlomix/data/dataset.py b/src/dlomix/data/dataset.py
index 42b42bbc..35befdff 100644
--- a/src/dlomix/data/dataset.py
+++ b/src/dlomix/data/dataset.py
@@ -26,27 +26,6 @@
 
 logger = logging.getLogger(__name__)
 
-# TensorFlow import for tf.data is deferred until needed to avoid unnecessary imports for users who only want to use PyTorch datasets or other functionalities of the PeptideDataset class.
-# This also helps to reduce the initial loading time and memory footprint for users who do not need TensorFlow.
-
-_tf = None
-
-
-def _get_tensorflow():
-    """Lazy import of TensorFlow. Only imports when needed."""
-    global _tf
-    if _tf is None:
-        try:
-            import tensorflow as tf
-
-            _tf = tf
-        except ImportError:
-            raise ImportError(
-                "TensorFlow backend requires tensorflow to be installed. "
-                "Install with: pip install tensorflow"
-            )
-    return _tf
-
 
 class PeptideDataset:
     """
@@ -189,13 +168,20 @@ def __init__(self, dataset_config: DatasetConfig, **kwargs):
                 self.processed = True
 
     def _set_num_proc(self):
+        n_processors = get_num_processors()
         if self._num_proc:
-            n_processors = get_num_processors()
             if self._num_proc > n_processors:
                 warnings.warn(
                     f"Number of processors provided is greater than the available processors. Using the maximum number of processors available: {n_processors}."
                 )
                 self._num_proc = n_processors
+        else:
+            warnings.warn(
+                f"Number of processors not provided. Using the maximum number of processors available: {n_processors}.\n"
+                f"If you want to specify a different number of processors, please provide num_proc=<desired_number> parameter in the dataset configuration.\n"
+                f"If you face issues with memory usage, please consider providing a smaller number of processors or setting num_proc=1 to disable multi-processing."
+            )
+            self._num_proc = n_processors
 
     def _set_hf_cache_management(self):
         if self.disable_cache:
@@ -715,8 +701,6 @@ def tensor_train_data(self):
         if self.dataset_type == "pt":
             return self._get_split_torch_dataset(PeptideDataset.DEFAULT_SPLIT_NAMES[0])
         else:
-            tf = _get_tensorflow()
-            dataset_len = len(self.hf_dataset[PeptideDataset.DEFAULT_SPLIT_NAMES[0]])
             tf_dataset = self._get_split_tf_dataset(
                 PeptideDataset.DEFAULT_SPLIT_NAMES[0]
             )
@@ -724,17 +708,6 @@ def tensor_train_data(self):
             if self.enable_tf_dataset_cache:
                 tf_dataset = tf_dataset.cache()
 
-            if self.shuffle:
-                tf_dataset = tf_dataset.shuffle(
-                    buffer_size=min(10000, dataset_len),
-                    reshuffle_each_iteration=True,
-                )
-
-            # Batch the data
-            tf_dataset = tf_dataset.batch(self.batch_size)
-
-            tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
-
             return tf_dataset
 
     @property
@@ -743,7 +716,6 @@ def tensor_val_data(self):
         if self.dataset_type == "pt":
             return self._get_split_torch_dataset(PeptideDataset.DEFAULT_SPLIT_NAMES[1])
         else:
-            tf = _get_tensorflow()
             tf_dataset = self._get_split_tf_dataset(
                 PeptideDataset.DEFAULT_SPLIT_NAMES[1]
             )
@@ -751,9 +723,6 @@ def tensor_val_data(self):
             if self.enable_tf_dataset_cache:
                 tf_dataset = tf_dataset.cache()
 
-            tf_dataset = tf_dataset.batch(self.batch_size)
-            tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
-
             return tf_dataset
 
     @property
@@ -762,17 +731,10 @@ def tensor_test_data(self):
         if self.dataset_type == "pt":
             return self._get_split_torch_dataset(PeptideDataset.DEFAULT_SPLIT_NAMES[2])
         else:
-            tf = _get_tensorflow()
             tf_dataset = self._get_split_tf_dataset(
                 PeptideDataset.DEFAULT_SPLIT_NAMES[2]
             )
 
-            if self.enable_tf_dataset_cache:
-                tf_dataset = tf_dataset.cache()
-
-            tf_dataset = tf_dataset.batch(self.batch_size)
-            tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
-
             return tf_dataset
 
     def _check_if_split_exists(self, split_name: str):
@@ -796,7 +758,10 @@ def _get_split_tf_dataset(self, split_name: str):
         return self.hf_dataset[split_name].to_tf_dataset(
             columns=self._get_input_tensor_column_names(),
             label_cols=label_cols,
-            shuffle=False,
+            shuffle=self.shuffle
+            if split_name == PeptideDataset.DEFAULT_SPLIT_NAMES[0]
+            else False,
+            batch_size=self.batch_size,
         )
 
     def _get_split_torch_dataset(self, split_name: str):
@@ -811,7 +776,9 @@ def _get_split_torch_dataset(self, split_name: str):
                 columns=[*self._get_input_tensor_column_names(), *self.label_column],
             ),
             "batch_size": self.batch_size,
-            "shuffle": self.shuffle,
+            "shuffle": self.shuffle
+            if split_name == PeptideDataset.DEFAULT_SPLIT_NAMES[0]
+            else False,
         }
 
         # Update with user-provided torch_dataloader_kwargs if available

From ee643cf713280ba4697da093ebd72b2d715a0ede Mon Sep 17 00:00:00 2001
From: omsh <omar.shouman@gmail.com>
Date: Thu, 12 Feb 2026 21:19:10 +0200
Subject: [PATCH 3/3] version 0.2.5

---
 src/dlomix/_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dlomix/_metadata.py b/src/dlomix/_metadata.py
index cb854c3f..f1223c8e 100644
--- a/src/dlomix/_metadata.py
+++ b/src/dlomix/_metadata.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.5.dev0"
+__version__ = "0.2.5"
 __author__ = "Wilhelm Lab"
 __author_email__ = "o.shouman@tum.de"
 __license__ = "MIT"