From 94e14aa3823c5a24771e1818d9ff3461d99d8426 Mon Sep 17 00:00:00 2001
From: sfluegel <simon.fluegel@uni-osnabrueck.de>
Date: Thu, 31 Jul 2025 13:28:02 +0200
Subject: [PATCH 1/4] use test_split and validation_split for pubchem

---
 chebai/preprocessing/datasets/pubchem.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
index 24e1e986..f7c5e668 100644
--- a/chebai/preprocessing/datasets/pubchem.py
+++ b/chebai/preprocessing/datasets/pubchem.py
@@ -154,9 +154,13 @@ def setup_processed(self):
         print("Load data from file", filename)
         data = self._load_data_from_file(filename)
         print("Create splits")
-        train, test = train_test_split(data, train_size=self.train_split)
+        train, test = train_test_split(
+            data, train_size=1 - (self.validation_split + self.test_split)
+        )
         del data
-        test, val = train_test_split(test, train_size=self.train_split)
+        test, val = train_test_split(
+            test, train_size=self.test_split / (self.validation_split + self.test_split)
+        )
         torch.save(train, os.path.join(self.processed_dir, "train.pt"))
         torch.save(test, os.path.join(self.processed_dir, "test.pt"))
         torch.save(val, os.path.join(self.processed_dir, "validation.pt"))

From ba96980cb98587a0d94d31aa026be400e73f6a73 Mon Sep 17 00:00:00 2001
From: sfluegel <simon.fluegel@uni-osnabrueck.de>
Date: Thu, 31 Jul 2025 13:47:21 +0200
Subject: [PATCH 2/4] set processed data props for pubchem

---
 chebai/preprocessing/datasets/pubchem.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
index f7c5e668..dacf7a8f 100644
--- a/chebai/preprocessing/datasets/pubchem.py
+++ b/chebai/preprocessing/datasets/pubchem.py
@@ -183,6 +183,23 @@ def processed_file_names(self) -> List[str]:
         """
         return ["test.pt", "train.pt", "validation.pt"]
 
+    def _set_processed_data_props(self):
+        """
+        Load processed data and extract metadata.
+
+        Sets:
+            - self._num_of_labels: Number of target labels in the dataset.
+            - self._feature_vector_size: 0.
+        """
+        with open(self.processed_dir_main, "classes.txt") as f:
+            classes = [f.strip() for f in f.readlines() if f.strip()]
+
+        self._num_of_labels = len(classes)
+        self._feature_vector_size = 0
+
+        print(f"Number of labels for loaded data: {self._num_of_labels}")
+        print(f"Feature vector size: {self._feature_vector_size}")
+
     def _perform_data_preparation(self, *args, **kwargs):
         """
         Checks for raw data and downloads if necessary.

From feea315151b2a8085b54b3dcba06cd45e7642b60 Mon Sep 17 00:00:00 2001
From: sfluegel <simon.fluegel@uni-osnabrueck.de>
Date: Thu, 31 Jul 2025 13:49:03 +0200
Subject: [PATCH 3/4] fix file path

---
 chebai/preprocessing/datasets/pubchem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
index dacf7a8f..3cb085ff 100644
--- a/chebai/preprocessing/datasets/pubchem.py
+++ b/chebai/preprocessing/datasets/pubchem.py
@@ -191,7 +191,7 @@ def _set_processed_data_props(self):
             - self._num_of_labels: Number of target labels in the dataset.
             - self._feature_vector_size: 0.
         """
-        with open(self.processed_dir_main, "classes.txt") as f:
+        with open(os.path.join(self.processed_dir_main, "classes.txt")) as f:
             classes = [f.strip() for f in f.readlines() if f.strip()]
 
         self._num_of_labels = len(classes)

From 4043b23a5cf44bd4fa985c76a068cce4c0070c42 Mon Sep 17 00:00:00 2001
From: sfluegel <simon.fluegel@uni-osnabrueck.de>
Date: Thu, 31 Jul 2025 13:52:41 +0200
Subject: [PATCH 4/4] skip metadata calculation

---
 chebai/preprocessing/datasets/pubchem.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
index 3cb085ff..a1879fe7 100644
--- a/chebai/preprocessing/datasets/pubchem.py
+++ b/chebai/preprocessing/datasets/pubchem.py
@@ -185,16 +185,14 @@ def processed_file_names(self) -> List[str]:
 
     def _set_processed_data_props(self):
         """
-        Load processed data and extract metadata.
+        Self-supervised learning with PubChem does not use this metadata, therefore set them to zero.
 
         Sets:
-            - self._num_of_labels: Number of target labels in the dataset.
+            - self._num_of_labels: 0
             - self._feature_vector_size: 0.
         """
-        with open(os.path.join(self.processed_dir_main, "classes.txt")) as f:
-            classes = [f.strip() for f in f.readlines() if f.strip()]
 
-        self._num_of_labels = len(classes)
+        self._num_of_labels = 0
         self._feature_vector_size = 0
 
         print(f"Number of labels for loaded data: {self._num_of_labels}")