From 94e14aa3823c5a24771e1818d9ff3461d99d8426 Mon Sep 17 00:00:00 2001 From: sfluegel Date: Thu, 31 Jul 2025 13:28:02 +0200 Subject: [PATCH 1/4] use test_split and validation_split for pubchem --- chebai/preprocessing/datasets/pubchem.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index 24e1e986..f7c5e668 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -154,9 +154,13 @@ def setup_processed(self): print("Load data from file", filename) data = self._load_data_from_file(filename) print("Create splits") - train, test = train_test_split(data, train_size=self.train_split) + train, test = train_test_split( + data, train_size=1 - (self.validation_split + self.test_split) + ) del data - test, val = train_test_split(test, train_size=self.train_split) + test, val = train_test_split( + test, train_size=self.test_split / (self.validation_split + self.test_split) + ) torch.save(train, os.path.join(self.processed_dir, "train.pt")) torch.save(test, os.path.join(self.processed_dir, "test.pt")) torch.save(val, os.path.join(self.processed_dir, "validation.pt")) From ba96980cb98587a0d94d31aa026be400e73f6a73 Mon Sep 17 00:00:00 2001 From: sfluegel Date: Thu, 31 Jul 2025 13:47:21 +0200 Subject: [PATCH 2/4] set processed data props for pubchem --- chebai/preprocessing/datasets/pubchem.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index f7c5e668..dacf7a8f 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -183,6 +183,23 @@ def processed_file_names(self) -> List[str]: """ return ["test.pt", "train.pt", "validation.pt"] + def _set_processed_data_props(self): + """ + Load processed data and extract metadata. + + Sets: + - self._num_of_labels: Number of target labels in the dataset. + - self._feature_vector_size: 0. + """ + with open(self.processed_dir_main, "classes.txt") as f: + classes = [f.strip() for f in f.readlines() if f.strip()] + + self._num_of_labels = len(classes) + self._feature_vector_size = 0 + + print(f"Number of labels for loaded data: {self._num_of_labels}") + print(f"Feature vector size: {self._feature_vector_size}") + def _perform_data_preparation(self, *args, **kwargs): """ Checks for raw data and downloads if necessary. From feea315151b2a8085b54b3dcba06cd45e7642b60 Mon Sep 17 00:00:00 2001 From: sfluegel Date: Thu, 31 Jul 2025 13:49:03 +0200 Subject: [PATCH 3/4] fix file path --- chebai/preprocessing/datasets/pubchem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index dacf7a8f..3cb085ff 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -191,7 +191,7 @@ def _set_processed_data_props(self): - self._num_of_labels: Number of target labels in the dataset. - self._feature_vector_size: 0. """ - with open(self.processed_dir_main, "classes.txt") as f: + with open(os.path.join(self.processed_dir_main, "classes.txt")) as f: classes = [f.strip() for f in f.readlines() if f.strip()] self._num_of_labels = len(classes) From 4043b23a5cf44bd4fa985c76a068cce4c0070c42 Mon Sep 17 00:00:00 2001 From: sfluegel Date: Thu, 31 Jul 2025 13:52:41 +0200 Subject: [PATCH 4/4] skip metadata calculation --- chebai/preprocessing/datasets/pubchem.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index 3cb085ff..a1879fe7 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -185,16 +185,14 @@ def processed_file_names(self) -> List[str]: def _set_processed_data_props(self): """ - Load processed data and extract metadata. + Self-supervised learning with PubChem does not use this metadata, therefore set them to zero. Sets: - - self._num_of_labels: Number of target labels in the dataset. + - self._num_of_labels: 0 - self._feature_vector_size: 0. """ - with open(os.path.join(self.processed_dir_main, "classes.txt")) as f: - classes = [f.strip() for f in f.readlines() if f.strip()] - self._num_of_labels = len(classes) + self._num_of_labels = 0 self._feature_vector_size = 0 print(f"Number of labels for loaded data: {self._num_of_labels}")