From 567c68da0eff4b7c8b513307264660ced29c9b81 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 9 Aug 2025 17:44:53 +0200
Subject: [PATCH 1/3] itertuples instead of iterrows

---
 chebai/preprocessing/datasets/chebi.py   |  6 +++---
 chebai/preprocessing/datasets/pubchem.py | 12 ++++++------
 chebai/train.py                          | 18 +++++++++---------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 9fa1c1c7..cda77f3b 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -300,7 +300,7 @@ def _graph_to_raw_dataset(self, g: nx.DiGraph) -> pd.DataFrame:
 
         data = pd.DataFrame(data)
         data = data[~data["SMILES"].isnull()]
-        data = data[[name not in CHEBI_BLACKLIST for name, _ in data.iterrows()]]
+        data = data[~data["name"].isin(CHEBI_BLACKLIST)]
 
         return data
 
@@ -491,10 +491,10 @@ def _setup_pruned_test_set(
         ]
 
         # Iterate over each data instance in the test set which is derived from chebi_version
-        for _, row in df_test_chebi_version.iterrows():
+        for row in df_test_chebi_version.itertuples(index=False):
             # Size = Number of classes in chebi_version_train
             new_labels = [False for _ in new_classes]
-            for ind, label in enumerate(row["labels"]):
+            for ind, label in enumerate(row.labels):
                 # If the chebi_version class exists in the chebi_version_train and has a True label,
                 # set the corresponding label in new_labels to True
                 if mapping[ind] is not None and label:
diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
index a1879fe7..48dd1efc 100644
--- a/chebai/preprocessing/datasets/pubchem.py
+++ b/chebai/preprocessing/datasets/pubchem.py
@@ -628,8 +628,8 @@ def download(self):
                     if not os.path.exists(os.path.join(self.raw_dir, f"{name}.txt")):
                         open(os.path.join(self.raw_dir, f"{name}.txt"), "x").close()
                     with open(os.path.join(self.raw_dir, f"{name}.txt"), "w") as f:
-                        for id, row in splits[i].iterrows():
-                            f.writelines(f"{id}\t{row['smiles']}\n")
+                        for id, row in splits[i].itertuples(index=True):
+                            f.writelines(f"{id}\t{row.smiles}\n")
 
 
 class PubChemDissimilarSMILES(PubChemDissimilar):
@@ -809,12 +809,12 @@ def download(self):
         csv_path = os.path.join(self.raw_dir, "pubchem_hazardous_compound_list.csv")
         compounds = pd.read_csv(csv_path)
         smiles_list = []
-        for id, compound in compounds.iterrows():
+        for compound in compounds.itertuples(index=False):
             if (
-                not isinstance(compound["cmpdsynonym"], str)
-                or "CHEBI" not in compound["cmpdsynonym"]
+                not isinstance(compound.cmpdsynonym, str)
+                or "CHEBI" not in compound.cmpdsynonym
             ):
-                smiles_list.append(f"{compound['cid']}\t{compound['isosmiles']}")
+                smiles_list.append(f"{compound.cid}\t{compound.isosmiles}")
         with open(os.path.join(self.raw_dir, "smiles.txt"), "w") as f:
             f.write("\n".join(smiles_list))
 
diff --git a/chebai/train.py b/chebai/train.py
index 9c524f1a..bab5f089 100644
--- a/chebai/train.py
+++ b/chebai/train.py
@@ -246,11 +246,11 @@ def prepare_data(infile: pickle.Pickler) -> pd.DataFrame:
         data_frame[col] = data_frame[col].astype(int)
 
     train_data = []
-    for index, row in data_frame.iterrows():
+    for row in data_frame.itertuples(index=False):
         train_data.append(
             [
-                data_frame.iloc[index].values[1],
-                data_frame.iloc[index].values[2:502].tolist(),
+                row.SMILES,
+                row.LABELS,
             ]
         )
 
@@ -309,13 +309,13 @@ def load_data() -> (
         train_dataset = []
         train_actual_labels = []
 
-        for index, row in prepare_data(train_infile).iterrows():
+        for row in prepare_data(train_infile).itertuples(index=False):
             try:
-                mol = Molecule(row["SMILES"], True)
+                mol = Molecule(row.SMILES, True)
 
                 # DAGs_meta_info = mol.dag_to_node
                 train_dataset.append(mol)
-                train_actual_labels.append(torch.tensor(row["LABELS"]).float())
+                train_actual_labels.append(torch.tensor(row.LABELS).float())
             except Exception:
                 pass
 
@@ -323,14 +323,14 @@ def load_data() -> (
         validation_dataset = []
         validation_actual_labels = []
 
-        for index, row in prepare_data(validation_infile).iterrows():
+        for row in prepare_data(validation_infile).itertuples(index=False):
             try:
-                mol = Molecule(row["SMILES"], True)
+                mol = Molecule(row.SMILES, True)
 
                 # DAGs_meta_info = mol.dag_to_node
 
                 validation_dataset.append(mol)
-                validation_actual_labels.append(torch.tensor(row["LABELS"]).float())
+                validation_actual_labels.append(torch.tensor(row.LABELS).float())
             except Exception:
                 pass
 

From 30ca5f6b2247294a26863638771dabdbadaa0f39 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 9 Aug 2025 18:36:16 +0200
Subject: [PATCH 2/3] optimize _setup_pruned_test_set logic

---
 chebai/preprocessing/datasets/chebi.py | 61 +++++++++++++-------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index cda77f3b..95e3a43f 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -17,6 +17,7 @@
 
 import fastobo
 import networkx as nx
+import numpy as np
 import pandas as pd
 import requests
 import torch
@@ -465,43 +466,43 @@ def _setup_pruned_test_set(
         Returns:
             pd.DataFrame: The pruned test dataset.
         """
-        # TODO: find a more efficient way to do this
-        filename_old = "classes.txt"
-        # filename_new = f"classes_v{self.chebi_version_train}.txt"
-        # dataset = torch.load(os.path.join(self.processed_dir, "test.pt"))
+        classes_file_name = "classes.txt"
 
-        # Load original classes (from the current ChEBI version - chebi_version)
-        with open(os.path.join(self.processed_dir_main, filename_old), "r") as file:
-            orig_classes = file.readlines()
-
-        # Load new classes (from the training ChEBI version - chebi_version_train)
+        # Load original and new classes
+        with open(os.path.join(self.processed_dir_main, classes_file_name), "r") as f:
+            orig_classes = f.readlines()
         with open(
             os.path.join(
-                self._chebi_version_train_obj.processed_dir_main, filename_old
+                self._chebi_version_train_obj.processed_dir_main, classes_file_name
             ),
             "r",
-        ) as file:
-            new_classes = file.readlines()
-
-        # Create a mapping which give index of a class from chebi_version, if the corresponding
-        # class exists in chebi_version_train, Size = Number of classes in chebi_version
-        mapping = [
-            None if or_class not in new_classes else new_classes.index(or_class)
-            for or_class in orig_classes
-        ]
+        ) as f:
+            new_classes = f.readlines()
+
+        # Mapping array (-1 means no match in new classes)
+        mapping_array = np.array(
+            [
+                -1 if oc not in new_classes else new_classes.index(oc)
+                for oc in orig_classes
+            ],
+            dtype=int,
+        )
+
+        # Convert labels column to 2D NumPy array
+        labels_matrix = np.array(df_test_chebi_version["labels"].tolist(), dtype=bool)
+
+        # Allocate new labels matrix
+        num_new_classes = len(new_classes)
+        new_labels_matrix = np.zeros(
+            (labels_matrix.shape[0], num_new_classes), dtype=bool
+        )
 
-        # Iterate over each data instance in the test set which is derived from chebi_version
-        for row in df_test_chebi_version.itertuples(index=False):
-            # Size = Number of classes in chebi_version_train
-            new_labels = [False for _ in new_classes]
-            for ind, label in enumerate(row.labels):
-                # If the chebi_version class exists in the chebi_version_train and has a True label,
-                # set the corresponding label in new_labels to True
-                if mapping[ind] is not None and label:
-                    new_labels[mapping[ind]] = label
-            # Update the labels from test instance from chebi_version to the new labels, which are compatible to both versions
-            row["labels"] = new_labels
+        # Copy only valid columns
+        valid_mask = mapping_array != -1
+        new_labels_matrix[:, mapping_array[valid_mask]] = labels_matrix[:, valid_mask]
 
+        # Assign back
+        df_test_chebi_version["labels"] = new_labels_matrix.tolist()
         return df_test_chebi_version
 
     # ------------------------------ Phase: Raw Properties -----------------------------------

From 3785bb55be1e4365453220b6579799aaa1fad8b5 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 9 Aug 2025 18:54:41 +0200
Subject: [PATCH 3/3] avoid repeated slicing in loop

---
 chebai/preprocessing/datasets/chebi.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
index 95e3a43f..dfa64fa4 100644
--- a/chebai/preprocessing/datasets/chebi.py
+++ b/chebai/preprocessing/datasets/chebi.py
@@ -359,18 +359,18 @@ def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, No
         """
         with open(input_file_path, "rb") as input_file:
             df = pd.read_pickle(input_file)
-            if self.single_class is not None:
-                single_cls_index = list(df.columns).index(int(self.single_class))
-            for row in df.values:
-                if self.single_class is None:
-                    labels = row[self._LABELS_START_IDX :].astype(bool)
-                else:
-                    labels = [bool(row[single_cls_index])]
-                yield dict(
-                    features=row[self._DATA_REPRESENTATION_IDX],
-                    labels=labels,
-                    ident=row[self._ID_IDX],
-                )
+
+            if self.single_class is None:
+                all_labels = df.iloc[:, self._LABELS_START_IDX :].to_numpy(dtype=bool)
+            else:
+                single_cls_index = df.columns.get_loc(int(self.single_class))
+                all_labels = df.iloc[:, [single_cls_index]].to_numpy(dtype=bool)
+
+            features = df.iloc[:, self._DATA_REPRESENTATION_IDX].to_numpy()
+            idents = df.iloc[:, self._ID_IDX].to_numpy()
+
+            for feat, labels, ident in zip(features, all_labels, idents):
+                yield dict(features=feat, labels=labels, ident=ident)
 
     # ------------------------------ Phase: Dynamic Splits -----------------------------------
     def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: