From 567c68da0eff4b7c8b513307264660ced29c9b81 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 9 Aug 2025 17:44:53 +0200 Subject: [PATCH 1/3] itertuples instead of iterrows --- chebai/preprocessing/datasets/chebi.py | 6 +++--- chebai/preprocessing/datasets/pubchem.py | 12 ++++++------ chebai/train.py | 18 +++++++++--------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 9fa1c1c7..cda77f3b 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -300,7 +300,7 @@ def _graph_to_raw_dataset(self, g: nx.DiGraph) -> pd.DataFrame: data = pd.DataFrame(data) data = data[~data["SMILES"].isnull()] - data = data[[name not in CHEBI_BLACKLIST for name, _ in data.iterrows()]] + data = data[~data["name"].isin(CHEBI_BLACKLIST)] return data @@ -491,10 +491,10 @@ def _setup_pruned_test_set( ] # Iterate over each data instance in the test set which is derived from chebi_version - for _, row in df_test_chebi_version.iterrows(): + for row in df_test_chebi_version.itertuples(index=False): # Size = Number of classes in chebi_version_train new_labels = [False for _ in new_classes] - for ind, label in enumerate(row["labels"]): + for ind, label in enumerate(row.labels): # If the chebi_version class exists in the chebi_version_train and has a True label, # set the corresponding label in new_labels to True if mapping[ind] is not None and label: diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index a1879fe7..48dd1efc 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -628,8 +628,8 @@ def download(self): if not os.path.exists(os.path.join(self.raw_dir, f"{name}.txt")): open(os.path.join(self.raw_dir, f"{name}.txt"), "x").close() with open(os.path.join(self.raw_dir, f"{name}.txt"), "w") as f: - for id, row in splits[i].iterrows(): - f.writelines(f"{id}\t{row['smiles']}\n") + for id, row in splits[i].itertuples(index=True): + f.writelines(f"{id}\t{row.smiles}\n") class PubChemDissimilarSMILES(PubChemDissimilar): @@ -809,12 +809,12 @@ def download(self): csv_path = os.path.join(self.raw_dir, "pubchem_hazardous_compound_list.csv") compounds = pd.read_csv(csv_path) smiles_list = [] - for id, compound in compounds.iterrows(): + for compound in compounds.itertuples(index=False): if ( - not isinstance(compound["cmpdsynonym"], str) - or "CHEBI" not in compound["cmpdsynonym"] + not isinstance(compound.cmpdsynonym, str) + or "CHEBI" not in compound.cmpdsynonym ): - smiles_list.append(f"{compound['cid']}\t{compound['isosmiles']}") + smiles_list.append(f"{compound.cid}\t{compound.isosmiles}") with open(os.path.join(self.raw_dir, "smiles.txt"), "w") as f: f.write("\n".join(smiles_list)) diff --git a/chebai/train.py b/chebai/train.py index 9c524f1a..bab5f089 100644 --- a/chebai/train.py +++ b/chebai/train.py @@ -246,11 +246,11 @@ def prepare_data(infile: pickle.Pickler) -> pd.DataFrame: data_frame[col] = data_frame[col].astype(int) train_data = [] - for index, row in data_frame.iterrows(): + for row in data_frame.itertuples(index=False): train_data.append( [ - data_frame.iloc[index].values[1], - data_frame.iloc[index].values[2:502].tolist(), + row.SMILES, + row.LABELS, ] ) @@ -309,13 +309,13 @@ def load_data() -> ( train_dataset = [] train_actual_labels = [] - for index, row in prepare_data(train_infile).iterrows(): + for row in prepare_data(train_infile).itertuples(index=False): try: - mol = Molecule(row["SMILES"], True) + mol = Molecule(row.SMILES, True) # DAGs_meta_info = mol.dag_to_node train_dataset.append(mol) - train_actual_labels.append(torch.tensor(row["LABELS"]).float()) + train_actual_labels.append(torch.tensor(row.LABELS).float()) except Exception: pass @@ -323,14 +323,14 @@ def load_data() -> ( validation_dataset = [] validation_actual_labels = [] - for index, row in prepare_data(validation_infile).iterrows(): + for row in prepare_data(validation_infile).itertuples(index=False): try: - mol = Molecule(row["SMILES"], True) + mol = Molecule(row.SMILES, True) # DAGs_meta_info = mol.dag_to_node validation_dataset.append(mol) - validation_actual_labels.append(torch.tensor(row["LABELS"]).float()) + validation_actual_labels.append(torch.tensor(row.LABELS).float()) except Exception: pass From 30ca5f6b2247294a26863638771dabdbadaa0f39 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 9 Aug 2025 18:36:16 +0200 Subject: [PATCH 2/3] optimize _setup_pruned_test_set logic --- chebai/preprocessing/datasets/chebi.py | 61 +++++++++++++------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index cda77f3b..95e3a43f 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -17,6 +17,7 @@ import fastobo import networkx as nx +import numpy as np import pandas as pd import requests import torch @@ -465,43 +466,43 @@ def _setup_pruned_test_set( Returns: pd.DataFrame: The pruned test dataset. """ - # TODO: find a more efficient way to do this - filename_old = "classes.txt" - # filename_new = f"classes_v{self.chebi_version_train}.txt" - # dataset = torch.load(os.path.join(self.processed_dir, "test.pt")) + classes_file_name = "classes.txt" - # Load original classes (from the current ChEBI version - chebi_version) - with open(os.path.join(self.processed_dir_main, filename_old), "r") as file: - orig_classes = file.readlines() - - # Load new classes (from the training ChEBI version - chebi_version_train) + # Load original and new classes + with open(os.path.join(self.processed_dir_main, classes_file_name), "r") as f: + orig_classes = f.readlines() with open( os.path.join( - self._chebi_version_train_obj.processed_dir_main, filename_old + self._chebi_version_train_obj.processed_dir_main, classes_file_name ), "r", - ) as file: - new_classes = file.readlines() - - # Create a mapping which give index of a class from chebi_version, if the corresponding - # class exists in chebi_version_train, Size = Number of classes in chebi_version - mapping = [ - None if or_class not in new_classes else new_classes.index(or_class) - for or_class in orig_classes - ] + ) as f: + new_classes = f.readlines() + + # Mapping array (-1 means no match in new classes) + mapping_array = np.array( + [ + -1 if oc not in new_classes else new_classes.index(oc) + for oc in orig_classes + ], + dtype=int, + ) + + # Convert labels column to 2D NumPy array + labels_matrix = np.array(df_test_chebi_version["labels"].tolist(), dtype=bool) + + # Allocate new labels matrix + num_new_classes = len(new_classes) + new_labels_matrix = np.zeros( + (labels_matrix.shape[0], num_new_classes), dtype=bool + ) - # Iterate over each data instance in the test set which is derived from chebi_version - for row in df_test_chebi_version.itertuples(index=False): - # Size = Number of classes in chebi_version_train - new_labels = [False for _ in new_classes] - for ind, label in enumerate(row.labels): - # If the chebi_version class exists in the chebi_version_train and has a True label, - # set the corresponding label in new_labels to True - if mapping[ind] is not None and label: - new_labels[mapping[ind]] = label - # Update the labels from test instance from chebi_version to the new labels, which are compatible to both versions - row["labels"] = new_labels + # Copy only valid columns + valid_mask = mapping_array != -1 + new_labels_matrix[:, mapping_array[valid_mask]] = labels_matrix[:, valid_mask] + # Assign back + df_test_chebi_version["labels"] = new_labels_matrix.tolist() return df_test_chebi_version # ------------------------------ Phase: Raw Properties ----------------------------------- From 3785bb55be1e4365453220b6579799aaa1fad8b5 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 9 Aug 2025 18:54:41 +0200 Subject: [PATCH 3/3] avoid repeated slicing in loop --- chebai/preprocessing/datasets/chebi.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 95e3a43f..dfa64fa4 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -359,18 +359,18 @@ def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, No """ with open(input_file_path, "rb") as input_file: df = pd.read_pickle(input_file) - if self.single_class is not None: - single_cls_index = list(df.columns).index(int(self.single_class)) - for row in df.values: - if self.single_class is None: - labels = row[self._LABELS_START_IDX :].astype(bool) - else: - labels = [bool(row[single_cls_index])] - yield dict( - features=row[self._DATA_REPRESENTATION_IDX], - labels=labels, - ident=row[self._ID_IDX], - ) + + if self.single_class is None: + all_labels = df.iloc[:, self._LABELS_START_IDX :].to_numpy(dtype=bool) + else: + single_cls_index = df.columns.get_loc(int(self.single_class)) + all_labels = df.iloc[:, [single_cls_index]].to_numpy(dtype=bool) + + features = df.iloc[:, self._DATA_REPRESENTATION_IDX].to_numpy() + idents = df.iloc[:, self._ID_IDX].to_numpy() + + for feat, labels, ident in zip(features, all_labels, idents): + yield dict(features=feat, labels=labels, ident=ident) # ------------------------------ Phase: Dynamic Splits ----------------------------------- def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: