From 44070e6ee9ad667a3b04642a0924998c3fbbb14e Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 15 Apr 2025 14:49:04 +0200 Subject: [PATCH 1/4] remove protein code files --- .../preprocessing/datasets/deepGO/__init__.py | 0 .../datasets/deepGO/go_uniprot.py | 1007 ------------- .../datasets/deepGO/protein_pretraining.py | 279 ---- .../preprocessing/datasets/scope/__init__.py | 0 chebai/preprocessing/datasets/scope/scope.py | 972 ------------ .../migration/deep_go/__init__.py | 0 .../deep_go/migrate_deep_go_1_data.py | 316 ---- .../deep_go/migrate_deep_go_2_data.py | 366 ----- configs/data/deepGO/deepgo2_esm2.yml | 5 - .../data/deepGO/deepgo_1_migrated_data.yml | 4 - .../data/deepGO/deepgo_2_migrated_data.yml | 5 - configs/data/deepGO/go250.yml | 3 - configs/data/deepGO/go50.yml | 1 - configs/data/scope/scope2000.yml | 3 - configs/data/scope/scope50.yml | 3 - .../testGOUniProDataExtractor.py | 229 --- .../dataset_classes/testGoUniProtOverX.py | 140 -- .../testProteinPretrainingData.py | 76 - tests/unit/mock_data/ontology_mock_data.py | 407 ----- tests/unit/readers/testProteinDataReader.py | 139 -- tutorials/data_exploration_go.ipynb | 1341 ----------------- tutorials/data_exploration_scope.ipynb | 1182 --------------- 22 files changed, 6478 deletions(-) delete mode 100644 chebai/preprocessing/datasets/deepGO/__init__.py delete mode 100644 chebai/preprocessing/datasets/deepGO/go_uniprot.py delete mode 100644 chebai/preprocessing/datasets/deepGO/protein_pretraining.py delete mode 100644 chebai/preprocessing/datasets/scope/__init__.py delete mode 100644 chebai/preprocessing/datasets/scope/scope.py delete mode 100644 chebai/preprocessing/migration/deep_go/__init__.py delete mode 100644 chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py delete mode 100644 chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py delete mode 100644 configs/data/deepGO/deepgo2_esm2.yml delete mode 100644 configs/data/deepGO/deepgo_1_migrated_data.yml delete mode 100644 configs/data/deepGO/deepgo_2_migrated_data.yml delete mode 100644 configs/data/deepGO/go250.yml delete mode 100644 configs/data/deepGO/go50.yml delete mode 100644 configs/data/scope/scope2000.yml delete mode 100644 configs/data/scope/scope50.yml delete mode 100644 tests/unit/dataset_classes/testGOUniProDataExtractor.py delete mode 100644 tests/unit/dataset_classes/testGoUniProtOverX.py delete mode 100644 tests/unit/dataset_classes/testProteinPretrainingData.py delete mode 100644 tests/unit/readers/testProteinDataReader.py delete mode 100644 tutorials/data_exploration_go.ipynb delete mode 100644 tutorials/data_exploration_scope.ipynb diff --git a/chebai/preprocessing/datasets/deepGO/__init__.py b/chebai/preprocessing/datasets/deepGO/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/chebai/preprocessing/datasets/deepGO/go_uniprot.py b/chebai/preprocessing/datasets/deepGO/go_uniprot.py deleted file mode 100644 index 1b0eb2aa..00000000 --- a/chebai/preprocessing/datasets/deepGO/go_uniprot.py +++ /dev/null @@ -1,1007 +0,0 @@ -# References for this file : -# Reference 1: -# Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf; -# DeepGO: Predicting protein functions from sequence and interactions -# using a deep ontology-aware classifier, Bioinformatics, 2017. -# https://doi.org/10.1093/bioinformatics/btx624 -# Github: https://github.com/bio-ontology-research-group/deepgo - -# Reference 2: -# https://www.ebi.ac.uk/GOA/downloads -# https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt -# https://www.uniprot.org/uniprotkb - -# Reference 3: -# Kulmanov, M., Guzmán-Vega, F.J., Duek Roggli, -# P. et al. Protein function prediction as approximate semantic entailment. Nat Mach Intell 6, 220–228 (2024). -# https://doi.org/10.1038/s42256-024-00795-w -# https://github.com/bio-ontology-research-group/deepgo2 - -__all__ = [ - "GOUniProtOver250", - "GOUniProtOver50", - "EXPERIMENTAL_EVIDENCE_CODES", - "AMBIGUOUS_AMINO_ACIDS", - "DeepGO1MigratedData", - "DeepGO2MigratedData", -] - -import gzip -import itertools -import os -import shutil -from abc import ABC, abstractmethod -from collections import OrderedDict -from tempfile import NamedTemporaryFile -from typing import Any, Dict, Generator, List, Optional, Tuple, Union - -import fastobo -import networkx as nx -import pandas as pd -import requests -import torch -import tqdm -from Bio import SwissProt - -from chebai.preprocessing import reader as dr -from chebai.preprocessing.datasets.base import _DynamicDataset - -# https://github.com/bio-ontology-research-group/deepgo/blob/master/utils.py#L15 -EXPERIMENTAL_EVIDENCE_CODES = { - "EXP", - "IDA", - "IPI", - "IMP", - "IGI", - "IEP", - "TAS", - "IC", - # New evidence codes added in latest paper year 2024 Reference number 3 - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/utils.py#L24-L26 - "HTP", - "HDA", - "HMP", - "HGI", - "HEP", -} - -# https://github.com/bio-ontology-research-group/deepgo/blob/d97447a05c108127fee97982fd2c57929b2cf7eb/aaindex.py#L8 -# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L10 -# `X` is now considered as valid amino acid, as per latest paper year 2024 Refernce number 3 -AMBIGUOUS_AMINO_ACIDS = {"B", "O", "J", "U", "Z", "*"} - - -class _GOUniProtDataExtractor(_DynamicDataset, ABC): - """ - A class for extracting and processing data from the Gene Ontology (GO) dataset and the Swiss UniProt dataset. - - Args: - dynamic_data_split_seed (int, optional): The seed for random data splitting. Defaults to 42. - splits_file_path (str, optional): Path to the splits CSV file. Defaults to None. - max_sequence_length (int, optional): Specifies the maximum allowed sequence length for a protein, with a - default of 1002. During data preprocessing, any proteins exceeding this length will be excluded from further - processing. - **kwargs: Additional keyword arguments passed to DynamicDataset and XYBaseDataModule. - - Attributes: - dynamic_data_split_seed (int): The seed for random data splitting, default is 42. - max_sequence_length (int, optional): Specifies the maximum allowed sequence length for a protein, with a - default of 1002. During data preprocessing, any proteins exceeding this length will be excluded from further - processing. - splits_file_path (Optional[str]): Path to the CSV file containing split assignments. - """ - - _GO_DATA_INIT = "GO" - _SWISS_DATA_INIT = "SWISS" - - # -- Index for columns of processed `data.pkl` (derived from `_get_swiss_to_go_mapping` & `_graph_to_raw_dataset` - # "swiss_id" at row index 0 - # "accession" at row index 1 - # "go_ids" at row index 2 - # "sequence" at row index 3 - # labels starting from row index 4 - _ID_IDX: int = 0 - _DATA_REPRESENTATION_IDX: int = 3 # here `sequence` column - _LABELS_START_IDX: int = 4 - - _GO_DATA_URL: str = "https://purl.obolibrary.org/obo/go/go-basic.obo" - _SWISS_DATA_URL: str = ( - "https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/complete/uniprot_sprot.dat.gz" - ) - - # Gene Ontology (GO) has three major branches, one for biological processes (BP), molecular functions (MF) and - # cellular components (CC). The value "all" will take data related to all three branches into account. - _ALL_GO_BRANCHES: str = "all" - _GO_BRANCH_NAMESPACE: Dict[str, str] = { - "BP": "biological_process", - "MF": "molecular_function", - "CC": "cellular_component", - } - - def __init__(self, **kwargs): - self.go_branch: str = self._get_go_branch(**kwargs) - - self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002)) - assert ( - self.max_sequence_length >= 1 - ), "Max sequence length should be greater than or equal to 1." - - super(_GOUniProtDataExtractor, self).__init__(**kwargs) - - if self.reader.n_gram is not None: - assert self.max_sequence_length >= self.reader.n_gram, ( - f"max_sequence_length ({self.max_sequence_length}) must be greater than " - f"or equal to n_gram ({self.reader.n_gram})." - ) - - @classmethod - def _get_go_branch(cls, **kwargs) -> str: - """ - Retrieves the Gene Ontology (GO) branch based on provided keyword arguments. - This method checks if a valid GO branch value is provided in the keyword arguments. - - Args: - **kwargs: Arbitrary keyword arguments. Specifically looks for: - - "go_branch" (str): The desired GO branch. - Returns: - str: The GO branch value. This will be one of the allowed values. - - Raises: - ValueError: If the provided 'go_branch' value is not in the allowed list of values. - """ - - go_branch_value = kwargs.get("go_branch", cls._ALL_GO_BRANCHES) - allowed_values = list(cls._GO_BRANCH_NAMESPACE.keys()) + [cls._ALL_GO_BRANCHES] - if go_branch_value not in allowed_values: - raise ValueError( - f"Invalid value for go_branch: {go_branch_value}, Allowed values: {allowed_values}" - ) - return go_branch_value - - # ------------------------------ Phase: Prepare data ----------------------------------- - def _download_required_data(self) -> str: - """ - Downloads the required raw data related to Gene Ontology (GO) and Swiss-UniProt dataset. - - Returns: - str: Path to the downloaded data. - """ - self._download_swiss_uni_prot_data() - return self._download_gene_ontology_data() - - def _download_gene_ontology_data(self) -> str: - """ - Download the Gene Ontology data `.obo` file. - - Note: - Quote from : https://geneontology.org/docs/download-ontology/ - Three versions of the ontology are available, the one use in this method is described below: - https://purl.obolibrary.org/obo/go/go-basic.obo - The basic version of the GO, filtered such that the graph is guaranteed to be acyclic and annotations - can be propagated up the graph. The relations included are `is a, part of, regulates, negatively` - `regulates` and `positively regulates`. This version excludes relationships that cross the 3 GO - hierarchies. This version should be used with most GO-based annotation tools. - - Returns: - str: The file path of the loaded Gene Ontology data. - """ - go_path = os.path.join(self.raw_dir, self.raw_file_names_dict["GO"]) - os.makedirs(os.path.dirname(go_path), exist_ok=True) - - if not os.path.isfile(go_path): - print("Missing Gene Ontology raw data") - print(f"Downloading Gene Ontology data....") - r = requests.get(self._GO_DATA_URL, allow_redirects=True) - r.raise_for_status() # Check if the request was successful - open(go_path, "wb").write(r.content) - return go_path - - def _download_swiss_uni_prot_data(self) -> Optional[str]: - """ - Download the Swiss-Prot data file from UniProt Knowledgebase. - - Note: - UniProt Knowledgebase is collection of functional information on proteins, with accurate, consistent - and rich annotation. - - Swiss-Prot contains manually-annotated records with information extracted from literature and - curator-evaluated computational analysis. - - Returns: - str: The file path of the loaded Swiss-Prot data file. - """ - uni_prot_file_path = os.path.join( - self.raw_dir, self.raw_file_names_dict["SwissUniProt"] - ) - os.makedirs(os.path.dirname(uni_prot_file_path), exist_ok=True) - - if not os.path.isfile(uni_prot_file_path): - print(f"Downloading Swiss UniProt data....") - - # Create a temporary file - with NamedTemporaryFile(delete=False) as tf: - temp_filename = tf.name - print(f"Downloading to temporary file {temp_filename}") - - # Download the file - response = requests.get(self._SWISS_DATA_URL, stream=True) - with open(temp_filename, "wb") as temp_file: - shutil.copyfileobj(response.raw, temp_file) - - print(f"Downloaded to {temp_filename}") - - # Unpack the gzipped file - try: - print(f"Unzipping the file....") - with gzip.open(temp_filename, "rb") as f_in: - output_file_path = uni_prot_file_path - with open(output_file_path, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - print(f"Unpacked and saved to {output_file_path}") - - except Exception as e: - print(f"Failed to unpack the file: {e}") - finally: - # Clean up the temporary file - os.remove(temp_filename) - print(f"Removed temporary file {temp_filename}") - - return uni_prot_file_path - - def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: - """ - Extracts the class hierarchy from the GO ontology. - Constructs a directed graph (DiGraph) using NetworkX, where nodes are annotated with GO term data. - - Args: - data_path (str): The path to the GO ontology. - - Returns: - nx.DiGraph: A directed graph representing the class hierarchy, where nodes are GO terms and edges - represent parent-child relationships. - """ - print("Extracting class hierarchy...") - elements = [] - for term in fastobo.load(data_path): - if isinstance(term, fastobo.typedef.TypedefFrame): - # ---- To avoid term frame of the below format/structure ---- - # [Typedef] - # id: part_of - # name: part of - # namespace: external - # xref: BFO:0000050 - # is_transitive: true - continue - - if ( - term - and isinstance(term.id, fastobo.id.PrefixedIdent) - and term.id.prefix == self._GO_DATA_INIT - ): - # Consider only terms with id in following format - GO:2001271 - term_dict = self.term_callback(term) - if term_dict: - elements.append(term_dict) - - g = nx.DiGraph() - - # Add GO term nodes to the graph and their hierarchical ontology - for n in elements: - g.add_node(n["go_id"], **n) - g.add_edges_from( - [ - (parent_id, node_id) - for node_id in g.nodes - for parent_id in g.nodes[node_id]["parents"] - if parent_id in g.nodes - ] - ) - - print("Compute transitive closure") - return nx.transitive_closure_dag(g) - - def term_callback(self, term: fastobo.term.TermFrame) -> Union[Dict, bool]: - """ - Extracts information from a Gene Ontology (GO) term document. - - Args: - term: A Gene Ontology term Frame document. - - Returns: - Optional[Dict]: A dictionary containing the extracted information if the term is not obsolete, - otherwise None. The dictionary includes: - - "id" (str): The ID of the GO term. - - "parents" (List[str]): A list of parent term IDs. - - "name" (str): The name of the GO term. - """ - parents = [] - name = None - - for clause in term: - if isinstance(clause, fastobo.term.NamespaceClause): - if ( - self.go_branch != self._ALL_GO_BRANCHES - and clause.namespace.escaped - != self._GO_BRANCH_NAMESPACE[self.go_branch] - ): - # if the term document is not related to given go branch (except `all`), skip this document. - return False - - if isinstance(clause, fastobo.term.IsObsoleteClause): - if clause.obsolete: - # if the term document contains clause as obsolete as true, skips this document. - return False - - if isinstance(clause, fastobo.term.IsAClause): - parents.append(self._parse_go_id(clause.term)) - elif isinstance(clause, fastobo.term.NameClause): - name = clause.name - - return { - "go_id": self._parse_go_id(term.id), - "parents": parents, - "name": name, - } - - @staticmethod - def _parse_go_id(go_id: str) -> int: - """ - Helper function to parse and normalize GO term IDs. - - Args: - go_id: The raw GO term ID string. - - Returns: - str: The parsed and normalized GO term ID. - """ - # `is_a` clause has GO id in the following formats: - # GO:0009968 ! negative regulation of signal transduction - # GO:0046780 - return int(str(go_id).split(":")[1].split("!")[0].strip()) - - def _graph_to_raw_dataset(self, g: nx.DiGraph) -> pd.DataFrame: - """ - Processes a directed acyclic graph (DAG) to create a raw dataset in DataFrame format. The dataset includes - Swiss-Prot protein data and their associations with Gene Ontology (GO) terms. - - Note: - - GO classes are used as labels in the dataset. Each GO term is represented as a column, and its value - indicates whether a Swiss-Prot protein is associated with that GO term. - - Swiss-Prot proteins serve as samples. There is no 1-to-1 correspondence between Swiss-Prot proteins - and GO terms. - - Data Format: pd.DataFrame - - Column 0 : swiss_id (Identifier for SwissProt protein) - - Column 1 : Accession of the protein - - Column 2 : GO IDs (associated GO terms) - - Column 3 : Sequence of the protein - - Column 4 to Column "n": Each column corresponding to a class with value True/False indicating whether the - protein is associated with this GO term. - - Args: - g (nx.DiGraph): The class hierarchy graph. - - Returns: - pd.DataFrame: The raw dataset created from the graph. - """ - print(f"Processing graph") - - data_df = self._get_swiss_to_go_mapping() - # add ancestors to go ids - data_df["go_ids"] = data_df["go_ids"].apply( - lambda go_ids: sorted( - set( - itertools.chain.from_iterable( - [ - [go_id] + list(g.predecessors(go_id)) - for go_id in go_ids - if go_id in g.nodes - ] - ) - ) - ) - ) - # Initialize the GO term labels/columns to False - selected_classes = self.select_classes(g, data_df=data_df) - new_label_columns = pd.DataFrame( - False, index=data_df.index, columns=selected_classes - ) - data_df = pd.concat([data_df, new_label_columns], axis=1) - - # Set True for the corresponding GO IDs in the DataFrame go labels/columns - for index, row in data_df.iterrows(): - for go_id in row["go_ids"]: - if go_id in data_df.columns: - data_df.at[index, go_id] = True - - # This filters the DataFrame to include only the rows where at least one value in the row from 5th column - # onwards is True/non-zero. - # Quote from DeepGo Paper: `For training and testing, we use proteins which have been annotated with at least - # one GO term from the set of the GO terms for the model` - data_df = data_df[data_df.iloc[:, self._LABELS_START_IDX :].any(axis=1)] - return data_df - - def _get_swiss_to_go_mapping(self) -> pd.DataFrame: - """ - Parses the Swiss-Prot data and returns a DataFrame mapping Swiss-Prot records to Gene Ontology (GO) data. - - The DataFrame includes the following columns: - - "swiss_id": The unique identifier for each Swiss-Prot record. - - "sequence": The protein sequence. - - "accessions": Comma-separated list of accession numbers. - - "go_ids": List of GO IDs associated with the Swiss-Prot record. - - Note: - This mapping is necessary because the GO data does not include the protein sequence representation. - We select proteins with annotations having experimental evidence codes, as specified in - `EXPERIMENTAL_EVIDENCE_CODES` and filter the proteins by a maximum length of 1002, ignoring proteins with - ambiguous amino acid codes specified in `AMBIGUOUS_AMINO_ACIDS` in their sequence. - - Check the link below for keyword details: - https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt - - Returns: - pd.DataFrame: A DataFrame where each row corresponds to a Swiss-Prot record with its associated GO data. - """ - - print("Parsing swiss uniprot raw data....") - - swiss_ids, sequences, accessions, go_ids_list = [], [], [], [] - - swiss_data = SwissProt.parse( - open( - os.path.join(self.raw_dir, self.raw_file_names_dict["SwissUniProt"]), - "r", - ) - ) - - for record in swiss_data: - if record.data_class != "Reviewed": - # To consider only manually-annotated swiss data - continue - - if not record.sequence or len(record.sequence) > self.max_sequence_length: - # Consider protein with only sequence representation and seq. length not greater than max seq. length - continue - - if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence): - # Skip proteins with ambiguous amino acid codes - continue - - go_ids = [] - - for cross_ref in record.cross_references: - if cross_ref[0] == self._GO_DATA_INIT: - # One swiss data protein can correspond to many GO data instances - - if len(cross_ref) <= 3: - # No evidence code - continue - - # https://github.com/bio-ontology-research-group/deepgo/blob/master/get_functions.py#L63-L66 - evidence_code = cross_ref[3].split(":")[0] - if evidence_code not in EXPERIMENTAL_EVIDENCE_CODES: - # Skip GO id without the required experimental evidence codes - continue - - go_ids.append(self._parse_go_id(cross_ref[1])) - - if not go_ids: - # Skip Swiss proteins without mapping to GO data - continue - - swiss_ids.append(record.entry_name) - sequences.append(record.sequence) - accessions.append(",".join(record.accessions)) - go_ids.sort() - go_ids_list.append(go_ids) - - data_dict = OrderedDict( - swiss_id=swiss_ids, # swiss_id column at index 0 - accession=accessions, # Accession column at index 1 - go_ids=go_ids_list, # Go_ids (data representation) column at index 2 - sequence=sequences, # Sequence column at index 3 - ) - - return pd.DataFrame(data_dict) - - # ------------------------------ Phase: Setup data ----------------------------------- - def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]: - """ - Loads data from a pickled file and yields individual dictionaries for each row. - - The pickled file is expected to contain rows with the following structure: - - Data at row index `self._ID_IDX`: ID of go data instance - - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein - - Data from row index `self._LABELS_START_IDX` onwards: Labels - - This method is used by `_load_data_from_file` to generate dictionaries that are then - processed and converted into a list of dictionaries containing the features and labels. - - Args: - input_file_path (str): The path to the pickled input file. - - Yields: - Dict[str, Any]: A dictionary containing: - - `features` (str): The sequence data from the file. - - `labels` (np.ndarray): A boolean array of labels starting from row index 4. - - `ident` (Any): The identifier from row index 0. - """ - with open(input_file_path, "rb") as input_file: - df = pd.read_pickle(input_file) - for row in df.values: - labels = row[self._LABELS_START_IDX :].astype(bool) - # chebai.preprocessing.reader.DataReader only needs features, labels, ident, group - # "group" set to None, by default as no such entity for this data - yield dict( - features=row[self._DATA_REPRESENTATION_IDX], - labels=labels, - ident=row[self._ID_IDX], - ) - - # ------------------------------ Phase: Dynamic Splits ----------------------------------- - def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Loads encoded data and generates training, validation, and test splits. - - This method attempts to load encoded data from a file named `data.pt`. It then splits this data into - training, validation, and test sets. - - Raises: - FileNotFoundError: If the `data.pt` file does not exist. Ensure that `prepare_data` and/or - `setup` methods are called to generate the necessary dataset files. - - Returns: - Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames: - - Training set - - Validation set - - Test set - """ - try: - filename = self.processed_file_names_dict["data"] - data_go = torch.load( - os.path.join(self.processed_dir, filename), weights_only=False - ) - except FileNotFoundError: - raise FileNotFoundError( - f"File data.pt doesn't exists. " - f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" - ) - - df_go_data = pd.DataFrame(data_go) - train_df_go, df_test = self.get_test_split( - df_go_data, seed=self.dynamic_data_split_seed - ) - - # Get all splits - df_train, df_val = self.get_train_val_splits_given_test( - train_df_go, - df_test, - seed=self.dynamic_data_split_seed, - ) - - return df_train, df_val, df_test - - # ------------------------------ Phase: Raw Properties ----------------------------------- - @property - def base_dir(self) -> str: - """ - Returns the base directory path for storing GO-Uniprot data. - - Returns: - str: The path to the base directory, which is "data/GO_UniProt". - """ - return os.path.join("data", f"GO_UniProt") - - @property - def raw_file_names_dict(self) -> dict: - """ - Returns a dictionary of raw file names used in data processing. - - Returns: - dict: A dictionary mapping dataset names to their respective file names. - For example, {"GO": "go-basic.obo", "SwissUniProt": "uniprot_sprot.dat"}. - """ - return {"GO": "go-basic.obo", "SwissUniProt": "uniprot_sprot.dat"} - - -class _GOUniProtOverX(_GOUniProtDataExtractor, ABC): - """ - A class for extracting data from the Gene Ontology (GO) dataset with a threshold for selecting classes based on - the number of subclasses. - - This class is designed to filter GO classes based on a specified threshold, selecting only those classes - which have a certain number of subclasses in the hierarchy. - - Attributes: - READER (dr.ProteinDataReader): The reader used for reading the dataset. - THRESHOLD (int): The threshold for selecting classes based on the number of subclasses. - """ - - READER: dr.ProteinDataReader = dr.ProteinDataReader - THRESHOLD: int = None - - @property - def _name(self) -> str: - """ - Returns the name of the dataset. - - 'max_sequence_length' in the name indicates that proteins with sequence lengths exceeding are ignored - in the dataset. - - Returns: - str: The dataset name, formatted with the current threshold value and/or given go_branch. - """ - if self.go_branch != self._ALL_GO_BRANCHES: - return f"GO{self.THRESHOLD}_{self.go_branch}_{self.max_sequence_length}" - - return f"GO{self.THRESHOLD}_{self.max_sequence_length}" - - def select_classes( - self, g: nx.DiGraph, *args: Any, **kwargs: Dict[str, Any] - ) -> List[int]: - """ - Selects classes (GO terms) from the Gene Ontology (GO) dataset based on the number of annotations meeting a - specified threshold. - - The selection process is based on the annotations of the GO terms with its ancestors across the dataset. - - Annotations are calculated by counting how many times each GO term, along with its ancestral hierarchy, - is annotated per protein across the dataset. - This means that for each protein, the GO terms associated with it are considered, and the entire hierarchical - structure (ancestors) of each GO term is taken into account. The total count for each GO term and its ancestors - reflects how frequently these terms are annotated across all proteins in the dataset. - - Args: - g (nx.DiGraph): The directed acyclic graph representing the GO dataset, where each node corresponds to a GO term. - *args: Additional positional arguments (not used). - **kwargs: Additional keyword arguments, including: - - data_df (pd.DataFrame): A DataFrame containing the GO annotations for various proteins. - It should include a 'go_ids' column with the GO terms associated with each protein. - - Returns: - List[int]: A sorted list of selected GO term IDs that meet the annotation threshold criteria. - - Side Effects: - - Writes the list of selected GO term IDs to a file named "classes.txt" in the specified processed directory. - - Raises: - AttributeError: If the 'data_df' argument is not provided in kwargs. - - Notes: - - The `THRESHOLD` attribute, which defines the minimum number of annotations required to select a GO term, should be defined in the subclass. - """ - # Retrieve the DataFrame containing GO annotations per protein from the keyword arguments - data_df = kwargs.get("data_df", None) - if data_df is None or not isinstance(data_df, pd.DataFrame) or data_df.empty: - raise AttributeError( - "The 'data_df' argument must be provided and must be a non-empty pandas DataFrame." - ) - - print(f"Selecting GO terms based on given threshold: {self.THRESHOLD} ...") - - # https://github.com/bio-ontology-research-group/deepgo/blob/master/get_functions.py#L59-L77 - go_term_annot: Dict[int, int] = {} - for idx, row in data_df.iterrows(): - # Count the annotations for each go_id **`per protein`** - for go_id in row["go_ids"]: - if go_id not in go_term_annot: - go_term_annot[go_id] = 0 - go_term_annot[go_id] += 1 - - # Select GO terms that meet or exceed the threshold of annotations - selected_nodes: List[int] = [ - go_id - for go_id in g.nodes - if go_id in go_term_annot and go_term_annot[go_id] >= self.THRESHOLD - ] - - # Sort the selected nodes (optional but often useful for consistent output) - selected_nodes.sort() - - # Write the selected node IDs/classes to the file - filename = "classes.txt" - with open(os.path.join(self.processed_dir_main, filename), "wt") as fout: - fout.writelines(str(node) + "\n" for node in selected_nodes) - - return selected_nodes - - -class GOUniProtOver250(_GOUniProtOverX): - """ - A class for extracting data from the Gene Ontology (GO) dataset with a threshold of 250 for selecting classes. - - Inherits from `_GOUniProtOverX` and sets the threshold for selecting classes to 250. - - Attributes: - THRESHOLD (int): The threshold for selecting classes (250). - """ - - THRESHOLD: int = 250 - - -class GOUniProtOver50(_GOUniProtOverX): - """ - A class for extracting data from the Gene Ontology (GO) dataset with a threshold of 50 for selecting classes. - - Inherits from `_GOUniProtOverX` and sets the threshold for selecting classes to 50. - - Attributes: - THRESHOLD (int): The threshold for selecting classes (50). - """ - - THRESHOLD: int = 50 - - -class _DeepGOMigratedData(_GOUniProtDataExtractor, ABC): - """ - Base class for use of the migrated DeepGO data with common properties, name formatting, and file paths. - - Attributes: - READER (dr.ProteinDataReader): Protein data reader class. - THRESHOLD (Optional[int]): Threshold value for GO class selection, - determined by the GO branch type in derived classes. - """ - - READER: dr.ProteinDataReader = dr.ProteinDataReader - THRESHOLD: Optional[int] = None - - # Mapping from GO branch conventions used in DeepGO to our conventions - GO_BRANCH_MAPPING: dict = { - "cc": "CC", - "mf": "MF", - "bp": "BP", - } - - @property - def _name(self) -> str: - """ - Generates a unique identifier for the migrated data based on the GO - branch and max sequence length, optionally including a threshold. - - Returns: - str: A formatted name string for the data. - """ - threshold_part = f"GO{self.THRESHOLD}_" if self.THRESHOLD is not None else "GO_" - - if self.go_branch != self._ALL_GO_BRANCHES: - return f"{threshold_part}{self.go_branch}_{self.max_sequence_length}" - - return f"{threshold_part}{self.max_sequence_length}" - - # ------------------------------ Phase: Prepare data ----------------------------------- - def prepare_data(self, *args: Any, **kwargs: Any) -> None: - """ - Checks for the existence of migrated DeepGO data in the specified directory. - Raises an error if the required data file is not found, prompting - migration from DeepGO to this data structure. - - Args: - *args (Any): Additional positional arguments. - **kwargs (Any): Additional keyword arguments. - - Raises: - FileNotFoundError: If the processed data file does not exist. - """ - print("Checking for processed data in", self.processed_dir_main) - - processed_name = self.processed_main_file_names_dict["data"] - if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)): - raise FileNotFoundError( - f"File {processed_name} not found.\n" - f"You must run the appropriate DeepGO migration script " - f"(chebai/preprocessing/migration/deep_go) before executing this configuration " - f"to migrate data from DeepGO to this data structure." - ) - - def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List: - # Selection of GO classes not needed for migrated data - pass - - # ------------------------------ Phase: Raw Properties ----------------------------------- - @property - @abstractmethod - def processed_main_file_names_dict(self) -> Dict[str, str]: - """ - Abstract property for defining main processed file names. - These files are stored in the same directory as the generated data files - but have distinct names to differentiate them during training. - - Returns: - dict: A dictionary with key-value pairs for main processed file names. - """ - pass - - @property - @abstractmethod - def processed_file_names_dict(self) -> Dict[str, str]: - """ - Abstract property for defining additional processed file names. - These files are stored in the same directory as the generated data files - but have distinct names to differentiate them during training. - - Returns: - dict: A dictionary with key-value pairs for processed file names. - """ - pass - - -class DeepGO1MigratedData(_DeepGOMigratedData): - """ - Migrated data class specific to DeepGO1. Sets threshold values according - to the research paper based on the GO branch. - - Note: - Refer reference number 1 at the top of this file for the corresponding research paper. - - Args: - **kwargs: Arbitrary keyword arguments passed to the superclass. - - Raises: - ValueError: If an unsupported GO branch is provided. - """ - - def __init__(self, **kwargs): - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 - assert int(kwargs.get("max_sequence_length")) == 1002 - - # Set threshold based on GO branch, as per DeepGO1 paper and its data. - if kwargs.get("go_branch") in ["CC", "MF"]: - self.THRESHOLD = 50 - elif kwargs.get("go_branch") == "BP": - self.THRESHOLD = 250 - else: - raise ValueError( - f"DeepGO1 paper has no defined threshold for branch {self.go_branch}" - ) - - super(_DeepGOMigratedData, self).__init__(**kwargs) - - @property - def processed_main_file_names_dict(self) -> Dict[str, str]: - """ - Returns main processed file names specific to DeepGO1. - - Returns: - dict: Dictionary with the main data file name for DeepGO1. - """ - return {"data": "data_deep_go1.pkl"} - - @property - def processed_file_names_dict(self) -> Dict[str, str]: - """ - Returns processed file names specific to DeepGO1. - - Returns: - dict: Dictionary with data file name for DeepGO1. - """ - return {"data": "data_deep_go1.pt"} - - -class DeepGO2MigratedData(_DeepGOMigratedData): - """ - Migrated data class specific to DeepGO2, inheriting from DeepGO1MigratedData - with different processed file names. - - Note: - Refer reference number 3 at the top of this file for the corresponding research paper. - - Returns: - dict: Dictionary with file names specific to DeepGO2. - """ - - _LABELS_START_IDX: int = 5 # additional esm2_embeddings column in the dataframe - _ESM_EMBEDDINGS_COL_IDX: int = 4 - - def __init__(self, use_esm2_embeddings=False, **kwargs): - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 - assert int(kwargs.get("max_sequence_length")) == 1000 - self.use_esm2_embeddings: bool = use_esm2_embeddings - super(_DeepGOMigratedData, self).__init__(**kwargs) - - # ------------------------------ Phase: Setup data ----------------------------------- - def _load_data_from_file(self, path: str) -> List[Dict[str, Any]]: - """ - Load and process data from a file into a list of dictionaries containing features and labels. - - This method processes data differently based on the `use_esm2_embeddings` flag: - - If `use_esm2_embeddings` is True, raw dictionaries from `_load_dict` are returned, _load_dict already returns - the numerical features (esm2 embeddings) from the data file, hence no reader is required. - - Otherwise, a reader is used to process the data (generate numerical features). - - Args: - path (str): The path to the input file. - - Returns: - List[Dict[str, Any]]: A list of dictionaries with the following keys: - - `features`: Sequence or embedding data, depending on the context. - - `labels`: A boolean array of labels. - - `ident`: The identifier for the sequence. - """ - lines = self._get_data_size(path) - print(f"Processing {lines} lines...") - - if self.use_esm2_embeddings: - data = [ - d - for d in tqdm.tqdm(self._load_dict(path), total=lines) - if d["features"] is not None - ] - else: - data = [ - self.reader.to_data(d) - for d in tqdm.tqdm(self._load_dict(path), total=lines) - if d["features"] is not None - ] - - # filter for missing features in resulting data - data = [val for val in data if val["features"] is not None] - - return data - - def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]: - """ - Loads data from a pickled file and yields individual dictionaries for each row. - - The pickled file is expected to contain rows with the following structure: - - Data at row index `self._ID_IDX`: ID of go data instance - - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein - - Data at row index `self._ESM2_EMBEDDINGS_COL_IDX`: ESM2 embeddings of the protein - - Data from row index `self._LABELS_START_IDX` onwards: Labels - - The method adapts based on the `use_esm2_embeddings` flag: - - If `use_esm2_embeddings` is True, features are loaded from the column specified by `self._ESM_EMBEDDINGS_COL_IDX`. - - Otherwise, features are loaded from the column specified by `self._DATA_REPRESENTATION_IDX`. - - Args: - input_file_path (str): The path to the pickled input file. - - Yields: - Dict[str, Any]: A dictionary containing: - - `features` (Any): Sequence or embedding data for the instance. - - `labels` (np.ndarray): A boolean array of labels starting from row index 4. - - `ident` (Any): The identifier from row index 0. - """ - with open(input_file_path, "rb") as input_file: - df = pd.read_pickle(input_file) - - if self.use_esm2_embeddings: - features_idx = self._ESM_EMBEDDINGS_COL_IDX - else: - features_idx = self._DATA_REPRESENTATION_IDX - - for row in df.values: - labels = row[self._LABELS_START_IDX :].astype(bool) - yield dict( - features=row[features_idx], - labels=labels, - ident=row[self._ID_IDX], - ) - - # ------------------------------ Phase: Raw Properties ----------------------------------- - @property - def processed_main_file_names_dict(self) -> Dict[str, str]: - """ - Returns main processed file names specific to DeepGO2. - - Returns: - dict: Dictionary with the main data file name for DeepGO2. - """ - return {"data": "data_deep_go2.pkl"} - - @property - def processed_file_names_dict(self) -> Dict[str, str]: - """ - Returns processed file names specific to DeepGO2. - - Returns: - dict: Dictionary with data file name for DeepGO2. - """ - return {"data": "data_deep_go2.pt"} - - @property - def identifier(self) -> tuple: - """Identifier for the dataset.""" - if self.use_esm2_embeddings: - return (dr.ESM2EmbeddingReader.name(),) - return (self.reader.name(),) diff --git a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py b/chebai/preprocessing/datasets/deepGO/protein_pretraining.py deleted file mode 100644 index 8f7e9c4d..00000000 --- a/chebai/preprocessing/datasets/deepGO/protein_pretraining.py +++ /dev/null @@ -1,279 +0,0 @@ -__all__ = ["SwissProteinPretrain"] - -import os -from abc import ABC -from collections import OrderedDict -from typing import Any, Dict, Generator, List, Tuple - -import networkx as nx -import pandas as pd -import torch -from Bio import SwissProt -from sklearn.model_selection import train_test_split - -from chebai.preprocessing.datasets.base import _DynamicDataset -from chebai.preprocessing.datasets.deepGO.go_uniprot import ( - AMBIGUOUS_AMINO_ACIDS, - EXPERIMENTAL_EVIDENCE_CODES, - GOUniProtOver250, -) -from chebai.preprocessing.reader import ProteinDataReader - - -class _ProteinPretrainingData(_DynamicDataset, ABC): - """ - Data module for pretraining protein sequences, specifically designed for Swiss-UniProt data. It includes methods for - data preparation, loading, and dynamic splitting of protein sequences. - The data is parsed and filtered to only select proteins with no associated `valid` Gene Ontology (GO) labels. - A valid GO label is the one which has one of evidence codes defined in `EXPERIMENTAL_EVIDENCE_CODES`. - """ - - _ID_IDX: int = 0 - _DATA_REPRESENTATION_IDX: int = 1 # Index of `sequence` column - - def __init__(self, **kwargs): - """ - Initializes the data module with any GOUniProt extractor class object. - - Args: - **kwargs: Additional arguments for the superclass initialization. - """ - self._go_uniprot_extractor = GOUniProtOver250() - assert self._go_uniprot_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES - - self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002)) - assert ( - self.max_sequence_length >= 1 - ), "Max sequence length should be greater than or equal to 1." - - super(_ProteinPretrainingData, self).__init__(**kwargs) - - if self.reader.n_gram is not None: - assert self.max_sequence_length >= self.reader.n_gram, ( - f"max_sequence_length ({self.max_sequence_length}) must be greater than " - f"or equal to n_gram ({self.reader.n_gram})." - ) - - # ------------------------------ Phase: Prepare data ----------------------------------- - def prepare_data(self, *args: Any, **kwargs: Any) -> None: - """ - Prepares the data by downloading and parsing Swiss-Prot data if not already available. Saves the processed data - for further use. - - Args: - *args: Additional positional arguments. - **kwargs: Additional keyword arguments. - """ - processed_name = self.processed_main_file_names_dict["data"] - if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)): - print("Missing processed data file (`data.pkl` file)") - os.makedirs(self.processed_dir_main, exist_ok=True) - self._download_required_data() - protein_df = self._parse_protein_data_for_pretraining() - self.save_processed(protein_df, processed_name) - - def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: - # method not required as no Swiss-UniProt has no ontological data - pass - - def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame: - # method not required as no Swiss-UniProt has no ontological data - pass - - def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List: - # method not required as no Swiss-UniProt has no ontological data - pass - - def _download_required_data(self) -> str: - """ - Downloads the required Swiss-Prot data using the GOUniProt extractor class. - - Returns: - str: Path to the downloaded data. - """ - return self._go_uniprot_extractor._download_swiss_uni_prot_data() - - def _parse_protein_data_for_pretraining(self) -> pd.DataFrame: - """ - Parses the Swiss-Prot data and returns a DataFrame containing Swiss-Prot proteins which does not have any valid - Gene Ontology(GO) label. A valid GO label is the one which has one of the following evidence codes, as specified in - `EXPERIMENTAL_EVIDENCE_CODES`. - - The DataFrame includes the following columns: - - "swiss_id": The unique identifier for each Swiss-Prot record. - - "sequence": The protein sequence. - - Note: - We ignore proteins with ambiguous amino acid specified in `AMBIGUOUS_AMINO_ACIDS` in their sequence.` - - Returns: - pd.DataFrame: A DataFrame where each row corresponds to a Swiss-Prot record with not associated valid GO. - """ - print("Parsing swiss uniprot raw data....") - - swiss_ids, sequences = [], [] - - swiss_data = SwissProt.parse( - open( - os.path.join( - self._go_uniprot_extractor.raw_dir, - self._go_uniprot_extractor.raw_file_names_dict["SwissUniProt"], - ), - "r", - ) - ) - - for record in swiss_data: - if record.data_class != "Reviewed": - # To consider only manually-annotated swiss data - continue - - if not record.sequence: - # Consider protein with only sequence representation - continue - - if len(record.sequence) > self.max_sequence_length: - # Consider protein with only sequence length not greater than max seq. length - continue - - if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence): - # Skip proteins with ambiguous amino acid codes - continue - - has_valid_associated_go_label = False - for cross_ref in record.cross_references: - if cross_ref[0] == self._go_uniprot_extractor._GO_DATA_INIT: - - if len(cross_ref) <= 3: - # No evidence code - continue - - # https://github.com/bio-ontology-research-group/deepgo/blob/master/get_functions.py#L63-L66 - evidence_code = cross_ref[3].split(":")[0] - if evidence_code in EXPERIMENTAL_EVIDENCE_CODES: - has_valid_associated_go_label = True - break - - if has_valid_associated_go_label: - # Skip proteins which has at least one associated go label - continue - - swiss_ids.append(record.entry_name) - sequences.append(record.sequence) - - data_dict = OrderedDict( - swiss_id=swiss_ids, # swiss_id column at index 0 - sequence=sequences, # Sequence column at index 1 - ) - - return pd.DataFrame(data_dict) - - # ------------------------------ Phase: Setup data ----------------------------------- - def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]: - """ - Loads data from a pickled file and yields individual dictionaries for each row. - - The pickled file is expected to contain rows with the following structure: - - Data at row index `self._ID_IDX`: ID of go data instance - - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein - - This method is used by `_load_data_from_file` to generate dictionaries that are then - processed and converted into a list of dictionaries containing the features and labels. - - Args: - input_file_path (str): The path to the pickled input file. - - Yields: - Dict[str, Any]: A dictionary containing: - - `features` (str): The sequence data from the file. - - `ident` (Any): The identifier from row index 0. - - `labels`: Set to None - """ - with open(input_file_path, "rb") as input_file: - df = pd.read_pickle(input_file) - for row in df.values: - yield dict( - features=row[self._DATA_REPRESENTATION_IDX], - ident=row[self._ID_IDX], - labels=None, - ) - - # ------------------------------ Phase: Dynamic Splits ----------------------------------- - def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Loads encoded data and generates training, validation, and test splits. - - This method attempts to load encoded data from a file named `data.pt`. It then splits this data into - training, validation, and test sets. - - Raises: - FileNotFoundError: If the `data.pt` file does not exist. Ensure that `prepare_data` and/or - `setup` methods are called to generate the necessary dataset files. - - Returns: - Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames: - - Training set - - Validation set - - Test set - """ - try: - filename = self.processed_file_names_dict["data"] - data_go = torch.load( - os.path.join(self.processed_dir, filename), weights_only=False - ) - except FileNotFoundError: - raise FileNotFoundError( - f"File data.pt doesn't exists. " - f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" - ) - - df_go_data = pd.DataFrame(data_go) - train_df_go, df_test = train_test_split( - df_go_data, - train_size=self.train_split, - random_state=self.dynamic_data_split_seed, - ) - - # Get all splits - df_train, df_val = train_test_split( - train_df_go, - train_size=self.train_split, - random_state=self.dynamic_data_split_seed, - ) - - return df_train, df_val, df_test - - # ------------------------------ Phase: Raw Properties ----------------------------------- - @property - def base_dir(self) -> str: - """ - str: The base directory for pretraining data storage. - """ - return os.path.join(self._go_uniprot_extractor.base_dir, "Pretraining") - - @property - def raw_dir(self) -> str: - """Name of the directory where the raw data is stored.""" - return self._go_uniprot_extractor.raw_dir - - -class SwissProteinPretrain(_ProteinPretrainingData): - """ - Data module for Swiss-Prot protein pretraining, inheriting from `_ProteinPretrainingData`. - This class is specifically designed to handle data processing and loading for Swiss-Prot-based protein datasets. - - Attributes: - READER (Type): The data reader class used to load and process protein pretraining data. - """ - - READER = ProteinDataReader - - @property - def _name(self) -> str: - """ - The name identifier for this data module. - - Returns: - str: A string identifier, "SwissProteinPretrain", representing the name of this data module. - """ - return f"Swiss_{self.max_sequence_length}" diff --git a/chebai/preprocessing/datasets/scope/__init__.py b/chebai/preprocessing/datasets/scope/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/chebai/preprocessing/datasets/scope/scope.py b/chebai/preprocessing/datasets/scope/scope.py deleted file mode 100644 index e9127b25..00000000 --- a/chebai/preprocessing/datasets/scope/scope.py +++ /dev/null @@ -1,972 +0,0 @@ -# References for this file : - -# Reference 1: -# John-Marc Chandonia, Naomi K Fox, Steven E Brenner, SCOPe: classification of large macromolecular structures -# in the structural classification of proteins—extended database, Nucleic Acids Research, Volume 47, -# Issue D1, 08 January 2019, Pages D475–D481, https://doi.org/10.1093/nar/gky1134 -# https://scop.berkeley.edu/about/ver=2.08 - -# Reference 2: -# Murzin AG, Brenner SE, Hubbard TJP, Chothia C. 1995. SCOP: a structural classification of proteins database for -# the investigation of sequences and structures. Journal of Molecular Biology 247:536-540 - -import gzip -import os -import re -import shutil -from abc import ABC, abstractmethod -from tempfile import NamedTemporaryFile -from typing import Any, Dict, Generator, List, Optional, Tuple - -import networkx as nx -import pandas as pd -import requests -import torch -from Bio import SeqIO - -from chebai.preprocessing.datasets.base import _DynamicDataset -from chebai.preprocessing.reader import ProteinDataReader - - -class _SCOPeDataExtractor(_DynamicDataset, ABC): - """ - A class for extracting and processing data from the SCOPe (Structural Classification of Proteins - extended) dataset. - - This class is designed to handle the parsing, preprocessing, and hierarchical structure extraction from various - SCOPe dataset files, such as classification (CLA), hierarchy (HIE), and description (DES) files. - Additionally, it supports downloading related data like PDB sequence files. - - Args: - scope_version (str): The SCOPe version to use. - scope_version_train (Optional[str]): The training SCOPe version, if different. - dynamic_data_split_seed (int, optional): The seed for random data splitting. Defaults to 42. - splits_file_path (str, optional): Path to the splits CSV file. Defaults to None. - **kwargs: Additional keyword arguments passed to DynamicDataset and XYBaseDataModule. - """ - - # -- Index for columns of processed `data.pkl` (derived from `_graph_to_raw_dataset`) - # "id" at row index 0 - # "sids" at row index 1 - # "sequence" at row index 2 - # labels starting from row index 3 - _ID_IDX: int = 0 - _DATA_REPRESENTATION_IDX: int = 2 # here `sequence` column - _LABELS_START_IDX: int = 3 - - _SCOPE_GENERAL_URL = "https://scop.berkeley.edu/downloads/parse/dir.{data_type}.scope.{version_number}-stable.txt" - _PDB_SEQUENCE_DATA_URL = ( - "https://files.rcsb.org/pub/pdb/derived_data/pdb_seqres.txt.gz" - ) - - SCOPE_HIERARCHY: Dict[str, str] = { - "cl": "class", - "cf": "fold", - "sf": "superfamily", - "fa": "family", - "dm": "protein", - "sp": "species", - "px": "domain", - } - - def __init__( - self, - scope_version: str, - scope_version_train: Optional[str] = None, - max_sequence_len: int = 1000, - **kwargs, - ): - self.scope_version: str = scope_version - self.scope_version_train: str = scope_version_train - self.max_sequence_len: int = max_sequence_len - - super(_SCOPeDataExtractor, self).__init__(**kwargs) - - if self.scope_version_train is not None: - # Instantiate another same class with "scope_version" as "scope_version_train", if train_version is given - # This is to get the data from respective directory related to "scope_version_train" - _init_kwargs = kwargs - _init_kwargs["scope_version"] = self.scope_version_train - self._scope_version_train_obj = self.__class__( - **_init_kwargs, - ) - - @staticmethod - def _get_scope_url(data_type: str, version_number: str) -> str: - """ - Generates the URL for downloading SCOPe files. - - Args: - data_type (str): The type of data (e.g., 'cla', 'hie', 'des'). - version_number (str): The version of the SCOPe file. - - Returns: - str: The formatted SCOPe file URL. - """ - return _SCOPeDataExtractor._SCOPE_GENERAL_URL.format( - data_type=data_type, version_number=version_number - ) - - # ------------------------------ Phase: Prepare data ----------------------------------- - def _download_required_data(self) -> str: - """ - Downloads the required raw data for SCOPe and PDB sequence datasets. - - Returns: - str: Path to the downloaded data. - """ - self._download_pdb_sequence_data() - return self._download_scope_raw_data() - - def _download_pdb_sequence_data(self) -> None: - """ - Downloads and unzips the PDB sequence dataset from the RCSB PDB repository. - - The file is downloaded as a temporary gzip file, which is then extracted to the - specified directory. - """ - pdb_seq_file_path = os.path.join( - self.scope_root_dir, self.raw_file_names_dict["PDB"] - ) - os.makedirs(os.path.dirname(pdb_seq_file_path), exist_ok=True) - - if not os.path.isfile(pdb_seq_file_path): - print(f"Missing PDB raw data, Downloading PDB sequence data....") - - # Create a temporary file - with NamedTemporaryFile(delete=False) as tf: - temp_filename = tf.name - print(f"Downloading to temporary file {temp_filename}") - - # Download the file - response = requests.get(self._PDB_SEQUENCE_DATA_URL, stream=True) - with open(temp_filename, "wb") as temp_file: - shutil.copyfileobj(response.raw, temp_file) - - print(f"Downloaded to {temp_filename}") - - # Unpack the gzipped file - try: - print(f"Unzipping the file....") - with gzip.open(temp_filename, "rb") as f_in: - output_file_path = pdb_seq_file_path - with open(output_file_path, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - print(f"Unpacked and saved to {output_file_path}") - - except Exception as e: - print(f"Failed to unpack the file: {e}") - finally: - # Clean up the temporary file - os.remove(temp_filename) - print(f"Removed temporary file {temp_filename}") - - def _download_scope_raw_data(self) -> str: - """ - Downloads the raw SCOPe dataset files (CLA, HIE, DES, and COM). - - Each file is downloaded from the SCOPe repository and saved to the specified directory. - Files are only downloaded if they do not already exist. - - Returns: - str: A dummy path to indicate completion (can be extended for custom behavior). - """ - os.makedirs(self.raw_dir, exist_ok=True) - for data_type in ["CLA", "HIE", "DES"]: - data_file_name = self.raw_file_names_dict[data_type] - scope_path = os.path.join(self.raw_dir, data_file_name) - if not os.path.isfile(scope_path): - print(f"Missing Scope: {data_file_name} raw data, Downloading...") - r = requests.get( - self._get_scope_url(data_type.lower(), self.scope_version), - allow_redirects=False, - verify=False, # Disable SSL verification - ) - r.raise_for_status() # Check if the request was successful - open(scope_path, "wb").write(r.content) - return "dummy/path" - - def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: - """ - Extracts the class hierarchy from SCOPe data and computes its transitive closure. - - Args: - data_path (str): Path to the processed SCOPe dataset. - - Returns: - nx.DiGraph: A directed acyclic graph representing the SCOPe class hierarchy. - """ - print("Extracting class hierarchy...") - df_scope = self._get_scope_data() - pdb_chain_df = self._parse_pdb_sequence_file() - pdb_id_set = set(pdb_chain_df["pdb_id"]) # Search time complexity - O(1) - - # Initialize sets and dictionaries for storing edges and attributes - parent_node_edges, node_child_edges = set(), set() - node_attrs = {} - px_level_nodes = set() - sequence_nodes = dict() - px_to_seq_edges = set() - required_graph_nodes = set() - - # Create a lookup dictionary for PDB chain sequences - lookup_dict = ( - pdb_chain_df.groupby("pdb_id")[["chain_id", "sequence"]] - .apply(lambda x: dict(zip(x["chain_id"], x["sequence"]))) - .to_dict() - ) - - def add_sequence_nodes_edges(chain_sequence, px_sun_id): - """Adds sequence nodes and edges connecting px-level nodes to sequence nodes.""" - if chain_sequence not in sequence_nodes: - sequence_nodes[chain_sequence] = f"seq_{len(sequence_nodes)}" - px_to_seq_edges.add((px_sun_id, sequence_nodes[chain_sequence])) - - # Step 1: Build the graph structure and store node attributes - for row in df_scope.itertuples(index=False): - if row.level == "px": - - pdb_id, chain_id = row.sid[1:5], row.sid[5] - - if pdb_id not in pdb_id_set or chain_id == "_": - # Don't add domain level nodes that don't have pdb_id in pdb_sequences.txt file - # Also chain_id with "_" which corresponds to no chain - continue - px_level_nodes.add(row.sunid) - - # Add edges between px-level nodes and sequence nodes - if chain_id != ".": - if chain_id not in lookup_dict[pdb_id]: - continue - add_sequence_nodes_edges(lookup_dict[pdb_id][chain_id], row.sunid) - else: - # If chain_id is '.', connect all chains of this PDB ID - for chain, chain_sequence in lookup_dict[pdb_id].items(): - add_sequence_nodes_edges(chain_sequence, row.sunid) - else: - required_graph_nodes.add(row.sunid) - - node_attrs[row.sunid] = {"sid": row.sid, "level": row.level} - - if row.parent_sunid != -1: - parent_node_edges.add((row.parent_sunid, row.sunid)) - - for child_id in row.children_sunids: - node_child_edges.add((row.sunid, child_id)) - - del df_scope, pdb_chain_df, pdb_id_set - - g = nx.DiGraph() - g.add_nodes_from(node_attrs.items()) - # Note - `add_edges` internally create a node, if a node doesn't exist already - g.add_edges_from({(p, c) for p, c in parent_node_edges if p in node_attrs}) - g.add_edges_from({(p, c) for p, c in node_child_edges if c in node_attrs}) - - seq_nodes = set(sequence_nodes.values()) - g.add_nodes_from([(seq_id, {"level": "sequence"}) for seq_id in seq_nodes]) - g.add_edges_from( - { - (px_node, seq_node) - for px_node, seq_node in px_to_seq_edges - if px_node in node_attrs and seq_node in seq_nodes - } - ) - - # Step 2: Count sequence successors for required graph nodes only - for node in required_graph_nodes: - num_seq_successors = sum( - g.nodes[child]["level"] == "sequence" - for child in nx.descendants(g, node) - ) - g.nodes[node]["num_seq_successors"] = num_seq_successors - - # Step 3: Remove nodes which are not required before computing transitive closure for better efficiency - g.remove_nodes_from(px_level_nodes | seq_nodes) - - print("Computing Transitive Closure.........") - # Transitive closure is not needed in `select_classes` method but is required in _SCOPeOverXPartial - return nx.transitive_closure_dag(g) - - def _get_scope_data(self) -> pd.DataFrame: - """ - Merges and preprocesses the SCOPe classification, hierarchy, and description files into a unified DataFrame. - - Returns: - pd.DataFrame: A DataFrame containing combined SCOPe data with classification and hierarchy details. - """ - df_cla = self._get_classification_data() - df_hie = self._get_hierarchy_data() - df_des = self._get_node_description_data() - df_hie_with_cla = pd.merge(df_hie, df_cla, how="left", on="sunid") - df_all = pd.merge( - df_hie_with_cla, - df_des.drop(columns=["sid"], axis=1), - how="left", - on="sunid", - ) - return df_all - - def _get_classification_data(self) -> pd.DataFrame: - """ - Parses and processes the SCOPe CLA (classification) file. - - Returns: - pd.DataFrame: A DataFrame containing classification details, including hierarchy levels. - """ - df_cla = pd.read_csv( - os.path.join(self.raw_dir, self.raw_file_names_dict["CLA"]), - sep="\t", - header=None, - comment="#", - ) - df_cla.columns = [ - "sid", - "PDB_ID", - "description", - "sccs", - "sunid", - "hie_levels", - ] - - # Convert to dict - {cl:46456, cf:46457, sf:46458, fa:46459, dm:46460, sp:116748, px:113449} - df_cla["hie_levels"] = df_cla["hie_levels"].apply( - lambda x: {k: int(v) for k, v in (item.split("=") for item in x.split(","))} - ) - - # Split ancestor_nodes into separate columns and assign values - for key in self.SCOPE_HIERARCHY.keys(): - df_cla[self.SCOPE_HIERARCHY[key]] = df_cla["hie_levels"].apply( - lambda x: x[key] - ) - - df_cla["sunid"] = df_cla["sunid"].astype("int64") - - return df_cla - - def _get_hierarchy_data(self) -> pd.DataFrame: - """ - Parses and processes the SCOPe HIE (hierarchy) file. - - Returns: - pd.DataFrame: A DataFrame containing hierarchy details, including parent-child relationships. - """ - df_hie = pd.read_csv( - os.path.join(self.raw_dir, self.raw_file_names_dict["HIE"]), - sep="\t", - header=None, - comment="#", - low_memory=False, - ) - df_hie.columns = ["sunid", "parent_sunid", "children_sunids"] - - # if not parent id, then insert -1 - df_hie["parent_sunid"] = df_hie["parent_sunid"].replace("-", -1).astype(int) - # convert children ids to list of ids - df_hie["children_sunids"] = df_hie["children_sunids"].apply( - lambda x: list(map(int, x.split(","))) if x != "-" else [] - ) - - # Ensure the 'sunid' column in both DataFrames has the same type - df_hie["sunid"] = df_hie["sunid"].astype("int64") - return df_hie - - def _get_node_description_data(self) -> pd.DataFrame: - """ - Parses and processes the SCOPe DES (description) file. - - Returns: - pd.DataFrame: A DataFrame containing node-level descriptions from the SCOPe dataset. - """ - df_des = pd.read_csv( - os.path.join(self.raw_dir, self.raw_file_names_dict["DES"]), - sep="\t", - header=None, - comment="#", - low_memory=False, - ) - df_des.columns = ["sunid", "level", "scss", "sid", "description"] - df_des.loc[len(df_des)] = {"sunid": 0, "level": "root"} - - # Ensure the 'sunid' column in both DataFrames has the same type - df_des["sunid"] = df_des["sunid"].astype("int64") - return df_des - - def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame: - """ - Processes a directed acyclic graph (DAG) to generate a raw dataset in DataFrame format. This dataset includes - chain-level sequences and their corresponding labels based on the hierarchical structure of the associated domains. - - The process: - - Extracts SCOPe domain identifiers (sids) from the graph. - - Retrieves class labels for each domain based on all applicable taxonomy levels. - - Fetches the chain-level sequences from the Protein Data Bank (PDB) for each domain. - - For each sequence, identifies all domains associated with the same chain and assigns their corresponding labels. - - Notes: - - SCOPe hierarchy levels are used as labels, with each level represented by a column. The value in each column - indicates whether a PDB chain is associated with that particular hierarchy level. - - PDB chains are treated as samples. The method considers only domains that are mapped to the selected hierarchy levels. - - Data Format: pd.DataFrame - - Column 0 : id (Unique identifier for each sequence entry) - - Column 1 : sids (List of domain identifiers associated with the sequence) - - Column 2 : sequence (Amino acid sequence of the chain) - - Column 3 to Column "n": Each column corresponds to a SCOPe class hierarchy level with a value - of True/False indicating whether the chain is associated with the corresponding level. - - Args: - graph (nx.DiGraph): The class hierarchy graph. - - Returns: - pd.DataFrame: The raw dataset created from the graph. - - Raises: - RuntimeError: If no sunids are selected. - """ - print(f"Process graph") - - selected_sun_ids_per_lvl = self.select_classes(graph) - - if not selected_sun_ids_per_lvl: - raise RuntimeError("No sunid selected.") - - df_cla = self._get_classification_data() - hierarchy_levels = list(self.SCOPE_HIERARCHY.values()) - hierarchy_levels.remove("domain") - - df_cla = df_cla[["sid", "sunid"] + hierarchy_levels] - - # Initialize selected target columns - df_encoded = df_cla[["sid", "sunid"]].copy() - - # Collect all new columns in a dictionary first (avoids fragmentation) - encoded_df_columns = {} - - lvl_to_target_cols_mapping = {} - # Iterate over only the selected sun_ids (nodes) to one-hot encode them - for level, selected_sun_ids in selected_sun_ids_per_lvl.items(): - level_column = self.SCOPE_HIERARCHY[level] - if level_column in df_cla.columns: - # Create binary encoding for only relevant sun_ids - for sun_id in selected_sun_ids: - col_name = f"{level_column}_{sun_id}" - encoded_df_columns[col_name] = ( - df_cla[level_column] == sun_id - ).astype(bool) - - lvl_to_target_cols_mapping.setdefault(level_column, []).append( - col_name - ) - - # Convert the dictionary into a DataFrame and concatenate at once (prevents fragmentation) - df_encoded = pd.concat([df_encoded, pd.DataFrame(encoded_df_columns)], axis=1) - - encoded_target_columns = [] - for level in hierarchy_levels: - if level in lvl_to_target_cols_mapping: - encoded_target_columns.extend(lvl_to_target_cols_mapping[level]) - - print( - f"{len(encoded_target_columns)} labels has been selected for specified threshold, " - ) - print("Constructing data.pkl file .....") - - df_encoded = df_encoded[["sid", "sunid"] + encoded_target_columns] - - # Filter to select only domains that atleast map to any one selected sunid in any level - df_encoded = df_encoded[df_encoded.iloc[:, 2:].any(axis=1)] - - df_encoded["pdb_id"] = df_encoded["sid"].str[1:5] - df_encoded["chain_id"] = df_encoded["sid"].str[5] - - # "_" (underscore) means it has no chain - df_encoded = df_encoded[df_encoded["chain_id"] != "_"] - - pdb_chain_df = self._parse_pdb_sequence_file() - - # Handle chain_id == "." - Multiple chain case - # Split df_encoded into two: One for specific chains, one for "multiple chains" (".") - df_specific_chains = df_encoded[df_encoded["chain_id"] != "."] - df_multiple_chains = df_encoded[df_encoded["chain_id"] == "."].drop( - columns=["chain_id"] - ) - - # Merge specific chains normally - merged_specific = df_specific_chains.merge( - pdb_chain_df, on=["pdb_id", "chain_id"], how="left" - ) - - # Merge all chains case -> Join by pdb_id (not chain_id) - merged_all_chains = df_multiple_chains.merge( - pdb_chain_df, on="pdb_id", how="left" - ) - - # Combine both cases - sequence_hierarchy_df = pd.concat( - [merged_specific, merged_all_chains], ignore_index=True - ).dropna(subset=["sequence"]) - - # Vectorized Aggregation Instead of Row-wise Updates - sequence_hierarchy_df = ( - sequence_hierarchy_df.groupby("sequence", as_index=False) - .agg( - { - "sid": list, # Collect all SIDs per sequence - **{ - col: "max" for col in encoded_target_columns - }, # Max works as Bitwise OR for labels - } - ) - .rename(columns={"sid": "sids"}) - ) # Rename for clarity - - sequence_hierarchy_df = sequence_hierarchy_df.assign( - id=range(1, len(sequence_hierarchy_df) + 1) - )[["id", "sids", "sequence"] + encoded_target_columns] - - # Ensure atleast one label is true for each protein sequence - sequence_hierarchy_df = sequence_hierarchy_df[ - sequence_hierarchy_df.iloc[:, self._LABELS_START_IDX :].any(axis=1) - ] - - with open(os.path.join(self.processed_dir_main, "classes.txt"), "wt") as fout: - fout.writelines(str(sun_id) + "\n" for sun_id in encoded_target_columns) - - return sequence_hierarchy_df - - def _parse_pdb_sequence_file(self) -> pd.DataFrame: - """ - Parses the PDB sequence file and returns a DataFrame containing PDB IDs, chain IDs, and sequences. - - Returns: - pd.DataFrame: A DataFrame with columns ["pdb_id", "chain_id", "sequence"]. - """ - records = [] - valid_amino_acids = "".join(ProteinDataReader.AA_LETTER) - - for record in SeqIO.parse( - os.path.join(self.scope_root_dir, self.raw_file_names_dict["PDB"]), "fasta" - ): - - if not record.seq or len(record.seq) > self.max_sequence_len: - continue - - pdb_id, chain = record.id.split("_") - sequence = re.sub(f"[^{valid_amino_acids}]", "X", str(record.seq)) - - # Store as a dictionary entry (list of dicts -> DataFrame later) - records.append( - { - "pdb_id": pdb_id.lower(), - "chain_id": chain.lower(), - "sequence": sequence, - } - ) - - # Convert list of dictionaries to a DataFrame - pdb_chain_df = pd.DataFrame.from_records(records) - - return pdb_chain_df - - @abstractmethod - def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> Dict[str, List[int]]: - # Override the return type of the method from superclass - pass - - # ------------------------------ Phase: Setup data ----------------------------------- - def setup_processed(self) -> None: - """ - Transform and prepare processed data for the SCOPe dataset. - - Main function of this method is to transform `data.pkl` into a model input data format (`data.pt`), - ensuring that the data is in a format compatible for input to the model. - The transformed data must contain the following keys: `ident`, `features`, `labels`, and `group`. - This method uses a subclass of Data Reader to perform the transformation. - - It will transform the data related to `scope_version_train`, if specified. - """ - super().setup_processed() - - # Transform the data related to "scope_version_train" to encoded data, if it doesn't exist - if self.scope_version_train is not None and not os.path.isfile( - os.path.join( - self._scope_version_train_obj.processed_dir, - self._scope_version_train_obj.processed_file_names_dict["data"], - ) - ): - print( - f"Missing encoded data related to train version: {self.scope_version_train}" - ) - print("Calling the setup method related to it") - self._scope_version_train_obj.setup() - - def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]: - """ - Loads data from a pickled file and yields individual dictionaries for each row. - - The pickled file is expected to contain rows with the following structure: - - Data at row index `self._ID_IDX`: ID of go data instance - - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein - - Data from row index `self._LABELS_START_IDX` onwards: Labels - - This method is used by `_load_data_from_file` to generate dictionaries that are then - processed and converted into a list of dictionaries containing the features and labels. - - Args: - input_file_path (str): The path to the pickled input file. - - Yields: - Dict[str, Any]: A dictionary containing: - - `features` (str): The sequence data from the file. - - `labels` (np.ndarray): A boolean array of labels starting from row index 4. - - `ident` (Any): The identifier from row index 0. - """ - with open(input_file_path, "rb") as input_file: - df = pd.read_pickle(input_file) - for row in df.values: - labels = row[self._LABELS_START_IDX :].astype(bool) - # chebai.preprocessing.reader.DataReader only needs features, labels, ident, group - # "group" set to None, by default as no such entity for this data - yield dict( - features=row[self._DATA_REPRESENTATION_IDX], - labels=labels, - ident=row[self._ID_IDX], - ) - - # ------------------------------ Phase: Dynamic Splits ----------------------------------- - def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Loads encoded/transformed data and generates training, validation, and test splits. - - This method first loads encoded data from a file named `data.pt`, which is derived from either - `scope_version` or `scope_version_train`. It then splits the data into training, validation, and test sets. - - If `scope_version_train` is provided: - - Loads additional encoded data from `scope_version_train`. - - Splits this data into training and validation sets, while using the test set from `scope_version`. - - Prunes the test set from `scope_version` to include only labels that exist in `scope_version_train`. - - If `scope_version_train` is not provided: - - Splits the data from `scope_version` into training, validation, and test sets without modification. - - Raises: - FileNotFoundError: If the required `data.pt` file(s) do not exist. Ensure that `prepare_data` - and/or `setup` methods have been called to generate the dataset files. - - Returns: - Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames: - - Training set - - Validation set - - Test set - """ - try: - filename = self.processed_file_names_dict["data"] - data_scope_version = torch.load( - os.path.join(self.processed_dir, filename), weights_only=False - ) - except FileNotFoundError: - raise FileNotFoundError( - f"File data.pt doesn't exists. " - f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" - ) - - df_scope_version = pd.DataFrame(data_scope_version) - train_df_scope_ver, df_test_scope_ver = self.get_test_split( - df_scope_version, seed=self.dynamic_data_split_seed - ) - - if self.scope_version_train is not None: - # Load encoded data derived from "scope_version_train" - try: - filename_train = ( - self._scope_version_train_obj.processed_file_names_dict["data"] - ) - data_scope_train_version = torch.load( - os.path.join( - self._scope_version_train_obj.processed_dir, filename_train - ), - weights_only=False, - ) - except FileNotFoundError: - raise FileNotFoundError( - f"File data.pt doesn't exists related to scope_version_train {self.scope_version_train}." - f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" - ) - - df_scope_train_version = pd.DataFrame(data_scope_train_version) - # Get train/val split of data based on "scope_version_train", but - # using test set from "scope_version" - df_train, df_val = self.get_train_val_splits_given_test( - df_scope_train_version, - df_test_scope_ver, - seed=self.dynamic_data_split_seed, - ) - # Modify test set from "scope_version" to only include the labels that - # exists in "scope_version_train", all other entries remains same. - df_test = self._setup_pruned_test_set(df_test_scope_ver) - else: - # Get all splits based on "scope_version" - df_train, df_val = self.get_train_val_splits_given_test( - train_df_scope_ver, - df_test_scope_ver, - seed=self.dynamic_data_split_seed, - ) - df_test = df_test_scope_ver - - return df_train, df_val, df_test - - def _setup_pruned_test_set( - self, df_test_scope_version: pd.DataFrame - ) -> pd.DataFrame: - """ - Create a test set with the same leaf nodes, but use only classes that appear in the training set. - - Args: - df_test_scope_version (pd.DataFrame): The test dataset. - - Returns: - pd.DataFrame: The pruned test dataset. - """ - # TODO: find a more efficient way to do this - filename_old = "classes.txt" - # filename_new = f"classes_v{self.scope_version_train}.txt" - # dataset = torch.load(os.path.join(self.processed_dir, "test.pt")) - - # Load original classes (from the current SCOPe version - scope_version) - with open(os.path.join(self.processed_dir_main, filename_old), "r") as file: - orig_classes = file.readlines() - - # Load new classes (from the training SCOPe version - scope_version_train) - with open( - os.path.join( - self._scope_version_train_obj.processed_dir_main, filename_old - ), - "r", - ) as file: - new_classes = file.readlines() - - # Create a mapping which give index of a class from scope_version, if the corresponding - # class exists in scope_version_train, Size = Number of classes in scope_version - mapping = [ - None if or_class not in new_classes else new_classes.index(or_class) - for or_class in orig_classes - ] - - # Iterate over each data instance in the test set which is derived from scope_version - for _, row in df_test_scope_version.iterrows(): - # Size = Number of classes in scope_version_train - new_labels = [False for _ in new_classes] - for ind, label in enumerate(row["labels"]): - # If the scope_version class exists in the scope_version_train and has a True label, - # set the corresponding label in new_labels to True - if mapping[ind] is not None and label: - new_labels[mapping[ind]] = label - # Update the labels from test instance from scope_version to the new labels, which are compatible to both versions - row["labels"] = new_labels - - return df_test_scope_version - - # ------------------------------ Phase: Raw Properties ----------------------------------- - @property - def scope_root_dir(self) -> str: - """ - Returns the root directory of scope data. - - Returns: - str: The path to the base directory, which is "data/GO_UniProt". - """ - return os.path.join("data", "SCOPe") - - @property - def base_dir(self) -> str: - """ - Returns the base directory path for storing SCOPe data. - - Returns: - str: The path to the base directory, which is "data/GO_UniProt". - """ - return os.path.join(self.scope_root_dir, f"version_{self.scope_version}") - - @property - def raw_file_names_dict(self) -> dict: - """ - Returns a dictionary of raw file names used in data processing. - - Returns: - dict: A dictionary mapping dataset names to their respective file names. - """ - return { - "CLA": "cla.txt", - "DES": "des.txt", - "HIE": "hie.txt", - "PDB": "pdb_sequences.txt", - } - - -class _SCOPeOverX(_SCOPeDataExtractor, ABC): - """ - A class for extracting data from the SCOPe dataset with a threshold for selecting classes/labels based on - the number of subclasses. - - This class is designed to filter SCOPe classes/labels based on a specified threshold, selecting only those classes - which have a certain number of subclasses in the hierarchy. - - Attributes: - READER (dr.ProteinDataReader): The reader used for reading the dataset. - THRESHOLD (int): The threshold for selecting classes/labels based on the number of subclasses. - - """ - - READER = ProteinDataReader - THRESHOLD: int = None - - @property - def _name(self) -> str: - """ - Returns the name of the dataset. - - Returns: - str: The dataset name, formatted with the current threshold. - """ - return f"SCOPe{self.THRESHOLD}" - - def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> Dict[str, List[int]]: - """ - Selects classes from the SCOPe dataset based on the number of successors meeting a specified threshold. - - This method iterates over the nodes in the graph, counting the number of successors for each node. - Nodes with a number of successors greater than or equal to the defined threshold are selected. - - Note: - The input graph must be transitive closure of a directed acyclic graph. - - Args: - g (nx.Graph): The graph representing the dataset. - *args: Additional positional arguments (not used). - **kwargs: Additional keyword arguments (not used). - - Returns: - Dict: A dict containing selected nodes at each hierarchy level. - - Notes: - - The `THRESHOLD` attribute should be defined in the subclass of this class. - """ - selected_sunids_for_level = {} - for node, attr_dict in g.nodes(data=True): - if attr_dict["level"] in {"root", "px", "sequence"}: - # Skip nodes with level "root", "px", or "sequence" - continue - - # Check if the number of "sequence"-level successors meets or exceeds the threshold - if g.nodes[node]["num_seq_successors"] >= self.THRESHOLD: - selected_sunids_for_level.setdefault(attr_dict["level"], []).append( - node - ) - return selected_sunids_for_level - - -class _SCOPeOverXPartial(_SCOPeOverX, ABC): - """ - Dataset that doesn't use the full SCOPe dataset, but extracts a part of SCOPe (subclasses of a given top class) - - Attributes: - top_class_sunid (int): The Sun-ID of the top class from which to extract subclasses. - """ - - def __init__(self, top_class_sunid: int, **kwargs): - """ - Initializes the _SCOPeOverXPartial dataset. - - Args: - top_class_sunid (int): The Sun-ID of the top class from which to extract subclasses. - **kwargs: Additional keyword arguments passed to the superclass initializer. - """ - if "top_class_sunid" not in kwargs: - kwargs["top_class_sunid"] = top_class_sunid - - self.top_class_sunid: int = top_class_sunid - super().__init__(**kwargs) - - @property - def processed_dir_main(self) -> str: - """ - Returns the main processed data directory specific to the top class. - - Returns: - str: The processed data directory path. - """ - return os.path.join( - self.base_dir, - self._name, - f"partial_{self.top_class_sunid}", - "processed", - ) - - def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: - """ - Extracts a subset of SCOPe based on subclasses of the top class ID. - - This method calls the superclass method to extract the full class hierarchy, - then extracts the subgraph containing only the descendants of the top class ID, including itself. - - Args: - data_path (str): The file path to the SCOPe ontology file. - - Returns: - nx.DiGraph: The extracted class hierarchy as a directed graph, limited to the - descendants of the top class ID. - """ - g = super()._extract_class_hierarchy(data_path) - g = g.subgraph( - list(g.successors(self.top_class_sunid)) + [self.top_class_sunid] - ) - return g - - -class SCOPeOver2000(_SCOPeOverX): - """ - A class for extracting data from the SCOPe dataset with a threshold of 2000 for selecting classes. - - Inherits from `_SCOPeOverX` and sets the threshold for selecting classes to 2000. - - Attributes: - THRESHOLD (int): The threshold for selecting classes (2000). - """ - - THRESHOLD: int = 2000 - - -class SCOPeOver50(_SCOPeOverX): - - THRESHOLD = 50 - - -class SCOPeOverPartial2000(_SCOPeOverXPartial): - """ - A class for extracting data from the SCOPe dataset with a threshold of 2000 for selecting classes. - - Inherits from `_SCOPeOverXPartial` and sets the threshold for selecting classes to 2000. - - Attributes: - THRESHOLD (int): The threshold for selecting classes (2000). - """ - - THRESHOLD: int = 2000 - - -if __name__ == "__main__": - scope = SCOPeOver50(scope_version="2.08") - - # g = scope._extract_class_hierarchy("dummy/path") - # # Save graph - # import pickle - # with open("graph.gpickle", "wb") as f: - # pickle.dump(g, f) - - # Load graph - import pickle - - with open("graph.gpickle", "rb") as f: - g = pickle.load(f) - - # print(len([node for node in g.nodes() if g.out_degree(node) > 10000])) - scope._graph_to_raw_dataset(g) diff --git a/chebai/preprocessing/migration/deep_go/__init__.py b/chebai/preprocessing/migration/deep_go/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py b/chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py deleted file mode 100644 index 7d59c699..00000000 --- a/chebai/preprocessing/migration/deep_go/migrate_deep_go_1_data.py +++ /dev/null @@ -1,316 +0,0 @@ -import os -from collections import OrderedDict -from typing import List, Literal, Optional, Tuple - -import pandas as pd -from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit -from jsonargparse import CLI - -from chebai.preprocessing.datasets.deepGO.go_uniprot import DeepGO1MigratedData - - -class DeepGo1DataMigration: - """ - A class to handle data migration and processing for the DeepGO project. - It migrates the DeepGO data to our data structure followed for GO-UniProt data. - - This class handles migration of data from the DeepGO paper below: - Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf, - DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier, - Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668 - (https://doi.org/10.1093/bioinformatics/btx624). - """ - - # Max sequence length as per DeepGO1 - _MAXLEN = 1002 - _LABELS_START_IDX = DeepGO1MigratedData._LABELS_START_IDX - - def __init__(self, data_dir: str, go_branch: Literal["cc", "mf", "bp"]): - """ - Initializes the data migration object with a data directory and GO branch. - - Args: - data_dir (str): Directory containing the data files. - go_branch (Literal["cc", "mf", "bp"]): GO branch to use. - """ - valid_go_branches = list(DeepGO1MigratedData.GO_BRANCH_MAPPING.keys()) - if go_branch not in valid_go_branches: - raise ValueError(f"go_branch must be one of {valid_go_branches}") - self._go_branch = go_branch - - self._data_dir: str = rf"{data_dir}" - self._train_df: Optional[pd.DataFrame] = None - self._test_df: Optional[pd.DataFrame] = None - self._validation_df: Optional[pd.DataFrame] = None - self._terms_df: Optional[pd.DataFrame] = None - self._classes: Optional[List[str]] = None - - def migrate(self) -> None: - """ - Executes the data migration by loading, processing, and saving the data. - """ - print("Starting the migration process...") - self._load_data() - if not all( - df is not None - for df in [ - self._train_df, - self._validation_df, - self._test_df, - self._terms_df, - ] - ): - raise Exception( - "Data splits or terms data is not available in instance variables." - ) - splits_df = self._record_splits() - data_with_labels_df = self._extract_required_data_from_splits() - - if not all( - var is not None for var in [data_with_labels_df, splits_df, self._classes] - ): - raise Exception( - "Data splits or terms data is not available in instance variables." - ) - - self.save_migrated_data(data_with_labels_df, splits_df) - - def _load_data(self) -> None: - """ - Loads the test, train, validation, and terms data from the pickled files - in the data directory. - """ - try: - print(f"Loading data files from directory: {self._data_dir}") - self._test_df = pd.DataFrame( - pd.read_pickle( - os.path.join(self._data_dir, f"test-{self._go_branch}.pkl") - ) - ) - - # DeepGO 1 lacks a validation split, so we will create one by further splitting the training set. - # Although this reduces the training data slightly compared to the original DeepGO setup, - # given the data size, the impact should be minimal. - train_df = pd.DataFrame( - pd.read_pickle( - os.path.join(self._data_dir, f"train-{self._go_branch}.pkl") - ) - ) - - self._train_df, self._validation_df = self._get_train_val_split(train_df) - - self._terms_df = pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, f"{self._go_branch}.pkl")) - ) - - except FileNotFoundError as e: - raise FileNotFoundError( - f"Data file not found in directory: {e}. " - "Please ensure all required files are available in the specified directory." - ) - - @staticmethod - def _get_train_val_split( - train_df: pd.DataFrame, - ) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - Splits the training data into a smaller training set and a validation set. - - Args: - train_df (pd.DataFrame): Original training DataFrame. - - Returns: - Tuple[pd.DataFrame, pd.DataFrame]: Training and validation DataFrames. - """ - labels_list_train = train_df["labels"].tolist() - train_split = 0.85 - test_size = ((1 - train_split) ** 2) / train_split - - splitter = MultilabelStratifiedShuffleSplit( - n_splits=1, test_size=test_size, random_state=42 - ) - - train_indices, validation_indices = next( - splitter.split(labels_list_train, labels_list_train) - ) - - df_validation = train_df.iloc[validation_indices] - df_train = train_df.iloc[train_indices] - return df_train, df_validation - - def _record_splits(self) -> pd.DataFrame: - """ - Creates a DataFrame that stores the IDs and their corresponding data splits. - - Returns: - pd.DataFrame: A combined DataFrame containing split assignments. - """ - print("Recording data splits for train, validation, and test sets.") - split_assignment_list: List[pd.DataFrame] = [ - pd.DataFrame({"id": self._train_df["proteins"], "split": "train"}), - pd.DataFrame( - {"id": self._validation_df["proteins"], "split": "validation"} - ), - pd.DataFrame({"id": self._test_df["proteins"], "split": "test"}), - ] - - combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True) - return combined_split_assignment - - def _extract_required_data_from_splits(self) -> pd.DataFrame: - """ - Extracts required columns from the combined data splits. - - Returns: - pd.DataFrame: A DataFrame containing the essential columns for processing. - """ - print("Combining data splits into a single DataFrame with required columns.") - required_columns = [ - "proteins", - "accessions", - "sequences", - "gos", - "labels", - ] - - new_df = pd.concat( - [ - self._train_df[required_columns], - self._validation_df[required_columns], - self._test_df[required_columns], - ], - ignore_index=True, - ) - new_df["go_ids"] = new_df.apply( - lambda row: self.extract_go_id(row["gos"]), axis=1 - ) - - labels_df = self._get_labels_columns(new_df) - - data_df = pd.DataFrame( - OrderedDict( - swiss_id=new_df["proteins"], - accession=new_df["accessions"], - go_ids=new_df["go_ids"], - sequence=new_df["sequences"], - ) - ) - - df = pd.concat([data_df, labels_df], axis=1) - - return df - - @staticmethod - def extract_go_id(go_list: List[str]) -> List[int]: - """ - Extracts and parses GO IDs from a list of GO annotations. - - Args: - go_list (List[str]): List of GO annotation strings. - - Returns: - List[int]: List of parsed GO IDs. - """ - return [DeepGO1MigratedData._parse_go_id(go_id_str) for go_id_str in go_list] - - def _get_labels_columns(self, data_df: pd.DataFrame) -> pd.DataFrame: - """ - Generates columns for labels based on provided selected terms. - - Args: - data_df (pd.DataFrame): DataFrame with GO annotations and labels. - - Returns: - pd.DataFrame: DataFrame with label columns. - """ - print("Generating label columns from provided selected terms.") - parsed_go_ids: pd.Series = self._terms_df["functions"].apply( - lambda gos: DeepGO1MigratedData._parse_go_id(gos) - ) - all_go_ids_list = parsed_go_ids.values.tolist() - self._classes = all_go_ids_list - - new_label_columns = pd.DataFrame( - data_df["labels"].tolist(), index=data_df.index, columns=all_go_ids_list - ) - - return new_label_columns - - def save_migrated_data( - self, data_df: pd.DataFrame, splits_df: pd.DataFrame - ) -> None: - """ - Saves the processed data and split information. - - Args: - data_df (pd.DataFrame): Data with GO labels. - splits_df (pd.DataFrame): Split assignment DataFrame. - """ - print("Saving transformed data files.") - - deepgo_migr_inst: DeepGO1MigratedData = DeepGO1MigratedData( - go_branch=DeepGO1MigratedData.GO_BRANCH_MAPPING[self._go_branch], - max_sequence_length=self._MAXLEN, - ) - - # Save data file - deepgo_migr_inst.save_processed( - data_df, deepgo_migr_inst.processed_main_file_names_dict["data"] - ) - print( - f"{deepgo_migr_inst.processed_main_file_names_dict['data']} saved to {deepgo_migr_inst.processed_dir_main}" - ) - - # Save splits file - splits_df.to_csv( - os.path.join(deepgo_migr_inst.processed_dir_main, "splits_deep_go1.csv"), - index=False, - ) - print(f"splits_deep_go1.csv saved to {deepgo_migr_inst.processed_dir_main}") - - # Save classes file - classes = sorted(self._classes) - with open( - os.path.join(deepgo_migr_inst.processed_dir_main, "classes_deep_go1.txt"), - "wt", - ) as fout: - fout.writelines(str(node) + "\n" for node in classes) - print(f"classes_deep_go1.txt saved to {deepgo_migr_inst.processed_dir_main}") - - print("Migration process completed!") - - -class Main: - """ - Main class to handle the migration process for DeepGo1DataMigration. - - Methods: - migrate(data_dir: str, go_branch: Literal["cc", "mf", "bp"]): - Initiates the migration process for the specified data directory and GO branch. - """ - - @staticmethod - def migrate(data_dir: str, go_branch: Literal["cc", "mf", "bp"]) -> None: - """ - Initiates the migration process by creating a DeepGoDataMigration instance - and invoking its migrate method. - - Args: - data_dir (str): Directory containing the data files. - go_branch (Literal["cc", "mf", "bp"]): GO branch to use - ("cc" for cellular_component, - "mf" for molecular_function, - or "bp" for biological_process). - """ - DeepGo1DataMigration(data_dir, go_branch).migrate() - - -if __name__ == "__main__": - # Example: python script_name.py migrate --data_dir="data/deep_go1" --go_branch="mf" - # --data_dir specifies the directory containing the data files. - # --go_branch specifies the GO branch (cc, mf, or bp) you want to use for the migration. - CLI( - Main, - description="DeepGo1DataMigration CLI tool to handle migration of GO data for specified branches (cc, mf, bp).", - as_positional=False, - ) diff --git a/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py b/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py deleted file mode 100644 index d23247c0..00000000 --- a/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py +++ /dev/null @@ -1,366 +0,0 @@ -import os -import re -from collections import OrderedDict -from typing import List, Literal, Optional - -import pandas as pd -from jsonargparse import CLI - -from chebai.preprocessing.datasets.deepGO.go_uniprot import DeepGO2MigratedData -from chebai.preprocessing.reader import ProteinDataReader - - -class DeepGo2DataMigration: - """ - A class to handle data migration and processing for the DeepGO project. It migrates the data from the DeepGO-SE - data structure to our data structure followed for GO-UniProt data. - - This class handles migration of data from the DeepGO paper below: - Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf, - DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier, - Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668 - (https://doi.org/10.1093/bioinformatics/btx624) - """ - - _LABELS_START_IDX = DeepGO2MigratedData._LABELS_START_IDX - - def __init__( - self, data_dir: str, go_branch: Literal["cc", "mf", "bp"], max_len: int = 1000 - ): - """ - Initializes the data migration object with a data directory and GO branch. - - Args: - data_dir (str): Directory containing the data files. - go_branch (Literal["cc", "mf", "bp"]): GO branch to use. - max_len (int): Used to truncate the sequence to this length. Default is 1000. - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 - """ - valid_go_branches = list(DeepGO2MigratedData.GO_BRANCH_MAPPING.keys()) - if go_branch not in valid_go_branches: - raise ValueError(f"go_branch must be one of {valid_go_branches}") - self._go_branch = go_branch - - self._data_dir: str = os.path.join(rf"{data_dir}", go_branch) - self._max_len: int = max_len - - self._train_df: Optional[pd.DataFrame] = None - self._test_df: Optional[pd.DataFrame] = None - self._validation_df: Optional[pd.DataFrame] = None - self._terms_df: Optional[pd.DataFrame] = None - self._classes: Optional[List[str]] = None - - def migrate(self) -> None: - """ - Executes the data migration by loading, processing, and saving the data. - """ - print("Starting the migration process...") - self._load_data() - if not all( - df is not None - for df in [ - self._train_df, - self._validation_df, - self._test_df, - self._terms_df, - ] - ): - raise Exception( - "Data splits or terms data is not available in instance variables." - ) - splits_df = self._record_splits() - - data_df = self._extract_required_data_from_splits() - data_with_labels_df = self._generate_labels(data_df) - - if not all( - var is not None for var in [data_with_labels_df, splits_df, self._classes] - ): - raise Exception( - "Data splits or terms data is not available in instance variables." - ) - - self.save_migrated_data(data_with_labels_df, splits_df) - - def _load_data(self) -> None: - """ - Loads the test, train, validation, and terms data from the pickled files - in the data directory. - """ - - try: - print(f"Loading data from directory: {self._data_dir}......") - - print( - "Pre-processing the data before loading them into instance variables\n" - f"2-Steps preprocessing: \n" - f"\t 1: Truncating every sequence to {self._max_len}\n" - f"\t 2: Replacing every amino acid which is not in {ProteinDataReader.AA_LETTER}" - ) - - self._test_df = self._pre_process_data( - pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, "test_data.pkl")) - ) - ) - self._train_df = self._pre_process_data( - pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, "train_data.pkl")) - ) - ) - self._validation_df = self._pre_process_data( - pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, "valid_data.pkl")) - ) - ) - - self._terms_df = pd.DataFrame( - pd.read_pickle(os.path.join(self._data_dir, "terms.pkl")) - ) - - except FileNotFoundError as e: - raise FileNotFoundError( - f"Data file not found in directory: {e}. " - "Please ensure all required files are available in the specified directory." - ) - - def _pre_process_data(self, df: pd.DataFrame) -> pd.DataFrame: - """ - Pre-processes the input dataframe by truncating sequences to the maximum - length and replacing invalid amino acids with 'X'. - - Args: - df (pd.DataFrame): The dataframe to preprocess. - - Returns: - pd.DataFrame: The processed dataframe. - """ - df = self._truncate_sequences(df) - df = self._replace_invalid_amino_acids(df) - return df - - def _truncate_sequences( - self, df: pd.DataFrame, column: str = "sequences" - ) -> pd.DataFrame: - """ - Truncate sequences in a specified column of a dataframe to the maximum length. - - https://github.com/bio-ontology-research-group/deepgo2/blob/main/train_cnn.py#L206-L217 - - Args: - df (pd.DataFrame): The input dataframe containing the data to be processed. - column (str, optional): The column containing sequences to truncate. - Defaults to "sequences". - - Returns: - pd.DataFrame: The dataframe with sequences truncated to `self._max_len`. - """ - df[column] = df[column].apply(lambda x: x[: self._max_len]) - return df - - @staticmethod - def _replace_invalid_amino_acids( - df: pd.DataFrame, column: str = "sequences" - ) -> pd.DataFrame: - """ - Replaces invalid amino acids in a sequence with 'X' using regex. - - https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L26-L33 - https://github.com/ChEB-AI/python-chebai/pull/64#issuecomment-2517067073 - - Args: - df (pd.DataFrame): The dataframe containing the sequences to be processed. - column (str, optional): The column containing the sequences. Defaults to "sequences". - - Returns: - pd.DataFrame: The dataframe with invalid amino acids replaced by 'X'. - """ - valid_amino_acids = "".join(ProteinDataReader.AA_LETTER) - # Replace any character not in the valid set with 'X' - df[column] = df[column].apply( - lambda x: re.sub(f"[^{valid_amino_acids}]", "X", x) - ) - return df - - def _record_splits(self) -> pd.DataFrame: - """ - Creates a DataFrame that stores the IDs and their corresponding data splits. - - Returns: - pd.DataFrame: A combined DataFrame containing split assignments. - """ - print("Recording data splits for train, validation, and test sets.") - split_assignment_list: List[pd.DataFrame] = [ - pd.DataFrame({"id": self._train_df["proteins"], "split": "train"}), - pd.DataFrame( - {"id": self._validation_df["proteins"], "split": "validation"} - ), - pd.DataFrame({"id": self._test_df["proteins"], "split": "test"}), - ] - - combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True) - return combined_split_assignment - - def _extract_required_data_from_splits(self) -> pd.DataFrame: - """ - Extracts required columns from the combined data splits. - - Returns: - pd.DataFrame: A DataFrame containing the essential columns for processing. - """ - print("Combining the data splits with required data..... ") - required_columns = [ - "proteins", - "accessions", - "sequences", - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/gendata/uni2pandas.py#L60-L69 - "prop_annotations", # Direct and Transitively associated GO ids - "esm2", - ] - - new_df = pd.concat( - [ - self._train_df[required_columns], - self._validation_df[required_columns], - self._test_df[required_columns], - ], - ignore_index=True, - ) - new_df["go_ids"] = new_df["prop_annotations"].apply( - lambda x: self.extract_go_id(x) - ) - - data_df = pd.DataFrame( - OrderedDict( - swiss_id=new_df["proteins"], - accession=new_df["accessions"], - go_ids=new_df["go_ids"], - sequence=new_df["sequences"], - esm2_embeddings=new_df["esm2"], - ) - ) - return data_df - - @staticmethod - def extract_go_id(go_list: List[str]) -> List[int]: - """ - Extracts and parses GO IDs from a list of GO annotations. - - Args: - go_list (List[str]): List of GO annotation strings. - - Returns: - List[str]: List of parsed GO IDs. - """ - return [DeepGO2MigratedData._parse_go_id(go_id_str) for go_id_str in go_list] - - def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame: - """ - Generates label columns for each GO term in the dataset. - - Args: - data_df (pd.DataFrame): DataFrame containing data with GO IDs. - - Returns: - pd.DataFrame: DataFrame with new label columns. - """ - print("Generating labels based on terms.pkl file.......") - parsed_go_ids: pd.Series = self._terms_df["gos"].apply( - DeepGO2MigratedData._parse_go_id - ) - all_go_ids_list = parsed_go_ids.values.tolist() - self._classes = all_go_ids_list - new_label_columns = pd.DataFrame( - False, index=data_df.index, columns=all_go_ids_list - ) - data_df = pd.concat([data_df, new_label_columns], axis=1) - - for index, row in data_df.iterrows(): - for go_id in row["go_ids"]: - if go_id in data_df.columns: - data_df.at[index, go_id] = True - - data_df = data_df[data_df.iloc[:, self._LABELS_START_IDX :].any(axis=1)] - return data_df - - def save_migrated_data( - self, data_df: pd.DataFrame, splits_df: pd.DataFrame - ) -> None: - """ - Saves the processed data and split information. - - Args: - data_df (pd.DataFrame): Data with GO labels. - splits_df (pd.DataFrame): Split assignment DataFrame. - """ - print("Saving transformed data......") - deepgo_migr_inst: DeepGO2MigratedData = DeepGO2MigratedData( - go_branch=DeepGO2MigratedData.GO_BRANCH_MAPPING[self._go_branch], - max_sequence_length=self._max_len, - ) - - # Save data file - deepgo_migr_inst.save_processed( - data_df, deepgo_migr_inst.processed_main_file_names_dict["data"] - ) - print( - f"{deepgo_migr_inst.processed_main_file_names_dict['data']} saved to {deepgo_migr_inst.processed_dir_main}" - ) - - # Save split file - splits_df.to_csv( - os.path.join(deepgo_migr_inst.processed_dir_main, "splits_deep_go2.csv"), - index=False, - ) - print(f"splits_deep_go2.csv saved to {deepgo_migr_inst.processed_dir_main}") - - # Save classes.txt file - classes = sorted(self._classes) - with open( - os.path.join(deepgo_migr_inst.processed_dir_main, "classes_deep_go2.txt"), - "wt", - ) as fout: - fout.writelines(str(node) + "\n" for node in classes) - print(f"classes_deep_go2.txt saved to {deepgo_migr_inst.processed_dir_main}") - - print("Migration completed!") - - -class Main: - """ - Main class to handle the migration process for DeepGoDataMigration. - - Methods: - migrate(data_dir: str, go_branch: Literal["cc", "mf", "bp"]): - Initiates the migration process for the specified data directory and GO branch. - """ - - @staticmethod - def migrate( - data_dir: str, go_branch: Literal["cc", "mf", "bp"], max_len: int = 1000 - ) -> None: - """ - Initiates the migration process by creating a DeepGoDataMigration instance - and invoking its migrate method. - - Args: - data_dir (str): Directory containing the data files. - go_branch (Literal["cc", "mf", "bp"]): GO branch to use - ("cc" for cellular_component, - "mf" for molecular_function, - or "bp" for biological_process). - max_len (int): Used to truncate the sequence to this length. Default is 1000. - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 - """ - DeepGo2DataMigration(data_dir, go_branch, max_len).migrate() - - -if __name__ == "__main__": - # Example: python script_name.py migrate --data_dir="data/deep_go_se_training_data" --go_branch="bp" - # --data_dir specifies the directory containing the data files. - # --go_branch specifies the GO branch (cc, mf, or bp) you want to use for the migration. - CLI( - Main, - description="DeepGoDataMigration CLI tool to handle migration of GO data for specified branches (cc, mf, bp).", - as_positional=False, - ) diff --git a/configs/data/deepGO/deepgo2_esm2.yml b/configs/data/deepGO/deepgo2_esm2.yml deleted file mode 100644 index 5a0436e3..00000000 --- a/configs/data/deepGO/deepgo2_esm2.yml +++ /dev/null @@ -1,5 +0,0 @@ -class_path: chebai.preprocessing.datasets.deepGO.go_uniprot.DeepGO2MigratedData -init_args: - go_branch: "MF" - max_sequence_length: 1000 - use_esm2_embeddings: True diff --git a/configs/data/deepGO/deepgo_1_migrated_data.yml b/configs/data/deepGO/deepgo_1_migrated_data.yml deleted file mode 100644 index 0924e023..00000000 --- a/configs/data/deepGO/deepgo_1_migrated_data.yml +++ /dev/null @@ -1,4 +0,0 @@ -class_path: chebai.preprocessing.datasets.deepGO.go_uniprot.DeepGO1MigratedData -init_args: - go_branch: "MF" - max_sequence_length: 1002 diff --git a/configs/data/deepGO/deepgo_2_migrated_data.yml b/configs/data/deepGO/deepgo_2_migrated_data.yml deleted file mode 100644 index 5a0436e3..00000000 --- a/configs/data/deepGO/deepgo_2_migrated_data.yml +++ /dev/null @@ -1,5 +0,0 @@ -class_path: chebai.preprocessing.datasets.deepGO.go_uniprot.DeepGO2MigratedData -init_args: - go_branch: "MF" - max_sequence_length: 1000 - use_esm2_embeddings: True diff --git a/configs/data/deepGO/go250.yml b/configs/data/deepGO/go250.yml deleted file mode 100644 index 01e34aa4..00000000 --- a/configs/data/deepGO/go250.yml +++ /dev/null @@ -1,3 +0,0 @@ -class_path: chebai.preprocessing.datasets.go_uniprot.deepGO.GOUniProtOver250 -init_args: - go_branch: "BP" diff --git a/configs/data/deepGO/go50.yml b/configs/data/deepGO/go50.yml deleted file mode 100644 index bee43773..00000000 --- a/configs/data/deepGO/go50.yml +++ /dev/null @@ -1 +0,0 @@ -class_path: chebai.preprocessing.datasets.deepGO.go_uniprot.GOUniProtOver50 diff --git a/configs/data/scope/scope2000.yml b/configs/data/scope/scope2000.yml deleted file mode 100644 index d75c807f..00000000 --- a/configs/data/scope/scope2000.yml +++ /dev/null @@ -1,3 +0,0 @@ -class_path: chebai.preprocessing.datasets.scope.scope.SCOPeOver2000 -init_args: - scope_version: "2.08" diff --git a/configs/data/scope/scope50.yml b/configs/data/scope/scope50.yml deleted file mode 100644 index c65028e2..00000000 --- a/configs/data/scope/scope50.yml +++ /dev/null @@ -1,3 +0,0 @@ -class_path: chebai.preprocessing.datasets.scope.scope.SCOPeOver50 -init_args: - scope_version: "2.08" \ No newline at end of file diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py deleted file mode 100644 index 96ff9a3a..00000000 --- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py +++ /dev/null @@ -1,229 +0,0 @@ -import unittest -from collections import OrderedDict -from unittest.mock import PropertyMock, mock_open, patch - -import fastobo -import networkx as nx -import pandas as pd - -from chebai.preprocessing.datasets.deepGO.go_uniprot import _GOUniProtDataExtractor -from chebai.preprocessing.reader import ProteinDataReader -from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData - - -class TestGOUniProtDataExtractor(unittest.TestCase): - """ - Unit tests for the _GOUniProtDataExtractor class. - """ - - @classmethod - @patch.multiple(_GOUniProtDataExtractor, __abstractmethods__=frozenset()) - @patch.object(_GOUniProtDataExtractor, "base_dir", new_callable=PropertyMock) - @patch.object(_GOUniProtDataExtractor, "_name", new_callable=PropertyMock) - @patch("os.makedirs", return_value=None) - def setUpClass( - cls, - mock_makedirs, - mock_name_property: PropertyMock, - mock_base_dir_property: PropertyMock, - ) -> None: - """ - Class setup for mocking abstract properties of _GOUniProtDataExtractor. - """ - mock_base_dir_property.return_value = "MockedBaseDirPropGOUniProtDataExtractor" - mock_name_property.return_value = "MockedNamePropGOUniProtDataExtractor" - - _GOUniProtDataExtractor.READER = ProteinDataReader - - cls.extractor = _GOUniProtDataExtractor() - - def test_term_callback(self) -> None: - """ - Test the term_callback method for correct parsing and filtering of GO terms. - """ - self.extractor.go_branch = "all" - term_mapping = {} - for term in fastobo.loads(GOUniProtMockData.get_GO_raw_data()): - if isinstance(term, fastobo.typedef.TypedefFrame): - continue - term_mapping[self.extractor._parse_go_id(term.id)] = term - - # Test individual term callback - term_dict = self.extractor.term_callback(term_mapping[4]) - expected_dict = {"go_id": 4, "parents": [3, 2], "name": "GO_4"} - self.assertEqual( - term_dict, - expected_dict, - "The term_callback did not return the expected dictionary.", - ) - - # Test filtering valid terms - valid_terms_docs = set() - for term_id, term_doc in term_mapping.items(): - if self.extractor.term_callback(term_doc): - valid_terms_docs.add(term_id) - - self.assertEqual( - valid_terms_docs, - set(GOUniProtMockData.get_nodes()), - "The valid terms do not match expected nodes.", - ) - - # Test that obsolete terms are filtered out - self.assertFalse( - any( - self.extractor.term_callback(term_mapping[obs_id]) - for obs_id in GOUniProtMockData.get_obsolete_nodes_ids() - ), - "Obsolete terms should not be present.", - ) - - # Test filtering by GO branch (e.g., BP) - self.extractor.go_branch = "BP" - BP_terms = { - term_id - for term_id, term in term_mapping.items() - if self.extractor.term_callback(term) - } - self.assertEqual( - BP_terms, {2, 4}, "The BP terms do not match the expected set." - ) - - @patch( - "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data()) - ) - def test_extract_class_hierarchy(self, mock_load) -> None: - """ - Test the extraction of the class hierarchy from the ontology. - """ - graph = self.extractor._extract_class_hierarchy("fake_path") - - # Validate the graph structure - self.assertIsInstance( - graph, nx.DiGraph, "The result should be a directed graph." - ) - - # Check nodes - actual_nodes = set(graph.nodes) - self.assertEqual( - set(GOUniProtMockData.get_nodes()), - actual_nodes, - "The graph nodes do not match the expected nodes.", - ) - - # Check edges - actual_edges = set(graph.edges) - self.assertEqual( - GOUniProtMockData.get_edges_of_transitive_closure_graph(), - actual_edges, - "The graph edges do not match the expected edges.", - ) - - # Check number of nodes and edges - self.assertEqual( - GOUniProtMockData.get_number_of_nodes(), - len(actual_nodes), - "The number of nodes should match the actual number of nodes in the graph.", - ) - - self.assertEqual( - GOUniProtMockData.get_number_of_transitive_edges(), - len(actual_edges), - "The number of transitive edges should match the actual number of transitive edges in the graph.", - ) - - @patch( - "builtins.open", - new_callable=mock_open, - read_data=GOUniProtMockData.get_UniProt_raw_data(), - ) - def test_get_swiss_to_go_mapping(self, mock_open) -> None: - """ - Test the extraction of SwissProt to GO term mapping. - """ - mapping_df = self.extractor._get_swiss_to_go_mapping() - expected_df = pd.DataFrame( - OrderedDict( - swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"], - accession=["Q6GZX4", "DCGZX4"], - go_ids=[[2, 3, 5], [2, 5]], - sequence=list(GOUniProtMockData.protein_sequences().values()), - ) - ) - - pd.testing.assert_frame_equal( - mapping_df, - expected_df, - obj="The SwissProt to GO mapping DataFrame does not match the expected DataFrame.", - ) - - @patch( - "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data()) - ) - @patch( - "builtins.open", - new_callable=mock_open, - read_data=GOUniProtMockData.get_UniProt_raw_data(), - ) - @patch.object( - _GOUniProtDataExtractor, - "select_classes", - return_value=GOUniProtMockData.get_nodes(), - ) - def test_graph_to_raw_dataset( - self, mock_select_classes, mock_open, mock_load - ) -> None: - """ - Test the conversion of the class hierarchy graph to a raw dataset. - """ - graph = self.extractor._extract_class_hierarchy("fake_path") - actual_df = self.extractor._graph_to_raw_dataset(graph) - expected_df = GOUniProtMockData.get_data_in_dataframe() - - pd.testing.assert_frame_equal( - actual_df, - expected_df, - obj="The raw dataset DataFrame does not match the expected DataFrame.", - ) - - @patch("builtins.open", new_callable=mock_open, read_data=b"Mocktestdata") - @patch("pandas.read_pickle") - def test_load_dict( - self, mock_read_pickle: PropertyMock, mock_open: mock_open - ) -> None: - """ - Test the loading of the dictionary from a DataFrame. - """ - mock_df = GOUniProtMockData.get_data_in_dataframe() - mock_read_pickle.return_value = mock_df - - generator = self.extractor._load_dict("data/tests") - result = list(generator) - - # Convert NumPy arrays to lists for comparison - for item in result: - item["labels"] = list(item["labels"]) - - # Expected output for comparison - expected_result = [ - { - "features": mock_df["sequence"][0], - "labels": mock_df.iloc[0, 4:].to_list(), - "ident": mock_df["swiss_id"][0], - }, - { - "features": mock_df["sequence"][1], - "labels": mock_df.iloc[1, 4:].to_list(), - "ident": mock_df["swiss_id"][1], - }, - ] - - self.assertEqual( - result, - expected_result, - "The loaded dictionary does not match the expected structure.", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py deleted file mode 100644 index 3f329c56..00000000 --- a/tests/unit/dataset_classes/testGoUniProtOverX.py +++ /dev/null @@ -1,140 +0,0 @@ -import unittest -from typing import List -from unittest.mock import mock_open, patch - -import networkx as nx -import pandas as pd - -from chebai.preprocessing.datasets.deepGO.go_uniprot import _GOUniProtOverX -from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData - - -class TestGOUniProtOverX(unittest.TestCase): - @classmethod - @patch.multiple(_GOUniProtOverX, __abstractmethods__=frozenset()) - @patch("os.makedirs", return_value=None) - def setUpClass(cls, mock_makedirs) -> None: - """ - Set up the class for tests by initializing the extractor, graph, and input DataFrame. - """ - cls.extractor = _GOUniProtOverX() - cls.test_graph: nx.DiGraph = GOUniProtMockData.get_transitively_closed_graph() - cls.input_df: pd.DataFrame = GOUniProtMockData.get_data_in_dataframe().iloc[ - :, :4 - ] - - @patch("builtins.open", new_callable=mock_open) - def test_select_classes(self, mock_open_file: mock_open) -> None: - """ - Test the `select_classes` method to ensure it selects classes based on the threshold. - - Args: - mock_open_file (mock_open): Mocked open function to intercept file operations. - """ - # Set threshold for testing - self.extractor.THRESHOLD = 2 - selected_classes: List[int] = self.extractor.select_classes( - self.test_graph, data_df=self.input_df - ) - - # Expected result: GO terms 1, 2, and 5 should be selected based on the threshold - expected_selected_classes: List[int] = sorted([1, 2, 5]) - - # Check if the selected classes are as expected - self.assertEqual( - selected_classes, - expected_selected_classes, - msg="The selected classes do not match the expected output for threshold 2.", - ) - - # Expected data as string - expected_lines: str = "\n".join(map(str, expected_selected_classes)) + "\n" - - # Extract the generator passed to writelines - written_generator = mock_open_file().writelines.call_args[0][0] - written_lines: str = "".join(written_generator) - - # Ensure the data matches - self.assertEqual( - written_lines, - expected_lines, - msg="The written lines do not match the expected lines for the given threshold of 2.", - ) - - @patch("builtins.open", new_callable=mock_open) - def test_no_classes_meet_threshold(self, mock_open_file: mock_open) -> None: - """ - Test the `select_classes` method when no nodes meet the successor threshold. - - Args: - mock_open_file (mock_open): Mocked open function to intercept file operations. - """ - self.extractor.THRESHOLD = 5 - selected_classes: List[int] = self.extractor.select_classes( - self.test_graph, data_df=self.input_df - ) - - # Expected result: No classes should meet the threshold of 5 - expected_selected_classes: List[int] = [] - - # Check if the selected classes are as expected - self.assertEqual( - selected_classes, - expected_selected_classes, - msg="The selected classes list should be empty when no nodes meet the threshold of 5.", - ) - - # Expected data as string - expected_lines: str = "" - - # Extract the generator passed to writelines - written_generator = mock_open_file().writelines.call_args[0][0] - written_lines: str = "".join(written_generator) - - # Ensure the data matches - self.assertEqual( - written_lines, - expected_lines, - msg="The written lines do not match the expected lines when no nodes meet the threshold of 5.", - ) - - @patch("builtins.open", new_callable=mock_open) - def test_all_nodes_meet_threshold(self, mock_open_file: mock_open) -> None: - """ - Test the `select_classes` method when all nodes meet the successor threshold. - - Args: - mock_open_file (mock_open): Mocked open function to intercept file operations. - """ - self.extractor.THRESHOLD = 0 - selected_classes: List[int] = self.extractor.select_classes( - self.test_graph, data_df=self.input_df - ) - - # Expected result: All nodes except those not referenced by any protein (4 and 6) should be selected - expected_classes: List[int] = sorted([1, 2, 3, 5]) - - # Check if the returned selected classes match the expected list - self.assertListEqual( - selected_classes, - expected_classes, - msg="The selected classes do not match the expected output when all nodes meet the threshold of 0.", - ) - - # Expected data as string - expected_lines: str = "\n".join(map(str, expected_classes)) + "\n" - - # Extract the generator passed to writelines - written_generator = mock_open_file().writelines.call_args[0][0] - written_lines: str = "".join(written_generator) - - # Ensure the data matches - self.assertEqual( - written_lines, - expected_lines, - msg="The written lines do not match the expected lines when all nodes meet the threshold of 0.", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/unit/dataset_classes/testProteinPretrainingData.py b/tests/unit/dataset_classes/testProteinPretrainingData.py deleted file mode 100644 index caac3eac..00000000 --- a/tests/unit/dataset_classes/testProteinPretrainingData.py +++ /dev/null @@ -1,76 +0,0 @@ -import unittest -from unittest.mock import PropertyMock, mock_open, patch - -from chebai.preprocessing.datasets.deepGO.protein_pretraining import ( - _ProteinPretrainingData, -) -from chebai.preprocessing.reader import ProteinDataReader -from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData - - -class TestProteinPretrainingData(unittest.TestCase): - """ - Unit tests for the _ProteinPretrainingData class. - Tests focus on data parsing and validation checks for protein pretraining. - """ - - @classmethod - @patch.multiple(_ProteinPretrainingData, __abstractmethods__=frozenset()) - @patch.object(_ProteinPretrainingData, "base_dir", new_callable=PropertyMock) - @patch.object(_ProteinPretrainingData, "_name", new_callable=PropertyMock) - @patch("os.makedirs", return_value=None) - def setUpClass( - cls, - mock_makedirs, - mock_name_property: PropertyMock, - mock_base_dir_property: PropertyMock, - ) -> None: - """ - Class setup for mocking abstract properties of _ProteinPretrainingData. - - Mocks the required abstract properties and sets up the data extractor. - """ - mock_base_dir_property.return_value = "MockedBaseDirPropProteinPretrainingData" - mock_name_property.return_value = "MockedNameProp_ProteinPretrainingData" - - # Set the READER class for the pretraining data - _ProteinPretrainingData.READER = ProteinDataReader - - # Initialize the extractor instance - cls.extractor = _ProteinPretrainingData() - - @patch( - "builtins.open", - new_callable=mock_open, - read_data=GOUniProtMockData.get_UniProt_raw_data(), - ) - def test_parse_protein_data_for_pretraining( - self, mock_open_file: mock_open - ) -> None: - """ - Tests the _parse_protein_data_for_pretraining method. - - Verifies that: - - The parsed DataFrame contains the expected protein IDs. - - The protein sequences are not empty. - """ - # Parse the pretraining data - pretrain_df = self.extractor._parse_protein_data_for_pretraining() - list_of_pretrain_swiss_ids = GOUniProtMockData.proteins_for_pretraining() - - # Assert that all expected Swiss-Prot IDs are present in the DataFrame - self.assertEqual( - set(pretrain_df["swiss_id"]), - set(list_of_pretrain_swiss_ids), - msg="The parsed DataFrame does not contain the expected Swiss-Prot IDs for pretraining.", - ) - - # Assert that all sequences are not empty - self.assertTrue( - pretrain_df["sequence"].str.len().gt(0).all(), - msg="Some protein sequences in the pretraining DataFrame are empty.", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index 552d2918..d94a8d94 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -404,410 +404,3 @@ def get_transitively_closed_graph() -> nx.DiGraph: g.add_edges_from(ChebiMockOntology.get_edges_of_transitive_closure_graph()) return g - - -class GOUniProtMockData(MockOntologyGraphData): - """ - A mock ontology representing a simplified version of the Gene Ontology (GO) structure with nodes and edges - representing GO terms and their relationships in a directed acyclic graph (DAG). - - Nodes: - - GO_1 - - GO_2 - - GO_3 - - GO_4 - - GO_5 - - GO_6 - - Edges (Parent-Child Relationships): - - GO_1 -> GO_2 - - GO_1 -> GO_3 - - GO_2 -> GO_4 - - GO_2 -> GO_5 - - GO_3 -> GO_4 - - GO_4 -> GO_6 - - This mock ontology structure is useful for testing methods related to GO hierarchy, graph extraction, and transitive - closure operations. - - The class also includes methods to retrieve nodes, edges, and transitive closure of the graph. - - Visual Representation Graph with Valid Nodes and Edges: - - GO_1 - / \ - GO_2 GO_3 - / \ / - GO_5 GO_4 - \ - GO_6 - - Valid Swiss Proteins with mapping to valid GO ids - Swiss_Prot_1 -> GO_2, GO_3, GO_5 - Swiss_Prot_2 -> GO_2, GO_5 - """ - - @staticmethod - def get_nodes() -> List[int]: - """ - Get a sorted list of node IDs. - - Returns: - List[int]: A sorted list of node IDs in the ontology graph. - """ - return sorted([1, 2, 3, 4, 5, 6]) - - @staticmethod - def get_number_of_nodes() -> int: - """ - Get the total number of nodes in the ontology graph. - - Returns: - int: The number of nodes. - """ - return len(GOUniProtMockData.get_nodes()) - - @staticmethod - def get_edges() -> Set[Tuple[int, int]]: - """ - Get the set of edges in the ontology graph. - - Returns: - Set[Tuple[int, int]]: A set of tuples where each tuple represents an edge between two nodes. - """ - return {(1, 2), (1, 3), (2, 4), (2, 5), (3, 4), (4, 6)} - - @staticmethod - def get_number_of_edges() -> int: - """ - Get the total number of edges in the ontology graph. - - Returns: - int: The number of edges. - """ - return len(GOUniProtMockData.get_edges()) - - @staticmethod - def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]: - """ - Get the set of edges in the transitive closure of the ontology graph. - - Returns: - Set[Tuple[int, int]]: A set of tuples representing edges in the transitive closure graph. - """ - return { - (1, 2), - (1, 3), - (1, 4), - (1, 5), - (1, 6), - (2, 4), - (2, 5), - (2, 6), - (3, 4), - (3, 6), - (4, 6), - } - - @staticmethod - def get_number_of_transitive_edges() -> int: - """ - Get the total number of edges in the transitive closure graph. - - Returns: - int: The number of transitive edges. - """ - return len(GOUniProtMockData.get_edges_of_transitive_closure_graph()) - - @staticmethod - def get_obsolete_nodes_ids() -> Set[int]: - """ - Get the set of obsolete node IDs in the ontology graph. - - Returns: - Set[int]: A set of node IDs representing obsolete nodes. - """ - return {7, 8} - - @staticmethod - def get_GO_raw_data() -> str: - """ - Get raw data in string format for a basic Gene Ontology (GO) structure. - - This data simulates a basic GO ontology format typically used for testing purposes. - The data will include valid and obsolete GO terms with various relationships between them. - - Scenarios covered: - - Obsolete terms being the parent of valid terms. - - Valid terms being the parent of obsolete terms. - - Both direct and indirect hierarchical relationships between terms. - - The data is designed to help test the proper handling of obsolete and valid GO terms, - ensuring that the ontology parser can correctly manage both cases. - - Returns: - str: The raw GO data in string format, structured as test input. - """ - return """ - [Term] - id: GO:0000001 - name: GO_1 - namespace: molecular_function - def: "OBSOLETE. Assists in the correct assembly of ribosomes or ribosomal subunits in vivo, but is not a component of the assembled ribosome when performing its normal biological function." [GOC:jl, PMID:12150913] - comment: This term was made obsolete because it refers to a class of gene products and a biological process rather than a molecular function. - synonym: "ribosomal chaperone activity" EXACT [] - xref: MetaCyc:BETAGALACTOSID-RXN - xref: Reactome:R-HSA-189062 "lactose + H2O => D-glucose + D-galactose" - xref: Reactome:R-HSA-5658001 "Defective LCT does not hydrolyze Lac" - xref: RHEA:10076 - - [Term] - id: GO:0000002 - name: GO_2 - namespace: biological_process - is_a: GO:0000001 ! hydrolase activity, hydrolyzing O-glycosyl compounds - is_a: GO:0000008 ! hydrolase activity, hydrolyzing O-glycosyl compounds - - [Term] - id: GO:0000003 - name: GO_3 - namespace: cellular_component - is_a: GO:0000001 ! regulation of DNA recombination - - [Term] - id: GO:0000004 - name: GO_4 - namespace: biological_process - is_a: GO:0000003 ! regulation of DNA recombination - is_a: GO:0000002 ! hydrolase activity, hydrolyzing O-glycosyl compounds - - [Term] - id: GO:0000005 - name: GO_5 - namespace: molecular_function - is_a: GO:0000002 ! regulation of DNA recombination - - [Term] - id: GO:0000006 - name: GO_6 - namespace: cellular_component - is_a: GO:0000004 ! glucoside transport - - [Term] - id: GO:0000007 - name: GO_7 - namespace: biological_process - is_a: GO:0000003 ! glucoside transport - is_obsolete: true - - [Term] - id: GO:0000008 - name: GO_8 - namespace: molecular_function - is_obsolete: true - - [Typedef] - id: term_tracker_item - name: term tracker item - namespace: external - xref: IAO:0000233 - is_metadata_tag: true - is_class_level: true - """ - - @staticmethod - def protein_sequences() -> Dict[str, str]: - """ - Get the protein sequences for Swiss-Prot proteins. - - Returns: - Dict[str, str]: A dictionary where keys are Swiss-Prot IDs and values are their respective sequences. - """ - return { - "Swiss_Prot_1": "MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK".replace( - " ", "" - ), - "Swiss_Prot_2": "EKGLIVGHFS GIKYKGEKAQ ASEVDVNKMC CWVSKFKDAM RRYQGIQTCK".replace( - " ", "" - ), - } - - @staticmethod - def proteins_for_pretraining() -> List[str]: - """ - Returns a list of protein IDs which will be used for pretraining based on mock UniProt data. - - Proteins include those with: - - No GO classes or invalid GO classes (missing required evidence codes). - - Returns: - List[str]: A list of protein IDs that do not meet validation criteria. - """ - return [ - "Swiss_Prot_5", # No GO classes associated - "Swiss_Prot_6", # GO class with no evidence code - "Swiss_Prot_7", # GO class with invalid evidence code - ] - - @staticmethod - def get_UniProt_raw_data() -> str: - """ - Get raw data in string format for UniProt proteins. - - This mock data contains eleven Swiss-Prot proteins with different properties: - - **Swiss_Prot_1**: A valid protein with three valid GO classes and one invalid GO class. - - **Swiss_Prot_2**: Another valid protein with two valid GO classes and one invalid. - - **Swiss_Prot_3**: Contains valid GO classes but has a sequence length > 1002. - - **Swiss_Prot_4**: Has valid GO classes but contains an invalid amino acid, 'B'. - - **Swiss_Prot_5**: Has a sequence but no GO classes associated. - - **Swiss_Prot_6**: Has GO classes without any associated evidence codes. - - **Swiss_Prot_7**: Has a GO class with an invalid evidence code. - - **Swiss_Prot_8**: Has a sequence length > 1002 and has only invalid GO class. - - **Swiss_Prot_9**: Has no GO classes but contains an invalid amino acid, 'B', in its sequence. - - **Swiss_Prot_10**: Has a valid GO class but lacks a sequence. - - **Swiss_Prot_11**: Has only Invalid GO class but lacks a sequence. - - Note: - A valid GO label is the one which has one of the following evidence code specified in - go_uniprot.py->`EXPERIMENTAL_EVIDENCE_CODES`. - Invalid amino acids are specified in go_uniprot.py->`AMBIGUOUS_AMINO_ACIDS`. - - Returns: - str: The raw UniProt data in string format. - """ - protein_sq_1 = GOUniProtMockData.protein_sequences()["Swiss_Prot_1"] - protein_sq_2 = GOUniProtMockData.protein_sequences()["Swiss_Prot_2"] - raw_str = ( - # Below protein with 3 valid associated GO class and one invalid GO class - f"ID Swiss_Prot_1 Reviewed; {len(protein_sq_1)} AA. \n" - "AC Q6GZX4;\n" - "DR GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n" - "DR GO; GO:0000003; C:membrane; IDA:UniProtKB-KW.\n" - "DR GO; GO:0000005; P:regulation of viral transcription; IPI:InterPro.\n" - "DR GO; GO:0000004; P:regulation of viral transcription; IEA:SGD.\n" - f"SQ SEQUENCE {len(protein_sq_1)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - f" {protein_sq_1}\n" - "//\n" - # Below protein with 2 valid associated GO class and one invalid GO class - f"ID Swiss_Prot_2 Reviewed; {len(protein_sq_2)} AA.\n" - "AC DCGZX4;\n" - "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - "DR GO; GO:0000002; P:regulation of viral transcription; IMP:InterPro.\n" - "DR GO; GO:0000005; P:regulation of viral transcription; IGI:InterPro.\n" - "DR GO; GO:0000006; P:regulation of viral transcription; IEA:PomBase.\n" - f"SQ SEQUENCE {len(protein_sq_2)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - f" {protein_sq_2}\n" - "//\n" - # Below protein with all valid associated GO class but sequence length greater than 1002 - f"ID Swiss_Prot_3 Reviewed; {len(protein_sq_1 * 25)} AA.\n" - "AC Q6GZX4;\n" - "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - "DR GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n" - "DR GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n" - "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" - f"SQ SEQUENCE {len(protein_sq_1 * 25)} AA; 129118 MW; FE2984658CED53A8 CRC64;\n" - f" {protein_sq_1 * 25}\n" - "//\n" - # Below protein has valid go class association but invalid amino acid `X` in its sequence - "ID Swiss_Prot_4 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - "DR GO; GO:0000002; P:regulation of viral transcription; EXP:InterPro.\n" - "DR GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n" - "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " BAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with sequence string but has no GO class - "ID Swiss_Prot_5 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with sequence string and with NO `valid` associated GO class (no evidence code) - "ID Swiss_Prot_6 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000023; P:regulation of viral transcription;\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with sequence string and with NO `valid` associated GO class (invalid evidence code) - "ID Swiss_Prot_7 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000024; P:regulation of viral transcription; IEA:SGD.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with sequence length greater than 1002 but with `Invalid` associated GO class - f"ID Swiss_Prot_8 Reviewed; {len(protein_sq_2 * 25)} AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000025; P:regulation of viral transcription; IC:Inferred.\n" - f"SQ SEQUENCE {len(protein_sq_2 * 25)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - f" {protein_sq_2 * 25}\n" - "//\n" - # Below protein with sequence string but invalid amino acid `X` in its sequence - "ID Swiss_Prot_9 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " BAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" - "//\n" - # Below protein with a `valid` associated GO class but without sequence string - "ID Swiss_Prot_10 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000027; P:regulation of viral transcription; EXP:InterPro.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " \n" - "//\n" - # Below protein with a `Invalid` associated GO class but without sequence string - "ID Swiss_Prot_11 Reviewed; 60 AA.\n" - "AC Q6GZX4;\n" - "DR GO; GO:0000028; P:regulation of viral transcription; ND:NoData.\n" - "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" - " \n" - "//\n" - ) - - return raw_str - - @staticmethod - def get_data_in_dataframe() -> pd.DataFrame: - """ - Get a mock DataFrame representing UniProt data. - - The DataFrame contains Swiss-Prot protein data, including identifiers, accessions, GO terms, sequences, - and binary label columns representing whether each protein is associated with certain GO classes. - - Returns: - pd.DataFrame: A DataFrame containing mock UniProt data with columns for 'swiss_id', 'accession', 'go_ids', 'sequence', - and binary labels for GO classes. - """ - expected_data = OrderedDict( - swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"], - accession=["Q6GZX4", "DCGZX4"], - go_ids=[[1, 2, 3, 5], [1, 2, 5]], - sequence=list(GOUniProtMockData.protein_sequences().values()), - **{ - # SP_1, SP_2 - 1: [True, True], - 2: [True, True], - 3: [True, False], - 4: [False, False], - 5: [True, True], - 6: [False, False], - }, - ) - return pd.DataFrame(expected_data) - - @staticmethod - def get_transitively_closed_graph() -> nx.DiGraph: - """ - Get the transitive closure of the ontology graph. - - Returns: - nx.DiGraph: A directed graph representing the transitive closure of the ontology graph. - """ - g = nx.DiGraph() - g.add_nodes_from(node for node in ChebiMockOntology.get_nodes()) - g.add_edges_from(GOUniProtMockData.get_edges_of_transitive_closure_graph()) - return g diff --git a/tests/unit/readers/testProteinDataReader.py b/tests/unit/readers/testProteinDataReader.py deleted file mode 100644 index c5bc5e9a..00000000 --- a/tests/unit/readers/testProteinDataReader.py +++ /dev/null @@ -1,139 +0,0 @@ -import unittest -from typing import List -from unittest.mock import mock_open, patch - -from chebai.preprocessing.reader import EMBEDDING_OFFSET, ProteinDataReader - - -class TestProteinDataReader(unittest.TestCase): - """ - Unit tests for the ProteinDataReader class. - """ - - @classmethod - @patch( - "chebai.preprocessing.reader.open", - new_callable=mock_open, - read_data="M\nK\nT\nF\nR\nN", - ) - def setUpClass(cls, mock_file: mock_open) -> None: - """ - Set up the test environment by initializing a ProteinDataReader instance with a mocked token file. - - Args: - mock_file: Mock object for file operations. - """ - cls.reader = ProteinDataReader(token_path="/mock/path") - # After initializing, cls.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N'] - assert cls.reader.cache == [ - "M", - "K", - "T", - "F", - "R", - "N", - ], "Cache initialization did not match expected tokens." - - def test_read_data(self) -> None: - """ - Test the _read_data method with a protein sequence to ensure it correctly tokenizes the sequence. - """ - raw_data = "MKTFFRN" - - # Expected output based on the cached tokens - expected_output: List[int] = [ - EMBEDDING_OFFSET + 0, # M - EMBEDDING_OFFSET + 1, # K - EMBEDDING_OFFSET + 2, # T - EMBEDDING_OFFSET + 3, # F - EMBEDDING_OFFSET + 3, # F (repeated token) - EMBEDDING_OFFSET + 4, # R - EMBEDDING_OFFSET + 5, # N - ] - result = self.reader._read_data(raw_data) - self.assertEqual( - result, - expected_output, - "The _read_data method did not produce the expected tokenized output.", - ) - - def test_read_data_with_new_token(self) -> None: - """ - Test the _read_data method with a protein sequence that includes a new token. - Ensure that the new token is added to the cache and processed correctly. - """ - raw_data = "MKTFY" - - # 'Y' is not in the initial cache and should be added. - expected_output: List[int] = [ - EMBEDDING_OFFSET + 0, # M - EMBEDDING_OFFSET + 1, # K - EMBEDDING_OFFSET + 2, # T - EMBEDDING_OFFSET + 3, # F - EMBEDDING_OFFSET + len(self.reader.cache), # Y (new token) - ] - - result = self.reader._read_data(raw_data) - self.assertEqual( - result, - expected_output, - "The _read_data method did not correctly handle a new token.", - ) - - # Verify that 'Y' was added to the cache - self.assertIn( - "Y", self.reader.cache, "The new token 'Y' was not added to the cache." - ) - # Ensure it's at the correct index - self.assertEqual( - self.reader.cache.index("Y"), - len(self.reader.cache) - 1, - "The new token 'Y' was not added at the correct index in the cache.", - ) - - def test_read_data_with_invalid_token(self) -> None: - """ - Test the _read_data method with an invalid amino acid token to ensure it raises a KeyError. - """ - raw_data = "MKTFZ" # 'Z' is not a valid amino acid token - - with self.assertRaises(KeyError) as context: - self.reader._read_data(raw_data) - - self.assertIn( - "Invalid token 'Z' encountered", - str(context.exception), - "The KeyError did not contain the expected message for an invalid token.", - ) - - def test_read_data_with_empty_sequence(self) -> None: - """ - Test the _read_data method with an empty protein sequence to ensure it returns an empty list. - """ - raw_data = "" - - result = self.reader._read_data(raw_data) - self.assertEqual( - result, - [], - "The _read_data method did not return an empty list for an empty input sequence.", - ) - - def test_read_data_with_repeated_tokens(self) -> None: - """ - Test the _read_data method with repeated amino acid tokens to ensure it handles them correctly. - """ - raw_data = "MMMMM" - - expected_output: List[int] = [EMBEDDING_OFFSET + 0] * 5 # All tokens are 'M' - - result = self.reader._read_data(raw_data) - self.assertEqual( - result, - expected_output, - "The _read_data method did not correctly handle repeated tokens.", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb deleted file mode 100644 index 6f67c82b..00000000 --- a/tutorials/data_exploration_go.ipynb +++ /dev/null @@ -1,1341 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "da687d32ba48b188", - "metadata": {}, - "source": [ - "# Introduction\n", - "\n", - "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the GO-UniProt dataset. It covers\n", - "- how to instantiate a data class and generate data\n", - "- how the data is processed and stored\n", - "- and how to work with different molecule encodings.\n", - "\n", - "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", - "\n", - "---\n" - ] - }, - { - "cell_type": "markdown", - "id": "0bd07c91-bb02-48d4-b759-aa35ecb224bd", - "metadata": {}, - "source": [ - "# 1. Instantiation of a Data Class\n", - "\n", - "To start working with `chebai`, you first need to instantiate a GO-UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a4d590fb-9a83-456e-9cb4-303caa8203e8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Already in the project root directory: G:\\github-aditya0by0\\python-chebai\n" - ] - } - ], - "source": [ - "# To run this notebook, you need to change the working directory of the jupyter notebook to root dir of the project.\n", - "import os\n", - "\n", - "# Root directory name of the project\n", - "expected_root_dir = \"python-chebai\"\n", - "\n", - "# Check if the current directory ends with the expected root directory name\n", - "if not os.getcwd().endswith(expected_root_dir):\n", - " os.chdir(\"..\") # Move up one directory level\n", - " if os.getcwd().endswith(expected_root_dir):\n", - " print(\"Changed to project root directory:\", os.getcwd())\n", - " else:\n", - " print(\"Warning: Directory change unsuccessful. Current directory:\", os.getcwd())\n", - "else:\n", - " print(\"Already in the project root directory:\", os.getcwd())" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "440f203ceaf7e4b7", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:25:03.920610Z", - "start_time": "2024-09-30T21:25:03.622407Z" - } - }, - "outputs": [], - "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250" - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a648346d81d0dc5e", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:25:08.863132Z", - "start_time": "2024-09-30T21:25:08.387739Z" - } - }, - "outputs": [], - "source": [ - "go_class = GOUniProtOver250(go_branch=\"BP\")" - ] - }, - { - "cell_type": "markdown", - "id": "64585012b0d7f66f", - "metadata": {}, - "source": [ - "### Inheritance Hierarchy\n", - "\n", - "GO_UniProt data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", - "\n", - "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", - "\n", - "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", - "\n", - "In summary, GO_UniProt data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", - "\n", - "\n", - "### Configuration Parameters\n", - "\n", - "Data classes related to proteins can be configured using the following main parameters:\n", - "\n", - "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", - " - **`\"BP\"`**: Biological Process branch.\n", - " - **`\"MF\"`**: Molecular Function branch.\n", - " - **`\"CC\"`**: Cellular Component branch.\n", - "\n", - "- **`max_sequence_length (int)`**: Specifies the maximum allowed sequence length for a protein, with a default of `1002`. During data preprocessing, any proteins exceeding this length will be excluded from further processing.\n", - "\n", - "This allows for more specific datasets focused on a particular aspect of gene function.\n", - "\n", - "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", - "\n", - "### Additional Input Parameters\n", - "\n", - "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n", - "\n", - "\n", - "# Available Data Classes\n", - "\n", - "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py).\n", - "\n", - "There is a range of available dataset classes for GOUniProt classes. Usually, you want to use `GOUniProtOver250` or `GOUniProtOver50`. Both inherit from `_GOUniProtOverX`. The number indicates the threshold for selecting label classes. The selection process is based on the annotations of the GO terms with its ancestors across the dataset. For instance, GOUniProtOver50 will only select labels which have at least 50 samples in the dataset.\n", - "\n", - "Refer `select_classes` method of `_GOUniProtOverX` for more details on selection process.\n", - "\n", - "If you need a different threshold, you can create your own subclass." - ] - }, - { - "cell_type": "markdown", - "id": "651ab5c39833bd2c", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "a52b4363-7398-44aa-a4cc-8bba14bdd966", - "metadata": {}, - "source": [ - "# 2. Preparation / Setup Methods\n", - "\n", - "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is to generate the actual dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "9f77351090560bc4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checking for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\n", - "Missing processed data file (`data.pkl` file)\n", - "Downloading Swiss UniProt data....\n", - "Downloading to temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n", - "Downloaded to C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n", - "Unzipping the file....\n", - "Unpacked and saved to data\\GO_UniProt\\raw\\uniprot_sprot.dat\n", - "Removed temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n", - "Missing Gene Ontology raw data\n", - "Downloading Gene Ontology data....\n", - "Extracting class hierarchy...\n", - "Compute transitive closure\n", - "Processing graph\n", - "Parsing swiss uniprot raw data....\n", - "Selecting GO terms based on given threshold: 250 ...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Check for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\\protein_token\n", - "Cross-validation enabled: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Missing transformed data (`data.pt` file). Transforming data.... \n", - "Processing 53604 lines...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|███████████████████████████████████████████████████████████████████████████| 53604/53604 [01:18<00:00, 678.84it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving 20 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\protein_token\\tokens.txt...\n", - "First 10 tokens: ['M', 'S', 'I', 'G', 'A', 'T', 'R', 'L', 'Q', 'N']\n" - ] - } - ], - "source": [ - "go_class.prepare_data()\n", - "go_class.setup()" - ] - }, - { - "cell_type": "markdown", - "id": "2328e824c4dafb2d", - "metadata": {}, - "source": [ - "### Automatic Execution: \n", - "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", - "\n", - "\n", - "### Why is Preparation Needed?\n", - "\n", - "- **Data Availability**: The preparation step ensures that the required GOUniProt data files are downloaded or loaded, which are essential for analysis.\n", - "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", - "\n", - "### Main Methods for Data Preprocessing\n", - "\n", - "The data preprocessing in a data class involves two main methods:\n", - "\n", - "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", - "\n", - "2. **`setup` Method**:\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", - "\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." - ] - }, - { - "cell_type": "markdown", - "id": "db5b58f2d96823fc", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "ee174b61b36c71aa", - "metadata": {}, - "source": [ - "# 3. Overview of the 3 preprocessing stages\n", - "\n", - "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", - "\n", - "1. **Raw Data Stage**:\n", - " - **File**: `go-basic.obo` and `uniprot_sprot.data`\n", - " - **Description**: This stage contains the raw GO ontology data and raw Swiss-UniProt data, serving as the initial input for further processing.\n", - " - **File Paths**:\n", - " - `data/GO_UniProt/raw/go-basic.obo`\n", - " - `data/GO_UniProt/raw/uniprot_sprot.dat`\n", - "\n", - "2. **Processed Data Stage 1**:\n", - " - **File**: `data.pkl`\n", - " - **Description**: This stage includes the data after initial processing. It contains sequence strings, class columns, and metadata but lacks data splits.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", - " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", - "\n", - "3. **Processed Data Stage 2**:\n", - " - **File**: `data.pt`\n", - " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", - " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", - "\n", - "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n", - "\n", - "### Summary of File Paths\n", - "\n", - "- **Raw Data**: `data/GO_UniProt/raw`\n", - "- **Processed Data 1**: `data/GO_UniProt/${dataset_name}/processed`\n", - "- **Processed Data 2**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}`\n", - "\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", - "\n", - "### Data Splits\n", - "\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" - ] - }, - { - "cell_type": "markdown", - "id": "a927ad484c930960", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "3f92b58e460c08fd", - "metadata": {}, - "source": [ - "# 4. Data Files and their structure\n", - "\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" - ] - }, - { - "cell_type": "markdown", - "id": "cca75d881cb8bade", - "metadata": {}, - "source": [ - "## go-basic.obo File\n", - "\n", - "**Description**: The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n", - "\n", - "#### Example of a Term Document\n", - "\n", - "```plaintext\n", - "[Term]\n", - "id: GO:0000032\n", - "name: cell wall mannoprotein biosynthetic process\n", - "namespace: biological_process\n", - "def: \"The chemical reactions and pathways resulting in the formation of cell wall mannoproteins, any cell wall protein that contains covalently bound mannose residues.\" [GOC:ai]\n", - "synonym: \"cell wall mannoprotein anabolism\" EXACT []\n", - "is_a: GO:0006057 ! mannoprotein biosynthetic process\n", - "is_a: GO:0031506 ! cell wall glycoprotein biosynthetic process\n", - "```\n", - "\n", - "**File Path**: `data/GO_UniProt/raw/go-basic.obo`\n", - "\n", - "### Structure of `go-basic.obo`\n", - "\n", - "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n", - "\n", - "\n", - "\n", - "### Breakdown of Attributes\n", - "\n", - "Each term document in the `go-basic.obo` file consists of the following key attributes:\n", - "\n", - "- **`[Term]`**: \n", - " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct biological process, molecular function, or cellular component.\n", - "\n", - "- **`id: GO:0000032`**: \n", - " - **Description**: A unique identifier for the biological term within the GO ontology.\n", - " - **Example**: `GO:0000032` refers to the term \"cell wall mannoprotein biosynthetic process.\"\n", - "\n", - "- **`name: cell wall mannoprotein biosynthetic process`**: \n", - " - **Description**: The name of the biological process, molecular function, or cellular component being described.\n", - " - **Example**: The name \"cell wall mannoprotein biosynthetic process\" is a descriptive label for the GO term with the identifier `GO:0000032`.\n", - "\n", - "- **`namespace: biological_process`**: \n", - " - **Description**: Specifies which ontology the term belongs to. The main namespaces are `biological_process`, `molecular_function`, and `cellular_component`.\n", - "\n", - "- **`is_a: GO:0006057`**: \n", - " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current term is a subclass or specific instance of the referenced term.\n", - " - **Example**: The term `GO:0000032` (\"cell wall mannoprotein biosynthetic process\") is a subclass of `GO:0006057` and subclass of `GO:0031506`.\n" - ] - }, - { - "cell_type": "markdown", - "id": "87c841de7d80beef", - "metadata": {}, - "source": [ - "## uniprot_sprot.dat File\n", - "\n", - "**Description**: The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotations. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n", - "\n", - "\n", - "### Example of a Protein Entry\n", - "\n", - "```plaintext\n", - "ID 002L_FRG3G Reviewed; 320 AA.\n", - "AC Q6GZX3;\n", - "DT 28-JUN-2011, integrated into UniProtKB/Swiss-Prot.\n", - "DT 19-JUL-2004, sequence version 1.\n", - "DT 08-NOV-2023, entry version 46.\n", - "DE RecName: Full=Uncharacterized protein 002L;\n", - "GN ORFNames=FV3-002L;\n", - "OS Frog virus 3 (isolate Goorha) (FV-3).\n", - "OC Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes;\n", - "OX NCBI_TaxID=654924;\n", - "OH NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens).\n", - "RN [1]\n", - "RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].\n", - "RX PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;\n", - "RA Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.;\n", - "RT \"Comparative genomic analyses of frog virus 3, type species of the genus\n", - "RT Ranavirus (family Iridoviridae).\";\n", - "RL Virology 323:70-84(2004).\n", - "CC -!- SUBCELLULAR LOCATION: Host membrane {ECO:0000305}; Single-pass membrane\n", - "CC protein {ECO:0000305}.\n", - "DR EMBL; AY548484; AAT09661.1; -; Genomic_DNA.\n", - "DR RefSeq; YP_031580.1; NC_005946.1.\n", - "DR GeneID; 2947774; -.\n", - "DR KEGG; vg:2947774; -.\n", - "DR Proteomes; UP000008770; Segment.\n", - "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", - "DR GO; GO:0016020; C:membrane; IEA:UniProtKB-KW.\n", - "PE 4: Predicted;\n", - "KW Host membrane; Membrane; Reference proteome; Transmembrane;\n", - "KW Transmembrane helix.\n", - "FT CHAIN 1..320\n", - "FT /note=\"Uncharacterized protein 002L\"\n", - "FT /id=\"PRO_0000410509\"\n", - "SQ SEQUENCE 320 AA; 34642 MW; 9E110808B6E328E0 CRC64;\n", - " MSIIGATRLQ NDKSDTYSAG PCYAGGCSAF TPRGTCGKDW DLGEQTCASG FCTSQPLCAR\n", - " IKKTQVCGLR YSSKGKDPLV SAEWDSRGAP YVRCTYDADL IDTQAQVDQF VSMFGESPSL\n", - " AERYCMRGVK NTAGELVSRV SSDADPAGGW CRKWYSAHRG PDQDAALGSF CIKNPGAADC\n", - " KCINRASDPV YQKVKTLHAY PDQCWYVPCA ADVGELKMGT QRDTPTNCPT QVCQIVFNML\n", - " DDGSVTMDDV KNTINCDFSK YVPPPPPPKP TPPTPPTPPT PPTPPTPPTP PTPRPVHNRK\n", - " VMFFVAGAVL VAILISTVRW\n", - "//\n", - "```\n", - "\n", - "**File Path**: `data/GO_UniProt/raw/uniprot_sprot.dat`\n", - "\n", - "\n", - "## Structure of `uniprot_sprot.dat`\n", - "\n", - "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n", - "\n", - "### Breakdown of Attributes\n", - "\n", - "Each protein entry in the `uniprot_sprot.dat` file is structured with specific tags and sections that describe the protein in detail. Here's a breakdown of the key attributes:\n", - "\n", - "- **`ID`**: \n", - " - **Description**: Contains the unique identifier for the protein and its status (e.g., `Reviewed` indicates the sequence has been manually curated).\n", - " - **Example**: `002L_FRG3G` is the identifier for the protein from Frog virus 3.\n", - "\n", - "- **`AC`**: \n", - " - **Description**: Accession number, a unique identifier for the protein sequence.\n", - " - **Example**: `Q6GZX3` is the accession number for this entry.\n", - "\n", - "- **`DR`**: \n", - " - **Description**: Cross-references to other databases like EMBL, RefSeq, KEGG, and GeneID.\n", - " - **Example**: This entry is cross-referenced with the EMBL database, RefSeq, GO, etc.\n", - "\n", - "- **`GO`**: \n", - " - **Description**: Gene Ontology annotations that describe the cellular component, biological process, or molecular function associated with the protein.\n", - " - **Example**: The protein is associated with the GO terms `GO:0033644` (host cell membrane) and `GO:0016020` (membrane).\n", - "\n", - "- **`SQ`**: \n", - " - **Description**: The amino acid sequence of the protein.\n", - " - **Example**: The sequence consists of 320 amino acids.\n", - "\n", - "__Note__: For more detailed information refer [here](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt\n", - "). \n", - "\n", - "Consider the below line from above example: \n", - "```plaintext\n", - "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", - "```\n", - "\n", - "The line contains a **Gene Ontology (GO) annotation** describing the protein's subcellular location. Here's a detailed breakdown:\n", - "\n", - "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n", - "\n", - "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n", - "\n", - "__Note__: For more details on evidence codes check section 5.2" - ] - }, - { - "cell_type": "markdown", - "id": "b7687078-f6b8-4fbf-afa7-dfda89061a5e", - "metadata": {}, - "source": [ - "## data.pkl File\n", - "\n", - "**Description**: This file is generated by the `prepare_data` method and contains the processed GO data in a dataframe format. It includes protein IDs, data representations (such as sequence strings), and class columns with boolean values." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "b4da7e73e251e1d1", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T14:08:33.990378Z", - "start_time": "2024-09-30T14:08:33.959459Z" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "b66fbb9b720d053c", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T14:10:12.796911Z", - "start_time": "2024-09-30T14:10:06.052276Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Size of the data (rows x columns): (53604, 902)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
swiss_idaccessiongo_idssequence4175122165209226...1990778200002620001452000146200014720002412000243200114120012332001234
111S1_CARILB5KVH4[3006, 8150, 9791, 10431, 21700, 22414, 32501,...MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
311S2_SESINQ9XHP0[3006, 8150, 10431, 21700, 22414, 32502, 48609]MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
614310_ARATHP48347,Q9LME5[7165, 8150, 9742, 9755, 9987, 43401, 50789, 5...MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
814331_ARATHP42643,Q945M2,Q9M0S7[8150, 19222, 50789, 65007]MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
914331_CAEELP41932,Q21537[132, 226, 1708, 6611, 6810, 6886, 6913, 6950,...MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...FalseFalseFalseFalseFalseTrue...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", - "

5 rows × 902 columns

\n", - "
" - ], - "text/plain": [ - " swiss_id accession \\\n", - "1 11S1_CARIL B5KVH4 \n", - "3 11S2_SESIN Q9XHP0 \n", - "6 14310_ARATH P48347,Q9LME5 \n", - "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", - "9 14331_CAEEL P41932,Q21537 \n", - "\n", - " go_ids \\\n", - "1 [3006, 8150, 9791, 10431, 21700, 22414, 32501,... \n", - "3 [3006, 8150, 10431, 21700, 22414, 32502, 48609] \n", - "6 [7165, 8150, 9742, 9755, 9987, 43401, 50789, 5... \n", - "8 [8150, 19222, 50789, 65007] \n", - "9 [132, 226, 1708, 6611, 6810, 6886, 6913, 6950,... \n", - "\n", - " sequence 41 75 122 \\\n", - "1 MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE... False False False \n", - "3 MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL... False False False \n", - "6 MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV... False False False \n", - "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", - "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", - "\n", - " 165 209 226 ... 1990778 2000026 2000145 2000146 2000147 \\\n", - "1 False False False ... False False False False False \n", - "3 False False False ... False False False False False \n", - "6 False False False ... False False False False False \n", - "8 False False False ... False False False False False \n", - "9 False False True ... False False False False False \n", - "\n", - " 2000241 2000243 2001141 2001233 2001234 \n", - "1 False False False False False \n", - "3 False False False False False \n", - "6 False False False False False \n", - "8 False False False False False \n", - "9 False False False False False \n", - "\n", - "[5 rows x 902 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pkl_df = pd.DataFrame(\n", - " pd.read_pickle(\n", - " os.path.join(\n", - " go_class.processed_dir_main,\n", - " go_class.processed_dir_main_file_names_dict[\"data\"],\n", - " )\n", - " )\n", - ")\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "735844f0b2474ad6", - "metadata": {}, - "source": [ - "**File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", - "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n", - "- **Column 1**: Contains the accession of each Protein data instance.\n", - "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n", - "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n", - "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n", - "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" - ] - }, - { - "cell_type": "markdown", - "id": "2c9b17f6-93bd-4cc3-8967-7ab1d2e06e51", - "metadata": {}, - "source": [ - "## data.pt File\n", - "\n", - "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "85b097601fb242d6", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T14:10:35.034002Z", - "start_time": "2024-09-30T14:10:35.018342Z" - } - }, - "outputs": [], - "source": [ - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "289a54a71dec20fb", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T14:11:36.443693Z", - "start_time": "2024-09-30T14:11:34.199285Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type of loaded data: \n", - "Content of the data file: \n", - " {'features': [10, 14, 21, 23, 12, 17, 17, 11, 12, 22, 17, 24, 17, 12, 12, 28, 14, 17, 25, 19, 13, 24, 17, 14, 18, 11, 13, 13, 16, 18, 18, 29, 21, 25, 13, 18, 24, 18, 17, 19, 16, 17, 20, 14, 17, 27, 23, 15, 19, 16, 12, 27, 14, 27, 14, 13, 28, 12, 27, 11, 26, 20, 23, 19, 29, 18, 18, 17, 18, 24, 14, 13, 28, 14, 28, 28, 16, 16, 15, 12, 27, 23, 19, 13, 17, 17, 17, 23, 29, 22, 11, 19, 14, 23, 18, 17, 28, 22, 12, 14, 16, 13, 16, 13, 12, 15, 13, 28, 17, 25, 23, 13, 24, 23, 27, 15, 25, 27, 27, 11, 18, 16, 18, 11, 18, 18, 13, 18, 16, 16, 27, 25, 18, 18, 20, 16, 29, 18, 21, 12, 16, 29, 25, 16, 27, 13, 20, 12, 12, 14, 25, 23, 14, 13, 28, 14, 29, 26, 24, 22, 19, 20, 13, 11, 11, 23, 28, 28, 14, 12, 25, 17, 17, 20, 15, 29, 19, 19, 14, 19, 18, 17, 20, 18, 19, 23, 16, 19, 25, 22, 17, 14, 13, 19, 23, 20, 20, 27, 25, 16, 23, 18, 13, 18, 18, 27, 22, 27, 18, 29, 16, 16, 18, 18, 18, 29, 18, 18, 16, 16, 13, 27, 29, 13, 27, 18, 18, 16, 20, 17, 13, 19, 19, 28, 25, 11, 13, 25, 20, 14, 27, 25, 17, 14, 20, 14, 25, 19, 28, 20, 15, 27, 15, 14, 16, 16, 17, 18, 11, 27, 19, 20, 29, 16, 13, 11, 12, 28, 16, 28, 27, 13, 16, 18, 17, 18, 28, 12, 16, 23, 16, 26, 11, 16, 27, 27, 18, 27, 29, 27, 27, 16, 21, 27, 16, 27, 16, 27, 16, 27, 11, 27, 11, 27, 16, 16, 18, 11, 16, 16, 13, 13, 16, 20, 20, 19, 13, 17, 27, 27, 15, 12, 24, 15, 17, 11, 17, 16, 27, 19, 12, 13, 20, 23, 11, 16, 14, 20, 12, 22, 15, 27, 27, 14, 13, 16, 12, 11, 15, 28, 19, 11, 29, 19, 17, 23, 12, 17, 16, 26, 17, 18, 17, 11, 14, 27, 16, 13, 14, 17, 22, 11, 20, 14, 17, 22, 28, 23, 29, 26, 19, 17, 19, 14, 29, 11, 28, 28, 22, 14, 17, 16, 13, 16, 14, 27, 28, 18, 28, 28, 20, 19, 25, 13, 18, 15, 28, 25, 20, 20, 27, 17, 16, 27, 13, 18, 17, 17, 15, 12, 23, 18, 19, 25, 14, 28, 28, 21, 16, 14, 16, 20, 27, 13, 25, 27, 26, 28, 11, 25, 21, 15, 19, 27, 19, 14, 10, 28, 11, 23, 17, 14, 13, 16, 15, 11, 14, 12, 16, 14, 17, 23, 27, 27, 28, 17, 28, 19, 14, 25, 18, 12, 23, 16, 27, 20, 14, 16, 16, 17, 21, 25, 19, 16, 18, 27, 11, 15, 17, 28, 16, 11, 16, 11, 16, 11, 11, 16, 11, 27, 16, 16, 14, 27, 28], 'labels': array([False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, True, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, True, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, True, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, True, False, False, False, False, False, True,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, True, True, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, True,\n", - " True, False, False, False, False, False, False, False, False,\n", - " True, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False, False, False,\n", - " False, False, False, False, False, False, False]), 'ident': '11S1_CARIL', 'group': None}\n" - ] - } - ], - "source": [ - "data_pt = torch.load(\n", - " os.path.join(go_class.processed_dir, go_class.processed_file_names_dict[\"data\"]),\n", - " weights_only=False,\n", - ")\n", - "print(\"Type of loaded data:\", type(data_pt))\n", - "print(\"Content of the data file: \\n\", data_pt[0])" - ] - }, - { - "cell_type": "markdown", - "id": "2c9f23883c66b48d", - "metadata": {}, - "source": [ - "**File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", - "\n", - "- **`features`**: \n", - " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", - "\n", - "- **`labels`**: \n", - " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", - "\n", - "- **`ident`**: \n", - " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" - ] - }, - { - "cell_type": "markdown", - "id": "36aed0b8-ab05-428d-8833-2a24deebacc3", - "metadata": {}, - "source": [ - "## classes.txt File\n", - "\n", - "**Description**: This file lists the GO classes that are used as labels. It can be used to match labels in `data.pt` with GO classes: For position `i` in the label-tensor, the GO-ID is in line `i` of `classes.txt`" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "19200f7ff9a6ebba", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:30:34.344202Z", - "start_time": "2024-09-30T21:30:34.328318Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "41\n", - "75\n", - "122\n", - "165\n", - "209\n" - ] - } - ], - "source": [ - "with open(os.path.join(go_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n", - " for i in range(5):\n", - " line = file.readline()\n", - " print(line.strip())" - ] - }, - { - "cell_type": "markdown", - "id": "f69012b3540fd1b6", - "metadata": {}, - "source": [ - "**File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", - "\n", - "The `classes.txt` file lists selected GO classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset." - ] - }, - { - "cell_type": "markdown", - "id": "b81ea34f-cfa8-4ffa-8b88-b54ca96afd84", - "metadata": {}, - "source": [ - "## splits.csv File\n", - "\n", - "**Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "88c3ea8f01ba9fac", - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:30:41.586616Z", - "start_time": "2024-09-30T21:30:39.318598Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idsplit
014331_ARATHtrain
114331_CAEELtrain
214331_MAIZEtrain
314332_MAIZEtrain
414333_ARATHtrain
\n", - "
" - ], - "text/plain": [ - " id split\n", - "0 14331_ARATH train\n", - "1 14331_CAEEL train\n", - "2 14331_MAIZE train\n", - "3 14332_MAIZE train\n", - "4 14333_ARATH train" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv_df = pd.read_csv(os.path.join(go_class.processed_dir_main, \"splits.csv\"))\n", - "csv_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "6661dc11247e9753", - "metadata": {}, - "source": [ - "**File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", - "\n", - "To reuse an existing split, you can use the `splits_file_path` argument. This way, you can reuse the same datasplit across several runs." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "2b02d8b4-c2de-4b8e-b680-ec67b40d9a30", - "metadata": {}, - "outputs": [], - "source": [ - "# You can specify a literal path for the `splits_file_path`, or if another `go_class` instance is already defined,\n", - "# you can use its existing `splits_file_path` attribute for consistency.\n", - "go_class_with_splits = GOUniProtOver250(\n", - " go_branch=\"BP\",\n", - " # splits_file_path=\"data/GO_UniProt/GO250_BP_1002/processed/splits.csv\", # Literal path option\n", - " splits_file_path=go_class.splits_file_path, # Use path from an existing `go_class` instance\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "e6b1f184a5091b83", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "481b8c0271ec9636", - "metadata": {}, - "source": [ - "## 5.1 Protein Representation Using Amino Acid Sequence Notation\n", - "\n", - "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", - "\n", - "### Example Protein Sequence\n", - "\n", - "Protein: **Lysozyme C** from **Gallus gallus** (Chicken). \n", - "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n", - "\n", - "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n", - "- **Sequence Length**: 147\n", - "\n", - "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", - "\n", - "### Tokenization and Encoding\n", - "\n", - "To tokenize and numerically encode this protein sequence, the `ProteinDataReader` class is used. This class allows for n-gram tokenization, where the `n_gram` parameter defines the size of the tokenized units. If `n_gram` is not provided (default is `None`), each amino acid letter is treated as a single token.\n", - "\n", - "For more details, you can explore the implementation of the `ProteinDataReader` class in the source code [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/reader.py)." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "e0cf4fb6-2ca4-4b85-a4e7-0cfbac5cd6c1", - "metadata": {}, - "outputs": [], - "source": [ - "from chebai.preprocessing.reader import ProteinDataReader" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "e8343d83-0be3-44df-9224-bba8d5c32336", - "metadata": {}, - "outputs": [], - "source": [ - "protein_dr_3gram = ProteinDataReader(n_gram=3)\n", - "protein_dr = ProteinDataReader()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "8a18dc27-f308-4dde-b1ae-b03a20fb0d45", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[10, 16, 11, 17, 17, 12, 17, 28, 17, 24, 25, 17, 23, 17, 14, 14, 17, 13, 21]\n", - "[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]\n" - ] - } - ], - "source": [ - "protein = \"MRSLLILVLCFLPLAALGK\"\n", - "print(protein_dr._read_data(protein))\n", - "print(protein_dr_3gram._read_data(protein))" - ] - }, - { - "cell_type": "markdown", - "id": "7e95738a-0b2d-4c56-ac97-f3b24c1de18f", - "metadata": {}, - "source": [ - "The numbers mentioned above refer to the index of each individual token from the [`tokens.txt`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/bin/protein_token/tokens.txt) file, which is used by the `ProteinDataReader` class. \n", - "\n", - "Each token in the `tokens.txt` file corresponds to a specific amino-acid letter, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing." - ] - }, - { - "cell_type": "markdown", - "id": "fd54ca4a-743c-496e-9e89-cff2d8226eb2", - "metadata": {}, - "source": [ - "### The 20 Amino Acids and Their One-Letter Notations\n", - "\n", - "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n", - "\n", - "| One-Letter Notation | Amino Acid Name | Description |\n", - "|---------------------|----------------------|---------------------------------------------------------|\n", - "| **A** | Alanine | Non-polar, aliphatic amino acid. |\n", - "| **C** | Cysteine | Polar, contains a thiol group, forms disulfide bonds. |\n", - "| **D** | Aspartic Acid | Acidic, negatively charged at physiological pH. |\n", - "| **E** | Glutamic Acid | Acidic, negatively charged at physiological pH. |\n", - "| **F** | Phenylalanine | Aromatic, non-polar. |\n", - "| **G** | Glycine | Smallest amino acid, non-polar. |\n", - "| **H** | Histidine | Polar, positively charged, can participate in enzyme active sites. |\n", - "| **I** | Isoleucine | Non-polar, aliphatic. |\n", - "| **K** | Lysine | Basic, positively charged at physiological pH. |\n", - "| **L** | Leucine | Non-polar, aliphatic. |\n", - "| **M** | Methionine | Non-polar, contains sulfur, start codon in mRNA translation. |\n", - "| **N** | Asparagine | Polar, uncharged. |\n", - "| **P** | Proline | Non-polar, introduces kinks in protein chains. |\n", - "| **Q** | Glutamine | Polar, uncharged. |\n", - "| **R** | Arginine | Basic, positively charged, involved in binding phosphate groups. |\n", - "| **S** | Serine | Polar, can be phosphorylated. |\n", - "| **T** | Threonine | Polar, can be phosphorylated. |\n", - "| **V** | Valine | Non-polar, aliphatic. |\n", - "| **W** | Tryptophan | Aromatic, non-polar, largest amino acid. |\n", - "| **Y** | Tyrosine | Aromatic, polar, can be phosphorylated. |\n", - "\n", - "### Understanding Protein Sequences\n", - "\n", - "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", - "\n", - "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", - "\n", - "\n", - "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" - ] - }, - { - "cell_type": "markdown", - "id": "db6d7f2cc446e6f9", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "7f42b928364e5cd1", - "metadata": {}, - "source": [ - "## 5.2 More on GO Evidence Codes\n", - "\n", - "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the GO evidence codes with brief descriptions:\n", - "\n", - "| **Evidence Code** | **Description** |\n", - "|-----------------------|-----------------|\n", - "| **EXP** | [Inferred from Experiment (EXP)](http://wiki.geneontology.org/index.php/Inferred_from_Experiment_(EXP)) |\n", - "| **IDA** | [Inferred from Direct Assay (IDA)](http://wiki.geneontology.org/index.php/Inferred_from_Direct_Assay_(IDA)) |\n", - "| **IPI** | [Inferred from Physical Interaction (IPI)](http://wiki.geneontology.org/index.php/Inferred_from_Physical_Interaction_(IPI)) |\n", - "| **IMP** | [Inferred from Mutant Phenotype (IMP)](http://wiki.geneontology.org/index.php/Inferred_from_Mutant_Phenotype_(IMP)) |\n", - "| **IGI** | [Inferred from Genetic Interaction (IGI)](http://wiki.geneontology.org/index.php/Inferred_from_Genetic_Interaction_(IGI)) |\n", - "| **IEP** | [Inferred from Expression Pattern (IEP)](http://wiki.geneontology.org/index.php/Inferred_from_Expression_Pattern_(IEP)) |\n", - "| **HTP** | [Inferred from High Throughput Experiment (HTP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Experiment_(HTP) ) |\n", - "| **HDA** | [Inferred from High Throughput Direct Assay (HDA)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Direct_Assay_(HDA)) |\n", - "| **HMP** | [Inferred from High Throughput Mutant Phenotype (HMP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Mutant_Phenotype_(HMP)) |\n", - "| **HGI** | [Inferred from High Throughput Genetic Interaction (HGI)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Genetic_Interaction_(HGI)) |\n", - "| **HEP** | [Inferred from High Throughput Expression Pattern (HEP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Expression_Pattern_(HEP)) |\n", - "| **IBA** | [Inferred from Biological aspect of Ancestor (IBA)](http://wiki.geneontology.org/index.php/Inferred_from_Biological_aspect_of_Ancestor_(IBA)) |\n", - "| **IBD** | [Inferred from Biological aspect of Descendant (IBD)](http://wiki.geneontology.org/index.php/Inferred_from_Biological_aspect_of_Descendant_(IBD)) |\n", - "| **IKR** | [Inferred from Key Residues (IKR)](http://wiki.geneontology.org/index.php/Inferred_from_Key_Residues_(IKR)) |\n", - "| **IRD** | [Inferred from Rapid Divergence (IRD)](http://wiki.geneontology.org/index.php/Inferred_from_Rapid_Divergence(IRD)) |\n", - "| **ISS** | [Inferred from Sequence or Structural Similarity (ISS)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_or_structural_Similarity_(ISS)) |\n", - "| **ISO** | [Inferred from Sequence Orthology (ISO)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Orthology_(ISO)) |\n", - "| **ISA** | [Inferred from Sequence Alignment (ISA)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Alignment_(ISA)) |\n", - "| **ISM** | [Inferred from Sequence Model (ISM)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Model_(ISM)) |\n", - "| **RCA** | [Inferred from Reviewed Computational Analysis (RCA)](http://wiki.geneontology.org/index.php/Inferred_from_Reviewed_Computational_Analysis_(RCA)) |\n", - "| **IEA** | [Inferred from Electronic Annotation (IEA)](http://wiki.geneontology.org/index.php/Inferred_from_Electronic_Annotation_(IEA)) |\n", - "| **TAS** | [Traceable Author Statement (TAS)](http://wiki.geneontology.org/index.php/Traceable_Author_Statement_(TAS)) |\n", - "| **NAS** | [Non-traceable Author Statement (NAS)](http://wiki.geneontology.org/index.php/Non-traceable_Author_Statement_(NAS)) |\n", - "| **IC** | [Inferred by Curator (IC)](http://wiki.geneontology.org/index.php/Inferred_by_Curator_(IC)) |\n", - "| **ND** | [No Biological Data Available (ND)](http://wiki.geneontology.org/index.php/No_biological_Data_available_(ND)_evidence_code) |\n", - "| **NR** | Not Recorded |\n", - "\n", - "\n", - "### **Grouping of Codes**:\n", - "\n", - "- **Experimental Evidence Codes**:\n", - " - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n", - " \n", - "- **High-Throughput Experimental Codes**:\n", - " - **HTP**, **HDA**, **HMP**, **HGI**, **HEP**\n", - "\n", - "- **Phylogenetically-Inferred Codes**:\n", - " - **IBA**, **IBD**, **IKR**, **IRD**\n", - "\n", - "- **Author/Curator Inferred Codes**:\n", - " - **TAS**, **IC**, **NAS**\n", - "\n", - "- **Computational Evidence Codes**:\n", - " - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n", - "\n", - "- **Others**:\n", - " - **ND** (No Biological Data Available), **NR** (Not Recorded)\n", - "\n", - "\n", - "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation.\n", - "\n", - "__Note__ : For more information on GO evidence codes please check [here](https://geneontology.org/docs/guide-go-evidence-codes/) " - ] - }, - { - "cell_type": "markdown", - "id": "1c11d6f520b02434", - "metadata": {}, - "source": [ - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tutorials/data_exploration_scope.ipynb b/tutorials/data_exploration_scope.ipynb deleted file mode 100644 index c14046ac..00000000 --- a/tutorials/data_exploration_scope.ipynb +++ /dev/null @@ -1,1182 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", - "metadata": {}, - "source": [ - "# Introduction\n", - "\n", - "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the SCOPe dataset. It covers\n", - "- how to instantiate a data class and generate data\n", - "- how the data is processed and stored\n", - "- and how to work with different molecule encodings.\n", - "\n", - "The `chebai` package simplifies the handling of these datasets by **automatically downloading and processing** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", - "\n", - "---\n" - ] - }, - { - "cell_type": "markdown", - "id": "cca637ce-d4ea-4365-acd9-657418e0640f", - "metadata": {}, - "source": [ - "### Overview of SCOPe Data and its Usage in Protein-Related Tasks\n", - "\n", - "#### **What is SCOPe?**\n", - "\n", - "The **Structural Classification of Proteins — extended (SCOPe)** is a comprehensive database that extends the original SCOP (Structural Classification of Proteins) database. SCOPe offers a detailed classification of protein domains based on their structural and evolutionary relationships.\n", - "\n", - "The SCOPe database, like SCOP, organizes proteins into a hierarchy of domains based on structural similarities, which is crucial for understanding evolutionary patterns and functional aspects of proteins. This hierarchical structure is comparable to taxonomy in biology, where species are classified based on shared characteristics.\n", - "\n", - "#### **SCOPe Hierarchy:**\n", - "By analogy with taxonomy, SCOP was created as a hierarchy of several levels where the fundamental unit of classification is a **domain** in the experimentally determined protein structure. Starting at the bottom, the hierarchy of SCOP domains comprises the following levels:\n", - "\n", - "1. **Species**: Representing distinct protein sequences and their naturally occurring or artificially created variants.\n", - "2. **Protein**: Groups together similar sequences with essentially the same functions. These can originate from different biological species or represent isoforms within the same species.\n", - "3. **Family**: Contains proteins with similar sequences but typically distinct functions.\n", - "4. **Superfamily**: Bridges protein families with common functional and structural features, often inferred from a shared evolutionary ancestor.\n", - "5. **Fold**: Groups structurally similar superfamilies. \n", - "6. **Class**: Based on secondary structure content and organization. This level classifies proteins based on their secondary structure properties, such as alpha-helices and beta-sheets.\n", - "\n", - "\n", - "\n", - "For more details, you can refer to the [SCOPe documentation](https://scop.berkeley.edu/help/ver=2.08).\n", - "\n", - "---\n", - "\n", - "#### **Why are We Using SCOPe?**\n", - "\n", - "We are integrating the SCOPe data into our pipeline as part of an ontology pretraining task for protein-related models. SCOPe is a great fit for our goal because it is primarily **structure-based**, unlike other protein-related databases like Gene Ontology (GO), which focuses more on functional classes.\n", - "\n", - "Our primary objective is to reproduce **ontology pretraining** on a protein-related task, and SCOPe provides the structural ontology that we need for this. The steps in our pipeline are aligned as follows:\n", - "\n", - "| **Stage** | **Chemistry Task** | **Proteins Task** |\n", - "|--------------------------|-------------------------------------|------------------------------------------------|\n", - "| **Unsupervised Pretraining** | Mask pretraining (ELECTRA) | Mask pretraining (ESM2, optional) |\n", - "| **Ontology Pretraining** | ChEBI | SCOPe |\n", - "| **Finetuning Task** | Toxicity, Solubility, etc. | GO (MF, BP, CC branches) |\n", - "\n", - " \n", - "This integration will allow us to use **SCOPe** for tasks such as **protein classification** and will contribute to the success of **pretraining models** for protein structures. The data will be processed with the same approach as the GO data, with **different labels** corresponding to the SCOPe classification system.\n", - "\n", - "---\n", - "\n", - "#### **Why SCOPe is Suitable for Our Task**\n", - "\n", - "1. **Structure-Based Classification**: SCOPe is primarily concerned with the structural characteristics of proteins, making it ideal for protein structure pretraining tasks. This contrasts with other ontology databases like **GO**, which categorize proteins based on more complex functional relationships.\n", - " \n", - "2. **Manageable Size**: SCOPe contains around **140,000 entries**, making it a manageable dataset for training models. This is similar in size to **ChEBI**, which is used in the chemical domain, and ensures we can work with it effectively for pretraining." - ] - }, - { - "cell_type": "markdown", - "id": "338e452f-426c-493d-bec2-5bd51e24e4aa", - "metadata": {}, - "source": [ - "\n", - "### Protein Data Bank (PDB)\n", - "\n", - "The **Protein Data Bank (PDB)** is a global repository that stores 3D structural data of biological macromolecules like proteins and nucleic acids. It contains information obtained through experimental methods such as **X-ray crystallography**, **NMR spectroscopy**, and **cryo-EM**. The data includes atomic coordinates, secondary structure details, and experimental conditions.\n", - "\n", - "The PDB is an essential resource for **structural biology**, **bioinformatics**, and **drug discovery**, enabling scientists to understand protein functions, interactions, and mechanisms at the molecular level.\n", - "\n", - "For more details, visit the [RCSB PDB website](https://www.rcsb.org/).\n" - ] - }, - { - "cell_type": "markdown", - "id": "f6c25706-251c-438c-9915-e8002647eb94", - "metadata": {}, - "source": [ - "### Understanding [SCOPe](https://scop.berkeley.edu/) and [PDB](https://www.rcsb.org/) \n", - "\n", - "\n", - "1. **Protein domains form chains.** \n", - "2. **Chains form complexes** (protein complexes or structures). \n", - "3. These **complexes are the entries in PDB**, represented by unique identifiers like `\"1A3N\"`. \n", - "\n", - "---\n", - "\n", - "#### **Protein Domain** \n", - "A **protein domain** is a **structural and functional unit** of a protein. \n", - "\n", - "\n", - "##### Key Characteristics:\n", - "- **Domains are part of a protein chain.** \n", - "- A domain can span: \n", - " 1. **The entire chain** (single-domain protein): \n", - " - In this case, the protein domain is equivalent to the chain itself. \n", - " - Example: \n", - " - All chains of the **PDB structure \"1A3N\"** are single-domain proteins. \n", - " - Each chain has a SCOPe domain identifier. \n", - " - For example, Chain **A**: \n", - " - Domain identifier: `d1a3na_` \n", - " - Breakdown of the identifier: \n", - " - `d`: Denotes domain. \n", - " - `1a3n`: Refers to the PDB protein structure identifier. \n", - " - `a`: Specifies the chain within the structure. (`_` for None and `.` for multiple chains)\n", - " - `_`: Indicates the domain spans the entire chain (single-domain protein). \n", - " - Example: [PDB Structure 1A3N - Chain A](https://www.rcsb.org/sequence/1A3N#A)\n", - " 2. **A specific portion of the chain** (multi-domain protein): \n", - " - Here, a single chain contains multiple domains. \n", - " - Example: Chain **A** of the **PDB structure \"1PKN\"** contains three domains: `d1pkna1`, `d1pkna2`, `d1pkna3`. \n", - " - Example: [PDB Structure 1PKN - Chain A](https://www.rcsb.org/annotations/1PKN). \n", - "\n", - "---\n", - "\n", - "#### **Protein Chain** \n", - "A **protein chain** refers to the entire **polypeptide chain** observed in a protein's 3D structure (as described in PDB files). \n", - "\n", - "##### Key Points:\n", - "- A chain can consist of **one or multiple domains**:\n", - " - **Single-domain chain**: The chain and domain are identical. \n", - " - Example: Myoglobin. \n", - " - **Multi-domain chain**: Contains several domains, each with distinct structural and functional roles. \n", - "- Chains assemble to form **protein complexes** or **structures**. \n", - "\n", - "\n", - "---\n", - "\n", - "#### **Key Observations About SCOPe** \n", - "- The **fundamental classification unit** in SCOPe is the **protein domain**, not the entire protein. \n", - "- _**The taxonomy in SCOPe is not for the entire protein (i.e., the full-length amino acid sequence as encoded by a gene) but for protein domains, which are smaller, structurally and functionally distinct regions of the protein.**_\n", - "\n", - "\n", - "--- \n", - "\n", - "**SCOPe 2.08 Data Analysis:**\n", - "\n", - "The current SCOPe version (2.08) includes the following statistics based on analysis for relevant data:\n", - "\n", - "- **Classes**: 12\n", - "- **Folds**: 1485\n", - "- **Superfamilies**: 2368\n", - "- **Families**: 5431\n", - "- **Proteins**: 13,514\n", - "- **Species**: 30,294\n", - "- **Domains**: 344,851\n", - "\n", - "For more detailed statistics, please refer to the official SCOPe website:\n", - "\n", - "- [SCOPe 2.08 Statistics](https://scop.berkeley.edu/statistics/ver=2.08)\n", - "- [SCOPe 2.08 Release](https://scop.berkeley.edu/ver=2.08)\n", - "\n", - "---\n", - "\n", - "## SCOPe Labeling \n", - "\n", - "- Use SCOPe labels for protein domains.\n", - "- Map them back to their **protein-chain** sequences (protein sequence label = sum of all domain labels).\n", - "- Train on protein sequences.\n", - "- This pretraining task would be comparable to GO-based training.\n", - "\n", - "--- " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "990cc6f2-6b4a-4fa7-905f-dda183c3ec4c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Changed to project root directory: G:\\github-aditya0by0\\python-chebai\n" - ] - } - ], - "source": [ - "# To run this notebook, you need to change the working directory of the jupyter notebook to root dir of the project.\n", - "import os\n", - "\n", - "# Root directory name of the project\n", - "expected_root_dir = \"python-chebai\"\n", - "\n", - "# Check if the current directory ends with the expected root directory name\n", - "if not os.getcwd().endswith(expected_root_dir):\n", - " os.chdir(\"..\") # Move up one directory level\n", - " if os.getcwd().endswith(expected_root_dir):\n", - " print(\"Changed to project root directory:\", os.getcwd())\n", - " else:\n", - " print(\"Warning: Directory change unsuccessful. Current directory:\", os.getcwd())\n", - "else:\n", - " print(\"Already in the project root directory:\", os.getcwd())" - ] - }, - { - "cell_type": "markdown", - "id": "4550d01fc7af5ae4", - "metadata": {}, - "source": [ - "# 1. Instantiation of a Data Class\n", - "\n", - "To start working with `chebai`, you first need to instantiate a SCOPe data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", - "metadata": {}, - "outputs": [], - "source": [ - "from chebai.preprocessing.datasets.scope.scope import SCOPeOver50" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:07:26.371796Z", - "start_time": "2024-10-05T21:07:26.058728Z" - } - }, - "outputs": [], - "source": [ - "scope_class = SCOPeOver50(scope_version=\"2.08\")" - ] - }, - { - "cell_type": "markdown", - "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", - "metadata": {}, - "source": [ - "\n", - "### Inheritance Hierarchy\n", - "\n", - "SCOPe data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L598), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L23). Specifically:\n", - "\n", - "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", - "\n", - "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", - "\n", - "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", - "\n", - "\n", - "### Input parameters\n", - "A SCOPe data class can be configured with a range of parameters, including:\n", - "\n", - "- **scope_version (str)**: Specifies the version of the ChEBI database to be used. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", - "\n", - "- **scope_version_train (str, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `scope_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", - "\n", - "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n", - "\n", - "### Additional Input Parameters\n", - "\n", - "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_SCOPeDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/scope/scope.py#L31), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n" - ] - }, - { - "cell_type": "markdown", - "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", - "metadata": {}, - "source": [ - "# Available SCOPe Data Classes\n", - "\n", - "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/scope/scope.py):\n", - "\n", - "There is a range of available dataset classes for SCOPe. Usually, you want to use `SCOPeOver2000` or `SCOPeOver50`. The number indicates the threshold for selecting label classes: SCOPe classes which have at least 2000 / 50 subclasses will be used as labels.\n", - "\n", - "Both inherit from `SCOPeOverX`. If you need a different threshold, you can create your own subclass. By default, `SCOPeOverX` uses the Protein encoding (see Section 5).\n", - "\n", - "Finally, `SCOPeOver2000Partial` selects extracts a part of SCOPe based on a given top class, with a threshold of 2000 for selecting labels.\n", - "This class inherits from `SCOPEOverXPartial`.\n" - ] - }, - { - "cell_type": "markdown", - "id": "8456b545-88c5-401d-baa5-47e8ae710f04", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "ed973fb59df11849", - "metadata": {}, - "source": [ - "# 2. Preparation / Setup Methods\n", - "\n", - "Now we have a SCOPe data class with all the relevant parameters. Next, we need to generate the actual dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "11f2208e-fa40-44c9-bfe7-576ca23ad366", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checking for processed data in data\\SCOPe\\version_2.08\\SCOPe50\\processed\n", - "Missing processed data file (`data.pkl` file)\n", - "Missing PDB raw data, Downloading PDB sequence data....\n", - "Downloading to temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmpsif7r129\n", - "Downloaded to C:\\Users\\HP\\AppData\\Local\\Temp\\tmpsif7r129\n", - "Unzipping the file....\n", - "Unpacked and saved to data\\SCOPe\\pdb_sequences.txt\n", - "Removed temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmpsif7r129\n", - "Missing Scope: cla.txt raw data, Downloading...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "G:\\anaconda3\\envs\\env_chebai\\lib\\site-packages\\urllib3\\connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'scop.berkeley.edu'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", - "warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Missing Scope: hie.txt raw data, Downloading...\n", - "Missing Scope: des.txt raw data, Downloading...\n", - "Extracting class hierarchy...\n", - "Computing transitive closure\n", - "Process graph\n", - "101 labels has been selected for specified threshold, \n", - "Constructing data.pkl file .....\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Check for processed data in data\\SCOPe\\version_2.08\\SCOPe50\\processed\\protein_token\n", - "Cross-validation enabled: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Missing transformed data (`data.pt` file). Transforming data.... \n", - "Processing 60298 lines...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|█████████████████████████████████████████████████████████████████████████| 60298/60298 [00:53<00:00, 1119.10it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving 21 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\protein_token\\tokens.txt...\n", - "First 10 tokens: ['M', 'S', 'I', 'G', 'A', 'T', 'R', 'L', 'Q', 'N']\n" - ] - } - ], - "source": [ - "scope_class.prepare_data()\n", - "scope_class.setup()" - ] - }, - { - "cell_type": "markdown", - "id": "1655d489-25fe-46de-9feb-eeca5d36936f", - "metadata": {}, - "source": [ - "\n", - "### Automatic Execution: \n", - "These methods are executed automatically when using the training command `chebai fit`. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", - "\n", - "### Why is Preparation Needed?\n", - "\n", - "- **Data Availability**: The preparation step ensures that the required SCOPe data files are downloaded or loaded, which are essential for analysis.\n", - "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", - "\n", - "### Main Methods for Data Preprocessing\n", - "\n", - "The data preprocessing in a data class involves two main methods:\n", - "\n", - "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels. This step is independent of input encodings.\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", - "\n", - "2. **`setup` Method**:\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), tokenizing the input according to the specified encoding. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the tokenization.\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", - "\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." - ] - }, - { - "cell_type": "markdown", - "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "bb6e9a81554368f7", - "metadata": {}, - "source": [ - "# 3. Overview of the 3 preprocessing stages\n", - "\n", - "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", - "\n", - "1. **Raw Data Stage**:\n", - " - **Files**: `cla.txt`, `des.txt` and `hie.txt`. Please find description of each file [here](https://scop.berkeley.edu/help/ver=2.08#parseablefiles-2.08).\n", - " - **Description**: This stage contains the raw SCOPe data in txt format, serving as the initial input for further processing.\n", - " - **File Path**: `data/SCOPe/version_${scope_version}/raw/${filename}.txt`\n", - "\n", - "2. **Processed Data Stage 1**:\n", - " - **File**: `data.pkl`\n", - " - **Description**: This stage includes the data after initial processing. It contains protein sequence strings, class columns, and metadata but lacks data splits.\n", - " - **File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/data.pkl`\n", - " - **Additional File**: `classes.txt` - A file listing the relevant SCOPe classes.\n", - "\n", - "3. **Processed Data Stage 2**:\n", - " - **File**: `data.pt`\n", - " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", - " - **File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", - "\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", - "\n", - "### Data Splits\n", - "\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" - ] - }, - { - "cell_type": "markdown", - "id": "7e172c0d1e8bb93f", - "metadata": {}, - "source": [ - "# 4. Data Files and their structure\n", - "\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" - ] - }, - { - "cell_type": "markdown", - "id": "43329709-5134-4ce5-88e7-edd2176bf84d", - "metadata": {}, - "source": [ - "## raw files\n", - "- cla.txt, des.txt and hie.txt\n", - "\n", - "For detailed description of raw files and their structures, please refer the official website [here](https://scop.berkeley.edu/help/ver=2.08#parseablefiles-2.08).\n" - ] - }, - { - "cell_type": "markdown", - "id": "558295e5a7ded456", - "metadata": {}, - "source": [ - "## data.pkl File\n", - "\n", - "**Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the ids, sids which are used to label corresponding sequence, protein-chain sequence, and columns for each label with boolean values." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fd490270-59b8-4c1c-8b09-204defddf592", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:09:01.622317Z", - "start_time": "2024-10-05T21:09:01.606698Z" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:11:51.296162Z", - "start_time": "2024-10-05T21:11:44.559304Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Size of the data (rows x columns): (60424, 1035)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idsidssequenceclass_46456class_48724class_51349class_53931class_56572class_56835class_56992...species_187294species_56257species_186882species_56690species_161316species_57962species_58067species_267696species_311502species_311501
01[d4oq9a_, d4oq9b_, d4oq9c_, d4oq9d_, d4niaa_, ...AAAAAAAAAAFalseTrueFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
12[d7dxhc_]AAAAAAAAAAAAAAAAAAAAAAAFalseFalseFalseFalseFalseTrueFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
23[d1gkub1, d1gkub2, d1gkub3, d1gkub4]AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASLCLFPEDFLLKEF...FalseFalseTrueFalseTrueFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
34[d3c9wa2, d3c9wb2, d3c9wa3, d3c9wb3]AAAAAAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNLNKV...FalseFalseFalseTrueFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
45[d1xwaa1, d1xwab_, d1xwac_, d1xwad_, d1xwaa2]AAAAAMVYQVKDKADLDGQLTKASGKLVVLDFFATWCGPCKMISPK...FalseFalseTrueFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", - "

5 rows × 1035 columns

\n", - "
" - ], - "text/plain": [ - " id sids \\\n", - "0 1 [d4oq9a_, d4oq9b_, d4oq9c_, d4oq9d_, d4niaa_, ... \n", - "1 2 [d7dxhc_] \n", - "2 3 [d1gkub1, d1gkub2, d1gkub3, d1gkub4] \n", - "3 4 [d3c9wa2, d3c9wb2, d3c9wa3, d3c9wb3] \n", - "4 5 [d1xwaa1, d1xwab_, d1xwac_, d1xwad_, d1xwaa2] \n", - "\n", - " sequence class_46456 \\\n", - "0 AAAAAAAAAA False \n", - "1 AAAAAAAAAAAAAAAAAAAAAAA False \n", - "2 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASLCLFPEDFLLKEF... False \n", - "3 AAAAAAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNLNKV... False \n", - "4 AAAAAMVYQVKDKADLDGQLTKASGKLVVLDFFATWCGPCKMISPK... False \n", - "\n", - " class_48724 class_51349 class_53931 class_56572 class_56835 \\\n", - "0 True False False False False \n", - "1 False False False False True \n", - "2 False True False True False \n", - "3 False False True False False \n", - "4 False True False False False \n", - "\n", - " class_56992 ... species_187294 species_56257 species_186882 \\\n", - "0 False ... False False False \n", - "1 False ... False False False \n", - "2 False ... False False False \n", - "3 False ... False False False \n", - "4 False ... False False False \n", - "\n", - " species_56690 species_161316 species_57962 species_58067 \\\n", - "0 False False False False \n", - "1 False False False False \n", - "2 False False False False \n", - "3 False False False False \n", - "4 False False False False \n", - "\n", - " species_267696 species_311502 species_311501 \n", - "0 False False False \n", - "1 False False False \n", - "2 False False True \n", - "3 False False True \n", - "4 False False True \n", - "\n", - "[5 rows x 1035 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pkl_df = pd.DataFrame(\n", - " pd.read_pickle(\n", - " os.path.join(\n", - " scope_class.processed_dir_main,\n", - " scope_class.processed_main_file_names_dict[\"data\"],\n", - " )\n", - " )\n", - ")\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", - "metadata": {}, - "source": [ - "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/data.pkl`\n", - "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the ID of eachdata instance.\n", - "- **Column 1**: Contains the `sids` which are associated with corresponding protein-chain sequence.\n", - "- **Column 2**: Contains the protein-chain sequence.\n", - "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", - "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" - ] - }, - { - "cell_type": "markdown", - "id": "ba019d2d4324bd0b", - "metadata": {}, - "source": [ - "## data.pt File\n", - "\n", - "\n", - "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library, specifically as a list of dictionaries. Each dictionary in this list includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:12:49.338943Z", - "start_time": "2024-10-05T21:12:49.323319Z" - } - }, - "outputs": [], - "source": [ - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:14:12.892845Z", - "start_time": "2024-10-05T21:13:59.859953Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type of loaded data: \n" - ] - } - ], - "source": [ - "data_pt = torch.load(\n", - " os.path.join(\n", - " scope_class.processed_dir, scope_class.processed_file_names_dict[\"data\"]\n", - " ),\n", - " weights_only=False,\n", - ")\n", - "print(\"Type of loaded data:\", type(data_pt))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:14:21.185027Z", - "start_time": "2024-10-05T21:14:21.169358Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'features': [14, 14, 14, 14, 20, 15, 15, 28, 15, 18, 25, 17, 18, 11, 25, 21, 27, 19, 14, 27, 19, 13, 14, 17, 16, 21, 25, 22, 27, 28, 12, 10, 20, 19, 13, 13, 14, 28, 17, 20, 20, 12, 19, 11, 17, 15, 27, 28, 15, 12, 17, 14, 23, 11, 19, 27, 14, 26, 19, 11, 11, 19, 12, 19, 19, 28, 17, 16, 20, 16, 19, 21, 10, 16, 18, 12, 17, 19, 10, 29, 12, 12, 21, 20, 16, 17, 19, 28, 20, 21, 12, 16, 18, 21, 19, 14, 19, 17, 12, 14, 18, 28, 23, 15, 28, 19, 19, 19, 15, 25, 17, 22, 25, 19, 28, 16, 13, 27, 13, 11, 20, 15, 28, 12, 15, 28, 27, 13, 13, 13, 28, 19, 14, 15, 28, 12, 18, 14, 20, 28, 14, 18, 15, 19, 13, 22, 28, 29, 12, 12, 20, 29, 28, 17, 13, 28, 23, 22, 15, 15, 28, 17, 13, 21, 17, 27, 11, 20, 23, 10, 10, 11, 20, 15, 22, 21, 10, 13, 21, 25, 11, 29, 25, 19, 20, 18, 17, 19, 19, 15, 18, 16, 16, 25, 15, 22, 25, 28, 23, 16, 20, 21, 13, 26, 18, 21, 15, 27, 17, 20, 22, 23, 11, 14, 29, 21, 21, 17, 25, 10, 14, 20, 25, 11, 22, 29, 11, 21, 11, 12, 17, 27, 16, 29, 17, 14, 12, 11, 20, 21, 27, 22, 15, 10, 21, 20, 17, 28, 21, 25, 11, 18, 27, 11, 13, 11, 28, 12, 17, 23, 15, 25, 16, 20, 11, 17, 11, 12, 16, 28, 27, 27, 27, 14, 13, 16, 22, 28, 12, 12, 26, 19, 22, 21, 21, 12, 19, 28, 22, 16, 23, 20, 28, 27, 24, 15, 19, 13, 12, 12, 29, 28, 12, 20, 22, 23, 17, 17, 27, 27, 21, 20, 28, 28, 28, 14, 13, 13, 11, 14, 14, 14, 14, 14], 'labels': array([False, True, False, ..., False, False, False]), 'ident': 6, 'group': None}\n" - ] - } - ], - "source": [ - "for i in range(5, 6):\n", - " print(data_pt[i])" - ] - }, - { - "cell_type": "markdown", - "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", - "metadata": {}, - "source": [ - "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "\n", - "### Structure of `data.pt`\n", - "\n", - "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", - "\n", - "- **`features`**: \n", - " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", - "\n", - "- **`labels`**: \n", - " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", - "\n", - "- **`ident`**: \n", - " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" - ] - }, - { - "cell_type": "markdown", - "id": "186ec6f0eed6ecf7", - "metadata": {}, - "source": [ - "## classes.txt File\n", - "\n", - "**Description**: A file containing the list of selected SCOPe **labels** based on the specified threshold. This file is crucial for ensuring that only relevant **labels** are included in the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:15:19.146285Z", - "start_time": "2024-10-05T21:15:18.503284Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "class_48724\n", - "class_53931\n", - "class_310555\n", - "fold_48725\n", - "fold_56111\n", - "fold_56234\n", - "fold_310573\n", - "superfamily_48726\n", - "superfamily_56112\n", - "superfamily_56235\n", - "superfamily_310607\n", - "family_48942\n", - "family_56251\n", - "family_191359\n", - "family_191470\n" - ] - } - ], - "source": [ - "with open(os.path.join(scope_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n", - " for i in range(15):\n", - " line = file.readline()\n", - " print(line.strip())" - ] - }, - { - "cell_type": "markdown", - "id": "861da1c3-0401-49f0-a22f-109814ed95d5", - "metadata": {}, - "source": [ - "\n", - "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/classes.txt`\n", - "\n", - "The `classes.txt` file lists selected SCOPe classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique SCOPe class ID, identifying specific class withing SCOPe ontology along with the hierarchy level.\n", - "\n", - "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" - ] - }, - { - "cell_type": "markdown", - "id": "fb72be449e52b63f", - "metadata": {}, - "source": [ - "## splits.csv File\n", - "\n", - "**Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-05T21:15:54.575116Z", - "start_time": "2024-10-05T21:15:53.945139Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idsplit
01train
13train
24train
36train
49train
\n", - "
" - ], - "text/plain": [ - " id split\n", - "0 1 train\n", - "1 3 train\n", - "2 4 train\n", - "3 6 train\n", - "4 9 train" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv_df = pd.read_csv(os.path.join(scope_class.processed_dir_main, \"splits.csv\"))\n", - "csv_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "b058714f-e434-4367-89b9-74c129ac727f", - "metadata": {}, - "source": [ - "\n", - "\n", - "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/splits.csv`\n", - "\n", - "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "6dc3fd6c-7cf6-47ef-812f-54319a0cdeb9", - "metadata": {}, - "outputs": [], - "source": [ - "# You can specify a literal path for the `splits_file_path`, or if another `scope_class` instance is already defined,\n", - "# you can use its existing `splits_file_path` attribute for consistency.\n", - "scope_class_with_splits = SCOPeOver2000(\n", - " scope_version=\"2.08\",\n", - " # splits_file_path=\"data/chebi_v231/ChEBI50/processed/splits.csv\", # Literal path option\n", - " splits_file_path=scope_class.splits_file_path, # Use path from an existing `chebi_class` instance\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", - "metadata": {}, - "source": [ - "## 5.1 Protein Representation Using Amino Acid Sequence Notation\n", - "\n", - "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", - "\n", - "### Example Protein Sequence\n", - "\n", - "Protein-Chain: PDB ID:**1cph** Chain ID:**B** mol:protein length:30 INSULIN (PH 10)\n", - "
Refer - [1cph_B](https://www.rcsb.org/sequence/1CPH)\n", - "\n", - "- **Sequence**: `FVNQHLCGSHLVEALYLVCGERGFFYTPKA`\n", - "- **Sequence Length**: 30\n", - "\n", - "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", - "\n", - "### Tokenization and Encoding\n", - "\n", - "To tokenize and numerically encode this protein sequence, the `ProteinDataReader` class is used. This class allows for n-gram tokenization, where the `n_gram` parameter defines the size of the tokenized units. If `n_gram` is not provided (default is `None`), each amino acid letter is treated as a single token.\n", - "\n", - "For more details, you can explore the implementation of the `ProteinDataReader` class in the source code [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/reader.py)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "da47d47e-4560-46af-b246-235596f27d82", - "metadata": {}, - "outputs": [], - "source": [ - "from chebai.preprocessing.reader import ProteinDataReader" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "8bdbf309-29ec-4aab-a6dc-9e09bc6961a2", - "metadata": {}, - "outputs": [], - "source": [ - "protein_dr_3gram = ProteinDataReader(n_gram=3)\n", - "protein_dr = ProteinDataReader()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "68e5c87c-79c3-4d5f-91e6-635399a84d3d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[25, 28, 19, 18, 29, 17, 24, 13, 11, 29, 17, 28, 27, 14, 17, 22, 17, 28, 24, 13, 27, 16, 13, 25, 25, 22, 15, 23, 21, 14]\n", - "[5023, 2218, 3799, 2290, 6139, 2208, 6917, 4674, 484, 439, 2737, 851, 365, 2624, 3240, 4655, 1904, 3737, 1453, 2659, 5160, 3027, 2355, 7163, 4328, 3115, 6207, 1234]\n" - ] - } - ], - "source": [ - "protein = \"FVNQHLCGSHLVEALYLVCGERGFFYTPKA\"\n", - "print(protein_dr._read_data(protein))\n", - "print(protein_dr_3gram._read_data(protein))" - ] - }, - { - "cell_type": "markdown", - "id": "5b7211ee-2ccc-46d3-8e8f-790f344726ba", - "metadata": {}, - "source": [ - "The numbers mentioned above refer to the index of each individual token from the [`tokens.txt`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/bin/protein_token/tokens.txt) file, which is used by the `ProteinDataReader` class. \n", - "\n", - "Each token in the `tokens.txt` file corresponds to a specific amino-acid letter, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing." - ] - }, - { - "cell_type": "markdown", - "id": "93e328cf-09f9-4694-b175-28320590937d", - "metadata": {}, - "source": [ - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 9575d553bff7a6ecf8d7bb2aac88b4cf73fcd7f1 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 15 Apr 2025 14:49:30 +0200 Subject: [PATCH 2/4] remove protein-related libs --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index 8a6d3e0c..21ddfa49 100644 --- a/setup.py +++ b/setup.py @@ -50,8 +50,6 @@ "chardet", "pyyaml", "torchmetrics", - "biopython", - "fair-esm", ], extras_require={"dev": ["black", "isort", "pre-commit"]}, ) From d1ca287de89f4eab2f855bb4592abf6a3cc27c40 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 15 Apr 2025 14:51:13 +0200 Subject: [PATCH 3/4] remove proteins tokens and action workflow --- .github/workflows/token_consistency.yaml | 6 - .../bin/protein_token/tokens.txt | 21 - .../bin/protein_token_3_gram/tokens.txt | 8359 ----------------- 3 files changed, 8386 deletions(-) delete mode 100644 chebai/preprocessing/bin/protein_token/tokens.txt delete mode 100644 chebai/preprocessing/bin/protein_token_3_gram/tokens.txt diff --git a/.github/workflows/token_consistency.yaml b/.github/workflows/token_consistency.yaml index 06c3a42e..5261bf52 100644 --- a/.github/workflows/token_consistency.yaml +++ b/.github/workflows/token_consistency.yaml @@ -13,21 +13,17 @@ on: - "chebai/preprocessing/bin/smiles_token/tokens.txt" - "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt" - "chebai/preprocessing/bin/selfies/tokens.txt" - - "chebai/preprocessing/bin/protein_token/tokens.txt" - "chebai/preprocessing/bin/graph_properties/tokens.txt" - "chebai/preprocessing/bin/graph/tokens.txt" - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt" - - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" pull_request: paths: - "chebai/preprocessing/bin/smiles_token/tokens.txt" - "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt" - "chebai/preprocessing/bin/selfies/tokens.txt" - - "chebai/preprocessing/bin/protein_token/tokens.txt" - "chebai/preprocessing/bin/graph_properties/tokens.txt" - "chebai/preprocessing/bin/graph/tokens.txt" - "chebai/preprocessing/bin/deepsmiles_token/tokens.txt" - - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" jobs: check_tokens: @@ -58,11 +54,9 @@ jobs: "chebai/preprocessing/bin/smiles_token/tokens.txt" "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt" "chebai/preprocessing/bin/selfies/tokens.txt" - "chebai/preprocessing/bin/protein_token/tokens.txt" "chebai/preprocessing/bin/graph_properties/tokens.txt" "chebai/preprocessing/bin/graph/tokens.txt" "chebai/preprocessing/bin/deepsmiles_token/tokens.txt" - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" ) echo "TOKENS_FILES=${TOKENS_FILES[*]}" >> $GITHUB_ENV diff --git a/chebai/preprocessing/bin/protein_token/tokens.txt b/chebai/preprocessing/bin/protein_token/tokens.txt deleted file mode 100644 index c31c5b72..00000000 --- a/chebai/preprocessing/bin/protein_token/tokens.txt +++ /dev/null @@ -1,21 +0,0 @@ -M -S -I -G -A -T -R -L -Q -N -D -K -Y -P -C -F -W -E -V -H -X diff --git a/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt b/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt deleted file mode 100644 index 534e5db1..00000000 --- a/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt +++ /dev/null @@ -1,8359 +0,0 @@ -MAT -ATP -TPG -PGA -GAS -ASS -SSA -SAR -ARD -RDE -DEF -EFV -FVY -VYM -YMA -MAK -AKL -KLA -LAE -AEQ -EQA -QAE -AER -ERY -RYE -YEE -EEM -EMV -MVE -VEF -EFM -FME -MEK -EKV -KVA -VAK -AKA -KAV -AVD -VDK -DKD -KDE -DEL -ELT -LTV -TVE -VEE -EER -ERN -RNL -NLL -LLS -LSV -SVA -VAY -AYK -YKN -KNV -NVI -VIG -IGA -GAR -ARR -RRA -RAS -ASW -SWR -WRI -RII -IIS -ISS -SSI -SIE -IEQ -EQK -QKE -KEE -EES -ESR -SRG -RGN -GND -NDD -DDH -DHV -HVS -VSL -SLI -LIR -IRD -RDY -DYR -YRS -RSK -SKI -KIE -IET -ETE -TEL -ELS -LSD -SDI -DIC -ICD -CDG -DGI -GIL -ILK -LKL -KLL -LLD -LDT -DTI -TIL -ILV -LVP -VPA -PAA -AAA -AAS -ASG -SGD -GDS -DSK -SKV -KVF -VFY -FYL -YLK -LKM -KMK -MKG -KGD -GDY -DYH -YHR -HRY -RYL -YLA -AEF -EFK -FKS -KSG -SGQ -GQE -QER -ERK -RKD -KDA -DAA -AAE -AEH -EHT -HTL -TLT -LTA -TAY -YKA -KAA -AAQ -AQD -QDI -DIA -IAN -ANS -NSE -SEL -ELA -LAP -APT -PTH -THP -HPI -PIR -IRL -RLG -LGL -GLA -LAL -ALN -LNF -NFS -FSV -SVF -FYY -YYE -YEI -EIL -ILN -LNS -NSP -SPD -PDR -DRA -RAC -ACN -CNL -NLA -LAK -AKQ -KQA -QAF -AFD -FDE -DEA -EAI -AIA -IAE -AEL -ELD -DTL -TLG -LGE -GEE -ESY -SYK -YKD -KDS -DST -STL -TLI -LIM -IMQ -MQL -QLL -LLR -LRD -RDN -DNL -NLT -LTL -TLW -LWT -WTS -TSD -SDM -DMQ -MQD -QDD -DDV -DVA -VAD -ADD -DDI -DIK -IKE -KEA -EAA -AAP -APA -AAK -AKP -KPA -PAD -ADE -DEQ -EQQ -QQS -MSD -SDT -DTV -EEL -ELV -LVQ -VQR -QRA -RAK -RYD -YDD -DDM -DMA -MAA -AAM -AMK -MKK -KKV -KVT -VTE -TEQ -EQG -QGQ -QEL -LSN -SNE -NEE -NVV -VVG -VGA -RRS -RSS -SSW -WRV -RVI -VIS -QKT -KTE -TEG -EGS -GSE -SEK -EKK -KKQ -KQQ -QQL -QLA -AKE -KEY -EYR -YRV -RVK -VKV -KVE -VEQ -EQE -ELN -LND -NDI -ICQ -CQD -QDV -DVL -VLK -LDE -EFL -FLI -LIV -IVK -VKA -KAG -AGA -GAA -AES -ESK -DYY -YYR -YRY -AEV -EVA -VAS -ASE -SED -EDR -RAA -AAV -AVV -VVE -VEK -EKS -KSQ -SQK -QKA -KAY -AYQ -YQE -QEA -EAL -ALD -LDI -IAK -AKD -KDK -DKM -KMQ -MQP -QPT -LNT -NTP -TPE -PEH -EHA -HAC -ACQ -CQL -FDD -DDA -DAI -TLN -LNE -NED -EDS -DSY -SDV -DVG -GAE -AED -EDQ -DQE -QEQ -QEG -EGN -GNQ -NQE -EAG -AGN -MAS -ASA -SAE -LSR -SRE -REE -EEN -ENV -NVY -AKT -KTV -TVD -VDS -DSE -SEE -EEG -EGR -GRG -GNE -DRV -RVT -VTL -LIK -IKD -KDY -YRG -RGK -GKI -LTK -TKI -KIC -LLE -LET -ETH -THL -HLV -VPS -PSS -SST -STA -TAP -APE -PES -FKT -KTG -TGA -AEN -ENT -NTM -TMV -MVA -IAL -ALA -ACS -CSL -SLA -AIS -ISE -TLS -LSE -DIS -EDP -DPA -PAE -AEE -EEI -EIR -IRE -REA -EAP -APK -PKR -KRD -RDS -DSS -SSE -SEG -EGQ -LES -ESH -SHL -LLH -LHD -HDN -PKH -KHD -HDL -DLS -MST -STR -TRE -VDV -DVE -SVE -SKG -KGN -EDH -HVA -VAI -AII -IIK -IES -ESE -LSK -LNV -NVL -VLE -LEA -EAH -AHL -HLI -LIP -IPS -PSA -SAS -ASP -SPA -FKA -RKE -EST -TLV -LVA -YKS -KSA -ASD -IAT -ATA -TAE -DMT -MTD -TDE -AGD -GDE -DEI -EIK -EAS -ASK -SKP -KPD -PDG -DGA -MAE -RED -EDC -DCV -CVF -VFL -FLS -SKL -EQS -QSE -SER -YDE -DEM -MVQ -VQY -QYM -YMK -MKQ -KQV -QVA -VAA -AAL -NTE -IGS -GSR -SRR -IIT -ITS -TSL -SLE -LEQ -KEQ -QAK -AKG -NDK -DKH -KHV -HVE -VEI -EII -IKG -KGY -GYR -YRA -AKI -IED -EDE -AKY -KYC -YCD -CDD -LKV -KVI -VIK -KEN -ENL -LLP -LPN -PNA -NAS -AST -STS -TSE -SES -FYK -YKK -KKM -KME -MEG -EGD -RYY -YYA -YAE -EFT -FTV -VDE -DEK -EKR -KRQ -RQE -QEV -ADK -DKS -KSL -LAA -AAY -AYT -YTE -TEA -EAT -ATE -TEI -EIS -ISN -SNA -NAD -ADL -DLA -EIM -IMN -MND -NDA -DAD -DKA -KAC -DDS -DSI -SIA -KLD -DEV -EVP -VPE -ESS -SSY -DTA -TAD -DEE -AAT -ATL -LGR -GRD -RDQ -DQY -QYV -YVY -VQF -QFM -MEQ -EQL -QLV -LVT -VTG -GAT -TPA -GSL -SLR -LRA -AAW -AWR -RIV -IVS -VSS -SRK -RKN -KND -NDE -DEH -EHV -SLV -LVK -VKD -VES -LSS -SSV -SVC -VCS -CSG -SGI -LDS -DSH -SAG -RYM -DER -RKT -KTA -TAA -EDT -DTM -TML -MLA -LAY -IAA -AAD -ADM -MAP -NSS -SSD -SDK -CNM -NMA -AFE -FEE -EEA -MQE -EQM -QMD -MDE -ATT -TTL -SRD -LVS -VSG -SGA -PAG -AGE -GEL -KNE -EEH -VET -SIC -ICS -ILR -LRL -RLL -SAT -TAS -TMI -MIA -IAY -VAV -AVA -EKA -CSM -SMA -MTM -TMD -MDK -KSE -VQK -KAK -MKA -AVT -QGH -GHE -HEL -TER -RNE -NEK -QQM -QMG -MGK -GKE -YRE -REK -EKI -IEA -EAE -ELQ -LQD -ICN -CND -NDV -LEL -ELL -LDK -DKY -KYL -YLI -IPN -NAT -ATQ -TQP -QPE -DYF -YFR -FRY -YLS -SEV -GDN -DNK -NKQ -KQT -QTT -TTV -TVS -VSN -SNS -NSQ -SQQ -QQA -QAY -EAF -FEI -ISK -SKK -KKE -KEM -EMQ -SPE -PEK -TAF -SEN -ENQ -NQG -QGD -DEG -GDA -DAG -GEG -EGE -GEN -LIL -LNA -TQA -SGE -ENK -CSD -ATH -THA -HAE -MTE -ERE -REN -ENN -NNV -VYK -VEA -EAM -ASM -SMD -MDV -VEL -TSI -NKG -KGA -EEK -EKL -KLE -LEM -EMI -MIK -IKT -KTY -TYR -RGQ -GQV -QVE -EKE -KEL -ELR -RDI -DIL -LEK -EKH -KHL -IPC -PCA -CAT -ATS -TSG -GES -YYK -YKM -EFA -FAT -ATG -TGS -GSD -SDR -DRK -ENS -NSL -LIA -IAM -AMN -NDL -DLP -LPP -PPT -ACR -CRL -RLA -AAF -MQA -EEV -EVD -VDP -DPN -NAG -GDG -DGE -GEP -EPK -PKE -EQI -QIQ -IQD -VED -DQD -DVS -MDD -DDR -DRE -EDL -DLV -LVY -VYQ -YQA -ESM -SMK -VAG -AGM -GMD -KGG -GGE -GED -EDK -DKL -KLK -KMI -MIR -REY -YRQ -RQM -QMV -ELK -KLI -LIC -ICC -CCD -CDI -ILD -LDV -VLD -IPA -AAN -ANT -NTG -TGE -TGN -NDR -AMT -ELP -MQG -EEQ -EQN -QNK -NKE -ALQ -DEN -MGD -GDR -REQ -LLQ -LQR -RAR -ARL -SAM -NEP -EPL -PLS -DRN -KTM -TMA -MAD -ADG -DGN -KKL -KVK -AYR -IEK -ELE -ETV -TVC -VCN -VLS -LSL -SLL -DKF -KFL -IKN -KNC -NCN -NDF -DFQ -FQY -QYE -YES -GEK -KKN -KNS -NSV -SVV -SEA -YKE -SKE -QMQ -EIQ -IQN -QNA -NAP -PEQ -QAC -ACL -CLL -LLA -SDQ -DQQ -QQD -QDE -VLA -ALL -KEH -EHM -HMQ -MVD -VDR -KAR -MKN -NVT -KTS -TSA -SAD -KKI -IEM -MVR -VRA -RAY -EAV -AVC -VCQ -LDN -DNY -NYL -NCS -CSE -SET -ETQ -TQY -VAT -KRA -RAT -ATV -TVV -AYS -YSE -AHE -HEI -LNY -NYS -YSV -ACH -CHL -HLA -DDD -DDG -DGG -GNN -MER -ERA -ASL -LIQ -IQK -YED -EDM -AFM -FMK -MKS -SAV -AVE -EKG -KGE -LSC -SCE -CEE -VGG -GGQ -GQR -RVL -QKS -KSN -KGP -GPE -PEV -EVK -VKE -LRG -RGV -GVC -VCD -CDT -TVL -VLG -GLL -GAG -DAE -SRV -RVF -TGD -GDD -DDK -DKK -KKR -KRI -IID -IDS -DSA -ARS -RSA -SAY -AMD -MDI -EMP -MPP -PTN -TNP -NPI -VFH -FHY -HYE -EIA -PEE -ISL -KTT -TTF -TFD -AMA -DLH -LHT -WTA -ADS -EGG -GEA -EEP -EPQ -PQS -EKT -ELI -ATC -TCM -CMK -QGA -GGR -GRR -SAW -KTD -TDT -DTS -KLQ -LQL -QLI -LRS -RSI -ICT -CTT -ANA -ATN -NPE -VAC -ACG -CGD -RKQ -QTI -TID -IDN -DNS -SQG -GAY -FDI -LNN -NNP -PEL -LAC -ACT -CTL -TLA -SDS -EEC -ECD -CDA -AEG -EGA -TIE -IEN -STV -DKE -MAQ -AQA -QAM -KSV -SVT -TET -ETG -TGV -GVE -ARK -LAR -ARE -RER -ERV -RVE -LRE -REI -EIC -ICY -CYE -YEV -EVL -IPK -PKA -KAS -ASN -SNP -DAR -ARN -RNT -NTV -VVD -VDD -DSQ -SQT -QTA -YQD -QDA -DAF -KGK -GKM -PDK -DTQ -TQG -AEP -PQE -GGD -DKN -NEL -AAC -ACM -RVV -VVS -AEK -QMA -MAR -EKF -ASQ -SQA -AAG -KKG -KGI -GIV -IVD -VDQ -DQS -QSQ -AEA -SQP -MPA -PAS -ASR -DSV -SVY -VYL -VEN -ENM -NMK -SSG -EAK -NES -ESQ -SQV -VAL -ALI -ICE -CED -EDI -ILS -SVL -SDH -DHL -LIT -SAQ -AQT -QTG -FAI -KRK -EAY -DAV -DLE -ETL -WTD -TDL -TEE -QQQ -QSS -SSQ -QAP -AQP -PTE -EGK -GKA -KAD -ADQ -MTR -VAE -NEN -ENH -NHV -HVK -VKK -KIK -EYK -YKC -KCK -CKV -LTD -TDI -ILE -LEV -GNP -NPR -PRK -SSL -IAV -DVH -VHN -HNM -NME -EKN -KNQ -NQD -QDG -DGD -DDQ -DQN -QNE -EPG -PGM -AFT -FTR -EDY -DYV -YVF -VFM -FMA -AQL -QLN -ENA -NAE -ETM -TMR -MRK -RKI -KIS -ISG -SGM -GME -KER -IGP -GPR -PRR -KEK -KGR -GRQ -RQK -QKP -KPN -NAK -AKR -RIE -QIR -IRV -RVY -VYR -QKI -LQE -EQF -QFV -FVP -VPR -PRS -RST -STN -TNA -ADA -DAK -AKV -AEY -EYS -YSS -KIA -IAG -AGS -GSA -SAL -NAY -AYN -YNS -NSA -SAF -ISQ -QLP -ILA -LAS -ACE -CEL -RKA -KAF -FDA -AAI -AIT -ITD -DLD -KLT -LTE -NLN -LNL -NLW -LWV -WVT -VTD -TDS -DDN -DNA -NEA -ALS -VLN -DNF -NFL -NCG -CGE -GET -TQH -QHE -HES -KSY -SYS -DDE -MVS -VSQ -QVV -VVA -EKP -KPQ -PQL -KKA -AGC -GCN -CNS -NSH -SHG -HGQ -GQD -QDS -SYF -YFL -FLG -LGW -GWQ -WQE -QEY -EYE -YEK -KNP -NPF -PFD -FDP -DPV -PVS -NPS -PSG -GII -IIQ -IQM -MGL -NQL -QLS -LSF -SFD -FDL -DLL -LEE -EEW -EWL -WLE -NPH -PHA -HAL -ALG -GLR -LRR -RRE -REG -GGG -GGA -ASV -VFR -FRE -REL -ALF -LFQ -FQD -QDY -YHG -HGL -GLP -LPA -PAF -AFK -FKN -KNA -NAL -ARF -RFM -FMS -MSE -SEQ -EQR -QRG -RGY -GYK -YKV -KVV -VVF -VFD -DPS -PSN -SNI -NIV -IVL -VLT -TAG -SAN -ANE -ALM -LMF -MFC -FCL -CLA -LAD -ADH -DHG -HGD -AFL -IPT -PTP -TPY -PYY -YYP -YPG -PGF -GFD -FDR -DRD -RDL -DLK -LKW -KWR -WRT -RTG -AEI -EIV -IVP -VPV -PVH -VHC -HCA -CAS -ANG -NGF -GFR -FRV -VTR -TRP -RPA -PAL -LDD -DAY -YRR -RAQ -AQK -QKR -KRR -RRL -RLR -LRV -VKG -KGV -GVL -VLI -ITN -NPL -PLG -LGT -GTA -SPR -PRA -RAD -ETI -TIV -VDF -DFV -FVA -GIH -IHL -LIS -ISD -SDE -EIY -IYA -YAG -AGT -AFA -FAE -EPP -PPA -AGF -GFV -FVS -VSA -ALE -EVV -AGR -RDG -GAD -ADV -VSD -RVH -VHV -HVV -VVY -VYS -YSL -SLS -SKD -KDL -DLG -LPG -RVG -GAI -AIY -IYS -YSA -NAA -SAA -ATK -TKM -KMS -MSS -SSF -SFG -FGL -GLV -QTQ -QYL -YLL -LLG -LGD -RDF -DFT -TRS -RSY -SYV -YVA -NKR -RRI -RIK -ERH -RHD -HDQ -DQL -LVD -VDG -DGL -EIG -IGI -GIG -IGC -GCL -CLP -LPS -AGL -GLF -LFC -FCW -CWV -WVD -VDM -DMS -MSH -HLM -LMR -MRS -RSR -SRS -RSF -SFA -FAG -GEM -EME -MEL -ELW -LWK -WKK -VFE -FEV -EVG -VGL -GLN -LNI -NIS -ISP -SPG -PGS -GSS -SSC -SCH -CHC -HCR -CRE -REP -PGW -GWF -WFR -RVC -VCF -CFA -FAN -ANM -NMS -MSA -SAK -KTL -TLD -VAM -AMQ -MQR -QRL -SFV -FVD -TGG -ALR -AVP -PVR -VRS -RSV -SVS -VSC -SCP -CPL -PLA -LAI -AIK -IKW -KWA -WAL -RLT -LTP -TPS -PSI -IAD -ADR -KAE -MAY -YQG -QGI -GID -IDL -LST -STK -TKA -HGE -YFD -FDG -DGW -GWK -WKA -AYD -YDT -DTN -DLR -LRH -RHN -HNR -NRG -RGG -GGV -GVI -VIQ -SLD -LDL -DLI -LIE -IEE -EWS -WSK -SKN -KNH -NHP -HPE -PEA -ASI -CTP -PEG -EGV -GVS -SQF -QFK -FKR -RIA -ANF -NFQ -LPE -PEF -EFR -FRK -KAM -AQF -FMG -MGQ -QVR -VRG -GGK -KAT -ATF -DPD -VVM -VMS -MSG -SGG -GAQ -AQE -QET -LAF -AFC -LAN -ANP -NPG -PGE -FLV -VPT -YPA -RDC -DCC -CCW -CWR -WRS -RSG -GIK -IKL -LPI -PIE -IEC -ECH -CHS -HSF -SFN -FND -DFR -FRL -TKE -ALV -YDG -RRQ -RQG -GIS -ISV -SVK -ILI -GTI -TIT -TDR -RDT -LAM -AML -LAT -TFA -TEH -EHR -HRV -VHL -LVC -CDE -GSV -VFA -PEY -EYV -YVS -VSI -EVI -VIE -IER -ERD -RDV -DVP -VPW -PWC -WCN -CNR -NRD -LIH -IHV -KDF -DFG -VGI -IIY -YSY -SYN -YND -AAR -RRM -RMS -QYF -FLA -ARM -RML -MLS -EEF -EFI -FIG -IGR -GRF -RFL -FLQ -QES -SKC -KCR -RLV -VAR -ARH -RHE -HER -ERF -RFT -FTS -SGL -REV -CLR -GNA -LFS -FSW -SWM -WMD -MDL -MLR -LWR -VIV -IVH -VHQ -HQV -QVK -VKL -KLN -NVS -VSP -PGT -GTS -TSF -SFH -FHC -VCH -CHA -HAN -NMD -DET -TME -MEV -GRI -RIH -IHD -HDF -FVR -VRQ -RQH -QHQ -HQQ -QQR -QRR -RRV -ERW -RWA -WAA -ANR -NRQ -RQL -QLR -RLS -SLP -LPH -PHH -HHH -HHL -HLS -LSP -PAH -SSP -SPL -SPQ -QSP -SPM -PMV -KQL -TKV -VTS -TSN -SNG -NGH -GHG -GWE -WEE -EEY -NPY -PYD -NPN -PNG -NGM -GMI -MIQ -QLC -LCF -CFD -ESW -SWL -WLT -TKN -NPD -PDA -SLK -LKR -KRN -RNG -NGQ -GQS -QSI -SIF -IFR -HGM -GMP -MPE -FKK -MEE -IRG -GNR -NRV -VTF -DPK -PKK -KIV -GST -NET -TLM -PGD -FLL -LPT -VPI -PIH -IHC -HCS -CSS -SSS -SSN -GFQ -FQI -QIT -ITE -TES -ESA -LQQ -YQQ -QAQ -QKL -VLV -VTN -TAL -ALT -LTR -TRR -LLV -DFI -FIT -TSK -KNI -NIH -YSG -SGT -GTM -TMF -MFG -FGF -GFE -FEQ -QFI -FIS -SVM -VMD -LKD -LED -DTE -TEV -EVS -VSK -SKR -KRV -YSN -SND -MIV -LSA -KKF -KFT -TSQ -SQY -YLE -NQK -KRL -RLK -LKS -KSR -SRQ -RQR -GLE -AGI -GIT -ITC -TCL -RSN -DMR -MRH -RHL -HLL -TNT -NTF -TFE -FEA -DLW -IVY -VYN -YNV -NVK -HCT -CTE -TEP -ALK -LKT -KTF -TFV -FVE -STD -TDC -DCG -CGR -GRM -RMI -MIS -ISR -SSH -SHE -ERL -LRK -RKK -KKT -SNW -NWV -WVF -RVS -VSW -SWT -RVP -VPD -PDE -VAF -TEK -KQD -QDL -DLN -IAS -DGH -AYE -ENP -PFH -FHP -PID -IDR -DRP -RPD -DGV -LCG -GDL -DLM -RKW -KWV -WVL -LKH -KHP -CTS -GVN -VNQ -NQF -QFS -FSD -IAI -AIF -IFQ -FRQ -RQA -QAV -AKF -KFM -KTR -TRN -RNN -NNK -NKV -VKF -KFD -DRI -IVM -GAH -HET -TVA -DGF -GFL -LRW -RWR -VNL -NLV -PVT -VTC -TCH -HSS -GFK -FKI -KIT -ITV -YEN -NAR -RKS -NIP -IPV -PVK -KGL -GTT -LDR -REC -ECL -CLK -LVN -VNF -NFT -FTN -TND -DKG -YAA -TFG -FGQ -SEF -EIE -DCN -IHI -HIV -KDM -DMG -PGL -VVQ -VQI -QIA -IAR -RKM -QHL -AKM -KML -FIR -RES -KLR -RHA -EIT -ITT -TTG -TGL -GLD -LDG -GLG -LGI -IGW -GWL -WLK -LKA -LFL -FLW -LWM -LRN -LLK -TAT -FDS -PGG -GGS -GSF -HCH -CHE -HEP -MDH -DHK -HKT -MET -ETA -LER -ERI -RIR -VFT -SQL -QLE -EEE -EET -ETK -TKP -KPM -PMA -TTM -TMM -MMA -AKK -KKK -KKC -KCW -CWQ -WQS -QSN -SNL -NLR -SFS -DTR -RRF -RFD -GFF -FFS -FSP -SPH -PHS -HSP -SPV -PVP -VPP -PPS -PSP -PLV -LVR -RKV -NAH -AHG -NGI -ETW -TWL -WLA -AKN -GLK -LKK -KKD -KDG -DGQ -IFK -FKE -KAL -PSK -MLT -GTV -TVF -VFG -VSV -KNL -NLE -LEN -VHI -MVV -TST -STY -TYL -YLD -LKI -KIR -IRQ -QKK -KLV -VYD -YDV -DVK -MKR -LKE -YVE -DSR -SKS -KSS -SHD -HDR -IKS -RKR -KRT -RTV -MHG -HGS -GSG -SGH -GHS -HSL -SLT -LTG -GAP -APH -PHQ -HQI -QIP -IPP -PPP -PPR -PRT -RTQ -GQQ -TAN -ANQ -DKI -KID -IDP -DPF -FHN -HNK -KRG -RGT -TSR -LRI -RIN -INN -NNS -SSR -SRY -RYN -NVD -VQL -KDT -NEQ -EQP -QPA -LVI -VQC -QCQ -CQH -QHV -HVF -FDF -DFY -FYD -YDP -PVA -VAQ -QLK -LKC -CKE -KEI -IKR -LID -IDH -DHI -HIT -TKG -AIV -IVE -TIY -IYP -PAV -AVI -IKM -KMV -NIF -VLP -PSE -ENC -NCE -CEF -EFD -DPE -EED -DEP -EPT -PTL -TLE -SWP -WPH -PHL -HLQ -VYE -YEL -ELF -FLR -LRF -FLE -ESP -PDF -FQA -QAS -SIG -IGK -GKK -KKY -KYI -YID -IDQ -DQR -QRF -RFV -FVL -DLF -LFD -DPR -PRE -DFL -FLK -VLH -LHR -HRI -RIY -IYG -YGK -GKF -RAF -AFI -IRK -RKH -KHI -HIN -NNM -NMF -MFL -YET -ETD -DSF -FNG -NGV -GVG -VGE -LEI -ILG -LGS -GSI -SII -IIN -ING -GFA -FAL -ALP -LPL -PLK -LKQ -KQE -QEH -EHK -HKV -KVL -VLL -PLH -LHK -HKP -KPK -PKC -KCL -CLS -SLY -LYH -YHA -HAQ -AYC -YCV -CVV -FIE -EKD -TPQ -PQV -QVF -LKF -KFW -FWP -WPR -RTC -TCS -SSK -KEV -EVM -VMF -GEV -EVE -DII -IIE -IEP -EPE -KII -DPL -PLF -LFR -AKC -KCV -CVS -PHF -HFQ -FQV -RAL -ALY -LYF -YFW -FWN -WNN -NNE -NEY -EYI -YIL -TSS -LVM -VMP -MPI -PIM -IMF -MFP -FPA -LYR -YRI -RIS -EHW -HWN -WNQ -NQT -IVA -TFM -MEM -EMN -MNG -NGK -GKL -KLF -LTS -TYK -YKG -GER -EKQ -KQR -QRE -KDR -RDA -AFW -FWK -MEA -LNP -NPP -EVT -VTP -PSL -SLF -LFP -FPE -TDY -DYL -DGP -GPN -PNM -NMT -MTP -TPL -PLP -LPV -AGG -GDK -KSP -SPS -PSV -VVK -KKS -STG -ETT -TTT -TTP -PAK -TKL -KLP -STP -TPT -PTS -TSP -GLS -PPD -DKV -KVD -GFS -FSR -RSL -ARP -RPR -RSH -SHS -QFR -RYQ -YQS -SNQ -NQQ -QQE -PLL -KDV -ELH -LHE -RKL -LAQ -AQC -QCG -CGV -GVM -MFD -FLD -LDC -CVA -LKG -VKR -LVE -VEC -ECV -CVG -VGS -TRG -EPV -PVY -VYP -YPD -PDI -IIR -IRM -SVN -VNI -FRT -RTL -TLP -EPN -PNL -LEP -EPS -PSW -YEF -EFF -FFL -FQP -QPS -KRY -RYV -YVD -DQK -QKF -KFV -VLM -LML -MLL -EYL -KTI -ILH -VYG -AYI -YIR -KQC -QCN -CNH -NHI -HIF -IFL -RFI -FIY -IYE -LEH -EHF -HFN -GVA -HKQ -KQF -QFL -VRV -IPL -LHS -HSV -VKS -FHA -DAT -HVI -VIR -RGL -LKY -KYW -YWP -WPK -PKT -KTC -TCT -CTQ -TQK -DVI -PSQ -FVK -VKI -KIQ -IQE -QEP -LFK -FKQ -ARC -RCV -EDN -DNC -NCH -CHT -HTV -AVF -FGT -GTL -TLY -LYQ -YQV -QVS -LIY -IYN -ASY -YKL -QQK -KAQ -ERQ -WRG -RLQ -LQG -QGT -GTQ -GAK -APV -PRP -RPT -MPY -PYK -KEP -PPK -PKV -KCT -CTA -TAK -KPS -SGK -GKD -EAQ -QPQ -PQP -PQA -AQS -QPP -SNK -KRP -RPS -NST -TPP -PTQ -TQL -IKY -KYS -GGP -GPQ -PQI -QIV -ERR -RQS -SRF -RFN -FNL -NLS -KNR -NRE -LQK -DSP -SPT -TQE -LFI -FIQ -LRQ -RQC -QCC -CCV -CVL -VLF -SDP -SDL -KFK -RAG -NEM -VEY -YIT -ITH -THS -HSR -DVV -VVT -YPE -VTM -MFS -NLF -NPT -PTG -AWP -QPN -PNI -NIA -IRR -RQI -QIN -INH -IFY -FYR -YRF -EHH -HHN -HNG -GIA -HKM -KMF -VYH -YHP -HPQ -KES -PVI -IVG -KTH -SPK -FLN -EFS -FSK -KVM -VME -MEP -LYY -YYW -YWN -YIM -IMS -MSL -SDN -ARV -YRN -RNS -NSK -KSH -SHW -WNK -NKT -TIH -IHG -GLI -YNA -LFM -MNQ -DDC -DCT -TQQ -QQY -QYK -KQK -QKG -RFR -FRM -RMK -MKE -EMW -MWQ -WQK -RLN -NPQ -PQY -QYP -YPM -PMF -MFR -FRA -RAP -APP -PPL -PPV -YSM -SME -ETP -PTA -DIQ -IQL -AVQ -VQM -QML -MLK -KDI -IKK -RRK -LPQ -PQD -DVY -VYT -YTI -TIK -IKA -AHK -HKR -RAE -FLT -SQE -MMR -MRG -RGF -RLI -STT -TTS -KKP -HGT -TTH -GSK -KST -TTE -GKQ -KQS -QSG -SGS -SVP -QGK -GKH -KHH -HHS -SKT -KTK -TKT -VSR -TKK -RKG -KGQ -QSK -SKQ -QQP -SQS -QKQ -KQG -QGS -AIM -MNP -TPV -PVL -TVT -VTK -TKD -KDD -DHA -HAH -AHP -HPT -TLL -LGA -GAV -AVS -SPI -PIS -TAV -ENG -NGN -GNS -NSN -SNN -NNN -NMN -MNI -NIN -INT -NTS -SNT -NTQ -TQD -DAN -ANH -NHA -HAS -SID -IDI -DIP -IPR -SFE -FER -RLP -PTK -PDT -DTD -KTP -PQR -QRH -RHS -RFE -FEP -PSR -RYT -YTP -PLT -PNF -NFN -FNE -NEV -RIP -FIA -DQC -CNT -DFN -NDP -PSF -IQG -KRS -IEF -TNR -NRF -FTY -TYT -YTN -TNE -EMY -MYA -YAH -AHV -VVN -VNM -MFK -KIN -INL -FRP -RPI -PIP -PVN -VNP -NPV -PVG -VGD -GDI -DIY -IYD -DED -VNE -LAW -PHM -AVY -FNH -NHQ -KQY -QYI -QDF -FIL -DIR -DCL -TLH -SFI -RSM -SMN -MNN -NNI -LQF -KFN -VRI -RIL -KVR -VRC -RCL -YCI -CIV -IVQ -KDP -LLT -VMG -LRY -RYW -PKI -INS -NEI -DIF -IFE -PLE -LEF -FIK -IKV -VEV -VPL -LFV -FVQ -KCI -CIS -LSY -SYW -EYF -NLC -LCI -CIE -VIL -ILP -PII -IIF -IFP -LYE -NGE -SIS -DPY -PYM -YML -MLV -QAI -AIN -NSG -GSW -SWN -WNR -NRA -RAI -AIH -IHA -HAM -MAF -KIF -ETN -VLY -CNA -LYL -KET -QRK -KVQ -ENW -NWS -YVK -VKN -NND -KDQ -QYT -NSF -FNT -NTA -NNT -NTL -ENE -END -NDC -DCD -CDS -SEI -IKQ -KQI -QIF -IFG -FGK -LPR -RKP -SHN -HND -NDS -DSN -VNS -NSY -SYY -YYI -YIP -PNS -NGA -GAN -NGT -TVI -VIA -IAP -APS -SNR -NRT -RTN -TNQ -NQV -QVN -VNG -GVY -YEA -SFR -FRD -KLS -LSM -SMC -MCC -RQT -QTL -VDY -DYI -YIA -VST -SDA -QEI -RTF -TFP -FPS -NHE -KIL -DVD -EPA -PAW -LQV -LLL -PMT -TDA -RYI -DHS -FMV -MVH -VHR -HRP -RPF -PFI -KAI -FIF -FET -KHN -HKL -IRA -RPK -KCA -AYH -YHQ -SYC -DFK -FKL -ADT -WPV -TNS -QAA -EFQ -FQR -QRC -RCM -CMV -MVP -CLN -SHF -LWN -NDH -HIR -IRN -NLI -ITQ -TQN -QNH -NHK -VIM -IMP -PIV -IVF -VFP -PAM -AME -NTR -RGH -GHW -NQA -VQS -QSL -NVR -VRK -VMA -AET -TDQ -DQI -QIL -ILF -DEC -KFQ -FQE -QED -EAN -KRE -ATW -TWK -WKL -AVL -PRF -RFS -FSS -TGK -GKT -LTC -TCN -CNK -NKA -SRM -RMV -VDA -NGP -GPF -PFQ -QPV -PVV -VVL -LHI -QEK -KWK -WKE -SEM -THN -NRN -RNV -VIT -EPI -PIY -VVH -VHM -HMF -MFA -FAV -AVN -VLQ -HKI -MAL -KIM -IME -THW -QQF -EAW -AWV -WVK -KAN -YTV -TVY -YSQ -STM -TMS -MSI -SIP -TDG -GPL -LFE -FED -EDV -DVQ -TVK -AHQ -HQA -QKD -RPL -QDP -DPH -PHT -HTK -AHC -CRA -SQD -DGR -MSV -ATD -TDD -DAL -LYP -YPI -PIA -IDE -DVT -TLR -NSI -SIR -STI -TIA -LGV -VER -ERT -RTR -IQF -LVL -QLG -LGN -GNF -FTP -LVG -GPD -PDH -HVH -HCL -VVR -VRD -RDK -ESL -KHS -HFV -VPM -PML -GDW -DWF -WFT -SRT -RTS -SAC -CGL -YPR -PRV -PAI -KSM -SMF -TLC -LCR -CRD -RDD -DDT -DTP -TPM -VRR -KLG -GEF -FAK -FEK -IEG -EGL -GLH -LHV -HVD -EQD -SVR -VRL -SAI -IAF -AFG -ANK -NKK -PIL -IEL -KSW -RVR -VRY -YMV -IEI -QNV -DMD -MDT -DTT -NMY -MYT -TNL -EVR -RCA -CAA -TQR -QEF -NLP -PED -DKR -RQN -QNI -NII -IIC -LLN -NVA -LAG -AGV -IMG -APL -PLI -LIG -EQT -QTV -VSE -IYM -YMQ -NDQ -DQT -QTP -KVN -EDG -DGK -GKW -FMP -MPL -LGQ -FFD -PLC -LCL -LNW -NWL -TDH -VFS -FSI -IMK -LTQ -KFG -FGG -GQW -QWA -WAS -TNI -VPK -PKM -MQK -TNY -YLQ -QRM -RMT -MTC -CLF -MTQ -EDD -VPN -PNV -VRF -FNA -AKS -RIG -GKN -PST -VKP -KPL -LGK -DSD -SDF -DFD -FDV -DVR -RYF -YFS -FSE -SLG -SVD -DSL -LKN -SIK -RSE -IPF -PFL -FAM -AMY -MYL -LRT -EHS -HSA -EIH -VVP -TLQ -VCY -CYP -VTQ -RAN -NFR -KLC -LCQ -NKL -TEY -KSD -NFV -LAV -EAC -ACV -IAQ -VEH -EHL -QCA -VDL -DLQ -AVG -VGP -PEI -ITR -TRV -RVD -AFQ -DFC -FCA -CAN -ANL -NLD -QVQ -QII -IIL -SIL -LPY -PYV -YVR -PNP -PHV -SVI -MLG -YQT -ECP -CPE -CVN -VND -GIQ -IQQ -LSQ -SKW -IEY -EYM -YMP -AGQ -GQL -FDQ -GLC -LCM -CMG -MGW -WLN -HVY -VYA -YAI -AIR -LNM -QFG -FGA -APW -PWA -WAE -IIP -IPM -PMI -MIL -MSR -SRN -RNK -NKN -KNY -YLH -HRM -EVC -VCG -CGT -GTD -DIT -TTK -PTV -ADP -VAN -ANV -FNV -SPF -VID -IDA -DAQ -AQV -KPT -NTD -TDV -VKH -KHF -HFA -FAA -LPF -GTF -TFT -FTT -YVH -ISH -HEH -PSD -AHF -AVK -RQY -FRN -LCS -SDD -DNV -FSN -MPT -FTE -ITK -FQN -QNL -NLM -LMK -MKD -KDC -DCE -CEA -ASH -SHK -KEF -EFC -FCE -CEN -ADC -DCR -MSQ -SQI -LPC -PCI -CIK -NQH -KDN -DNT -NTI -IEH -GIR -EDA -AKW -SLC -CMA -MAW -AWL -WLV -VDH -NLK -KEW -EWA -WAH -AHA -HAT -ATI -TII -AMS -GDP -PNY -MTT -TLF -FCI -CIN -INV -CGQ -TKH -KHM -HML -MLP -VLR -LRM -RMA -MAG -SLQ -KIG -GPI -LQS -KPI -QDQ -VKY -KYF -YFA -FAQ -TTA -YPL -LLM -LMD -HDD -LGP -PER -EVF -VPY -PYI -YIG -IGG -QYA -YAT -ILL -VRE -SLN -QLF -ADW -WFS -KVS -IVR -NIL -MVK -RAV -VGK -NLG -EDW -DWD -WDY -YIS -FQK -IND -NDN -DNQ -VDC -CLI -ISI -KFF -FFN -DES -SHT -HTQ -IGD -DRF -VQP -QPF -LCE -DNE -NEG -GDV -SGF -LNK -NKI -VQN -TVR -NKD -DQV -QVI -VIN -NNF -FLP -NML -EFP -FPD -PDV -IIA -GIE -DVN -VNW -NWR -VRM -MAI -IPI -LGM -GMQ -MQF -QFF -DLC -LSW -WLW -LWD -WDT -YSI -VNN -NNL -EIF -FGS -SDW -DWC -WCR -SRL -ENF -FTI -LTT -GVP -NIR -IRF -SYA -YAV -KYD -YDA -KNT -LQT -AEC -ECQ -CQE -MVM -SQN -QNQ -NQP -AND -FDM -EGP -ETF -PVD -INW -NWK -WKF -FNQ -GNI -NID -VHT -HTE -EAD -ISC -SCV -CVE -FSH -HDG -GEY -GRV -VVI -VIF -QRD -GKY -KYV -GVR -EYN -YST -STF -TFQ -FQS -QSH -FDY -EID -INQ -NQI -IRW -RWL -NFI -DKT -KLW -WKI -DAW -AWN -WNL -NRI -FRG -RGR -GRL -LQI -SIV -PME -YGN -AHT -HTY -TYH -YHV -HVN -NSD -TFL -DDL -RVN -ESF -FNI -VDI -IKP -PAN -ITA -EFH -TQC -CNW -NWF -WFV -KGS -RLC -LCD -CDM -MRD -RDR -ALC -AYA -YAK -DPQ -QSR -SFF -KFS -NGR -GRY -TRD -YLT -KVW -VWD -WDL -MES -PVE -ETY -TYP -YPV -HNY -YLR -RTK -LCA -CAL -IFD -FDK -KFE -FEC -CDW -DWS -WSG -HIL -ILT -GSY -SYH -YHN -HNL -FRS -YAR -ARG -NNQ -KTW -TWE -WEA -EAR -RPQ -EPH -HSQ -FVV -QLQ -QFD -HTA -TAW -AWH -WHP -HPK -PKD -DNI -TNN -NLY -LYI -YIF -IFS -MGR -GRW -RWG -WGR -PDP -PQM -MQT -FMR -MRQ -SIT -IGN -GNM -MLN -TAI -INI -SWC -WCF -CFS -FSQ -QIK -GAL -ADI -EFN -NHD -RDP -SKA -RRG -RGE -INK -WLQ -QKN -VHF -HFL -WKV -KSF -GGY -GYN -YNT -NTK -NGL -PQN -VTA -VKQ -RRT -YHI -LWH -WHL -HLE -NQS -QSY -YNI -TNM -TEC -ECN -CNV -NVF -VFV -KGT -TIR -CDR -DRH -HSK -QFE -PEN -NRS -SGR -YMI -LSI -LHM -HME -VHE -HEY -DCI -CIF -ECC -CWN -WNG -SIM -IMT -MTG -YNN -NFF -FFR -LKP -KPR -KVC -VCT -CTG -GKR -CLD -LDF -FNK -ENI -QDK -DID -IDT -TRK -SFL -RDH -HSY -IST -NHT -HTG -QVH -HRR -WLP -PQQ -QQN -AYF -RPE -EGY -YNL -PAT -LRP -RPM -PMD -LMV -TPR -SDY -DYE -TYM -YMS -WNF -NFE -QSF -HPH -HHC -HCN -MRA -RHT -TKF -FFE -HSG -MEN -ENR -NRP -RPV -TYQ -VHD -HDY -CVW -VWN -NGS -RMF -TKR -AIL -VCV -DFS -HPS -MRF -RFC -FCV -AWF -WFF -FFP -FPN -NTT -TTR -VFW -FWD -WDA -AFS -SNF -FTG -TGC -GCH -CHH -HHG -GQN -GLY -YFQ -RFG -FGY -GYI -IPE -PET -TFS -FSG -SGN -FTD -DDF -ELY -QTN -TNF -LDA -LTI -TIQ -IQH -QHI -IVI -VIP -PRC -RCG -CGN -SLM -LMH -HGG -EVN -RTH -HLH -LHA -HAV -YTL -FPG -EPR -PRW -RWP -PRN -RNR -NRR -RRD -DLT -LTY -TYA -YAF -PKN -SRA -FGR -RWS -WSD -FTL -FST -ITI -TIG -IGF -GFY -FYT -YTG -GDH -EPF -LAH -HAF -SPP -KFH -FHL -HLD -WVV -ESV -AVH -IGH -GHL -LGH -ESI -IMY -MYP -YPT -PTI -LTN -VEG -EGI -IQY -YLY -LYG -YGA -KHQ -HQR -DTG -GGF -FSA -RID -IDG -DGS -TVG -VLW -LWF -WFL -MGS -PLR -KPG -TSW -WNS -VRT -TQV -EYG -YGC -GCF -CFE -KGH -LNG -GNK -NKP -KPE -EYD -GFT -EGM -GMG -MGV -VGR -RIT -LMW -MWP -WPE -CET -SYG -KRM -KMM -MMV -MVF -FES -FGM -HFD -SFC -CES -LHF -HFM -MRY -QPG -PGK -GRS -RSP -SLH -HKD -KSI -IVN -NQN -QND -EFE -GEW -EWI -WIL -ADN -DNH -GDC -DCF -CFM -AWS -WSN -RLH -QAR -FSF -SFP -FPK -EHP -HPL -LLF -LFN -FNP -PFE -YCF -CFT -FTK -KEG -CDL -PAQ -PFR -FRI -QGP -ERP -RQQ -QQC -QCS -CSQ -SQR -QRI -RIQ -QGE -NQC -QCR -CRS -RSQ -SQM -QSC -SCC -CCQ -LQN -NVE -EQC -CQC -MPG -GWS -WSC -SCL -CLV -FVG -VGQ -VQE -QTK -MLE -LEG -AQY -CQG -VIH -IHT -IDV -VSH -SHV -HVL -PRQ -IYC -YCS -CST -AGP -HEE -HHE -STW -TWS -AYP -YPY -PYS -YSK -KNG -NGG -GGT -HTC -TCA -PMY -MYI -YIY -YGE -ERS -VMI -KNK -VYV -YVG -VGN -GNV -VAW -AWA -AHI -NVQ -VQG -GQF -QFY -TPH -HQS -SYD -LNC -NCT -EWG -WGL -RLD -SWS -WSL -LLY -LYW -YWL -VSF -PFY -FYN -YNY -NYR -YRP -RPP -PPF -PFN -FNC -SKF -FTF -FSY -AQR -LGY -GYV -YVP -SWE -SEW -WIG -IGT -EQH -QHR -HRE -RET -DTK -TKS -GGL -AFR -QNR -TAC -ACI -CII -DVF -FGV -GVT -VTH -THR -MNV -NVN -VNV -CVQ -VQA -PVF -VFI -IYT -YTS -IEV -QNG -NTW -TWP -WPT -PYP -NGW -GWN -NGD -GDT -LYT -YTC -PTY -TYI -SIN -INE -NNG -SVG -TVN -KAP -YDN -NYI -EFG -SRW -LMY -MYW -YWI -SYQ -YQP -FNR -NRH -YKP -PLY -LYS -YSW -VEW -EWV -WVG -RHK -HKE -TLK -KSK -KTQ -YRT -KHK -VTV -RGD -DIV -QGM -GMS -VII -IIH -DAC -TFH -FHT -MVN -VNR -KNN -KRH -SIQ -NYT -WGF -GFC -MVT -VTI -TIS -ISY -GYE -YEP -QVP -YLV -GGC -GCG -CGF -GEH -EHI -LEW -EWE -WEP -PRL -LHL -TGP -GPV -PVQ -VQV -QVT -AIQ -QAH -HEV -GSH -IHK -VQT -TGT -GTR -TRL -SSM -GHP -HPF -PYE -IHR -HRH -RHP -HPY -YPC -PCS -CSK -GRK -RLF -AIP -EHG -HGR -AWM -WMH -MHI -LMG -MGG -QVY -VYF -YFC -FCY -CYD -YDK -SPY -SYE -EDF -FNM -MEF -SPC -PCG -GTH -PYW -WLL -LQW -QWL -PYT -TNK -RHF -HFG -ART -RTI -IHW -HWV -WVQ -RMG -DAS -ELG -VTT -DRG -WVR -DVC -VCA -TIF -IFH -ELM -DEY -QRS -NVG -GTE -TEN -HAG -GVQ -YTD -DLY -AQN -GVD -DGM -GML -CAI -IRP -GIW -IWG -WGN -GNG -GDQ -QTM -GHV -HGF -GFI -AAH -DGT -APG -PGQ -GQA -YFI -FIN -PIN -INM -MFE -FEF -FAR -QRW -KMR -MRI -SGP -GPA -AVR -VRW -RWV -WVM -VMT -TGW -WQR -HFR -FRF -GFP -PAP -RLY -NYF -LFT -TTQ -QAL -YYV -QMK -ARA -MMK -QLH -RMR -GRT -RTP -RLE -AHN -HNI -LQA -CLQ -PLM -LMA -SFK -LDP -PDS -SMG -EMS -MSC -SCA -ARI -FEM -EMT -MTL -LQP -QPL -HKK -DWN -WNT -QAT -QGL -LGG -GSP -HSH -HTT -MAN -YHF -FVT -KED -YAN -ANY -IQA -QAD -ADY -NHG -PSM -SMT -MTA -THF -HFP -FPR -YGV -GRE -CVM -VMM -MML -GMK -FCS -SYL -PEP -LMT -MTF -LYD -DDW -DWM -WMR -CSR -PPE -YLM -MKF -VNK -NKM -KMT -LLW -LWP -WPP -DQA -QLD -IQV -VGV -GVV -IQS -QSA -DIN -INF -QDT -DRL -RTE -PAR -PTM -TMP -PPQ -PPG -GTP -TVP -PGP -NPA -QVD -SGV -QPR -HNV -NVH -VHK -TAM -PLN -LNR -NRL -HTH -THM -HMA -QCK -CKD -HFS -YFT -FTH -HRK -NHS -APF -PFS -QEE -MTS -ALH -HDV -QEN -FNN -GIF -APQ -QQV -MTV -LPK -PKP -PTD -VGT -PCP -CPA -SNM -NMP -DQG -TED -GGH -HPP -PRG -EMH -MHW -HWP -PMK -AIG -LTM -AGY -GYL -KWP -WPL -FVI -KRC -CVY -VYY -YYF -YFK -PQG -GAF -FSL -LSG -SGY -YNR -RVM -VMR -FPF -PFK -HIS -KKH -KHR -HRT -RTW -TWF -WMA -GHF -HFH -FHE -HEK -PLD -SFY -FYG -TDN -YEH -EHD -EPD -PGR -MHP -PAY -YPP -DMP -MPR -RAH -AHS -SFT -GPG -KHG -LPD -LCP -CPR -EPC -DPP -KPP -PPC -PCF -CFR -EPW -PWT -WTP -PGH -HGA -GAC -IMA -RNC -NCD -CDK -RGP -GPP -SEP -PKF -AMP -VAP -APR -RQP -KVP -FVN -VNT -ESC -CEV -LYC -CIR -GKV -LVV -VVW -WDE -ETS -VRN -RNY -RIF -KFY -GSM -SMV -EHY -HYH -YHT -THV -PSH -SHQ -PYG -YGY -GYT -IQI -QIE -EIN -TFR -GNC -NCI -RPY -AQI -CQK -HAA -MSN -HEW -EWQ -WQF -FDN -NAW -AWQ -QEM -EML -LNH -QKV -MDA -DCH -EHQ -FRR -NKS -SRP -PYF -YFE -QVC -TYS -DIH -HRQ -GDF -DFP -FPT -PGV -FQL -EKC -KCD -CDY -DYP -YPS -GSQ -QMS -ACD -DYD -VRP -DVW -VWE -WEH -EHE -LDH -LMM -QQT -STE -QRP -RHC -HCD -CDV -TSC -HHQ -HQL -NHL -TPI -PIK -VSM -SMR -MRE -DRS -RRR -PRI -LNQ -QST -INR -ARQ -KFR -KPY -YWE -RVA -RQF -QRV -LVH -ARY -AMG -FEL -KYY -YVQ -KMA -IHE -MGP -RGC -TSV -DSC -SCS -CSN -TQS -QSV -GPT -MPD -PDQ -DQF -QFP -RPG -GMM -MMF -FPV -SEC -ECS -PEC -ECE -ERG -ANN -NNR -NRM -LQC -QIG -ISA -REH -HKA -LQM -GKS -TRM -GCD -GVK -YHS -HSN -WDD -YGD -HAD -IGE -IFN -FNS -QLW -WMV -VDN -FQT -QTE -YWS -WSE -LGF -LHG -HGY -FEH -HFK -FKD -DQM -QFT -FTA -NDT -QTR -VFN -AFP -KFA -AYL -YRW -RWH -WHS -SYI -TPD -FHS -QCL -CLW -WRW -RWW -WWK -WGC -GCP -LTF -TFI -IRH -RHR -EFY -IDM -DMV -VKT -DMY -MYD -DTF -KRW -RWD -WDP -MVL -EMA -QGR -AEW -WIA -TGY -PTF -FEN -GHR -QPI -PFP -FPH -HHI -ILQ -IDF -NDY -DYA -YAC -CSI -TRC -RCY -CYK -ASC -SCT -SCY -CYM -STQ -MIE -NWE -WEF -PDN -DNN -NNA -API -KHA -AFN -LHH -HHF -HFY -YRD -DGY -GYS -LDY -QFA -SVQ -VQQ -CVK -AQW -QWI -SCI -DNP -DMI -YMR -LIN -CLG -GSC -SCN -DFA -CGY -GYA -IVC -CFW -HSD -GQK -III -GGI -RGA -YER -GLQ -GPH -PHG -HGW -GWR -WRM -SWG -LDQ -IVV -YLP -FQQ -QQH -QHY -HYG -YGG -HRS -RSD -KLH -LHN -DIE -IHS -DAP -AEM -EMK -IGY -HFI -QRY -RTA -DWG -YNH -NHC -CDP -QDR -WRN -NNW -NWW -WWQ -WQM -HAP -PLQ -LQY -AVM -MAM -MED -LFA -GNL -LDW -DWE -RRP -RCS -SRI -IQT -RFW -FWG -WGE -WHV -EGT -TAR -WFI -YAD -DWL -LWG -WGY -GYD -HIA -MPQ -EWR -WRY -RYA -YAL -NWQ -WQP -PPY -YDW -WSW -WML -IPD -CNP -PGC -GCV -CVD -QGV -QLY -YIC -ICF -CFP -LPM -MTI -TIP -IPG -MKT -QTF -PGI -RWT -RGW -WQA -PDD -DDY -RFP -GMT -RRY -RWK -WKP -KPW -PWR -HIW -IWY -WYT -EGW -QPD -RIC -ICV -LFF -FFA -FAP -RNA -NPW -PWN -AGK -LYM -FQH -QHF -NAV -VEM -MYQ -YQR -QRN -RNF -TMH -MHS -RFH -KHY -HYS -YSF -TRW -RWE -FYS -GPM -PMR -MRT -TGH -NWI -WIV -IRT -TGR -TTD -DSG -SDG -QYY -FWI -WII -FLY -YDL -ACW -CWA -WAP -LFG -IWI -WIP -NYD -YDQ -GYM -CVR -RGM -GMA -AYV -SKM -GIP -IPY -PYR -RAM -KYA -YPH -PHI -HIE -RTM -MDP -MRP -PGN -HSM -SML -GIM -IML -YPW -DRR -MWC -VQD -QRQ -QQI -INA -RNQ -EMR -YLN -PTR -NPC -QYG -DAH -AHR -HRA -QAW -GRA -AHH -HGC -GCS -SRH -GVH -VHG -AWI -ASF -QNP -NPM -PMG -LMP -VYW -YWK -WKG -RRW -KIW -IWR -WRA -EYA -GGN -DRY -YYG -FYA -YAM -AMR -MRL -RLW -WPG -GEI -GTK -FAF -MVG -GKP -MFY -FYM -YMT -TGQ -VVV -GMV -HQG -PHY -GVW -VWI -PNN -RKY -HAI -IIG -DTY -PEM -LCW -WVP -VPG -PGY -YSD -VEP -KPF -PDL -PMN -MNM -NMV -VMQ -MQQ -HPR -KVG -TWG -WGK -VGM -IGL -LYV -GIY -IYV -RHG -HGV -EHN -HNE -QMR -MRV -KYQ -PIT -TEW -EWT -WTV -LME -AWW -WWG -WGP -PWF -WFA -IIV -KRF -FMN -MNE -SMP -HHM -HMY -MYG -GQY -YGQ -GQG -WLI -LIF -QYR -IFA -KWL -ESG -DFH -FHR -HRG -YDR -DPT -IKH -HGP -RTD -LYA -PVM -MGH -GHT -TVQ -RTY -HGI -KHT -HTP -KMC -MCW -GRP -AYG -MKV -TMW -MWA -WAK -HEA -CGG -LVF -RYR -WLD -NAF -VGH -SAP -QAG -QDW -DWT -YTA -AQG -GLT -TTI -SIW -IWL -RQD -NIE -PDY -RMD -INP -DIG -GRC -CTK -DRM -MIG -QNF -NFA -PRY -MHA -FEG -AIW -IWS -WSM -GPS -ATR -RRN -VPQ -TSH -CSP -DNG -SFM -FMI -MIF -DCP -CPP -AQH -QHC -CRK -RCR -AFF -FFC -FCP -PPN -AIE -AID -GNT -FYP -AMV -SYR -QDM -MIC -CYN -YNQ -PTT -GQC -QCY -DHR -GCA -CAC -ACP -CPN -CCS -KCN -YKT -TCP -LCY -MFM -GCI -CID -CPK -YVC -VCC -CCN -DRC -RCN -VCL -KCY -CYV -TQT -QTC -CEK -EKY -VSY -YFH -FHD -YEC -ECT -CHR -GPY -PYN -NVC -LCN -MGE -THT -HTI -HTS -HLN -KFI -ITY -EIP -NAN -LII -DFF -FCN -TSM -TYF -LLC -LCT -CTF -FLH -HHP -LHQ -HQT -FPL -PMS -LFY -YRK -KTN -TNV -YKH -NMR -YGP -LSH -PHD -HDT -HEC -FLC -CFG -AQQ -SGC -GCR -CRF -LWL -EMD -EGF -VGF -TWV -PQK -HDA -THC -HCG -CGW -WSS -GWP -MPM -IYI -HLP -RPC -PCL -NNH -HIY -YTY -TIM -IMI -FVF -MGA -YLG -ACF -CFV -VIC -ICI -EGC -CIH -IHF -HDI -QSD -PKG -VML -LTH -THK -HKG -YMH -HSE -LMC -MCV -LFH -FHI -QFC -KYK -PFV -PPI -TVM -IKF -KFP -QGY -GYG -YGM -AMC -MCL -MKI -QIM -TRT -IDK -WLH -DND -FIV -KGF -HPN -AFV -YKR -VFF -FFV -PKS -TKQ -PNH -QIY -NSR -IQP -QPK -IVT -CHV -CLH -QAN -NEH -YIH -DVM -MLC -LCV -IQR -RYK -WLY -IDD -TFY -FID -SPN -VVC -TTN -EMM -TGM -MSK -SHR -HRN -GEQ -FIC -CTV -MFH -YGL -YGS -MHE -HEM -MMS -SMH -MHT -VLC -KYP -TGI -RYG -YGT -GQI -GPK -PKQ -GYF -WLR -CYI -IFV -GYQ -FPM -YVV -APY -IKI -MDS -HAR -PNT -HVT -VNH -HPD -NIK -ESD -FHV -TFW -WPD -PDM -DMK -KYN -TWY -IHQ -SHP -EYP -YPK -PKL -IRS -RSC -CSA -LMS -PHK -KPV -VCI -FGW -WFH -FDT -KYG -INC -NCA -CAV -FCK -CKK -FKV -DYS -TRI -RKF -FLM -MEC -ECR -CRN -PRD -PPM -HLR -GHQ -HQP -DYC -YCT -PCH -MIT -DPI -PIQ -QMP -EVY -RGS -SNV -NVP -PSC -TPF -RKC -CVP -QFQ -MDR -KCP -CPH -PHR -YTK -YDS -EKW -KWH -WHA -KDH -HRL -REF -FGD -FGE -RND -SYT -PEW -EWF -TGF -CNG -NEF -VPC -SMI -RWF -PHE -QNS -GNY -MLQ -PFM -GDM -TMK -MSP -GQP -GLM -VFQ -TRA -PIG -FQG -GMR -TAQ -AQM -NFY -FYQ -GFG -DRT -KMY -MYE -NRY -VPH -HVP -LHP -HPG -VHP -PQH -SHA -HMH -KWF -WFG -LEY -DYK -APM -PMH -NPK -QTS -THQ -MPH -SHC -HCV -SDC -CVT -KQP -QPM -MNA -GWV -LFW -FWL -WLG -QKW -KWW -WWH -WHT -HKN -QTD -QID -NFG -TPN -NSW -SWF -VDT -EFW -WQN -NIT -LLI -GTN -ESN -NRW -RWC -WCS -CSW -YQL -QLM -MLF -MLW -DPG -RHW -HWD -WDQ -NER -HEG -FPY -PYA -QMN -MNL -KLY -FAD -TKC -KCH -QKH -YKI -NDM -MVI -SHI -HIQ -ECK -CKY -KYE -RQV -KLM -MKL -YVT -VKM -DHY -HYA -DME -VFC -CIT -PIF -IFF -FFF -KIP -WFK -KSC -SCK -CKG -CAY -CKS -LQH -QHP -PWV -WVE -MRM -MLH -SHM -NSM -QGN -GYY -YYD -KGW -RYP -YSP -PND -ITP -IFC -NAC -QVL -NKW -KWT -WTL -TCD -LCC -CCT -HLC -YWA -WAI -TDP -IDY -YVN -LTW -CTI -AFY -FYI -YGR -TRH -RNW -WRL -EVH -TPC -CAP -IIM -MGT -ILC -CWL -PFF -FFI -PFC -CHM -HMP -VIY -YAY -YFN -IKC -CKF -KFC -FCR -CRQ -WNI -WRR -RRC -RCP -CPV -YQI -FGN -CVI -IFI -ITW -TWI -CRI -ILM -DTC -VHH -HHY -HYV -LHC -HCK -CKP -ETC -IQC -HNC -IYQ -LPW -PWK -ITL -TMY -CDF -DFW -WLS -TCC -IMH -MHL -TPK -LVW -VWV -FFW -FWR -WRQ -PNK -VCW -FII -PIC -ICK -CWF -FHM -FNW -YTM -AFH -FHK -RFK -FKC -PNQ -GAW -AWD -YTT -TWN -DIW -IWV -WVS -AGH -GHA -AMI -AVW -TAH -QIS -STC -TCG -CGA -ILY -ITG -ICW -ICR -SCW -CWI -WIH -IHP -HPA -FFT -FTW -TNC -EKM -MLI -ICM -CMT -YIV -DRW -EVW -VWL -CTC -NAI -LMI -TVW -VWT -WTI -ISM -SQC -QCT -QHD -HDH -IYH -YQK -FAS -CKL -TFC -TEF -IRI -DHP -SIY -ADF -IRC -MPS -NWT -CEG -KNW -WSA -MAV -DML -MPV -WIY -IYL -IHH -VFK -FIP -IMV -SIH -TMQ -MQS -ACK -FLF -VMW -WCP -CPF -NIM -CNE -FVW -YIQ -CQY -KQH -QHS -LMQ -DCS -MEI -PTC -NRK -QVG -CTD -FVH -LHW -HWA -FVM -AMW -WLF -NQY -QYN -TCV -DFM -FML -VTY -MLD -MRR -CNQ -CNY -KIY -IYF -RNP -FFK -GIN -EQV -SEH -CDH -TVH -VHW -IWP -PHN -TCE -HRD -NDG -RTT -VNA -VGY -NYQ -KCS -YFG -MFQ -NWA -VMV -DWP -CPI -PIW -CIA -NYP -PHP -ITF -SNH -IMM -MMI -MII -KVY -SYP -TMG -SMQ -CGP -GPC -PCD -ANI -RLM -SWV -TRY -THI -LGC -NCK -CKQ -VHS -VWQ -FKF -QNW -NWP -WPA -HNA -YDY -YVW -VWP -YLC -PVW -WIS -IVW -VWA -IGV -TTC -TYC -YCL -CLT -YVL -HGH -KCC -CCK -CKR -IMW -IYR -SNY -LRC -NYK -NIY -YRH -HTN -MQV -TFN -FQF -NCC -CCC -APN -CGK -SKY -RCD -GKG -HNS -RDW -DWR -WRK -TTY -YIW -WYR -QFW -FWT -QWN -WNP -VRH -RHQ -EVQ -QNY -YNF -NFP -QNC -SLW -WEL -FYV -YFV -VCM -PLW -QLT -MGN -GNH -MCG -ETR -LWS -WSV -SVW -VWH -WHY -QYW -YWT -VYI -FSM -NYY -IAW -CSH -HMG -DFE -WSI -IWQ -WQY -CIP -IPQ -HST -QWT -GVF -PDW -MAH -HVG -IWH -LWA -WAC -CIL -DTH -THH -YHL -QKY -KYH -YHK -YNW -WTK -VHA -VWY -WYQ -WND -IWA -APD -ENY -TFK -HDK -QFN -RRH -PNC -NCR -HFF -TIC -HDE -VWS -WSQ -YLF -ALW -WGG -TRQ -QYH -CVH -SMW -MWY -RNH -QTY -WQL -GCT -APC -CAE -FWF -WFQ -LCH -CHF -WSR -GGW -TIN -DQH -QHG -PFT -ISF -WDN -NWN -KEC -DKP -FYF -YFP -DSM -WEI -MSM -YII -MMN -PMP -RCC -CCP -CPT -SGW -GWT -NCP -CPG -GQH -IWN -WNC -NCY -CYS -YSR -HTF -HHA -RPW -PWH -WHN -HNQ -VQW -ECG -SMS -ANW -RAW -SFQ -QIH -HHR -REW -YEQ -CRP -FKM -GQM -WTR -ICL -MVW -ISW -IRY -LDM -RNI -QTH -THG -YVI -VPF -DAM -CWD -WDR -DRQ -MPF -PFG -CCI -AIC -CQP -GCW -WVI -QGW -NIG -CSV -AYY -STH -MGC -CFC -CLC -DYT -QVW -YIN -GQT -QWE -WES -QCH -CHP -VCR -WAY -EMF -MFI -TWC -WCV -FCF -SRC -RCH -HLT -TFF -IWF -WFY -NHN -PQT -QDN -TNH -FAW -RWQ -LWI -IAC -WNV -RHM -MEY -EYT -NVM -TWA -GWG -CQV -PSY -IYK -DTW -TWR -WRE -CSC -SCD -IWK -WKS -NYN -KNF -PNR -SQH -WFP -SWD -WDI -AWG -WVA -VMC -GWH -WHE -YCR -GMF -MFF -VTW -CDC -GYC -YCN -TMN -PCV -HCP -CCL -TQI -FCC -CLE -PRH -WST -MMD -YGH -SWA -PTW -TWD -INY -NYG -NCL -KWI -WIF -FGH -GKC -KCM -GWA -WAQ -FMY -MYY -YYQ -AKH -HKF -ECA -KHE -ICG -GMH -CTR -DHH -HNW -MIH -HPV -WMP -RVQ -DHC -GHD -AWE -WET -CHI -HIG -NTY -QNN -GAM -HLY -GWM -WMI -TCI -RYS -HNT -NQW -QWS -DPW -YSC -MPK -EWK -YAQ -AYW -MLY -CAR -KYM -YME -EYQ -MTN -NHY -ATM -FFQ -RPH -QMF -YGI -IFT -MMP -TDF -CHN -EWY -HSI -NFM -FMD -MDF -FKG -NKH -YMD -YDI -MCP -QYQ -TIW -VAH -HEF -FPI -YPF -HGN -SCR -NMG -MIN -HTD -MFW -WNH -MCI -WEN -SVH -CRV -KMD -FQM -QMI -HDS -CHG -WAV -PKW -PQW -QWP -TSY -QWR -TCR -SEY -SHY -HYQ -SFW -FWA -WAM -GFN -VMK -DYW -YWY -WYS -HWQ -WQT -VMH -MHF -PKY -VKW -WWS -HMM -MMT -MTY -KWD -TKW -KWS -HPM -MKP -KPH -HNH -QHA -MVY -YYT -EYY -WDF -GYH -WVY -YGF -GFH -HDP -NEW -WYE -YKY -YIE -EWN -GTG -YAS -CSY -GMN -RIW -IMD -TYE -DHF -HIK -PCC -CCE -CEW -GHY -DIM -TDK -KWN -QGF -GFM -ATY -TYG -YGW -MWR -NKF -WTG -PYL -YQN -QNT -RGI -GHI -HID -REM -YRC -QYS -DYG -CRY -GWD -GMC -MCA -CAF -EMC -MCK -DMM -MID -MFV -VGC -FGP -KAW -HLK -DWV -KYR -WTC -TCF -DSW -GTW -WVH -TNW -ICA -CAK -HIM -YCA -MLM -FRW -WGI -QFH -FHH -HHT -SCF -CFL -NHH -IDW -DWA -WAR -ARW -WHF -FWV -NTN -KKW -SAH -CGS -VWF -LWW -WWR -WRP -HRF -SHH -HGK -NYE -YEG -FSC -NAM -MKM -KMG -IGM -TWT -WTM -SWQ -WQD -PMM -FPP -FIH -HHD -RFA -NQM -CIG -IGQ -YSH -IIW -IPW -PWY -WYL -HKY -LWE -WEG -YDF -FGI -KGM -HSC -GTC -WPF -WWL -HQD -WGD -FKP -NYV -YMM -WGH -HQF -GGM -YDM -QGG -QDC -CDN -FDC -EQY -MQM -GYP -WLM -FEW -FAY -HLG -DGC -HLF -AHD -GFW -FWW -YTH -THY -HYK -TQF -FMH -DMF -YFF -VWG -WGV -TEM -RMP -FNF -RHY -HYC -YCE -MQI -FFG -YCK -CKH -YYN -PPH -CPS -GHH -KQN -FTC -GCK -CRG -QHH -QYC -MGM -DPM -MGI -EPY -QCM -CMQ -NIC -EYC -CRT -YPN -TRF -RDM -WPY -LYK -MNC -NCV -SCQ -CQA -GHN -HNN -AMF -LYN -NIW -ECF -FAC -ACA -RHV -EWM -WME -FEY -DDP -TMT -MTW -IPH -CNI -NIQ -NVW -VWK -DHE -RWN -FCT -PCW -DMW -YWF -FHG -YMG -DHW -HWK -VCK -SWK -RFF -CAG -MFT -VMY -MYN -PRM -RMC -CPM -THD -CKI -FWQ -WQV -MDY -RME -WGA -AYM -YYL -FFM -KNM -YCP -CPD -WDG -HNF -MGY -LEC -CLY -SWI -NLQ -GSN -PQC -QCV -PNW -IYY -NMI -EGH -HIP -WQG -MWS -HII -YNG -RMH -WKR -RHI -YNK -HQE -RWM -DHT -WEV -KCE -IWT -YRL -KCG -WSP -KGC -CKA -HDM -MPN -DKC -DNR -LCK -YRM -RFQ -FMM -WMS -QIW -HVC -NWD -MDG -FIW -DPC -HED -KWQ -WQQ -PVC -HLW -TYN -GHK -EWD -WDS -PHC -HCI -CIQ -IWD -WDV -AWC -IDC -DCA -CRR -WPM -QYD -HYR -HCW -CWS -WHI -WRD -KMP -VIW -AWK -WKH -HYP -FFY -QCI -CIY -FAH -NWG -FYE -TDW -DWK -CEP -HMV -DKW -YFY -VDW -HKS -MQH -FDH -CCM -MMM -DYN -WKQ -WCL -IHN -NNY -NYH -VNY -HTR -MNR -DMH -MHY -GTY -IFM -QRT -GCE -CEI -YEM -RMM -YTR -YAP -YMN -WGQ -WNM -GHM -WQI -NFD -WEY -FKY -HYD -HVW -AMM -RMY -QWV -MMQ -MSF -HFE -WER -HQM -VQH -YPQ -PQF -GHC -HMN -MNT -FFH -MMH -FIM -MIY -IYW -YWH -FMC -MCS -DWY -WYA -MWK -CMR -HYN -GWI -WIW -KWC -PWQ -RYC -THE -YQF -ERM -EWP -SWY -WYN -WKY -WEC -ECM -CME -RVW -VWC -WCK -RFY -NHM -KHC -GWC -HRW -RWI -WIK -FRC -HIH -RCW -CWP -CGI -FVC -VMN -KDW -AMH -MHQ -NQR -TCY -CYT -YTQ -HHV -AHY -QSM -LMN -FMT -MTH -GRN -NMQ -NGY -AWT -FCG -NMH -MHM -YCQ -NWC -CKT -VCE -HWH -NLH -VWW -PCR -RWY -WYF -YCM -QVM -QHT -HVR -RMN -QPW -FRH -HQK -YKF -MQN -KWE -TYV -HMR -ICH -KYT -TDM -CEY -CVC -PAC -NFK -KCF -YNC -QSW -WEW -WPW -YQH -NFH -MSY -YNP -DQP -HKH -MTK -KAH -VKC -YKW -GWW -WWP -MWG -VYC -YCG -HSW -WNE -CFI -CLM -CHK -RCQ -TCQ -PFA -NNC -QGC -MNY -NYM -KQM -QME -NCF -PDC -WAN -RPN -VCP -WIN -PPW -PWL -CRH -PWD -SYM -FGC -YIK -VNC -YTF -SNC -QHM -MEH -CQT -ITM -EYH -CQF -DYM -SMM -QMH -CYA -MAC -WVN -WAT -FWM -WMT -CCG -CYG -WAF -EPM -MVC -HWG -ELC -RCI -WQH -FWH -QWQ -AGW -NWY -WYC -CRW -CQS -LIW -CAQ -QMW -MWT -CER -ERC -VGW -IAH -NAQ -WIM -MKC -FQC -MWE -TQM -YHW -HWS -NYA -WMM -MMW -MWN -WNW -NWM -YEY -PCQ -HFW -FNY -NHR -NSC -TNG -HVM -HQW -EYW -IWE -HCE -PYH -YHD -YKQ -SWH -HAY -QMY -KIH -WFN -CSF -RCE -YCH -GRH -YNE -HQN -QPH -HYL -MHV -WIT -SCG -SPW -FHF -CIW -WAG -CTW -YAW -RHH -NFW -MNK -GEC -AHM -CYY -HEQ -MWV -IMR -FCD -HQC -CYF -MHC -PMC -HQY -WTH -QKC -HRC -HYF -CYL -HKC -WPS -WDC -FMQ -QHK -CFK -NEC -DNM -CQM -QMT -MDN -DCK -WDW -LHY -TKY -FPC -MDM -QWF -MDW -DWW -WWE -GLW -TWM -MSW -WEQ -WKN -PMQ -WAW -WMQ -DCY -CYR -CFH -HMS -IWW -WWI -PFW -WVC -ACY -MNS -CGC -GCM -TYY -YYS -MIM -MKW -HMI -FWE -MKH -MEW -SMY -MYH -HYI -CKN -NMM -RIM -SKH -YEW -CQR -RYH -HTM -WKT -KMN -FKH -TCK -WYI -HNP -NGC -MRN -FHW -EIW -KVH -WFE -YCY -AHW -TYW -YWR -WNA -EMG -CFF -HYT -FHQ -NKY -HHK -PCE -FCM -CMY -DHM -QQW -QWY -WYM -MRW -FPQ -MME -MYR -LWQ -GWY -WYD -HPW -YWD -CAH -EQW -QWK -WSH -NMC -PNE -FYH -QKM -HWE -WHD -RQW -SWW -WWA -MYS -KQW -WWT -CPQ -WIE -ACC -CCH -WEK -GMY -HFT -WTY -MMG -WTN -YYM -NTH -YCC -CCF -DYQ -WEM -WGT -NHF -CMS -WGS -MIW -YQM -IHM -QDH -TWQ -CAD -GNW -NWH -YYH -YYY -YFM -TPW -WED -MCR -YNM -WWD -MYV -YWM -SCM -CMM -NRC -RCT -CTN -YHM -QWC -WCT -TTW -TWW -WWY -WMG -YYC -WID -YVM -WIR -FYC -FWS -FYW -WTW -RCF -QQG -HMD -HEN -CKM -MKY -HCF -SQW -TYD -GIC -FQW -IFW -YQY -CCY -WAD -WSF -MYK -NDW -MIP -QWG -TCW -CWW -YLW -TQW -IHY -MQC -QCD -WTQ -MWW -VWM -WMK -GMW -MQW -NCQ -CQI -MRC -PWP -WTF -HVQ -HMC -DWQ -ILW -PWS -YHH -CPC -YHE -HAK -RNM -CEH -CMF -QHN -QCE -MDQ -DHQ -YTW -WLC -MCF -WFC -CFQ -YCW -CWE -MPW -WYK -MGF -FTM -CWK -HWF -PCT -MHN -HKW -WYV -DCW -CYQ -CAW -HWC -HWR -RSW -PYC -FKW -WFW -FMF -YMY -DCM -YDH -LWY -WKD -WRF -DKQ -QEC -WTE -CEM -GCY -MNH -CEQ -HYY -PYQ -QIC -GPW -PWW -MCD -WHR -NYW -QWM -CQQ -YHC -FCH -CHQ -QCF -NFC -PCN -PWG -CMI -CTM -QCP -WWN -TMC -CYW -EHC -CCR -FTQ -CNF -FDW -DWI -PWM -YWG -KMH -PWE -KWG -WGM -WHM -WPQ -CHY -VWR -WRH -CYC -AWY -DHN -CIC -CPW -ICP -QWD -CQW -CTY -WRC -WYW -MWL -CGH -HPC -PCY -EWH -QNM -PCM -QMM -WMY -WPN -WCE -HQH -CNN -CMW -PCK -QWH -NTC -HIC -CMC -MCQ -KHW -KCQ -MHK -CWG -HMT -WFM -IWC -CML -HWT -MHR -DQW -IQW -WVW -WPC -WHG -WYH -IEW -VHY -YQW -WDH -CHD -QPY -WKC -YDC -NHW -WDM -QPC -CKW -KWY -NCM -CQN -MYF -YMW -MMC -KMW -MWI -MHD -ECI -CMD -WCI -CGM -GCQ -MCE -WWF -WTT -HDC -FCQ -DMN -PWI -RMQ -WGW -WYP -MYM -HCC -CDQ -MNW -CMP -RCK -MWD -FPW -QTW -WNY -MCT -MHH -IWM -CFY -HYW -PHW -HWW -CFN -MWF -HCM -MWH -GYW -HAW -DWH -YWV -NMW -QEW -CNC -WDK -NKC -GCC -MPC -MCN -CCA -KWM -MCM -HWL -WSY -CKC -WMF -CWY -HCQ -WCA -HMK -DHD -YHY -DNW -WCD -WPI -WFD -WHW -WHC -HCY -WHQ -IMC -KPC -YMC -CRC -MCY -ECY -MCH -HWI -DCQ -PMW -LWC -CRM -DMC -MNF -HWY -YWW -YWC -WYY -EWC -FWC -FWY -WMN -WWV -EWW -WCM -CAM -WKM -WHH -YMF -WCQ -WIQ -MFN -ANC -ECW -WCG -CIM -WQC -CMH -MYC -CTH -HHW -QWW -WIC -CPY -MDC -NYC -CMN -WHK -MMY -DEW -QHW -WQW -CEC -TWH -HFC -WKW -HWM -MQY -HDW -WYG -CWM -CYH -HYM -QMC -QCW -NCW -YQC -FMW -WMC -WWW -HMW -RMW -CHW -WCW -HTW -CWC -WCY -YWQ -WMW -CWT -CWH -MWM -WWC -WCC -WCH -WWM -TAX -AXD -XDR -IEX -EXV -QAX -AXX -XXE -XES -MXN -XNF -NRX -RXX -XXX -XXR -XRI -SAX -AXG -XGG -PRX -RXR -XRX -RXE -XEF -QEX -EXQ -XQR -REX -EXR -RXQ -XQQ -DRX -RXP -XPG -QMX -MXT -XTX -TXR -XRM -APX -PXX -XXG -XGI -NLX -LXX -XXM -XMA -LNX -NXE -XEA -GTX -TXN -XND -LIX -IXI -XIM -MVX -VXX -XXK -XKT -GLX -LXP -XPP -QGX -GXD -XDL -XAP -QNX -NXM -XMN -VAX -XGV -IKX -KXY -KEX -EXL -XLY -GQX -QXE -XEP -PLX -XKC -PVX -XKE -RXI -XIR -AXL -XLN -LLX -LXD -XDA -AXE -XEL -GGX -GXG -KAX -XXA -XAG -XWS -SPX -PXC -XCD -GWX -WXH -XHF -MPX -ESX -SXN -XNK -DLX -LXN -XNS -QXG -XGD -ITX -XRG -NEX -EXA -XAL -LDX -DXI -XII -TPX -PXM -XMR -NXG -XGY -ASX -SXV -XVE -TKX -KXA -KRX -XXT -XTL -IDX -DXX -XXL -XLV -AKX -KXX -QHX -HXV -XVN -NSX -SXX -XKX -XDP -DAX -AXK -XKQ -PIX -IXX -XXF -VLX -XDI -DIX -IXL -XLK -LKX -KXV -XVA -DNX -NXD -ILX -LXK -XKV -VYX -YXE -XEI -RXS -XSH -KGX -XGF -AVX -VXY -XYG -HVX -XXI -XID -TVX -XXS -XSA -ENX -NXX -XMD -IIX -XMQ -AEX -EXX -XME -PGX -GXP -XPR -SKX -KXF -XFT -HRX -XSW -PQX -XGR -QQX -VTX -XRP -PSX -SXP -XPL -VGX -GXY -RSX -SXS -XSL -VSX -XST -AXV -XVL -AGX -GXX -XTK -KLX -LXR -XRV -AHX -HXC -XCS -LVX -VXN -XNR -NGX -GXL -TSX -SXQ -XQN -KXL -XLL -VIX -IXG -XGA -GFX -FXG -XGL -PTX -TXT -XTS -EMX -MXQ -SXY -XYA -IQX -QXY -XYR -TXK -IGX -XPS -PXT -XTG -NXQ -VKX -KXS -XSN -GVX -VXE -GRX -XRE -YKX -KXE -XEE -EEX -EXT -XTI -EHX -HXN -XNL -NDX -DXD -IAX -KSX -SXL -RRX -XRK -DDX -DXE -RXG -VXL -XLS -DTX -TXG -VXF -XFA -XIG -VXT -XTA -ISX -SXR -XRY -VQX -QXP -XPC -LGX -GXS -HGX -XGH -XXD -XDD -KKX -XXV -PKX -XLT -XSP -XLD -RAX -AXS -XSI -IYX -YXX -XXP -XPI -MSX -SXT -GEX -XHP -LFX -FXX -VXI -XIW -QTX -TXX -XXQ -XQA -FLX -DXN -XNC -MXS -XSR -YLX -EQX -QXS -TMX -MXC -XCY -NXA -XAV -EXE -XEQ -HPX -PXP -LMX -MXX -KTX -XKK -XXH -XHS -MKX -XIH -WRX -XKS -EXY -XYQ -QKX From ab470b929b6cad0606db4da55c022748ded48732 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 15 Apr 2025 15:00:06 +0200 Subject: [PATCH 4/4] remove protein readers --- chebai/preprocessing/reader.py | 397 --------------------------------- 1 file changed, 397 deletions(-) diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py index 7e943eb5..345b2567 100644 --- a/chebai/preprocessing/reader.py +++ b/chebai/preprocessing/reader.py @@ -1,18 +1,8 @@ import os -from pathlib import Path from typing import Any, Dict, List, Optional, Tuple -from urllib.error import HTTPError import deepsmiles import selfies as sf -import torch -from esm import Alphabet -from esm.model.esm2 import ESM2 -from esm.pretrained import ( - _has_regression_weights, - load_model_and_alphabet_core, - load_model_and_alphabet_local, -) from pysmiles.read_smiles import _tokenize from transformers import RobertaTokenizerFast @@ -340,390 +330,3 @@ def name(cls) -> str: def _read_data(self, raw_data: str) -> List[int]: """Convert characters in raw data to their ordinal values.""" return [ord(s) for s in raw_data] - - -class ProteinDataReader(DataReader): - """ - Data reader for protein sequences using amino acid tokens. This class processes raw protein sequences into a format - suitable for model input by tokenizing them and assigning unique indices to each token. - - Note: - Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure - - Args: - collator_kwargs (Optional[Dict[str, Any]]): Optional dictionary of keyword arguments for configuring the collator. - token_path (Optional[str]): Path to the token file. If not provided, it will be created automatically. - kwargs: Additional keyword arguments. - """ - - COLLATOR = RaggedCollator - - # 21 natural amino acid notation - AA_LETTER = [ - "A", - "R", - "N", - "D", - "C", - "Q", - "E", - "G", - "H", - "I", - "L", - "K", - "M", - "F", - "P", - "S", - "T", - "W", - "Y", - "V", - # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L3-L5 - "X", # Consider valid in latest paper year 2024 Reference number 3 in go_uniprot.py - ] - - def name(self) -> str: - """ - Returns the name of the data reader. This method identifies the specific type of data reader. - - Returns: - str: The name of the data reader, which is "protein_token". - """ - if self.n_gram is not None: - return f"protein_token_{self.n_gram}_gram" - - return "protein_token" - - def __init__(self, *args, n_gram: Optional[int] = None, **kwargs): - """ - Initializes the ProteinDataReader, loading existing tokens from the specified token file. - - Args: - *args: Additional positional arguments passed to the base class. - **kwargs: Additional keyword arguments passed to the base class. - """ - if n_gram is not None: - assert ( - int(n_gram) >= 2 - ), "Ngrams must be greater than or equal to 2 if provided." - self.n_gram = int(n_gram) - else: - self.n_gram = None - - super().__init__(*args, **kwargs) - - # Load the existing tokens from the token file into a cache - with open(self.token_path, "r") as pk: - self.cache = [x.strip() for x in pk] - - def _get_token_index(self, token: str) -> int: - """ - Returns a unique index for each token (amino acid). If the token is not already in the cache, it is added. - - Args: - token (str): The amino acid token to retrieve or add. - - Returns: - int: The index of the token, offset by the predefined EMBEDDING_OFFSET. - """ - error_str = ( - f"Please ensure that the input only contains valid amino acids " - f"20 Valid natural amino acid notation: {self.AA_LETTER}" - f"Refer to the amino acid sequence details here: " - f"https://en.wikipedia.org/wiki/Protein_primary_structure" - ) - - if self.n_gram is None: - # Single-letter amino acid token check - if str(token) not in self.AA_LETTER: - raise KeyError(f"Invalid token '{token}' encountered. " + error_str) - else: - # n-gram token validation, ensure that each component of the n-gram is valid - for aa in token: - if aa not in self.AA_LETTER: - raise KeyError( - f"Invalid token '{token}' encountered as part of n-gram {self.n_gram}. " - + error_str - ) - - if str(token) not in self.cache: - self.cache.append(str(token)) - return self.cache.index(str(token)) + EMBEDDING_OFFSET - - def _read_data(self, raw_data: str) -> List[int]: - """ - Reads and tokenizes raw protein sequence data into a list of token indices. - - Args: - raw_data (str): The raw protein sequence to be tokenized (e.g., "MKTFF..."). - - Returns: - List[int]: A list of integers representing the indices of the amino acid tokens. - """ - if self.n_gram is not None: - # Tokenize the sequence into n-grams - tokens = [ - raw_data[i : i + self.n_gram] - for i in range(len(raw_data) - self.n_gram + 1) - ] - return [self._get_token_index(gram) for gram in tokens] - - # If n_gram is None, tokenize the sequence at the amino acid level (single-letter representation) - return [self._get_token_index(aa) for aa in raw_data] - - def on_finish(self) -> None: - """ - Saves the current cache of tokens to the token file. This method is called after all data processing is complete. - """ - with open(self.token_path, "w") as pk: - print(f"Saving {len(self.cache)} tokens to {self.token_path}...") - print(f"First 10 tokens: {self.cache[:10]}") - pk.writelines([f"{c}\n" for c in self.cache]) - - -class ESM2EmbeddingReader(DataReader): - """ - A data reader to process protein sequences using the ESM2 model for embeddings. - - References: - https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/extract_esm.py - - Note: - For layer availability by model, Please check below link: - https://github.com/facebookresearch/esm?tab=readme-ov-file#pre-trained-models- - - To test this reader, try lighter models: - esm2_t6_8M_UR50D: 6 layers (valid layers: 1–6), (~28 Mb) - A tiny 8M parameter model. - esm2_t12_35M_UR50D: 12 layers (valid layers: 1–12), (~128 Mb) - A slightly larger, 35M parameter model. - These smaller models are good for testing and debugging purposes. - - """ - - # https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L53 - _MODELS_URL = "https://dl.fbaipublicfiles.com/fair-esm/models/{}.pt" - _REGRESSION_URL = ( - "https://dl.fbaipublicfiles.com/fair-esm/regression/{}-contact-regression.pt" - ) - - def __init__( - self, - save_model_dir: str = os.path.join("data", "esm2_reader"), - model_name: str = "esm2_t36_3B_UR50D", - device: Optional[torch.device] = None, - truncation_length: int = 1022, - toks_per_batch: int = 4096, - return_contacts: bool = False, - repr_layer: int = 36, - *args, - **kwargs, - ): - """ - Initialize the ESM2EmbeddingReader class. - - Args: - save_model_dir (str): Directory to save/load the pretrained ESM model. - model_name (str): Name of the pretrained model. Defaults to "esm2_t36_3B_UR50D". - device (torch.device or str, optional): Device for computation (e.g., 'cpu', 'cuda'). - truncation_length (int): Maximum sequence length for truncation. Defaults to 1022. - toks_per_batch (int): Tokens per batch for data processing. Defaults to 4096. - return_contacts (bool): Whether to return contact maps. Defaults to False. - repr_layers (int): Layer number to extract representations from. Defaults to 36. - """ - self.save_model_dir = save_model_dir - if not os.path.exists(self.save_model_dir): - os.makedirs((os.path.dirname(self.save_model_dir)), exist_ok=True) - self.model_name = model_name - self.device = device - self.truncation_length = truncation_length - self.toks_per_batch = toks_per_batch - self.return_contacts = return_contacts - self.repr_layer = repr_layer - - self._model: Optional[ESM2] = None - self._alphabet: Optional[Alphabet] = None - - self._model, self._alphabet = self.load_model_and_alphabet() - self._model.eval() - - if self.device: - self._model = self._model.to(device) - - super().__init__(*args, **kwargs) - - def load_model_and_alphabet(self) -> Tuple[ESM2, Alphabet]: - """ - Load the ESM2 model and its alphabet. - - References: - https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L24-L28 - - Returns: - Tuple[ESM2, Alphabet]: Loaded model and alphabet. - """ - model_location = os.path.join(self.save_model_dir, f"{self.model_name}.pt") - if os.path.exists(model_location): - return load_model_and_alphabet_local(model_location) - else: - return self.load_model_and_alphabet_hub() - - def load_model_and_alphabet_hub(self) -> Tuple[ESM2, Alphabet]: - """ - Load the model and alphabet from the hub URL. - - References: - https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L62-L64 - - Returns: - Tuple[ESM2, Alphabet]: Loaded model and alphabet. - """ - model_url = self._MODELS_URL.format(self.model_name) - model_data = self.load_hub_workaround(model_url) - regression_data = None - if _has_regression_weights(self.model_name): - regression_url = self._REGRESSION_URL.format(self.model_name) - regression_data = self.load_hub_workaround(regression_url) - return load_model_and_alphabet_core( - self.model_name, model_data, regression_data - ) - - def load_hub_workaround(self, url) -> torch.Tensor: - """ - Workaround to load models from the PyTorch Hub. - - References: - https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L31-L43 - - Returns: - torch.Tensor: Loaded model state dictionary. - """ - try: - data = torch.hub.load_state_dict_from_url( - url, self.save_model_dir, progress=True, map_location=self.device - ) - - except RuntimeError: - # Handle PyTorch version issues - fn = Path(url).name - data = torch.load( - f"{torch.hub.get_dir()}/checkpoints/{fn}", - map_location="cpu", - ) - except HTTPError as e: - raise Exception( - f"Could not load {url}. Did you specify the correct model name?" - ) - return data - - @staticmethod - def name() -> str: - """ - Returns the name of the data reader. This method identifies the specific type of data reader. - - Returns: - str: The name of the data reader, which is "protein_token". - """ - return "esm2_embedding" - - @property - def token_path(self) -> None: - """ - Not used as no token file is not created for this reader. - - Returns: - str: Empty string since this method is not implemented. - """ - return - - def _read_data(self, raw_data: str) -> List[int]: - """ - Reads protein sequence data and generates embeddings. - - Args: - raw_data (str): The protein sequence. - - Returns: - List[int]: Embeddings generated for the sequence. - """ - alp_tokens_idx = self._sequence_to_alphabet_tokens_idx(raw_data) - return self._alphabet_tokens_to_esm_embedding(alp_tokens_idx).tolist() - - def _sequence_to_alphabet_tokens_idx(self, sequence: str) -> torch.Tensor: - """ - Converts a protein sequence into ESM alphabet token indices. - - Args: - sequence (str): Protein sequence. - - References: - https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/data.py#L249-L250 - https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/data.py#L262-L297 - - Returns: - torch.Tensor: Tokenized sequence with special tokens (BOS/EOS) included. - """ - seq_encoded = self._alphabet.encode(sequence) - tokens = [] - - # Add BOS token if configured - if self._alphabet.prepend_bos: - tokens.append(self._alphabet.cls_idx) - - # Add the main sequence - tokens.extend(seq_encoded) - - # Add EOS token if configured - if self._alphabet.append_eos: - tokens.append(self._alphabet.eos_idx) - - # Convert to PyTorch tensor and return - return torch.tensor([tokens], dtype=torch.int64) - - def _alphabet_tokens_to_esm_embedding(self, tokens: torch.Tensor) -> torch.Tensor: - """ - Converts alphabet tokens into ESM embeddings. - - Args: - tokens (torch.Tensor): Tokenized protein sequences. - - References: - https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/extract_esm.py#L82-L107 - - Returns: - torch.Tensor: Protein embedding from the specified representation layer. - """ - if self.device: - tokens = tokens.to(self.device, non_blocking=True) - - with torch.no_grad(): - out = self._model( - tokens, - repr_layers=[ - self.repr_layer, - ], - return_contacts=self.return_contacts, - ) - - # Extract representations and compute the mean embedding for each layer - representations = { - layer: t.to(self.device) for layer, t in out["representations"].items() - } - truncate_len = min(self.truncation_length, tokens.size(1)) - - result = { - "mean_representations": { - layer: t[0, 1 : truncate_len + 1].mean(0).clone() - for layer, t in representations.items() - } - } - return result["mean_representations"][self.repr_layer] - - def on_finish(self) -> None: - """ - Not used here as no token file exists for this reader. - - Returns: - None - """ - pass