diff --git a/examples/clone.py b/examples/clone.py new file mode 100644 index 0000000..5349514 --- /dev/null +++ b/examples/clone.py @@ -0,0 +1,26 @@ +import sys + +from mavetools.client.client import Client + +#This example shows how to download the whole MaveDB and create a local clone + +#Provide the URL of MaveDB +base_url = 'https://www.mavedb.org/#/api/' + +# Generate a new auth_token in your profile and post it here + +auth_token = "" +# if the base url exists, the client object is instantiated with that value +# otherwise the client object is instantiated with default value which points to localhost +client = ( + Client(base_url, auth_token=auth_token) + if base_url + else Client(auth_token=auth_token) +) + +#Provide a path where the local clone should be stored. +local_instance_path = f'../../localMaveDB_Feb_2023' + +#Download MaveDB +experiment_dict = client.clone(local_instance_path) + diff --git a/examples/create_ml_dataset.py b/examples/create_ml_dataset.py new file mode 100644 index 0000000..fbcfb85 --- /dev/null +++ b/examples/create_ml_dataset.py @@ -0,0 +1,38 @@ +from mavetools.client.client import LocalClient +from mavetools.models.ml_tools import MlDataset + +#This example shows how to create a scaled dataset of all SAV effect values contained in the MaveDB + +#We use a locally cloned version here, if you haven't cloned it yet, look into clone.py + +#Provide the path to the local clone +local_instance_path = f'../../localMaveDB' + +#Provide paths to where the dataset will be written. +outfile = 'mave_db_scaled_savs.fasta' +statfile = 'mave_db_scaled_savs_statistics.tsv' +seq_file = 'mave_db_scaled_savs_only_sequences.fasta' + +#Create a local client object +client = LocalClient(local_instance_path) + +#Search the database without any filters to retrieve the whole database +experiment_dict = client.search_database() + +#Create the ML dataset object +ml_dataset = MlDataset(experiment_dict) + +#Retrieve the scoretables +ml_dataset.retrieve_data(client) + +#Aggregate scoresets from same experiments +ml_dataset.aggregate_scoresetdata() + +ml_dataset.write_dataset_statistics(statfile) + +#Scale all SAV effect scores +ml_dataset.scale_all_savs() + +#Write the output +ml_dataset.write_scaled_sav_fasta(outfile, sequences_only_file = seq_file) + diff --git a/examples/make_gold_standard.py b/examples/make_gold_standard.py new file mode 100644 index 0000000..ee6c491 --- /dev/null +++ b/examples/make_gold_standard.py @@ -0,0 +1,47 @@ +import sys + +from mavetools.client.client import LocalClient +from mavetools.models.ml_tools import MlDataset + +if len(sys.argv < 3): + print('Usage: python make_gold_standard.py [path to local instance of MaveDB (https://zenodo.org/records/11201737)] [path to the ProteinGym Substitutions reference file (https://marks.hms.harvard.edu/proteingym/DMS_ProteinGym_substitutions.zip)]') + sys.exit(1) +#This example shows how to create a scaled dataset of all SAV effect values contained in the MaveDB and ProteiGym + +#Provide the path to the local clone (download via https://zenodo.org/records/11201737) +local_instance_path = sys.argv[1] + +#Provide the path to the ProteinGym Substitutions reference file (download via https://marks.hms.harvard.edu/proteingym/DMS_ProteinGym_substitutions.zip) +path_to_reference_file = sys.argv[2] + +#Provide paths to where the dataset will be written. +outfile = 'mave_db_gold_standard.fasta' +seq_file = 'mave_db_gold_standard_only_sequences.fasta' + +#Create a local client object +client = LocalClient(local_instance_path) + +#Search the database without any filters to retrieve the whole database +experiment_dict = client.search_database() + +#Create the ML dataset object +ml_dataset = MlDataset(experiment_dict) + +#Retrieve the scoretables +ml_dataset.retrieve_data(client, verbosity = 1) + +#Add the ProteinGym database +ml_dataset.load_protein_gym(path_to_reference_file) + +#Aggregate scoresets from same experiments +filtered_list = ml_dataset.aggregate_scoresetdata(min_prot_size = 50, min_len_coverage = 0.4, std_filter = 0.25, verbosity = 1) + +#ml_dataset.write_filtered_entries(filtered_list, 'filtered_entries_mave_db_gold_standard.tsv') + +#Scale all SAV effect scores +ml_dataset.scale_all_savs(verbosity = 1) + +#ml_dataset.plot_sav_score_distribution('mave_db_gold_standard_score_distribution.png') + +#Write the output +ml_dataset.write_scaled_sav_fasta(outfile, sequences_only_file = seq_file) diff --git a/examples/scale_dataset.py b/examples/scale_dataset.py new file mode 100644 index 0000000..ca2e607 --- /dev/null +++ b/examples/scale_dataset.py @@ -0,0 +1,48 @@ +from mavetools.client.client import LocalClient +from mavetools.models.ml_tools import MlDataset + +#This example shows how to create a scaled for one particular scoreset + +#We use a locally cloned version here, if you haven't cloned it yet, look into clone.py + +#Give the path to local clone +local_instance_path = f'../../localMaveDB' + +#Create the local client object +client = LocalClient(local_instance_path) + +#Provide a MaveDB urn identifier +particular_experiment_id = 'urn:mavedb:00000005-a' + +#Create ML dataset object and aggregate all scoresets +experiment_dict = client.get_experiment_dict([particular_experiment_id]) + +print('=== Experiment object created ===') + +ml_dataset = MlDataset(experiment_dict) + +print('=== ML object created ===') + +ml_dataset.retrieve_data(client, verbosity = 1) + +print('=== Data retrieval done ===') + +ml_dataset.aggregate_scoresetdata(verbosity = 1) + +print('=== Data aggregation done ===') + +#Uncomment to plot the SAV score histogram +ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.plot_sav_score_distribution(f'score_distribution_{particular_experiment_id}.png') +ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.write_nonsense_tsv(f'Nonsense_scores_{particular_experiment_id}.tsv') + +#Scale all SAV effect scores +ml_dataset.scale_all_savs(verbosity = 1) + +print('=== Data scaling finished ===') + +#Uncomment to plot the scaled SAV score histogram +ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.plot_sav_score_distribution(f'scaled_score_distribution_{particular_experiment_id}.png', specific_scores = ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.scaled_sav_scores) + +#Write the scaled the dataset to the specialized fasta file +outfile = f'{particular_experiment_id}_scaled_savs.fasta' +ml_dataset.write_scaled_sav_fasta(outfile) diff --git a/mavetools/client/client.py b/mavetools/client/client.py index 477dc81..bf17029 100644 --- a/mavetools/client/client.py +++ b/mavetools/client/client.py @@ -2,10 +2,412 @@ import logging import requests import sys +import os +from mavetools.models.scoreset import ScoreSet +from mavetools.models.ml_tools import MlExperiment -class Client: - def __init__(self, base_url="http://127.0.0.1:8000/api/", auth_token=""): + +FUNCTION_TYPES = set(['Activity', 'Binding', 'Expression', 'OrganismalFitness', 'Stability']) + +def extract_function_type(scoreset): + function_type = None + sd = scoreset['shortDescription'] + methodText = scoreset['methodText'] + abstract = scoreset['abstractText'] + if abstract is None or abstract == '': + try: + abstract = scoreset['primaryPublicationIdentifiers'][0]['abstract'] + if abstract is None: + abstract = '' + except IndexError: + abstract = '' + second_abstract = '' + else: + try: + second_abstract = scoreset['primaryPublicationIdentifiers'][0]['abstract'] + if second_abstract is None: + second_abstract = '' + except IndexError: + second_abstract = '' + + prio_binding_keywords = ['transcription ability', 'protein-protein interaction'] + prio_fitness_keywords = [] + prio_act_keywords = ['readout of activity', 'fluorescence of Aequorea victoria GFP', 'loss-of-function', 'Functional scores', 'functional scores'] + prio_exp_keywords = ['variant abundance', 'protein abundance'] + prio_stabi_keywords = [] + + binding_keywords = ['binding', 'Binding', 'Y2H assay'] + fitness_keywords = ['fitness', 'Fitness', 'pathogenicity', 'saturation prime editing', 'Growth', 'growth', 'toxicity assay', 'TileSeq', 'viral replication', 'complementation', 'based on survival'] + act_keywords = ['activity', 'Activity'] + exp_keywords = ['expression', 'Expression', ] + stabi_keywords = ['stability', 'Stability', 'folding free energy', 'dG value', 'Protein folding'] + + keyword_tuples = [ + ('OrganismalFitness', fitness_keywords, prio_fitness_keywords), + ('Binding', binding_keywords, prio_binding_keywords), + ('Activity', act_keywords, prio_act_keywords), + ('Expression', exp_keywords, prio_exp_keywords), + ('Stability', stabi_keywords, prio_stabi_keywords) + ] + + for ft, keywords, prio_keywords in keyword_tuples: + for keyword in prio_keywords: + if sd.count(keyword) > 0: + function_type = ft + elif methodText.count(keyword) > 0: + function_type = ft + if function_type is not None: + break + if function_type is not None: + break + + if function_type is None: + for ft, keywords, prio_keywords in keyword_tuples: + for keyword in prio_keywords: + if abstract.count(keyword) > 0: + function_type = ft + break + elif second_abstract.count(keyword) > 0: + function_type = ft + break + if function_type is not None: + break + + if function_type is None: + for ft, keywords, prio_keywords in keyword_tuples: + for keyword in keywords: + if sd.count(keyword) > 0: + function_type = ft + elif methodText.count(keyword) > 0: + function_type = ft + if function_type is not None: + break + if function_type is not None: + break + + if function_type is None: + for ft, keywords, prio_keywords in keyword_tuples: + for keyword in keywords: + if abstract.count(keyword) > 0: + function_type = ft + break + elif second_abstract.count(keyword) > 0: + function_type = ft + break + if function_type is not None: + break + + + if function_type is not None: + return function_type + else: + return 'Activity' + +class ClientTemplate: + """ + Parent class for client classes to inheir. + """ + + def parse_json_scoreset_list( + self, + scoreset_list, + keywords=None, + organisms=None, + retrieve_json_only=False, + experiment_types=None, + verbose=False, + ): + """ + Parses a list of scoreset metadatas in json format. + Creates classes and datastructures that are required to use ML tools features. + + Parameters + ---------- + + scoreset_list + A list of scoreset metadata in json format. + + keywords + List of keywords. If not None, filters all scoresets that keywords do not contain any of the given keywords. + + organisms + List of organisms. If not None, filters all scoresets that organism is not any of the given organisms. + + retrieve_json_only + When True, the function does not create the ML datastructures, but applies all given filters and creatures a ordered experiment-scoreset datastructure of the json objects. + + experiment_types + List of experiment types. If not None, filters all scoresets that experiment type is not any of the given experiment types. + + Returns + ------- + + experiment_dict + A dictionary mapping experiment urns to their corresponding MLExperiment objects. + """ + + if verbose: + print(f"Call of parse_json_scoreset_list: {experiment_types=}") + + if keywords is not None: + keywords = set(keywords) + + if organisms is not None: + organisms = set(organisms) + + if experiment_types is not None: + experiment_types = set(experiment_types) + + experiment_dict = {} + filter_0 = 0 + filter_1 = 0 + filter_2 = 0 + filter_3 = 0 + filter_4 = 0 + for scoreset in scoreset_list: + if keywords is not None: + keyword_match = False + for keyword in scoreset["keywords"]: + if keyword["text"] in keywords: + keyword_match = True + if not keyword_match: + filter_0 += 1 + continue + + if organisms is not None: + if scoreset["target"]["reference_maps"][0]["genome"]["organism_name"] not in organisms: + filter_1 += 1 + continue + + if len(scoreset["targetGenes"]) == 0: + filter_2 += 1 + continue + + if experiment_types is not None: + if scoreset["targetGenes"][0]["category"] not in experiment_types: + # print(scoreset['targetGenes'][0]['category']) + filter_3 += 1 + continue + + urn = scoreset["urn"] + + if retrieve_json_only: + experiment_dict[urn] = scoreset + filter_4 += 1 + continue + + ft = extract_function_type(scoreset) + + scoreset_obj = ScoreSet.deserialize(scoreset) + + experiment_urn = scoreset_obj.urn + + if experiment_urn not in experiment_dict: + experiment_dict[experiment_urn] = MlExperiment( + experiment_urn, {}, scoreset_obj, urn=experiment_urn, function_type = ft + ) + + experiment_dict[experiment_urn].scoreset_dict[urn] = scoreset_obj + + if verbose: + print( + f"Filtered: {filter_0=} {filter_1=} {filter_2=} {filter_3=} {filter_4=}" + ) + + return experiment_dict + + +class LocalClient(ClientTemplate): + """ + A client class that imitates the original client class to use a local clone of the MaveDB. + """ + + def __init__(self, local_instance_path): + """ + Initializes the client instance. + + Parameters + ---------- + + local_instance_path + path the locally stored MaveDB. + """ + + self.local_instance_path = local_instance_path + self.meta_data_folder = f"{local_instance_path}/main.json" + self.main_meta_data = self.load_meta_data(self.meta_data_folder) + self.scoreset_data_folder = f"{local_instance_path}/csv/" + + def get_meta_file_path(self, urn): + """ + Getter for filepath of the stored meta data file in the locally cloned MaveDB. + + Parameters + ---------- + + urn + MaveDB urn identifier of the scoreset. + + Returns + ------- + + path + Path to the metadata json-formatted file. + """ + return f"{self.meta_data_folder}/{urn}.json" + + def load_meta_data(self, filepath): + """ + Wrapper function for loading a json-formatted metadata file. + + Parameters + ---------- + + filepath + Path to a json-formatted metadata file. + + Returns + ------- + + meta_data + json object of the metadata. + """ + + f = open(filepath, "r") + meta_data = json.load(f) + f.close() + return meta_data + + def get_meta_data(self, urn): + """ + Getter for metadata. + + Parameters + ---------- + + urn + MaveDB urn identifier of a scoreset. + + Returns + ------- + + meta_data + json object of the metadata. + """ + return self.load_meta_data(self.get_meta_file_path(urn)) + + def search_database( + self, + keywords=None, + organisms=None, + experiment_types=["protein_coding"], + verbose=False, + ): + """ + Searches all scoresets in MaveDB and applies some filters. + Parameters + ---------- + + keywords + List of keywords. If not None, filters all scoresets that keywords do not contain any of the given keywords. + + organisms + List of organisms. If not None, filters all scoresets that organism is not any of the given organisms. + + experiment_types + List of experiment types. If not None, filters all scoresets that experiment type is not any of the given experiment types. + + Returns + ------- + + experiment_dict + A dictionary mapping experiment urns to their corresponding MLExperiment objects. + """ + + experiment_sets = self.main_meta_data["experimentSets"] + scoreset_list = [] + for experiment_set in experiment_sets: + for experiment in experiment_set["experiments"]: + for scoreSet in experiment["scoreSets"]: + scoreset_list.append(scoreSet) + + if verbose: + print(f"Searching MaveDB: {len(experiment_sets)=} {len(scoreset_list)=}") + + experiment_dict = self.parse_json_scoreset_list( + scoreset_list, + keywords=keywords, + organisms=organisms, + experiment_types=experiment_types, + verbose=verbose, + ) + + if verbose: + print(f"{len(experiment_dict)=}") + + return experiment_dict + + def get_experiment_dict(self, urns): + """ + Generates a experiment_dict containing MLExperiment objects for a list of given urns. + + Parameters + ---------- + + urns + A list of MaveDB urn identifiers. + + Returns + ------- + + experiment_dict + A dictionary mapping experiment urns to their corresponding MLExperiment objects. + """ + + scoreset_list = [] + for urn in urns: + try: + scoreset_list.append(self.get_meta_data(urn)) + except: + n = 1 + while True: + try: + scoreset_urn = f"{urn}-{n}" + scoreset_list.append(self.get_meta_data(scoreset_urn)) + except: + break + n += 1 + return self.parse_json_scoreset_list(scoreset_list) + + def retrieve_score_table(self, urn): + """ + Retrieves the score table for an urn. + + Parameters + ---------- + + urn + MaveDB urn identifier of a scoreset. + + Returns + ------- + + text + Scoreset table as a string. + """ + + fixed_urn = urn.replace(":", "-") + + score_table_file = f"{self.scoreset_data_folder}/{fixed_urn}.scores.csv" + f = open(score_table_file, "r") + text = f.read() + f.close() + return text + + +class Client(ClientTemplate): + def __init__(self, base_url="https://www.mavedb.org/api/", auth_token=""): """ Instantiates the Client object and sets the values for base_url and auth_token @@ -24,6 +426,114 @@ def __init__(self, base_url="http://127.0.0.1:8000/api/", auth_token=""): class AuthTokenMissingException(Exception): pass + def clone(self, local_instance_path): + """ + Downloads the whole MaveDB and creates a local clone. + + Parameters + ---------- + + local_instance_path + Path to where the clone should be stored. + """ + + if not os.path.exists(local_instance_path): + os.mkdir(local_instance_path) + meta_data_folder = f"{local_instance_path}/meta_data/" + if not os.path.exists(meta_data_folder): + os.mkdir(meta_data_folder) + scoreset_data_folder = f"{local_instance_path}/scoreset_data/" + if not os.path.exists(scoreset_data_folder): + os.mkdir(scoreset_data_folder) + + entry_dict = self.search_database(retrieve_json_only=True) + for urn in entry_dict: + meta_file = f"{meta_data_folder}/{urn}.json" + + f = open(meta_file, "w") + json.dump(entry_dict[urn], f) + f.close() + + score_table_file = f"{scoreset_data_folder}/{urn}.csv" + + f = open(score_table_file, "w") + f.write(self.retrieve_score_table(urn)) + f.close() + + def search_database( + self, + keywords=None, + organisms=None, + retrieve_json_only=False, + experiment_types=["protein_coding"], + ): + """ + Searches all scoresets in MaveDB and applies some filters. + Parameters + ---------- + + keywords + List of keywords. If not None, filters all scoresets that keywords do not contain any of the given keywords. + + organisms + List of organisms. If not None, filters all scoresets that organism is not any of the given organisms. + + experiment_types + List of experiment types. If not None, filters all scoresets that experiment type is not any of the given experiment types. + + Returns + ------- + + experiment_dict + A dictionary mapping experiment urns to their corresponding MLExperiment objects. + """ + + search_page_url = f"{self.base_url}/scoresets" + try: + r = requests.get(search_page_url) + r.raise_for_status() + except requests.exceptions.HTTPError as e: + logging.error(r.json()) + raise SystemExit(e) + + scoreset_list = r.json() + + experiment_dict = self.parse_json_scoreset_list( + scoreset_list, + keywords=keywords, + organisms=organisms, + retrieve_json_only=retrieve_json_only, + ) + + return experiment_dict + + def retrieve_score_table(self, urn): + """ + Retrieves the score table for an urn. + + Parameters + ---------- + + urn + MaveDB urn identifier of a scoreset. + + Returns + ------- + + text + Scoreset table as a string. + """ + + base_parent = self.base_url.replace("api/", "") + score_table_url = f"{base_parent}scoreset/{urn}/scores/" + try: + r = requests.get(score_table_url) + r.raise_for_status() + except requests.exceptions.HTTPError as e: + logging.error(r.json()) + raise SystemExit(e) + return r.text + def get_model_instance(self, model_class, instance_id): """ Using a GET, hit an API endpoint to get info on a particular instance diff --git a/mavetools/models/dataset.py b/mavetools/models/dataset.py index 4a286ea..66a3b37 100644 --- a/mavetools/models/dataset.py +++ b/mavetools/models/dataset.py @@ -28,8 +28,8 @@ class TimeStamped: Instantiates TimeStamped object and declares attributes """ - creation_date: str = attr.ib(kw_only=True) - modification_date: str = attr.ib(kw_only=True) + creationDate: str = attr.ib(kw_only=True) + modificationDate: str = attr.ib(kw_only=True) @attr.s @@ -38,43 +38,27 @@ class Dataset(TimeStamped, Urn): Instantiates Dataset object and declares attributes """ # record keeping attributes - publish_date: str = attr.ib(kw_only=True) - created_by: str = attr.ib(kw_only=True) - modified_by: str = attr.ib(kw_only=True) + publishedDate: str = attr.ib(kw_only=True) + createdBy: str = attr.ib(kw_only=True) + modifiedBy: str = attr.ib(kw_only=True) # required attributes - short_description: str = attr.ib(kw_only=True) + shortDescription: str = attr.ib(kw_only=True) title: str = attr.ib(kw_only=True) - contributors: List[str] = attr.ib(kw_only=True) + # optional attribute - abstract_text: str = attr.ib(kw_only=True) - method_text: str = attr.ib(kw_only=True) + contributors: Optional[List[str]] = attr.ib(kw_only=True, default=None) + abstractText: str = attr.ib(kw_only=True) + methodText: str = attr.ib(kw_only=True) keywords: List[Keyword] = attr.ib(kw_only=True) - sra_ids: Optional[List[ExternalIdentifier]] = attr.ib(kw_only=True, default=None) - doi_ids: Optional[List[ExternalIdentifier]] = attr.ib(kw_only=True, default=None) - pubmed_ids: Optional[List[ExternalIdentifier]] = attr.ib(kw_only=True, default=None) - extra_metadata: Optional[Dict[str, str]] = attr.ib(kw_only=True, default=None) + secondaryPublicationIdentifiers: Optional[List[ExternalIdentifier]] = attr.ib(kw_only=True, default=None) + doiIdentifiers: Optional[List[ExternalIdentifier]] = attr.ib(kw_only=True, default=None) + primaryPublicationIdentifiers: Optional[List[ExternalIdentifier]] = attr.ib(kw_only=True, default=None) + extraMetadata: Optional[Dict[str, str]] = attr.ib(kw_only=True, default=None) + private: Optional[bool] = attr.ib(kw_only=True, default=None) + processingState: Optional[str] = attr.ib(kw_only=True, default=None) + processingErrors: Optional[str] = attr.ib(kw_only=True, default=None) def deserialize(): raise NotImplementedError() - - -@attr.s -class NewDataset: - """ - Instantiates NewDataset object and declares object attributes - """ - - # required attributes - title: str = attr.ib(kw_only=True) - short_description: str = attr.ib(kw_only=True) - - # optional attributes - abstract_text: Optional[str] = attr.ib(kw_only=True, default=None) - method_text: Optional[str] = attr.ib(kw_only=True, default=None) - keywords: Optional[List[Keyword]] = attr.ib(kw_only=True, default=None) - # TODO: change this once you know what this is supposed to be - doi_ids: Optional[List[str]] = attr.ib(kw_only=True, default=None) - sra_ids: Optional[List[str]] = attr.ib(kw_only=True, default=None) - pubmed_ids: Optional[List[str]] = attr.ib(kw_only=True, default=None) diff --git a/mavetools/models/external_identifier.py b/mavetools/models/external_identifier.py index 7a6e3ed..4e98385 100644 --- a/mavetools/models/external_identifier.py +++ b/mavetools/models/external_identifier.py @@ -11,5 +11,6 @@ class ExternalIdentifier: identifier: str = attr.ib(kw_only=True) # optional attributes url: Optional[str] = attr.ib(kw_only=True, default=None) - dbversion: Optional[str] = attr.ib(kw_only=True, default=None) - dbname: Optional[str] = attr.ib(kw_only=True, default=None) + dbVersion: Optional[str] = attr.ib(kw_only=True, default=None) + dbName: Optional[str] = attr.ib(kw_only=True, default=None) + referebceHtml: Optional[str] = attr.ib(kw_only=True, default=None) diff --git a/mavetools/models/ml_tools.py b/mavetools/models/ml_tools.py new file mode 100644 index 0000000..a965a9a --- /dev/null +++ b/mavetools/models/ml_tools.py @@ -0,0 +1,683 @@ +from mavetools.models.scoreset import ScoreSetData, ProteinGymScoreset, load_protein_gym_scores, DomainomeScoreset +from mavetools.models.utils import apply_offset_to_hgvs_pro, check_offset, median, aac_to_hgvs_pro + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import numpy as np +import metapub +import csv +import os + +class MlExperiment: + + """ + Class that represents one experiment. + """ + + def __init__(self, primary_id, scoreset_dict, scoreset, proteinGym_id = None, urn = None, domainome_id = None, function_type = None): + + """ + Initializes an instance of the ML experiment class. + + Parameters + ---------- + + urn + MaveDB urn identifier of the experiment + + scoreset_dict + A dictionary of all scoreset metdata objects belonging to the experiment. + + scoreset + A representative metadata object for the experiment. + """ + + self.primary_id = primary_id + self.urn = urn + self.proteinGym_id = proteinGym_id + self.domainome_id = domainome_id + self.scoreset_dict = scoreset_dict + self.representative_scoreset_metadata = scoreset + self.function_type = function_type + + + def get_name(self): + if self.representative_scoreset_metadata.current_version == 'ProteinGym' or self.representative_scoreset_metadata.current_version == 'Domainome': + return self.representative_scoreset_metadata.target['name'] + return self.representative_scoreset_metadata.targetGenes[0]['name'].replace(" ","_").replace('β','beta').replace(';','') + +class MlDataset: + + """ + A class that represents a dataset of experiments determined for the usage in ML tools. + """ + + def __init__(self, experiments): + + """ + Initializes an instance of the ML dataset class. + + Parameters + ---------- + + experiments + A dictionary mapping MaveDB urn identifiers to their corresponding ML experiment objects. + """ + + self.experiments = experiments + + def retrieve_data(self, client, verbosity = 0): + + """ + Uses a client object to download (or look up) the scoretables for all the scoresets in the dataset. + + Parameters + ---------- + + client + A client object, see client.py for more information. + """ + + for experiment_urn in self.experiments: + for scoreset_urn in self.experiments[experiment_urn].scoreset_dict: + if verbosity >= 1: + print(f'Retrieving scoreset table {scoreset_urn} for experiment {experiment_urn}') + csv_table = client.retrieve_score_table(scoreset_urn) + hgvs_pro_pos, hgvs_nt_pos, score_pos = self.experiments[experiment_urn].scoreset_dict[scoreset_urn].get_score_table_positions() + self.experiments[experiment_urn].scoreset_dict[scoreset_urn].scoresetdata = ScoreSetData(scoreset_urn, csv_table, hgvs_pro_pos = hgvs_pro_pos, score_pos = score_pos, hgvs_nt_pos = hgvs_nt_pos) + + + def load_domainome(self, path_to_domainome): + f = open(path_to_domainome, 'r') + lines = f.readlines() + f.close() + + data_dict = {} + seq_dict = {} + + three_seq_dict = {} + + for line in lines[1:]: + words = line[:-1].split('\t') + domain_id = words[0] + + uni_mut_id = words[2] + mut_seq = words[3] + + uniprot_id, aac = uni_mut_id.split('_') + wt_aa = aac[0] + try: + mutpos = int(aac[1:-1]) + except: + continue + if domain_id not in three_seq_dict: + three_seq_dict[domain_id] = [] + + if len(three_seq_dict[domain_id]) < 3: + three_seq_dict[domain_id].append((wt_aa, mut_seq)) + if len(three_seq_dict[domain_id]) == 3: + wt_seq = '' + pos_count: list[dict[str, int]] = [] + for mut_aa in three_seq_dict[domain_id][0][1]: + pos_count.append({mut_aa:1}) + for pos, mut_aa in enumerate(three_seq_dict[domain_id][1][1]): + if mut_aa in pos_count[pos]: + pos_count[pos][mut_aa] += 1 + else: + pos_count[pos][mut_aa] = 1 + for pos, mut_aa in enumerate(three_seq_dict[domain_id][2][1]): + if mut_aa in pos_count[pos]: + wt_seq += mut_aa + elif len(pos_count[pos]) == 1: + wt_seq += pos_count[pos].popitem()[0] + else: + wt_seq += wt_aa + seq_dict[domain_id] = wt_seq + + for line in lines[1:]: + words = line[:-1].split('\t') + domain_id = words[0] + uni_mut_id = words[2] + mut_seq = words[3] + + uniprot_id, aac = uni_mut_id.split('_') + if aac[-1] == '*': + continue + + try: + scaled_effect = float(words[6]) + except: + continue + + wt_aa = aac[0] + try: + mutpos = int(aac[1:-1]) + except: + continue + mut_aa = aac[-1] + + wt_seq = seq_dict[domain_id] + + offset_mut_pos = None + + for pos, aa in enumerate(mut_seq): + if aa != wt_seq[pos]: + offset_mut_pos = pos + 1 + + offset_aac = f'{wt_aa}{offset_mut_pos}{mut_aa}' + + if domain_id not in data_dict: + data_dict[domain_id] = ({}, uniprot_id) + + hgvs_pro = aac_to_hgvs_pro(offset_aac) + + data_dict[domain_id][0][hgvs_pro] = scaled_effect + + for domain_id in data_dict: + target_name = domain_id.split('_')[1] + ssd_object = ScoreSetData(domain_id, '', score_dict = data_dict[domain_id][0]) + + scoreset_obj = DomainomeScoreset(target_name, data_dict[domain_id][1], seq_dict[domain_id], ssd_object) + self.experiments[domain_id] = MlExperiment(domain_id, {domain_id:scoreset_obj}, scoreset_obj, domainome_id = domain_id, function_type='Stability') + + + def load_protein_gym(self, path_to_reference_file): + + proteingym_folder = path_to_reference_file.rsplit('/',1)[0] + proteingym_folder = f'{proteingym_folder}/DMS_ProteinGym_substitutions' + + f = open(path_to_reference_file, 'r') + lines = f.readlines() + f.close() + + fetch = metapub.PubMedFetcher() + doi_dict = {} + for exp_urn in self.experiments: + scoreset = self.experiments[exp_urn].representative_scoreset_metadata + for publication_entry in scoreset.primaryPublicationIdentifiers: + + if publication_entry['doi'] is not None: + doi_dict[publication_entry['doi']] = exp_urn + elif publication_entry['dbName'] == 'PubMed': + pubmed_id = publication_entry['identifier'] + article = fetch.article_by_pmid(pubmed_id) + doi_dict[article.doi] = exp_urn + + for line in lines[1:]: + words = [ '{}'.format(x) for x in list(csv.reader([line], delimiter=',', quotechar='"'))[0] ] + + dms_id = words[0] + dms_filename = words[1] + uniprot = words[2] + wt_sequence = words[5] + doi = words[16] + function_type = words[44] + if doi in doi_dict: + print(f'Skipped ProteinGym entry: DOI already found in MaveDB: {dms_id} {doi_dict[doi]}') + self.experiments[doi_dict[doi]].proteinGym_id = dms_id + continue + #else: + #print(f'{dms_id} {doi}') + + path_to_dms_file = f'{proteingym_folder}/{dms_filename}' + scoresetdata = load_protein_gym_scores(dms_id, path_to_dms_file) + scoreset_obj = ProteinGymScoreset(uniprot, uniprot, wt_sequence, scoresetdata) + self.experiments[dms_id] = MlExperiment(dms_id, {dms_id:scoreset_obj}, scoreset_obj, proteinGym_id = dms_id, function_type=function_type) + + + def store_raw_score_distributions(self, outfolder): + for experiment_urn in self.experiments: + for scoreset_urn in self.experiments[experiment_urn].scoreset_dict: + scoreset = self.experiments[experiment_urn].scoreset_dict[scoreset_urn] + scoresetdata = scoreset.scoresetdata + outfile = f'{outfolder}/{scoreset_urn}_raw_score_distribution.png' + if not os.path.isfile(outfile): + scoresetdata.plot_sav_score_distribution(outfile) + + def aggregate_scoresetdata(self, min_prot_size = None, min_coverage = None, min_len_coverage = None, reasonable_len_coverage = 100, std_filter = None, nonsense_std_filter = False, verbosity = 0): + + """ + For all experiments aggregates multiple scoresets corresponding to one experiment. + Retrieves the full target protein sequence of the experiment. + Checks all the offset values given in the metadata and maps all variations to the full target sequence. + Effect values occupying the same position gets averaged. + """ + + del_list = [] + filtered_list = [] + results = [] + + for experiment_urn in self.experiments: + if verbosity >= 1: + print(f'Start aggregation of {experiment_urn}') + experiment_scoresetdata = ScoreSetData(experiment_urn, '') + experiment_scoresetdata.pro_baseline = [] + experiment_scoresetdata.ala_baseline = [] + + broken = 0 + reason = None + + for scoreset_urn in self.experiments[experiment_urn].scoreset_dict: + scoreset = self.experiments[experiment_urn].scoreset_dict[scoreset_urn] + if verbosity >= 1: + print(f'Processing data from scoreset {scoreset_urn}\n{scoreset.targetGenes=}') + scoresetdata = scoreset.scoresetdata + + sav_ids = list(scoresetdata.sav_scores.keys()) + if len(sav_ids) > 0: + example_sav_id = sav_ids[0] + if example_sav_id.count(':') > 0: + prot_id = example_sav_id.split(':')[0] + else: + prot_id = None + else: + reason = f'Filtered {scoreset_urn} due to empty scores list' + if verbosity >= 1: + print(reason) + broken += 1 + continue + + seq, unchecked_offset = scoreset.get_full_sequence_info(prot_id) + fallback_seq = scoreset.get_protein_sequence() + if seq is None: + seq = fallback_seq + unchecked_offset = 0 + if seq is None: + reason = f'Filtered {scoreset_urn} due to no sequence available' + if verbosity >= 1: + print(reason) + print(f'{scoreset.targetGenes=}') + broken += 1 + continue + + if verbosity >= 2: + print(seq) + + if min_prot_size is not None: + if len(seq) < min_prot_size: + reason = f'Filtered {scoreset_urn} due to small prot size: {len(seq)} < {min_prot_size}' + if verbosity >= 1: + print(reason) + broken += 1 + continue + + print(f'Checking offset: {scoreset_urn=} {unchecked_offset=}') + + offset, fall_back, hit_rate, seq = check_offset(seq, unchecked_offset, list(scoresetdata.sav_scores.keys()), scoreset_urn, fallback_seq, verbosity=verbosity) + + print(f'After Checking offset: {scoreset_urn=} {offset=} {hit_rate=} {fall_back=}') + + if fall_back: + seq = fallback_seq + scoreset.corrected_seq = seq + + if offset is None: + reason = f'Offset {unchecked_offset=} seems to be incorrect and could not be corrected for: {scoreset_urn}' + print(reason) + print(f'{hit_rate=}') + print(f'{scoreset.targetGenes=}') + print(f'{seq=}') + print(f'{fall_back=} {fallback_seq=}') + print(f'{list(scoresetdata.sav_scores.keys())[:10]} {len(scoresetdata.sav_scores)=}') + broken += 1 + continue + + for hgvs_pro in scoresetdata.score_dict: + try: + corrected_hgvs_pro = apply_offset_to_hgvs_pro(hgvs_pro, offset) + except: + print(f'Couldnt apply offset to hgvs: {hgvs_pro} {offset}, {experiment_urn}') + corrected_hgvs_pro = apply_offset_to_hgvs_pro(hgvs_pro, offset) + if corrected_hgvs_pro not in experiment_scoresetdata.score_dict: + experiment_scoresetdata.score_dict[corrected_hgvs_pro] = [] + experiment_scoresetdata.score_dict[corrected_hgvs_pro].append(scoresetdata.score_dict[hgvs_pro]) + + for hgvs_pro in scoresetdata.sav_scores: + try: + corrected_hgvs_pro = apply_offset_to_hgvs_pro(hgvs_pro, offset) + except: + print(f'Couldnt apply offset to hgvs: {hgvs_pro} {offset}, {experiment_urn}') + if corrected_hgvs_pro not in experiment_scoresetdata.sav_scores: + experiment_scoresetdata.sav_scores[corrected_hgvs_pro] = [] + score = scoresetdata.sav_scores[hgvs_pro] + experiment_scoresetdata.sav_scores[corrected_hgvs_pro].append(score) + + if hgvs_pro[-3:] == 'Pro' and hgvs_pro[2:5] != 'Pro': + experiment_scoresetdata.pro_baseline.append(score) + + if hgvs_pro[-3:] == 'Ala' and hgvs_pro[2:5] != 'Ala': + experiment_scoresetdata.ala_baseline.append(score) + + for hgvs_pro in scoresetdata.nonsense_scores: + try: + corrected_hgvs_pro = apply_offset_to_hgvs_pro(hgvs_pro, offset) + except: + print(f'Couldnt apply offset to hgvs: {hgvs_pro} {offset}, {experiment_urn}') + if not corrected_hgvs_pro in experiment_scoresetdata.nonsense_scores: + experiment_scoresetdata.nonsense_scores[corrected_hgvs_pro] = [] + experiment_scoresetdata.nonsense_scores[corrected_hgvs_pro].append(scoresetdata.nonsense_scores[hgvs_pro]) + + for hgvs_pro in scoresetdata.synonymous_scores: + try: + corrected_hgvs_pro = apply_offset_to_hgvs_pro(hgvs_pro, offset) + except: + corrected_hgvs_pro = hgvs_pro + if not corrected_hgvs_pro in experiment_scoresetdata.sav_scores: + experiment_scoresetdata.synonymous_scores[corrected_hgvs_pro] = [] + experiment_scoresetdata.synonymous_scores[corrected_hgvs_pro].append(scoresetdata.synonymous_scores[hgvs_pro]) + + if broken == len(self.experiments[experiment_urn].scoreset_dict): + del_list.append(experiment_urn) + filtered_list.append((self.experiments[experiment_urn].get_name(), experiment_urn, reason)) + continue + + if len(experiment_scoresetdata.pro_baseline) > 0: + experiment_scoresetdata.pro_baseline = sum(experiment_scoresetdata.pro_baseline)/len(experiment_scoresetdata.pro_baseline) + else: + experiment_scoresetdata.pro_baseline = None + + if len(experiment_scoresetdata.ala_baseline) > 0: + experiment_scoresetdata.ala_baseline = sum(experiment_scoresetdata.ala_baseline)/len(experiment_scoresetdata.ala_baseline) + else: + experiment_scoresetdata.ala_baseline = None + + for hgvs_pro in experiment_scoresetdata.score_dict: + experiment_scoresetdata.score_dict[hgvs_pro] = sum(experiment_scoresetdata.score_dict[hgvs_pro])/len(experiment_scoresetdata.score_dict[hgvs_pro]) + + for hgvs_pro in experiment_scoresetdata.sav_scores: + experiment_scoresetdata.sav_scores[hgvs_pro] = sum(experiment_scoresetdata.sav_scores[hgvs_pro])/len(experiment_scoresetdata.sav_scores[hgvs_pro]) + + if len(experiment_scoresetdata.sav_scores) > 0: + experiment_scoresetdata.set_std_sav() + + for hgvs_pro in experiment_scoresetdata.synonymous_scores: + experiment_scoresetdata.synonymous_scores[hgvs_pro] = sum(experiment_scoresetdata.synonymous_scores[hgvs_pro])/len(experiment_scoresetdata.synonymous_scores[hgvs_pro]) + + for hgvs_pro in experiment_scoresetdata.nonsense_scores: + experiment_scoresetdata.nonsense_scores[hgvs_pro] = sum(experiment_scoresetdata.nonsense_scores[hgvs_pro])/len(experiment_scoresetdata.nonsense_scores[hgvs_pro]) + + if len(experiment_scoresetdata.nonsense_scores) > 0: + nsl = list(experiment_scoresetdata.nonsense_scores.values()) + experiment_scoresetdata.median_nonsense = median(nsl) + experiment_scoresetdata.mean_nonsense = sum(nsl)/len(nsl) + experiment_scoresetdata.std_nonsense = np.std(nsl) + + experiment_scoresetdata.mean_score = None + if len(experiment_scoresetdata.score_dict) > 0: + experiment_scoresetdata.min_score = min(experiment_scoresetdata.score_dict.values()) + experiment_scoresetdata.max_score = max(experiment_scoresetdata.score_dict.values()) + experiment_scoresetdata.mean_sav = None + if len(experiment_scoresetdata.sav_scores) > 0: + experiment_scoresetdata.min_sav = min(experiment_scoresetdata.sav_scores.values()) + experiment_scoresetdata.max_sav = max(experiment_scoresetdata.sav_scores.values()) + experiment_scoresetdata.cardinality = len(experiment_scoresetdata.score_dict) + experiment_scoresetdata.sav_cardinality = len(experiment_scoresetdata.sav_scores) + + if min_coverage is not None: + thresh = len(seq)*19*min_coverage + if experiment_scoresetdata.sav_cardinality < thresh: + reason = f'Filtered {experiment_urn} due to variant coverage: {experiment_scoresetdata.sav_cardinality} ({experiment_scoresetdata.sav_cardinality/(len(seq)*19)}%) < {thresh} ({min_coverage}%)' + if verbosity >= 1: + print(reason) + + filtered_list.append((self.experiments[experiment_urn].get_name(), experiment_urn, reason)) + del_list.append(experiment_urn) + continue + else: + if verbosity >= 1: + print(f'{experiment_urn} variant coverage: {experiment_scoresetdata.sav_cardinality} ({experiment_scoresetdata.sav_cardinality/(len(seq)*19)}%)') + + number_of_covered_positions = experiment_scoresetdata.get_number_of_covered_positions() + if min_len_coverage is not None: + thresh = len(seq)*min_len_coverage + if number_of_covered_positions < thresh and number_of_covered_positions < reasonable_len_coverage: + reason = f'Filtered {experiment_urn} due to length coverage: {number_of_covered_positions} ({number_of_covered_positions/(len(seq))}%) < {thresh} ({min_len_coverage}%)' + if verbosity >= 1: + print(reason) + del_list.append(experiment_urn) + filtered_list.append((self.experiments[experiment_urn].get_name(), experiment_urn, reason)) + continue + else: + if verbosity >= 1: + print(f'{experiment_urn} length coverage: {number_of_covered_positions} ({number_of_covered_positions/(len(seq))}%)') + + if nonsense_std_filter: + if experiment_scoresetdata.std_nonsense is not None: + if verbosity >= 1: + if len(experiment_scoresetdata.nonsense_scores) == 1: + print("Can't calculate Pearson's correlation coefficient between nonsense mutation effect scores and corresponding position number, due there is only one value") + else: + pearson_corr, pearson_corr_p_value = experiment_scoresetdata.calculate_nonsense_position_correlation() + print(f"{experiment_urn}: Pearson's correlation coefficient between nonsense mutation effect scores and corresponding position number: {pearson_corr} (p-value: {pearson_corr_p_value})") + if experiment_scoresetdata.std_sav is not None: + if experiment_scoresetdata.std_nonsense*5 > experiment_scoresetdata.std_sav: + reason = f'Filtered {experiment_urn} due to high nonsense deviation: {experiment_scoresetdata.std_nonsense} STD nonsense score versus {experiment_scoresetdata.std_sav} STD missense score' + if verbosity >= 1: + print(reason) + del_list.append(experiment_urn) + filtered_list.append((self.experiments[experiment_urn].get_name(), experiment_urn, reason)) + continue + + if std_filter is not None: + normalizer = experiment_scoresetdata.get_normalizer() + std_missense = experiment_scoresetdata.std_sav + normalized_std_missense = std_missense/normalizer + + if normalized_std_missense < std_filter: + reason = f'Filtered {experiment_urn} due to low {normalized_std_missense=} ({std_missense=} {normalizer=}) normalized missense STD versus < {std_filter}' + if verbosity >= 1: + print(reason) + del_list.append(experiment_urn) + filtered_list.append((self.experiments[experiment_urn].get_name(), experiment_urn, reason)) + continue + + if experiment_scoresetdata.cardinality > 0: + experiment_scoresetdata.mean_score = sum(experiment_scoresetdata.score_dict.values()) / experiment_scoresetdata.cardinality + + if experiment_scoresetdata.sav_cardinality > 0: + experiment_scoresetdata.mean_sav = sum(experiment_scoresetdata.sav_scores.values()) / experiment_scoresetdata.sav_cardinality + experiment_scoresetdata.median_sav = median(experiment_scoresetdata.sav_scores.values()) + + self.experiments[experiment_urn].experiment_scoresetdata = experiment_scoresetdata + + for experiment_urn in del_list: + del self.experiments[experiment_urn] + + return filtered_list + + def write_filtered_entries(self, filtered_list, outfile): + + outlines = [] + + for name, experiment_urn, reason in filtered_list: + + outlines.append(f'{experiment_urn}\t{name}\t{reason}\n') + + f = open(outfile,'w') + f.write(''.join(outlines)) + f.close() + + + def scale_all_savs(self, verbosity = 0, outfolder = None, distribution_filtering = False): + + """ + Applies to effect value scaling to all SAV effect scores in the experiment. + + Parameters + ---------- + + verbosity + Verbosity level of the function. + """ + + for experiment_urn in self.experiments: + if len(self.experiments[experiment_urn].experiment_scoresetdata.sav_scores) == 0: + continue + if self.experiments[experiment_urn].representative_scoreset_metadata.current_version == 'Domainome': + self.experiments[experiment_urn].experiment_scoresetdata.scale_domainome() + successful = True + else: + successful = self.experiments[experiment_urn].experiment_scoresetdata.scale_sav_data(verbosity = verbosity, distribution_filtering = distribution_filtering) + + if outfolder is not None and successful: + outfile = f'{outfolder}/{experiment_urn}_scaled_score_distribution.png' + if not os.path.isfile(outfile): + self.experiments[experiment_urn].experiment_scoresetdata.plot_sav_score_distribution(outfile, specific_scores = self.experiments[experiment_urn].experiment_scoresetdata.scaled_sav_scores) + + def plot_sav_score_distribution(self, outfile): + all_scores = [] + for experiment_urn in self.experiments: + all_scores += self.experiments[experiment_urn].experiment_scoresetdata.scaled_sav_scores.values() + + plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100}) + + data = np.array(all_scores) + + plt.hist(data, bins=50) + plt.gca().set(title=f'Score distribution', ylabel='Frequency', xlabel = 'Effect score'); + plt.savefig(outfile) + + + def write_dataset_statistics(self, statfile): + headers = [ + 'Experiment URN', + 'Protein Length', + 'Dataset Cardinality', + 'Coverage', + 'Length Coverage', + 'Value Range', + 'Normalizer', + 'Missense Mean', + 'Missense Median', + 'Nonsense Mean', + 'Nonsense Median', + 'Missense STD', + 'Normalized Missense STD', + 'Length Normalized Missense STD' + ] + header = "\t".join(headers) + outlines = [f'{header}\n'] + + for experiment_urn in self.experiments: + + name = self.experiments[experiment_urn].representative_scoreset_metadata.target['name'].replace(" ","_").replace('β','beta') + + seq, offset = self.experiments[experiment_urn].representative_scoreset_metadata.get_full_sequence_info() + prot_len = len(seq) + data_cardinality = self.experiments[experiment_urn].experiment_scoresetdata.sav_cardinality + coverage = data_cardinality/(prot_len*19) + number_of_covered_positions = self.experiments[experiment_urn].experiment_scoresetdata.get_number_of_covered_positions() + length_coverage = number_of_covered_positions/prot_len + + min_val = self.experiments[experiment_urn].experiment_scoresetdata.min_sav + max_val = self.experiments[experiment_urn].experiment_scoresetdata.max_sav + + if min_val == max_val: + print(f'Min val equals to max val for {experiment_urn}') + continue + + normalizer = self.experiments[experiment_urn].experiment_scoresetdata.get_normalizer() + + std_missense = self.experiments[experiment_urn].experiment_scoresetdata.std_sav + + normalized_std_missense = std_missense/normalizer + + length_normalized_std_missense = std_missense/(max_val-min_val) + + val_range = f'[{min_val} : {max_val}]' + + missense_mean = self.experiments[experiment_urn].experiment_scoresetdata.mean_sav + missense_median = self.experiments[experiment_urn].experiment_scoresetdata.median_sav + nonsense_mean = self.experiments[experiment_urn].experiment_scoresetdata.mean_nonsense + nonsense_median = self.experiments[experiment_urn].experiment_scoresetdata.median_nonsense + + line = '\t'.join([ + f'{name}_{experiment_urn}', + f'{prot_len}', + f'{data_cardinality}', + f'{coverage}', + f'{length_coverage}', + f'{val_range}', + f'{normalizer}', + f'{missense_mean}', + f'{missense_median}', + f'{nonsense_mean}', + f'{nonsense_median}', + f'{std_missense}', + f'{normalized_std_missense}', + f'{length_normalized_std_missense}' + ]) + + outlines.append(f'{line}\n') + + f = open(statfile, 'w') + f.write(''.join(outlines)) + f.close() + + + def write_scaled_sav_fasta(self, outfile, sequences_only_file = None, write_unscaled = False): + + """ + Writes scaled SAV effect scores together with their full target protein sequences to a fasta file. + The fasta file has a non-standard format: + + >[Sequence identifier] + <[hgvs_pro identifier] #scaled_effect:[scaled effect value] + <[hgvs_pro identifier] #scaled_effect:[scaled effect value] + <[hgvs_pro identifier] #scaled_effect:[scaled effect value] + ... + [The sequence] + + Parameters + ---------- + + outfile + Path to where the fasta file should be written. + + sequences_only_file + When not None, gives a path where a classical fasta gets written omitting the variation information. + """ + + fasta_lines = [] + seq_only_lines = [] + for experiment_urn in self.experiments: + seq, offset = self.experiments[experiment_urn].representative_scoreset_metadata.get_full_sequence_info() + + name = self.experiments[experiment_urn].get_name() + function_type = self.experiments[experiment_urn].function_type + + try: + scaled_sav_scores = self.experiments[experiment_urn].experiment_scoresetdata.scaled_sav_scores + except: + print(f'Couldnt find scaled sav scores for {experiment_urn}') + continue + if write_unscaled: + scaled_sav_scores = self.experiments[experiment_urn].experiment_scoresetdata.sav_scores + if len(scaled_sav_scores) == 0: + print(f'Scaled sav scores empty for {experiment_urn}') + continue + + headline = f'>{name}_{experiment_urn}\t<{self.experiments[experiment_urn].urn},{self.experiments[experiment_urn].proteinGym_id},function_type={function_type}\n' + fasta_lines.append(headline) + if sequences_only_file is not None: + seq_only_lines.append(headline) + + for hgvs_pro in scaled_sav_scores: + + variant_line = f'<{hgvs_pro} #scaled_effect:{scaled_sav_scores[hgvs_pro]}\n' + fasta_lines.append(variant_line) + + fasta_lines.append(f'{seq}\n') + if sequences_only_file is not None: + seq_only_lines.append(f'{seq}\n') + + f = open(outfile, 'w') + f.write(''.join(fasta_lines)) + f.close() + + if sequences_only_file is not None: + f = open(sequences_only_file, 'w') + f.write(''.join(seq_only_lines)) + f.close() + diff --git a/mavetools/models/scoreset.py b/mavetools/models/scoreset.py index 6b6d75b..9c0632b 100644 --- a/mavetools/models/scoreset.py +++ b/mavetools/models/scoreset.py @@ -1,14 +1,30 @@ import attr import json import os +import sys +import traceback from typing import Any, BinaryIO, Dict, List, Optional, Union +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import numpy as np + +from scipy import stats from .base import APIObject -from .dataset import Dataset, NewDataset +from .dataset import Dataset from .licence import Licence -from .target import NewTarget, ReferenceMap, SequenceOffset, Target -from .utils import attrs_filter, attrs_serializer, prepare_for_encoding +from .target import Target +from .utils import attrs_filter, attrs_serializer, prepare_for_encoding, get_variant_type, median, score_scale_function, second_scale, disect_hgvs_single_pos +from fqfa.util.translate import translate_dna + +from mavetools.models.sequence_retrieval import getUniprotSequence, get_refseq_sequences, retrieve_transcript_sequences +from mavetools.models.utils import aac_to_hgvs_pro +@attr.s +class DatasetColumns: + countColumns: List[str] = attr.ib(kw_only=True) + scoreColumns: List[str] = attr.ib(kw_only=True) @attr.s class ScoreSet(APIObject, Dataset): @@ -17,21 +33,36 @@ class ScoreSet(APIObject, Dataset): It inherits the attributes of APIObject and Dataset """ - experiment: str = attr.ib(kw_only=True) - licence: Licence = attr.ib(kw_only=True) - target: Target = attr.ib(kw_only=True) + urn: str = attr.ib(kw_only=True) + license: Licence = attr.ib(kw_only=True) + targetGenes: Target = attr.ib(kw_only=True) # optional attributes - dataset_columns: Optional[Any] = attr.ib(kw_only=True, default=None) + datasetColumns: Optional[DatasetColumns] = attr.ib(kw_only=True, default=None) replaces: Optional[str] = attr.ib(kw_only=True, default=None) - score_columns: List[str] = attr.ib(kw_only=True) - count_columns: List[str] = attr.ib(kw_only=True) + #score_columns: List[str] = attr.ib(kw_only=True) + #count_columns: List[str] = attr.ib(kw_only=True) # optional attributes previous_version: Optional[str] = attr.ib(kw_only=True, default=None) next_version: Optional[str] = attr.ib(kw_only=True, default=None) - current_version: str = attr.ib(kw_only=True) - variant_count: int = attr.ib(kw_only=True) - data_usage_policy: str = attr.ib(kw_only=True) - is_meta_analysis: bool = attr.ib(kw_only=True) + current_version: Optional[str] = attr.ib(kw_only=True, default=None) + numVariants: int = attr.ib(kw_only=True, default=None) + data_usage_policy: Optional[str] = attr.ib(kw_only=True, default=None) + is_meta_analysis: Optional[bool] = attr.ib(kw_only=True, default=None) + + dataUsagePolicy: Optional[str] = attr.ib(kw_only=True, default=None) + supersededScoreSet: Optional[str] = attr.ib(kw_only=True, default=None) + supersedingScoreSet: Optional[str] = attr.ib(kw_only=True, default=None) + metaAnalyzesScoreSetUrns: Optional[List[str]] = attr.ib(kw_only=True, default=None) + metaAnalyzedByScoreSetUrns: Optional[List[str]] = attr.ib(kw_only=True, default=None) + + externalLinks = attr.ib(kw_only=True, default=None) + scoreRanges = attr.ib(kw_only=True, default=None) + mappingState = attr.ib(kw_only=True, default=None) + mappingErrors = attr.ib(kw_only=True, default=None) + keywords = attr.ib(kw_only=True, default=None) + recordType = attr.ib(kw_only=True, default=None) + scoreCalibrations = attr.ib(kw_only=True, default=None) + corrected_seq = None def api_url() -> str: """ @@ -51,62 +82,711 @@ def deserialize(json_dict): """ return ScoreSet(**json_dict) + def get_protein_sequence(self): + if self.current_version == 'ProteinGym' or self.current_version == 'Domainome': + return self.target['reference_sequence']['sequence'] + try: + raw_seq = self.targetGenes[0]['targetSequence']['sequence'] + except TypeError: + name = self.targetGenes[0]['name'] + if name[:3] == 'ENS': + ensembl_id = name.split('.')[0] + ens_seq_map, empty_seqs = retrieve_transcript_sequences([ensembl_id]) + if ensembl_id in ens_seq_map: + seq = ens_seq_map[ensembl_id] + self.targetGenes[0]['targetSequence'] = {'sequence':seq, 'sequenceType':'protein'} + return seq + elif name[:3] == 'NC_' or name[:3] == 'NM_': + nc_id = name.split('.')[0] + nc_seq_map = get_refseq_sequences([nc_id], seq_type='nucleotide') + if nc_id in nc_seq_map: + seq = nc_seq_map[nc_id] + self.targetGenes[0]['targetSequence'] = {'sequence':seq, 'sequenceType':'protein'} + return seq + print(f'TypeError, get_protein_sequence failed for {self.urn=}: {self.targetGenes=}') + return None + except IndexError: + print(f'IndexError, get_protein_sequence failed for {self.urn=}: {self.targetGenes=}') + return None + if self.targetGenes[0]['targetSequence']['sequenceType'] == 'protein': + return raw_seq + else: + return translate_dna(raw_seq)[0] -@attr.s -class NewScoreSet(NewDataset): - """ - This Class instantiates NewScoreSet and declares the variables present in NewScoreSet object. - Attributes are set before posting a model instance - """ - experiment: str = attr.ib(kw_only=True) + def get_full_sequence_info(self, prot_id_from_scores = None): + if self.current_version == 'ProteinGym' or self.current_version == 'Domainome': + full_seq = self.target['reference_sequence']['sequence'] + return full_seq, 0 + elif prot_id_from_scores is not None: + if prot_id_from_scores[:3] == 'ENS': + ensembl_id = prot_id_from_scores.split('.')[0] + ens_seq_map, empty_seqs = retrieve_transcript_sequences([ensembl_id]) + if ensembl_id in ens_seq_map: + seq = ens_seq_map[ensembl_id] + self.targetGenes[0]['targetSequence'] = {'sequence':seq, 'sequenceType':'protein'} + return seq, 0 + elif prot_id_from_scores[:3] == 'NC_' or prot_id_from_scores[:3] == 'NM_' or prot_id_from_scores[:3] == 'NP_': + nc_id = prot_id_from_scores.split('.')[0] + if prot_id_from_scores[:3] == 'NP_': + seq_type = 'protein' + else: + seq_type = 'nucleotide' + nc_seq_map = get_refseq_sequences([nc_id], seq_type=seq_type) + if nc_id in nc_seq_map: + seq = nc_seq_map[nc_id] + self.targetGenes[0]['targetSequence'] = {'sequence':seq, 'sequenceType':'protein'} + return seq, 0 + else: + print(f'Uncatched {prot_id_from_scores=}') + - # the following fields are optional - meta_analysis_for: Optional[str] = attr.ib(kw_only=True, default=None) - replaces: Optional[str] = attr.ib(kw_only=True, default=None) - licence: Optional[str] = attr.ib(kw_only=True, default=None) - data_usage_policy: Optional[str] = attr.ib(kw_only=True, default=None) + if self.corrected_seq is not None: + return self.corrected_seq, 0 - # These can be strings or open filepaths - score_data: Union[str, BinaryIO] = attr.ib(kw_only=True) - count_data: Union[str, BinaryIO] = attr.ib(kw_only=True) - meta_data: Union[str, BinaryIO] = attr.ib(kw_only=True) + id_dict = {} + for identifier_struct in self.targetGenes[0]['externalIdentifiers']: + dbname = identifier_struct['identifier']['dbName'] + offset = identifier_struct['offset'] + if 'targetSequence' in self.targetGenes[0]: + if 'sequenceType' in self.targetGenes[0]['targetSequence']: + if self.targetGenes[0]['targetSequence']['sequenceType'] == 'dna': + offset = (offset - 1)//3 + + identifier = identifier_struct['identifier']['identifier'] + if dbname == 'UniProt': + id_dict['uniprot'] = offset, identifier + elif dbname == 'Ensembl': + id_dict['ensembl'] = offset, identifier + elif dbname == 'RefSeq': + id_dict['refseq'] = offset, identifier + + if 'uniprot' in id_dict: + offset, identifier = id_dict['uniprot'] + full_seq = getUniprotSequence(identifier) + #elif 'refseq' in id_dict: + # offset, identifier = id_dict['refseq'] + # full_seq = get_refseq_sequences(identifier) + #elif 'ensembl' in id_dict: + # print(f'No Uniprot, no RefSeq, but Ensembl ID: {self.urn}') + # full_seq = self.get_protein_sequence() + # offset = 0 + else: + full_seq = self.get_protein_sequence() + offset = 0 + return full_seq, offset + def get_score_table_positions(self): + exp_score_pos = None + score_pos = None + hgvs_pro_pos = 3 + hgvs_nt_pos = 1 + + for pos, score_table_header in enumerate(self.datasetColumns['scoreColumns']): + + if score_table_header == 'score': + score_pos = pos + 4 + if score_table_header == 'exp.score': + exp_score_pos = pos + 4 + if exp_score_pos is not None: + score_pos = exp_score_pos + return hgvs_pro_pos, hgvs_nt_pos, score_pos + +class ProteinGymScoreset(ScoreSet): + def __init__(self, target_name, uniprot, seq, scoresetdata, offset = 0): + self.targetGenes = [] + self.target = {} + self.target['uniprot'] = {} + self.target['uniprot']['identifier'] = uniprot + self.target['reference_sequence'] = {} + self.target['reference_sequence']['sequence'] = seq + self.target['reference_sequence']['sequence_type'] = 'protein' + self.target['uniprot']['offset'] = offset + self.target['name'] = target_name + + self.current_version = 'ProteinGym' + self.scoresetdata = scoresetdata + +class DomainomeScoreset(ScoreSet): + def __init__(self, target_name, uniprot, seq, scoresetdata): + self.targetGenes = [] + self.target = {} + self.target['uniprot'] = {} + self.target['uniprot']['identifier'] = uniprot + self.target['reference_sequence'] = {} + self.target['reference_sequence']['sequence'] = seq + self.target['reference_sequence']['sequence_type'] = 'protein' + self.target['name'] = target_name + + self.current_version = 'Domainome' + self.scoresetdata = scoresetdata + +def load_protein_gym_scores(dms_id, path_to_dms_file): + + f = open(path_to_dms_file) + lines = f.readlines() + f.close() + + score_dict = {} + + for line in lines[1:]: + words = line.split(',') + aac = words[0] + if aac.count(':') > 0: #Consider only SAVs for the moment + continue + hgvs_pro = aac_to_hgvs_pro(aac) + score = float(words[2]) + score_dict[hgvs_pro] = score + + ssd_object = ScoreSetData(dms_id, '', score_dict = score_dict) + return ssd_object + +class ScoreSetData: -@attr.s -class NewScoreSetRequest(APIObject): """ - This class instantiates NewScoreSetRequest and sets the fields for NewScoreSetRequest. - Attributes are set before posting a model instance + This class represents the score tables for a corresponding scoresheet. + Can also represent score tables for a corresponding experiment, + in that case the scoretables from all scoresheets belonging to the experiment + got aggregated (see aggregate_scoresetdata function of MLdataset class in ml_tools.py). """ - scoreset: NewScoreSet = attr.ib(kw_only=True) - target: NewTarget = attr.ib(kw_only=True) + def __init__(self, urn, csv_formatted_data, hgvs_pro_pos = None, score_pos = None, hgvs_nt_pos = None, score_dict = None, verbosity = 1): + + """ + Initialization routine that builds a class instance from the content of a MaveDB scoreset table. - # These only need the ExternalIdentifier identifier field - uniprot: SequenceOffset = attr.ib(kw_only=True, default=None) - ensembl: SequenceOffset = attr.ib(kw_only=True, default=None) - refseq: SequenceOffset = attr.ib(kw_only=True, default=None) + Parameters + ---------- - reference_maps: List[ReferenceMap] = attr.ib(kw_only=True) + urn + MaveDB urn identifier - def api_url() -> str: + csv_formatted_data + Content of a MaveDB scoreset table. + + hgvs_pro_pos + Column number of the column that contains the hgvs_pro identifiers. + + score_pos + Column number of the column that contains the experimental effect values. + + hgvs_pro_pos + Column number of the column that contains the hgvs_nt identifiers. + + Returns + ------- + + class_instance + Instance of ScoreSetData class. """ - Returns API endpoint + + #Initialize several class attributes + if verbosity >= 2: + print(f'Initialising ScoreSetData Object for {urn} {score_dict is None}') + self.urn = urn + if score_dict is None: + self.score_dict = {} + else: + self.score_dict = score_dict + + self.nt_score_dict = {} + self.sav_scores = {} + self.synonymous_scores = {} + self.nonsense_scores = {} + self.mean_score = None + self.min_score = None + self.max_score = None + self.mean_sav = None + self.median_sav = None + self.min_sav = None + self.max_sav = None + self.cardinality = 0 + self.sav_cardinality = 0 + self.sorted_sav_values = None + self.mean_nonsense = None + self.median_nonsense = None + self.std_nonsense = None + self.std_sav = None + self.number_of_covered_positions = None + self.normalizer = None + + #Parse the table + + lines = csv_formatted_data.split('\n') + for line in lines: + if line == '': + continue + if line[0] == '#': + continue + words = line.split(',') + + #In case their is a header row, parse the hgvs_pro, hgvs_nt, and effect column numbers + if words[0] == 'accession': + exp_score_pos = None + for pos, word in enumerate(words): + if word == 'hgvs_pro': + hgvs_pro_pos = pos + if word == 'score': + score_pos = pos + if word == 'hgvs_nt': + hgvs_nt_pos = pos + if word == 'exp.score': + exp_score_pos = pos + if exp_score_pos is not None: + score_pos = exp_score_pos + else: + if words[score_pos] == 'NA': + continue + try: + score = float(words[score_pos]) + except: + #Debugging help, can be removed, when better sanity checks are implemented + print(score_pos) + print(words[score_pos]) + print(lines[10:]) + sys.exit() + + #Retrieve the column values and write them to their corresponding class attribute datastructures + hgvs_pro = words[hgvs_pro_pos] + hgvs_nt = words[hgvs_nt_pos] + + if hgvs_pro != 'NA': + self.score_dict[hgvs_pro] = score + elif hgvs_nt != 'NA': + self.nt_score_dict[hgvs_nt] = score + else: + continue + + #print(f'Parsed {len(self.score_dict)} number of scores for {self.urn}') + + #Get some basic information of the data + + + self.pro_baseline = [] + self.ala_baseline = [] + + for hgvs_pro in self.score_dict: + score = self.score_dict[hgvs_pro] + self.cardinality += 1 + + #Dependant on the variant type, we need to do different things later + #For now, store separately synonymous variants and single amino acid variations + #There is perhaps the need to implement similar datastructures for indels and non-coding variations. + variant_type = get_variant_type(hgvs_pro) + + if self.min_score is None: + self.min_score = score + elif score < self.min_score: + self.min_score = score + + if self.max_score is None: + self.max_score = score + elif score > self.max_score: + self.max_score = score + + if variant_type == 'synonymous': + self.synonymous_scores[hgvs_pro] = score + elif variant_type == 'nonsense': + self.nonsense_scores[hgvs_pro] = score + elif variant_type == 'sav': + if self.min_sav is None: + self.min_sav = score + elif score < self.min_sav: + self.min_sav = score + + if self.max_sav is None: + self.max_sav = score + elif score > self.max_sav: + self.max_sav = score + + self.sav_scores[hgvs_pro] = score + self.sav_cardinality += 1 + if hgvs_pro[-3:] == 'Pro' and hgvs_pro[2:5] != 'Pro': + self.pro_baseline.append(score) + + if hgvs_pro[-3:] == 'Ala' and hgvs_pro[2:5] != 'Ala': + self.ala_baseline.append(score) + + #print(f'{len(self.sav_scores)} number of sav scores for {self.urn}') + + if len(self.pro_baseline) > 0: + self.pro_baseline = sum(self.pro_baseline)/len(self.pro_baseline) + else: + self.pro_baseline = None + + if len(self.ala_baseline) > 0: + self.ala_baseline = sum(self.ala_baseline)/len(self.ala_baseline) + else: + self.ala_baseline = None + + if self.cardinality > 0: + self.mean_score = sum(self.score_dict.values()) / self.cardinality + + if self.sav_cardinality > 0: + ssl = list(self.sav_scores.values()) + self.mean_sav = sum(ssl) / self.sav_cardinality + self.std_sav = np.std(ssl) + + if len(self.nonsense_scores) > 0: + nsl = list(self.nonsense_scores.values()) + self.median_nonsense = median(nsl) + self.std_nonsense = np.std(nsl) + + + def get_sorted_sav_values(self): + + """ + Sorts the SAV effect values and stores them, so if called again, the sorting don't has to be recalculated again. + + Returns + ------- + + self.sorted_sav_values + Pointer to the sorted sav value datastructure. + """ + + if self.sorted_sav_values is not None: + return self.sorted_sav_values + else: + self.sorted_sav_values = sorted(self.sav_scores.values()) + return self.sorted_sav_values + + + def get_normalizer(self): + if self.normalizer is not None: + return self.normalizer + sorted_scores = self.get_sorted_sav_values() + bound = max([1,round(self.sav_cardinality/100)]) + self.normalizer = sorted_scores[self.sav_cardinality - bound] - sorted_scores[bound] + return self.normalizer + + def set_std_sav(self): + sorted_scores = self.get_sorted_sav_values() + bound = max([1,round(self.sav_cardinality/100)]) + + self.std_sav = np.std(list(self.sav_scores.values())[bound:-bound]) + return self.std_sav + + def get_number_of_covered_positions(self): + if self.number_of_covered_positions is not None: + return self.number_of_covered_positions + covered_positions = set() + for hgvs_pro in self.sav_scores: + + l, r, pos = disect_hgvs_single_pos(hgvs_pro[2:]) + covered_positions.add(pos) + + self.number_of_covered_positions = len(covered_positions) + return self.number_of_covered_positions + + def sav_binning(self): + + """ + Creates a bin histogram of all sav scores. + + Returns + ------- + + bins + The calculated histogram + """ + + sorted_scores = self.get_sorted_sav_values() + n_of_bins = min([max([len(sorted_scores) // 100, 10]), len(sorted_scores)]) + bin_size = (self.max_sav - self.min_sav)/n_of_bins + + bins = [] + for i in range(n_of_bins): + bins.append([]) + current_bin_number = 0 + for score in sorted_scores: + if score > (self.min_sav + bin_size*(current_bin_number+1)): + current_bin_number += 1 + try: + bins[current_bin_number].append(score) + except: + bins[current_bin_number-1].append(score) + + return bins + + + def scale_domainome(self): + self.scaled_sav_scores = {} + for hgvs_pro in self.sav_scores: + reported_score = self.sav_scores[hgvs_pro] + scaled_score = reported_score + 1.0 + self.scaled_sav_scores[hgvs_pro] = scaled_score + + + def check_scaled_scores(self): + count_very_strong = 0 + count_strong = 0 + count_moderate = 0 + count_wt_like = 0 + count_increasing = 0 + + for sav in self.scaled_sav_scores: + scaled_score = self.scaled_sav_scores[sav] + match scaled_score: + case x if x < 0: + count_very_strong += 1 + case x if x < 0.25: + count_strong += 1 + case x if x < 0.8: + count_moderate += 1 + case x if x < 1.2: + count_wt_like += 1 + case _: + count_increasing += 1 + + count_all_strong = count_very_strong + count_strong + moderate_and_above = count_moderate + count_wt_like + count_increasing + all_with_effect = count_all_strong + count_moderate + if count_all_strong > moderate_and_above: + return False, f'Too many strong effects: {count_all_strong=} versus too few moderate effects {moderate_and_above=}' + if count_all_strong < count_moderate*0.26: + return False, f'Too few strong effects: {count_all_strong=} versus too many moderate effects {count_moderate=}' + if count_increasing > (all_with_effect*0.5): + return False, f'Too many increasing effects {count_increasing=} versus too few with effects {all_with_effect=}' + return True, None + + + def scale_sav_data(self, verbosity = 0, distribution_filtering = False): + + """ + Scales all sav scores. Scaling is based on a typical neutral effect score and a typical strong effect value. + For more details of the scaling process, look into the score_scale_function in utils.py. + + + Parameters + --------- + + verbosity + Verbosity level of the function + + Returns + ------- + + None + if the typical neutral effect and the strong effect value are equal, the scaling would be non-sensical + """ + + median_synonymous = median(self.synonymous_scores.values()) + sorted_scores = self.get_sorted_sav_values() + perc_size: int = max([1,round(self.sav_cardinality/100)]) + + self.scaled_sav_scores = {} + + + bins = self.sav_binning() + largest_bin = None + highest_peak = None + second_peak = None + max_bin_size = 0 + highest_peak_size = 0 + second_peak_size = 0 + for bin_number, score_bin in enumerate(bins): + if len(score_bin) > max_bin_size: + max_bin_size = len(score_bin) + largest_bin = bin_number + + #Is this a peak? + peak = False + if len(bins) == 1: + peak = True + elif bin_number == 0: + if len(bins[1]) < len(score_bin): + peak = True + elif bin_number == (len(bins) -1): + if len(bins[bin_number - 1]) < len(score_bin): + peak = True + else: + if len(bins[bin_number - 1]) < len(score_bin) and len(bins[bin_number + 1]) < len(score_bin): + peak = True + + if peak: + if len(score_bin) > highest_peak_size: + highest_peak_size = len(score_bin) + highest_peak = bin_number + + elif len(score_bin) > second_peak_size: + second_peak_size = len(score_bin) + second_peak = bin_number + + bi_modal = False + if second_peak is not None: + left_peak = min([highest_peak, second_peak]) + right_peak = max([highest_peak, second_peak]) + + if second_peak_size < (0.4 * highest_peak_size): #The two peaks need to be of similar size + bi_modal = False + elif (right_peak - left_peak) > (len(bins) * 0.3): #The two peaks need to be close together + bi_modal = False + else: #There need to be a steep valley between the peaks + for bin_between_peaks in bins[(left_peak+1):right_peak]: + if (2*len(bin_between_peaks)) < second_peak_size: + bi_modal = True + + if not bi_modal: + median_neutral_bin = median(bins[largest_bin]) + binned_median_synonymous = median_neutral_bin + + else: #Currently we assume for a bimodal distribution that the neutral mutations got removed from the experiment (see urn:mavedb:00000071-a) + median_highest_peak_bin = median(bins[highest_peak]) + median_second_peak_bin = median(bins[second_peak]) + binned_median_synonymous = 0.5 * (median_highest_peak_bin + median_second_peak_bin) + + if median_synonymous is None: + calculated_by_binning = True + median_synonymous = binned_median_synonymous + if bi_modal: + print(f'Bi-modal distribution for {self.urn}') + else: + calculated_by_binning = False + bi_modal = None + + if self.mean_sav > median_synonymous: #larger score, stronger effect ... + if self.pro_baseline is not None and self.ala_baseline is not None: + if self.pro_baseline < self.ala_baseline: + print(f'Direction does not add up: {self.urn} {self.pro_baseline=} {self.ala_baseline=}') + bottom_perc_median = median(sorted_scores[:perc_size]) + top_perc_median = median(sorted_scores[-perc_size:]) + else: + bottom_perc_median = median(sorted_scores[-perc_size:]) + top_perc_median = median(sorted_scores[:perc_size]) + else: + bottom_perc_median = median(sorted_scores[-perc_size:]) #...means, we need to take the right percentile of the sorted scores for a selection of scores with strong non-neutral effect + top_perc_median = median(sorted_scores[:perc_size]) + else: #smaller score, stronger effect ... + if self.pro_baseline is not None and self.ala_baseline is not None: + if self.pro_baseline > self.ala_baseline: + print(f'Direction does not add up 2: {self.urn} {self.pro_baseline=} {self.ala_baseline=}') + bottom_perc_median = median(sorted_scores[-perc_size:]) + top_perc_median = median(sorted_scores[:perc_size]) + else: + bottom_perc_median = median(sorted_scores[:perc_size]) + top_perc_median = median(sorted_scores[-perc_size:]) + else: + bottom_perc_median = median(sorted_scores[:perc_size]) #... means, we need to the left percentile + top_perc_median = median(sorted_scores[-perc_size:]) + + if verbosity >= 2: + print(f'Scaling SAV scores for {self.urn}') + print(f'Min/Max score: {self.min_sav} / {self.max_sav}') + print(f'Mean score: {self.mean_sav}') + print(f'WT-like score: {median_synonymous} (got calculated by binning: {calculated_by_binning}, binned median synonymous: {binned_median_synonymous}, bi_modal was {bi_modal})') + print(f'Strong effect score: {bottom_perc_median}, Median nonsense: {self.median_nonsense}') + print(f'Strong increasing effect score: {top_perc_median}') + print(f'Pro baseline: {self.pro_baseline}') + print(f'Ala baseline: {self.ala_baseline}') + print(f'STD nonsense score: {self.std_nonsense}') + print(f'STD missense score: {self.std_sav}') + + if (bottom_perc_median - median_synonymous) == 0: + print(f'Bottom median equals to median synonymous: {self.urn}') + return False + + for hgvs_pro in self.sav_scores: + reported_score = self.sav_scores[hgvs_pro] + scaled_score = score_scale_function(reported_score, median_synonymous, bottom_perc_median, top_perc_median) + self.scaled_sav_scores[hgvs_pro] = scaled_score + + scaled_wt_score = score_scale_function(median_synonymous, median_synonymous, bottom_perc_median, top_perc_median) + + if distribution_filtering: + is_alright, reason = self.check_scaled_scores() + + if not is_alright: + print(f'Scaling failed for {self.urn} due to {reason=}') + self.scaled_sav_scores = [] + return False + + min_value = min(self.scaled_sav_scores.values()) + max_value = max(self.scaled_sav_scores.values()) + scaled_median = median(self.scaled_sav_scores.values()) + + """ + if scaled_median < 0.4: + shift = 0.4 / scaled_median + else: + shift = None + + if shift is not None: + print(f'Needed to perform a second scaling for {self.urn} with shift: {shift} (scaled median: {scaled_median})') + for hgvs_pro in list(self.scaled_sav_scores.keys()): + scaled_score = self.scaled_sav_scores[hgvs_pro] + scaled_score = second_scale(scaled_score, min_value, max_value, shift) + self.scaled_sav_scores[hgvs_pro] = scaled_score """ - return "scoresets" - def post_payload(self): + if verbosity >= 2: + + #second_scaled_wt_score = second_scale(scaled_wt_score, min_value, max_value, shift) + scaled_strong_score = score_scale_function(bottom_perc_median, median_synonymous, bottom_perc_median, top_perc_median) + #second_scaled_strong_score = second_scale(scaled_strong_score, min_value, max_value, shift) + print(f'Scaled WT-like score: {scaled_wt_score}') + print(f'Scaled strong effect score: {scaled_strong_score}') + print(f'Scaled median effect score: {scaled_median}') + #print(f'Second scaled WT-like score: {second_scaled_wt_score}') + #print(f'Second scaled strong effect score: {second_scaled_strong_score}') + return True + + def plot_sav_score_distribution(self, outfile, specific_scores = None): + """ - Use this to POST an instance of this class. - data is converted to appropriate type and returned + Plots the histogram of the sav scores. Was majorly used for debugging purposes. + Could be modified in the future to generate some additional output. + + Parameters + ---------- + + outfile + Path to file, where the figure file should be written. + + specific_scores + An optional score dictionary similar to the self.sav_scores object. Can be used for plotting other distributions, like the scaled ones. """ - json_dict, files = prepare_for_encoding( - attr.asdict( - self, - filter=attrs_filter, - retain_collection_types=True, - value_serializer=attrs_serializer, - ) - ) - return json_dict, files + + plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100}) + + if specific_scores is None: + scores = self.sav_scores + else: + scores = specific_scores + + data = np.array(list(scores.values())) + + plt.hist(data, bins=50) + plt.gca().set(title=f'Score distribution for {self.urn}', ylabel='Frequency', xlabel = 'Effect score'); + plt.savefig(outfile) + plt.clf() + plt.cla() + plt.close() + + + def write_nonsense_tsv(self, outfile): + lines = [] + for hgvs in self.nonsense_scores: + score = self.nonsense_scores[hgvs] + l, r, pos = disect_hgvs_single_pos(hgvs[2:]) + lines.append(f'{hgvs}\t{pos}\t{score}\n') + + f = open(outfile, 'w') + f.write(''.join(lines)) + f.close() + + + def calculate_nonsense_position_correlation(self): + pos_score_tuples = [] + for hgvs in self.nonsense_scores: + score = self.nonsense_scores[hgvs] + l, r, pos = disect_hgvs_single_pos(hgvs[2:]) + pos_score_tuples.append((pos,score)) + pos_score_tuples = sorted(pos_score_tuples, key = lambda x: x[0]) + positions, scores = zip(*pos_score_tuples) + pearson,pearson_p = stats.pearsonr(positions, scores) + return pearson, pearson_p + + + + diff --git a/mavetools/models/sequence_retrieval.py b/mavetools/models/sequence_retrieval.py new file mode 100644 index 0000000..da676ee --- /dev/null +++ b/mavetools/models/sequence_retrieval.py @@ -0,0 +1,206 @@ +import socket +import time +import sys +import traceback +import urllib.error +import urllib.parse +import urllib.request +import requests +import json +from Bio import Entrez + +from mavetools.models.utils import parseFasta + +def is_connected(): + + """ + Tests if there is an active internet connection. + """ + + try: + # connect to the host -- tells us if the host is actually + # reachable + socket.create_connection(("1.1.1.1", 53)) + return True + except OSError: + pass + return False + + +def connection_sleep_cycle(): + + """ + A waiting routine for short internet outages. + """ + + while not is_connected(): + print('No connection, sleeping a bit and then try again') + time.sleep(30) + + +def retrieve_transcript_sequences(transcript_ids, recursed = False): + rest_url = 'https://rest.ensembl.org/sequence/id' + + headers = { "Content-Type" : "application/json", "Accept" : "application/json"} + transcript_seq_dict = {} + chunksize = 45 + transcript_ids = list(transcript_ids) + chunk = transcript_ids[:chunksize] + i = 0 + try_number = 0 + empty_seqs = {} + while len(chunk) > 0: + + data = json.dumps({'ids':chunk}) + + try: + r = requests.post(rest_url, headers = headers, data = data, params = {'type' : 'protein'}) + except: + try_number += 1 + if try_number == 4: + [e, f, g] = sys.exc_info() + g = traceback.format_exc() + print(f'POST request failed: {e}\n{f}\n{g}') + break + time.sleep(try_number*2) + continue + + if not r.ok: + try: + r.raise_for_status() + except requests.exceptions.HTTPError as errh: + print(f'Transcript Sequence Retrieval failed: {data}\nError: {errh.response.status_code}\n{errh.response.text}') + if recursed: + return {}, {transcript_ids[0]:None} + for transcript_id in chunk: + seq_sub_dict, sub_empty_seqs = retrieve_transcript_sequences([transcript_id], recursed=True) + transcript_seq_dict.update(seq_sub_dict) + empty_seqs.update(sub_empty_seqs) + i += 1 + chunk = transcript_ids[(chunksize*i):(chunksize*(i+1))] + continue + #r.raise_for_status() + #return None + + decoded = r.json() + try_number = 0 + + for entry in decoded: + transcript_id = entry['query'] + nt_seq = entry['seq'] + transcript_seq_dict[transcript_id] = nt_seq + + i += 1 + chunk = transcript_ids[(chunksize*i):(chunksize*(i+1))] + + return transcript_seq_dict, empty_seqs + +def getUniprotSequence(uniprot_ac, tries=0): + + """ + Fetches sequence data from Uniprot database. + + Parameters + ---------- + + uniprot_ac + Uniprot accesion identifier of the protein sequence to fetch. + + tries + A count parameter for the trial and error loop for short time internet disconnections. + + Returns + ------- + + wildtype_sequence + Amino acid sequence of the querried protein. + """ + + if uniprot_ac is None: + print(f'Uniprot Ac is None') + return None + + if not uniprot_ac[0:3] == 'UPI': + url = 'https://www.uniprot.org/uniprot/%s.fasta' % uniprot_ac + else: + url = 'https://www.uniprot.org/uniparc/%s.fasta' % uniprot_ac + connection_sleep_cycle() + try: + request = urllib.request.Request(url) + response = urllib.request.urlopen(request, timeout=(tries + 1) * 10) + page = response.read(9000000).decode('utf-8') + except: + if tries < 3: + return getSequence(uniprot_ac, tries=tries + 1) + else: + return None + + lines = page.split("\n") + + wildtype_sequences = [] + + for line in lines: + if line == '': + continue + if line[0] == '>': + continue + wildtype_sequences.append(line) + + wildtype_sequence = ("".join(wildtype_sequences)).replace(" ", "").replace("\n", "") + + if wildtype_sequence == '': + return None + + return wildtype_sequence + +def get_refseq_sequences(refseqs, seq_type='protein'): + + """ + Fetches sequence data from NCBI refseq database. + Uses the biopython library. + + Parameters + ---------- + + refseqs + comma-separated string of refseq identifiers + + seq_type + type of sequence to be retrieved, either 'nucleotide' or 'protein' + + Returns + ------- + + seq_map + dictionary of {refseq_identifier:sequence} + """ + + Entrez.email = '' + + ret_type = 'fasta_cds_aa' + if seq_type == 'protein' or seq_type == 'nucleotide': + ret_type = 'fasta' + + try: + net_handle = Entrez.efetch( + db=seq_type, id=refseqs, rettype=ret_type, retmode="text" + ) + except: + [e, f, g] = sys.exc_info() + g = traceback.format_exc() + print(f'Sequence retrieval failed for {refseqs=}\n{e}\n{f}\n{g}') + return {} + page = net_handle.read() + net_handle.close() + right_split = '_prot' + left_split = '|' + if seq_type == 'protein': + right_split = ' ' + left_split = None + elif seq_type == 'nucleotide': + left_split = None + right_split = '.' + + seq_map = parseFasta(page=page, left_split=left_split, right_split=right_split) + + return seq_map diff --git a/mavetools/models/target.py b/mavetools/models/target.py index 88cd19f..fae9b98 100644 --- a/mavetools/models/target.py +++ b/mavetools/models/target.py @@ -15,32 +15,41 @@ class ReferenceSequence: @attr.s -class SequenceOffset(ExternalIdentifier): +class SequenceIdentifier(): """ - Instantiates SequenceOffset and declares attributes + Instantiates SequenceIdentifier and declares attributes """ + identifier: ExternalIdentifier = attr.ib(kw_only=True) offset: int = attr.ib(kw_only=True) @attr.s -class ReferenceGenome: +class Taxonomy: """ - Instantiates SequenceOffset and declares attributes + Instantiates Taxonomy and declares attributes """ - short_name: str = attr.ib(kw_only=True) - organism_name: str = attr.ib(kw_only=True, default=None) - assembly_identifier: ExternalIdentifier = attr.ib(kw_only=True, default=None) - + taxId: str = attr.ib(kw_only=True) + organismName: str = attr.ib(kw_only=True, default=None) + commonName: str = attr.ib(kw_only=True) + rank: str = attr.ib(kw_only=True) + hasDescribedSpeciesName: bool = attr.ib(kw_only=True) + articleReference: str = attr.ib(kw_only=True) + genomeId: str = attr.ib(kw_only=True) + id: int = attr.ib(kw_only=True) + url: str = attr.ib(kw_only=True) @attr.s -class ReferenceMap: +class TargetSequence: """ - Instantiates ReferenceMap object and declares genome attribute + Instantiates targetSequence object and declares attributes """ - genome: ReferenceGenome = attr.ib(kw_only=True) + sequenceType: str = attr.ib(kw_only=True) + sequence: str = attr.ib(kw_only=True) + label: str = attr.ib(kw_only=True) + taxonomy: Taxonomy = attr.ib(kw_only=True) @attr.s @@ -50,14 +59,12 @@ class Target: """ name: str = attr.ib(kw_only=True) + category: str = attr.ib(kw_only=True) + ExternalIdentifiers: List[SequenceIdentifier] = attr.ib(kw_only=True) reference_sequence: ReferenceSequence = attr.ib(kw_only=True) - uniprot: SequenceOffset = attr.ib(kw_only=True) - ensembl: SequenceOffset = attr.ib(kw_only=True) - refseq: SequenceOffset = attr.ib(kw_only=True) - - reference_maps: List[ReferenceMap] = attr.ib(kw_only=True) - scoreset: str = attr.ib(kw_only=True) - type: str = attr.ib(kw_only=True) + id: int = attr.ib(kw_only=True) + targetSequence: TargetSequence = attr.ib(kw_only=True) + targetAccession: str = attr.ib(kw_only=True) @attr.s diff --git a/mavetools/models/utils.py b/mavetools/models/utils.py index 59aee7b..32ece8c 100644 --- a/mavetools/models/utils.py +++ b/mavetools/models/utils.py @@ -1,10 +1,349 @@ import attr import collections import os +import math from typing import BinaryIO from .licence import Licence +from structman.lib.globalAlignment import ( + init_bp_aligner_class, + call_biopython_alignment, +) +ONE_TO_THREE = { + 'C': 'CYS', + 'D': 'ASP', + 'S': 'SER', + 'V': 'VAL', + 'Q': 'GLN', + 'K': 'LYS', + 'P': 'PRO', + 'T': 'THR', + 'F': 'PHE', + 'A': 'ALA', + 'H': 'HIS', + 'G': 'GLY', + 'I': 'ILE', + 'L': 'LEU', + 'R': 'ARG', + 'W': 'TRP', + 'N': 'ASN', + 'Y': 'TYR', + 'M': 'MET', + 'E': 'GLU', + 'X': 'UNK' +} + +ONE_TO_THREE_LC = { + 'C': 'Cys', + 'D': 'Asp', + 'S': 'Ser', + 'V': 'Val', + 'Q': 'Gln', + 'K': 'Lys', + 'P': 'Pro', + 'T': 'Thr', + 'F': 'Phe', + 'A': 'Ala', + 'H': 'His', + 'G': 'Gly', + 'I': 'Ile', + 'L': 'Leu', + 'R': 'Arg', + 'W': 'Trp', + 'N': 'Asn', + 'Y': 'Tyr', + 'M': 'Met', + 'E': 'Glu', + 'X': 'Unk' +} + +THREE_TO_ONE = { + "00C": "C", "01W": "X", "02K": "A", "03Y": "C", "07O": "C", + "08P": "C", "0A0": "D", "0A1": "Y", "0A2": "K", "0A8": "C", + "0AA": "V", "0AB": "V", "0AC": "G", "0AD": "G", "0AF": "W", + "0AG": "L", "0AH": "S", "0AK": "D", "0AM": "A", "0AP": "C", + "0AU": "U", "0AV": "A", "0AZ": "P", "0BN": "F", "0C ": "C", + "0CS": "A", "0DC": "C", "0DG": "G", "0DT": "T", "0FL": "A", + "0G ": "G", "0NC": "A", "0SP": "A", "0U ": "U", "0YG": "YG", + "10C": "C", "125": "U", "126": "U", "127": "U", "128": "N", + "12A": "A", "143": "C", "175": "ASG", "193": "X", "1AP": "A", + "1MA": "A", "1MG": "G", "1PA": "F", "1PI": "A", "1PR": "N", + "1SC": "C", "1TQ": "W", "1TY": "Y", "1X6": "S", "200": "F", + "23F": "F", "23S": "X", "26B": "T", "2AD": "X", "2AG": "A", + "2AO": "X", "2AR": "A", "2AS": "X", "2AT": "T", "2AU": "U", + "2BD": "I", "2BT": "T", "2BU": "A", "2CO": "C", "2DA": "A", + "2DF": "N", "2DM": "N", "2DO": "X", "2DT": "T", "2EG": "G", + "2FE": "N", "2FI": "N", "2FM": "M", "2GT": "T", "2HF": "H", + "2LU": "L", "2MA": "A", "2MG": "G", "2ML": "L", "2MR": "R", + "2MT": "P", "2MU": "U", "2NT": "T", "2OM": "U", "2OT": "T", + "2PI": "X", "2PR": "G", "2SA": "N", "2SI": "X", "2ST": "T", + "2TL": "T", "2TY": "Y", "2VA": "V", "2XA": "C", "32S": "X", + "32T": "X", "3AH": "H", "3AR": "X", "3CF": "F", "3DA": "A", + "3DR": "N", "3GA": "A", "3MD": "D", "3ME": "U", "3NF": "Y", + "3QN": "K", "3TY": "X", "3XH": "G", "4AC": "N", "4BF": "Y", + "4CF": "F", "4CY": "M", "4DP": "W", "4F3": "GYG", "4FB": "P", + "4FW": "W", "4HT": "W", "4IN": "W", "4MF": "N", "4MM": "X", + "4OC": "C", "4PC": "C", "4PD": "C", "4PE": "C", "4PH": "F", + "4SC": "C", "4SU": "U", "4TA": "N", "4U7": "A", "56A": "H", + "5AA": "A", "5AB": "A", "5AT": "T", "5BU": "U", "5CG": "G", + "5CM": "C", "5CS": "C", "5FA": "A", "5FC": "C", "5FU": "U", + "5HP": "E", "5HT": "T", "5HU": "U", "5IC": "C", "5IT": "T", + "5IU": "U", "5MC": "C", "5MD": "N", "5MU": "U", "5NC": "C", + "5PC": "C", "5PY": "T", "5SE": "U", "5ZA": "TWG", "64T": "T", + "6CL": "K", "6CT": "T", "6CW": "W", "6HA": "A", "6HC": "C", + "6HG": "G", "6HN": "K", "6HT": "T", "6IA": "A", "6MA": "A", + "6MC": "A", "6MI": "N", "6MT": "A", "6MZ": "N", "6OG": "G", + "70U": "U", "7DA": "A", "7GU": "G", "7JA": "I", "7MG": "G", + "8AN": "A", "8FG": "G", "8MG": "G", "8OG": "G", "9NE": "E", + "9NF": "F", "9NR": "R", "9NV": "V", "A ": "A", "A1P": "N", + "A23": "A", "A2L": "A", "A2M": "A", "A34": "A", "A35": "A", + "A38": "A", "A39": "A", "A3A": "A", "A3P": "A", "A40": "A", + "A43": "A", "A44": "A", "A47": "A", "A5L": "A", "A5M": "C", + "A5N": "N", "A5O": "A", "A66": "X", "AA3": "A", "AA4": "A", + "AAR": "R", "AB7": "X", "ABA": "A", "ABR": "A", "ABS": "A", + "ABT": "N", "ACB": "D", "ACL": "R", "AD2": "A", "ADD": "X", + "ADX": "N", "AEA": "X", "AEI": "D", "AET": "A", "AFA": "N", + "AFF": "N", "AFG": "G", "AGM": "R", "AGT": "C", "AHB": "N", + "AHH": "X", "AHO": "A", "AHP": "A", "AHS": "X", "AHT": "X", + "AIB": "A", "AKL": "D", "AKZ": "D", "ALA": "A", "ALC": "A", + "ALM": "A", "ALN": "A", "ALO": "T", "ALQ": "X", "ALS": "A", + "ALT": "A", "ALV": "A", "ALY": "K", "AN8": "A", "AP7": "A", + "APE": "X", "APH": "A", "API": "K", "APK": "K", "APM": "X", + "APP": "X", "AR2": "R", "AR4": "E", "AR7": "R", "ARG": "R", + "ARM": "R", "ARO": "R", "ARV": "X", "AS ": "A", "AS2": "D", + "AS9": "X", "ASA": "D", "ASB": "D", "ASI": "D", "ASK": "D", + "ASL": "D", "ASM": "X", "ASN": "N", "ASP": "D", "ASQ": "D", + "ASU": "N", "ASX": "B", "ATD": "T", "ATL": "T", "ATM": "T", + "AVC": "A", "AVN": "X", "AYA": "A", "AYG": "AYG", "AZK": "K", + "AZS": "S", "AZY": "Y", "B1F": "F", "B1P": "N", "B2A": "A", + "B2F": "F", "B2I": "I", "B2V": "V", "B3A": "A", "B3D": "D", + "B3E": "E", "B3K": "K", "B3L": "X", "B3M": "X", "B3Q": "X", + "B3S": "S", "B3T": "X", "B3U": "H", "B3X": "N", "B3Y": "Y", + "BB6": "C", "BB7": "C", "BB8": "F", "BB9": "C", "BBC": "C", + "BCS": "C", "BE2": "X", "BFD": "D", "BG1": "S", "BGM": "G", + "BH2": "D", "BHD": "D", "BIF": "F", "BIL": "X", "BIU": "I", + "BJH": "X", "BLE": "L", "BLY": "K", "BMP": "N", "BMT": "T", + "BNN": "F", "BNO": "X", "BOE": "T", "BOR": "R", "BPE": "C", + "BRU": "U", "BSE": "S", "BT5": "N", "BTA": "L", "BTC": "C", + "BTR": "W", "BUC": "C", "BUG": "V", "BVP": "U", "BZG": "N", + "C ": "C", "C12": "TYG", "C1X": "K", "C25": "C", "C2L": "C", + "C2S": "C", "C31": "C", "C32": "C", "C34": "C", "C36": "C", + "C37": "C", "C38": "C", "C3Y": "C", "C42": "C", "C43": "C", + "C45": "C", "C46": "C", "C49": "C", "C4R": "C", "C4S": "C", + "C5C": "C", "C66": "X", "C6C": "C", "C99": "TFG", "CAF": "C", + "CAL": "X", "CAR": "C", "CAS": "C", "CAV": "X", "CAY": "C", + "CB2": "C", "CBR": "C", "CBV": "C", "CCC": "C", "CCL": "K", + "CCS": "C", "CCY": "CYG", "CDE": "X", "CDV": "X", "CDW": "C", + "CEA": "C", "CFL": "C", "CFY": "FCYG", "CG1": "G", "CGA": "E", + "CGU": "E", "CH ": "C", "CH6": "MYG", "CH7": "KYG", "CHF": "X", + "CHG": "X", "CHP": "G", "CHS": "X", "CIR": "R", "CJO": "GYG", + "CLE": "L", "CLG": "K", "CLH": "K", "CLV": "AFG", "CM0": "N", + "CME": "C", "CMH": "C", "CML": "C", "CMR": "C", "CMT": "C", + "CNU": "U", "CP1": "C", "CPC": "X", "CPI": "X", "CQR": "GYG", + "CR0": "TLG", "CR2": "GYG", "CR5": "G", "CR7": "KYG", "CR8": "HYG", + "CRF": "TWG", "CRG": "THG", "CRK": "MYG", "CRO": "GYG", "CRQ": "QYG", + "CRU": "EYG", "CRW": "ASG", "CRX": "ASG", "CS0": "C", "CS1": "C", + "CS3": "C", "CS4": "C", "CS8": "N", "CSA": "C", "CSB": "C", + "CSD": "C", "CSE": "C", "CSF": "C", "CSH": "SHG", "CSI": "G", + "CSJ": "C", "CSL": "C", "CSO": "C", "CSP": "C", "CSR": "C", + "CSS": "C", "CSU": "C", "CSW": "C", "CSX": "C", "CSY": "SYG", + "CSZ": "C", "CTE": "W", "CTG": "T", "CTH": "T", "CUC": "X", + "CWR": "S", "CXM": "M", "CY0": "C", "CY1": "C", "CY3": "C", + "CY4": "C", "CYA": "C", "CYD": "C", "CYF": "C", "CYG": "C", + "CYJ": "X", "CYM": "C", "CYQ": "C", "CYR": "C", "CYS": "C", + "CZ2": "C", "CZO": "GYG", "CZZ": "C", "D11": "T", "D1P": "N", + "D3 ": "N", "D33": "N", "D3P": "G", "D3T": "T", "D4M": "T", + "D4P": "X", "DA ": "A", "DA2": "X", "DAB": "A", "DAH": "F", + "DAL": "A", "DAR": "R", "DAS": "D", "DBB": "T", "DBM": "N", + "DBS": "S", "DBU": "T", "DBY": "Y", "DBZ": "A", "DC ": "C", + "DC2": "C", "DCG": "G", "DCI": "X", "DCL": "X", "DCT": "C", + "DCY": "C", "DDE": "H", "DDG": "G", "DDN": "U", "DDX": "N", + "DFC": "C", "DFG": "G", "DFI": "X", "DFO": "X", "DFT": "N", + "DG ": "G", "DGH": "G", "DGI": "G", "DGL": "E", "DGN": "Q", + "DHA": "S", "DHI": "H", "DHL": "X", "DHN": "V", "DHP": "X", + "DHU": "U", "DHV": "V", "DI ": "I", "DIL": "I", "DIR": "R", + "DIV": "V", "DLE": "L", "DLS": "K", "DLY": "K", "DM0": "K", + "DMH": "N", "DMK": "D", "DMT": "X", "DN ": "N", "DNE": "L", + "DNG": "L", "DNL": "K", "DNM": "L", "DNP": "A", "DNR": "C", + "DNS": "K", "DOA": "X", "DOC": "C", "DOH": "D", "DON": "L", + "DPB": "T", "DPH": "F", "DPL": "P", "DPP": "A", "DPQ": "Y", + "DPR": "P", "DPY": "N", "DRM": "U", "DRP": "N", "DRT": "T", + "DRZ": "N", "DSE": "S", "DSG": "N", "DSN": "S", "DSP": "D", + "DT ": "T", "DTH": "T", "DTR": "W", "DTY": "Y", "DU ": "U", + "DVA": "V", "DXD": "N", "DXN": "N", "DYG": "DYG", "DYS": "C", + "DZM": "A", "E ": "A", "E1X": "A", "ECC": "Q", "EDA": "A", + "EFC": "C", "EHP": "F", "EIT": "T", "ENP": "N", "ESB": "Y", + "ESC": "M", "EXB": "X", "EXY": "L", "EY5": "N", "EYS": "X", + "F2F": "F", "FA2": "A", "FA5": "N", "FAG": "N", "FAI": "N", + "FB5": "A", "FB6": "A", "FCL": "F", "FFD": "N", "FGA": "E", + "FGL": "G", "FGP": "S", "FHL": "X", "FHO": "K", "FHU": "U", + "FLA": "A", "FLE": "L", "FLT": "Y", "FME": "M", "FMG": "G", + "FMU": "N", "FOE": "C", "FOX": "G", "FP9": "P", "FPA": "F", + "FRD": "X", "FT6": "W", "FTR": "W", "FTY": "Y", "FVA": "V", + "FZN": "K", "G ": "G", "G25": "G", "G2L": "G", "G2S": "G", + "G31": "G", "G32": "G", "G33": "G", "G36": "G", "G38": "G", + "G42": "G", "G46": "G", "G47": "G", "G48": "G", "G49": "G", + "G4P": "N", "G7M": "G", "GAO": "G", "GAU": "E", "GCK": "C", + "GCM": "X", "GDP": "G", "GDR": "G", "GFL": "G", "GGL": "E", + "GH3": "G", "GHG": "Q", "GHP": "G", "GL3": "G", "GLH": "Q", + "GLJ": "E", "GLK": "E", "GLM": "X", "GLN": "Q", "GLQ": "E", + "GLU": "E", "GLX": "Z", "GLY": "G", "GLZ": "G", "GMA": "E", + "GMS": "G", "GMU": "U", "GN7": "G", "GND": "X", "GNE": "N", + "GOM": "G", "GPL": "K", "GS ": "G", "GSC": "G", "GSR": "G", + "GSS": "G", "GSU": "E", "GT9": "C", "GTP": "G", "GVL": "X", + "GYC": "CYG", "GYS": "SYG", "H2U": "U", "H5M": "P", "HAC": "A", + "HAR": "R", "HBN": "H", "HCS": "X", "HDP": "U", "HEU": "U", + "HFA": "X", "HGL": "X", "HHI": "H", "HHK": "AK", "HIA": "H", + "HIC": "H", "HIP": "H", "HIQ": "H", "HIS": "H", "HL2": "L", + "HLU": "L", "HMR": "R", "HOL": "N", "HPC": "F", "HPE": "F", + "HPH": "F", "HPQ": "F", "HQA": "A", "HRG": "R", "HRP": "W", + "HS8": "H", "HS9": "H", "HSE": "S", "HSL": "S", "HSO": "H", + "HTI": "C", "HTN": "N", "HTR": "W", "HV5": "A", "HVA": "V", + "HY3": "P", "HYP": "P", "HZP": "P", "I ": "I", "I2M": "I", + "I58": "K", "I5C": "C", "IAM": "A", "IAR": "R", "IAS": "D", + "IC ": "C", "IEL": "K", "IEY": "HYG", "IG ": "G", "IGL": "G", + "IGU": "G", "IIC": "SHG", "IIL": "I", "ILE": "I", "ILG": "E", + "ILX": "I", "IMC": "C", "IML": "I", "IOY": "F", "IPG": "G", + "IPN": "N", "IRN": "N", "IT1": "K", "IU ": "U", "IYR": "Y", + "IYT": "T", "IZO": "M", "JJJ": "C", "JJK": "C", "JJL": "C", + "JW5": "N", "K1R": "C", "KAG": "G", "KCX": "K", "KGC": "K", + "KNB": "A", "KOR": "M", "KPI": "K", "KST": "K", "KYQ": "K", + "L2A": "X", "LA2": "K", "LAA": "D", "LAL": "A", "LBY": "K", + "LC ": "C", "LCA": "A", "LCC": "N", "LCG": "G", "LCH": "N", + "LCK": "K", "LCX": "K", "LDH": "K", "LED": "L", "LEF": "L", + "LEH": "L", "LEI": "V", "LEM": "L", "LEN": "L", "LET": "X", + "LEU": "L", "LEX": "L", "LG ": "G", "LGP": "G", "LHC": "X", + "LHU": "U", "LKC": "N", "LLP": "K", "LLY": "K", "LME": "E", + "LMF": "K", "LMQ": "Q", "LMS": "N", "LP6": "K", "LPD": "P", + "LPG": "G", "LPL": "X", "LPS": "S", "LSO": "X", "LTA": "X", + "LTR": "W", "LVG": "G", "LVN": "V", "LYF": "K", "LYK": "K", + "LYM": "K", "LYN": "K", "LYR": "K", "LYS": "K", "LYX": "K", + "LYZ": "K", "M0H": "C", "M1G": "G", "M2G": "G", "M2L": "K", + "M2S": "M", "M30": "G", "M3L": "K", "M5M": "C", "MA ": "A", + "MA6": "A", "MA7": "A", "MAA": "A", "MAD": "A", "MAI": "R", + "MBQ": "Y", "MBZ": "N", "MC1": "S", "MCG": "X", "MCL": "K", + "MCS": "C", "MCY": "C", "MD3": "C", "MD6": "G", "MDH": "X", + "MDO": "ASG", "MDR": "N", "MEA": "F", "MED": "M", "MEG": "E", + "MEN": "N", "MEP": "U", "MEQ": "Q", "MET": "M", "MEU": "G", + "MF3": "X", "MFC": "GYG", "MG1": "G", "MGG": "R", "MGN": "Q", + "MGQ": "A", "MGV": "G", "MGY": "G", "MHL": "L", "MHO": "M", + "MHS": "H", "MIA": "A", "MIS": "S", "MK8": "L", "ML3": "K", + "MLE": "L", "MLL": "L", "MLY": "K", "MLZ": "K", "MME": "M", + "MMO": "R", "MMT": "T", "MND": "N", "MNL": "L", "MNU": "U", + "MNV": "V", "MOD": "X", "MP8": "P", "MPH": "X", "MPJ": "X", + "MPQ": "G", "MRG": "G", "MSA": "G", "MSE": "M", "MSL": "M", + "MSO": "M", "MSP": "X", "MT2": "M", "MTR": "T", "MTU": "A", + "MTY": "Y", "MVA": "V", "N ": "N", "N10": "S", "N2C": "X", + "N5I": "N", "N5M": "C", "N6G": "G", "N7P": "P", "NA8": "A", + "NAL": "A", "NAM": "A", "NB8": "N", "NBQ": "Y", "NC1": "S", + "NCB": "A", "NCX": "N", "NCY": "X", "NDF": "F", "NDN": "U", + "NEM": "H", "NEP": "H", "NF2": "N", "NFA": "F", "NHL": "E", + "NIT": "X", "NIY": "Y", "NLE": "L", "NLN": "L", "NLO": "L", + "NLP": "L", "NLQ": "Q", "NMC": "G", "NMM": "R", "NMS": "T", + "NMT": "T", "NNH": "R", "NP3": "N", "NPH": "C", "NPI": "A", + "NRP": "LYG", "NRQ": "MYG", "NSK": "X", "NTY": "Y", "NVA": "V", + "NYC": "TWG", "NYG": "NYG", "NYM": "N", "NYS": "C", "NZH": "H", + "O12": "X", "O2C": "N", "O2G": "G", "OAD": "N", "OAS": "S", + "OBF": "X", "OBS": "X", "OCS": "C", "OCY": "C", "ODP": "N", + "OHI": "H", "OHS": "D", "OIC": "X", "OIP": "I", "OLE": "X", + "OLT": "T", "OLZ": "S", "OMC": "C", "OMG": "G", "OMT": "M", + "OMU": "U", "ONE": "U", "ONH": "A", "ONL": "X", "OPR": "R", + "ORN": "A", "ORQ": "R", "OSE": "S", "OTB": "X", "OTH": "T", + "OTY": "Y", "OXX": "D", "P ": "G", "P1L": "C", "P1P": "N", + "P2T": "T", "P2U": "U", "P2Y": "P", "P5P": "A", "PAQ": "Y", + "PAS": "D", "PAT": "W", "PAU": "A", "PBB": "C", "PBF": "F", + "PBT": "N", "PCA": "E", "PCC": "P", "PCE": "X", "PCS": "F", + "PDL": "X", "PDU": "U", "PEC": "C", "PF5": "F", "PFF": "F", + "PFX": "X", "PG1": "S", "PG7": "G", "PG9": "G", "PGL": "X", + "PGN": "G", "PGP": "G", "PGY": "G", "PHA": "F", "PHD": "D", + "PHE": "F", "PHI": "F", "PHL": "F", "PHM": "F", "PIA": "AYG", + "PIV": "X", "PLE": "L", "PM3": "F", "PMT": "C", "POM": "P", + "PPN": "F", "PPU": "A", "PPW": "G", "PQ1": "N", "PR3": "C", + "PR5": "A", "PR9": "P", "PRN": "A", "PRO": "P", "PRS": "P", + "PSA": "F", "PSH": "H", "PST": "T", "PSU": "U", "PSW": "C", + "PTA": "X", "PTH": "Y", "PTM": "Y", "PTR": "Y", "PU ": "A", + "PUY": "N", "PVH": "H", "PVL": "X", "PYA": "A", "PYO": "U", + "PYX": "C", "PYY": "N", "QLG": "QLG", "QMM": "Q", "QPA": "C", + "QPH": "F", "QUO": "G", "R ": "A", "R1A": "C", "R4K": "W", + "RC7": "HYG", "RE0": "W", "RE3": "W", "RIA": "A", "RMP": "A", + "RON": "X", "RT ": "T", "RTP": "N", "S1H": "S", "S2C": "C", + "S2D": "A", "S2M": "T", "S2P": "A", "S4A": "A", "S4C": "C", + "S4G": "G", "S4U": "U", "S6G": "G", "SAC": "S", "SAH": "C", + "SAR": "G", "SBL": "S", "SC ": "C", "SCH": "C", "SCS": "C", + "SCY": "C", "SD2": "X", "SDG": "G", "SDP": "S", "SEB": "S", + "SEC": "A", "SEG": "A", "SEL": "S", "SEM": "S", "SEN": "S", + "SEP": "S", "SER": "S", "SET": "S", "SGB": "S", "SHC": "C", + "SHP": "G", "SHR": "K", "SIB": "C", "SIC": "DC", "SLA": "P", + "SLR": "P", "SLZ": "K", "SMC": "C", "SME": "M", "SMF": "F", + "SMP": "A", "SMT": "T", "SNC": "C", "SNN": "N", "SOC": "C", + "SOS": "N", "SOY": "S", "SPT": "T", "SRA": "A", "SSU": "U", + "STY": "Y", "SUB": "X", "SUI": "DG", "SUN": "S", "SUR": "U", + "SVA": "S", "SVV": "S", "SVW": "S", "SVX": "S", "SVY": "S", + "SVZ": "X", "SWG": "SWG", "SYS": "C", "T ": "T", "T11": "F", + "T23": "T", "T2S": "T", "T2T": "N", "T31": "U", "T32": "T", + "T36": "T", "T37": "T", "T38": "T", "T39": "T", "T3P": "T", + "T41": "T", "T48": "T", "T49": "T", "T4S": "T", "T5O": "U", + "T5S": "T", "T66": "X", "T6A": "A", "TA3": "T", "TA4": "X", + "TAF": "T", "TAL": "N", "TAV": "D", "TBG": "V", "TBM": "T", + "TC1": "C", "TCP": "T", "TCQ": "Y", "TCR": "W", "TCY": "A", + "TDD": "L", "TDY": "T", "TFE": "T", "TFO": "A", "TFQ": "F", + "TFT": "T", "TGP": "G", "TH6": "T", "THC": "T", "THO": "X", + "THR": "T", "THX": "N", "THZ": "R", "TIH": "A", "TLB": "N", + "TLC": "T", "TLN": "U", "TMB": "T", "TMD": "T", "TNB": "C", + "TNR": "S", "TOX": "W", "TP1": "T", "TPC": "C", "TPG": "G", + "TPH": "X", "TPL": "W", "TPO": "T", "TPQ": "Y", "TQI": "W", + "TQQ": "W", "TRF": "W", "TRG": "K", "TRN": "W", "TRO": "W", + "TRP": "W", "TRQ": "W", "TRW": "W", "TRX": "W", "TS ": "N", + "TST": "X", "TT ": "N", "TTD": "T", "TTI": "U", "TTM": "T", + "TTQ": "W", "TTS": "Y", "TY1": "Y", "TY2": "Y", "TY3": "Y", + "TY5": "Y", "TYB": "Y", "TYI": "Y", "TYJ": "Y", "TYN": "Y", + "TYO": "Y", "TYQ": "Y", "TYR": "Y", "TYS": "Y", "TYT": "Y", + "TYU": "N", "TYW": "Y", "TYX": "X", "TYY": "Y", "TZB": "X", + "TZO": "X", "U ": "U", "U25": "U", "U2L": "U", "U2N": "U", + "U2P": "U", "U31": "U", "U33": "U", "U34": "U", "U36": "U", + "U37": "U", "U8U": "U", "UAR": "U", "UCL": "U", "UD5": "U", + "UDP": "N", "UFP": "N", "UFR": "U", "UFT": "U", "UMA": "A", + "UMP": "U", "UMS": "U", "UN1": "X", "UN2": "X", "UNK": "X", + "UR3": "U", "URD": "U", "US1": "U", "US2": "U", "US3": "T", + "US5": "U", "USM": "U", "VAD": "V", "VAF": "V", "VAL": "V", + "VB1": "K", "VDL": "X", "VLL": "X", "VLM": "X", "VMS": "X", + "VOL": "X", "WCR": "GYG", "X ": "G", "X2W": "E", "X4A": "N", + "X9Q": "AFG", "XAD": "A", "XAE": "N", "XAL": "A", "XAR": "N", + "XCL": "C", "XCN": "C", "XCP": "X", "XCR": "C", "XCS": "N", + "XCT": "C", "XCY": "C", "XGA": "N", "XGL": "G", "XGR": "G", + "XGU": "G", "XPR": "P", "XSN": "N", "XTH": "T", "XTL": "T", + "XTR": "T", "XTS": "G", "XTY": "N", "XUA": "A", "XUG": "G", + "XX1": "K", "XXY": "THG", "XYG": "DYG", "Y ": "A", "YCM": "C", + "YG ": "G", "YOF": "Y", "YRR": "N", "YYG": "G", "Z ": "C", + "Z01": "A", "ZAD": "A", "ZAL": "A", "ZBC": "C", "ZBU": "U", + "ZCL": "F", "ZCY": "C", "ZDU": "U", "ZFB": "X", "ZGU": "G", + "ZHP": "N", "ZTH": "T", "ZU0": "T", "ZZJ": "A", "AME": "M", + "KFP": "L", "7O5": "A", "AC5": "L", "ZXW": "X", "LGY": "L", + "PRJ": "P", "G5G": "L", "CNT": "T", "69P": "X", "STA": "X", + "DNW": "A", "8MC": "X", "BBS": "X", "L3O": "L", "823": "N", + "ADS": "X", "DPN": "F", "2PP": "X", "PRR": "X", "0UO": "W", + "HOA": "X", "PLF": "X", "OCE": "X", "E6F": "X", "T8L": "T", + "JMH": "X", "34H": "X", "6NA": "X", "A5R": "X", "0MG": "X", + "VLT": "X", "8DT": "X", "81S": "X", "0EA": "Y", "0E5": "T", + "FBE": "X", "6J9": "X", "RNG": "X", "3EG": "X", "0QE": "X", + "IL0": "X", "SUJ": "X", "B8N": "X", "DKA": "X", "9MN": "X", + "ACE": "X", "BAL": "X", "7XC": "F", "OSL": "X", "BTK": "L", + "IVA": "X", "3WU": "X", "3WT": "X", "3FB": "X", "HT7": "W", + "V9M": "X" +} + +def aac_to_hgvs_pro(aac): + aa1 = aac[0] + aa2 = aac[-1] + pos = aac[1:-1] + + aa1_three_letter = ONE_TO_THREE_LC[aa1] + aa2_three_letter = ONE_TO_THREE_LC[aa2] + + hgvs_pro = f'p.{aa1_three_letter}{pos}{aa2_three_letter}' + + return hgvs_pro def attrs_filter(attr, value): """ @@ -43,6 +382,657 @@ def attrs_serializer(inst, field, value): return value +def get_variant_type(hgvs_pro): + + """ + Routine to deduce the type of variation from hgvs_pro identifier. + + Parameters + ---------- + + hgvs_pro + hgvs_pro identifier + + Returns + ------- + + variant_type + As string. If variant types are used more in a future implementation, then one should consider to create corresponding classes and datastructures. + """ + + if hgvs_pro == '_wt' or hgvs_pro[-1] == '=' or hgvs_pro == 'p.(=)' or hgvs_pro == '_sy': + return 'synonymous' + + if hgvs_pro[-1] == '?' or hgvs_pro[-1] == '*': + return 'unknown' + + if hgvs_pro[-3:] == 'Ter' or hgvs_pro[:5] == 'p.Ter': + return 'nonsense' + + if hgvs_pro[-1] == ']': + return 'multi' + + if hgvs_pro.count('fs*') > 0 or hgvs_pro[-2:] == 'fs': + return 'frameshift' + + if hgvs_pro.count('delins') > 0: + return 'indel' + + if hgvs_pro.count('del') > 0: + return 'deletion' + + if hgvs_pro.count('ins') > 0: + return 'insertion' + + if hgvs_pro.count('dup') > 0: + return 'duplication' + + return 'sav' + + +def median(l): + + """ + Calculates the median of a list. + + Parameters + ---------- + + l + A list. + + med + Median value of the given list. + """ + + n = len(l) + l = sorted(l) + if n == 1: + return l[0] + if n == 0: + return None + if n % 2 == 0: + med = (l[(n // 2) - 1] + l[n // 2]) / 2.0 + else: + med = l[(n - 1) // 2] + return med + + +def score_scale_function(reported_score, median_synonymous, bottom_perc_median, top_perc_median): + """ + Scaling function for variant effect scores. + Based on 'Analysis of Large-Scale Mutagenesis Data To Assess the Impact of Single Amino Acid Substitutions' doi: 10.1534/genetics.117.300064 + Slightly improved to work on a broader spectrum of score distributions. + + Parameters + ---------- + + reported_score + Effect value reported in the scoreset table. + + median_synonymous + Typical neutral effect value of the corresponding experiment. + + bottom_perc_median + Typical strong effect value of the corresponding experiment. + + Returns + ------- + + scaled_score + Scaled effect score. + """ + + #Determine the direction of the effect scores. + if bottom_perc_median > median_synonymous: + sign = 1 + else: + sign = -1 + + #Addapted scaling function + if (sign * (reported_score - median_synonymous)) > 0: + stretch = -1*abs(bottom_perc_median - median_synonymous) + else: + stretch = -1*abs(top_perc_median - median_synonymous) + + if stretch == 0: + scaled_score = 1 + else: + scaled_score = sign * ((reported_score - median_synonymous) / stretch) + 1 + + return scaled_score + +def second_scale(score, min_value, max_value, shift): + if shift is None: + return score + if shift < 0.: + return score + zero_one_score = (score - min_value) + + zero_one_scaled_score = zero_one_score ** (1 / shift) + + scaled_score = zero_one_scaled_score + min_value + + return scaled_score + + +def parseFasta(path=None, new_file=None, lines=None, page=None, left_split=None, right_split=' '): + + """ + Parses a fasta file. + + Parameters + ---------- + + path + Path to the fasta file. Can be omitted, if a page or lines is given. + + new_file + Path to where the fasta file should be written. Can be used, when not giving an actual input path. + + lines + A list of line strings representing the fasta file. Can be omitted, if path or page is given. + + page + A string representing the fasta file. Can be omitted, if path or lines is given. + + left_split + A character, which can be used to split the Identifier string given after the '>' symbol in the fasta file format. + + right_split + A character, which can be used to split the Identifier string given after the '>' symbol in the fasta file format. + + Returns + ------- + + seq_map + A dictionary of sequence identifiers mapping to sequences. + """ + + if lines is None and page is None: + f = open(path, 'r') + lines = f.read().split('\n') + f.close() + elif lines is None: + lines = page.split('\n') + + seq_map = {} + n = 0 + + if new_file is not None: + new_lines = [] + + for line in lines: + if len(line) == 0: + continue + #print(line) + if line[0] == '>': + entry_id = line[1:] + if left_split is not None: + entry_id = entry_id.split(left_split, 1)[1] + if right_split is not None: + entry_id = entry_id.split(right_split, 1)[0] + seq_map[entry_id] = '' + n += 1 + if new_file is not None: + new_lines.append(line) + else: + seq_map[entry_id] += line + if new_file is not None: + new_lines.append(line) + + if new_file is not None: + f = open(new_file, 'w') + f.write('\n'.join(new_lines)) + f.close() + + return seq_map + + +def disect_hgvs(hgvs): + + """ + Routine to split a hgvs into its parts. + + Parameters + ---------- + + hgvs + hgvs identifier. + + Returns + ------- + + precursor + hgvs precursor symbol (for example "p." for protein hgvs_pro) + + parts + list with disected hgvs mutation identifiers + """ + + precursor = hgvs[:2] + hgvs_body = hgvs[2:] + + if len(hgvs_body) < 4: + muts = [('invalid', hgvs_body)] + + elif hgvs_body[0] == '[': + muts = [] + + for mut in hgvs_body[1:-1].split(';'): + muts.append(disect_hgvs_mut(mut)) + + else: + muts = [disect_hgvs_mut(hgvs_body)] + + return precursor, muts + + +def disect_hgvs_pro_sav(hgvs_pro): + + """ + Routine to split a SAV hgvs_pro into its parts. + + Parameters + ---------- + + hgvs_pro + hgvs_pro identifier. + + Returns + ------- + + precursor + hgvs precursor symbol (for example "p." for protein hgvs_pro) + + left_part + Wildtype amino acid of the SAV. + + right_part + Mutant amino acid of the SAV. + + pos + Position of the SAV in the corresponding protein sequence. + """ + + precursor = hgvs_pro[:2] + hgvs_body = hgvs_pro[2:] + + left_part, right_part, pos = disect_hgvs_single_pos(hgvs_body) + return precursor, left_part, right_part, pos + + +def disect_hgvs_single_pos(hgvs_single_pos): + + if hgvs_single_pos.count(':') > 0: + hgvs_single_pos = hgvs_single_pos.rsplit(':',1)[1] + if hgvs_single_pos[:2] == 'p.': + hgvs_single_pos = hgvs_single_pos[2:] + + left_part = hgvs_single_pos[:3] + if hgvs_single_pos[-1] == '=': + right_part = hgvs_single_pos[-1] + pos = int(hgvs_single_pos[3:-1]) + else: + right_part = hgvs_single_pos[-3:] + pos = int(hgvs_single_pos[3:-3]) + return left_part, right_part, pos + +def disect_hgvs_mut(hgvs_mut): + + if len(hgvs_mut) < 4 or hgvs_mut[-1] == '?': + return 'invalid', hgvs_mut + + if hgvs_mut.count('_') > 0: + + if hgvs_mut.count('delins') > 0: + separator = 'delins' + elif hgvs_mut.count('ins') > 0: + separator = 'ins' + elif hgvs_mut.count('del') > 0: + separator = 'del' + else: + print(hgvs_mut) + sys.exit(1) + + left_tuple, right_body = hgvs_mut.split('_') + left_part = left_tuple[:3] + left_pos = int(left_tuple[3:]) + + if right_body.count('delins') > 0: + separator = 'delins' + elif right_body.count('del') > 0: + separator = 'del' + elif right_body.count('ins') > 0: + separator = 'ins' + else: + print(hgvs_mut) + sys.exit(1) + right_tuple,tail = right_body.split(separator) + right_part = right_tuple[:3] + right_pos = int(right_tuple[3:]) + + return 'indel', left_part, left_pos, right_part, right_pos, f'{separator}{tail}' + + elif hgvs_mut.count('del') > 0 or hgvs_mut.count('ins') > 0: + """TODO + if hgvs_mut.count('delins') > 0: + separator = 'delins' + elif hgvs_mut.count('ins') > 0: + separator = 'ins' + elif hgvs_mut.count('del') > 0: + separator = 'del' + """ + return 'invalid', hgvs_mut + + elif hgvs_mut.count('fs') > 0: + if hgvs_mut.count('fs*') > 0: + mut_body, fs_number = hgvs_mut.split('fs*') + left_part, right_part, pos = disect_hgvs_single_pos(mut_body) + return 'frameshift', left_part, right_part, pos, f'fs*{fs_number}' + else: + left_part = hgvs_mut[:3] + pos = int(hgvs_mut[3:-2]) + right_part = 'fs' + return 'frameshift', left_part, right_part, pos, '' + + else: + left_part, right_part, pos = disect_hgvs_single_pos(hgvs_mut) + return 'sav', left_part, right_part, pos + + +def apply_offset_to_hgvs_mut_parts(parts, offset): + if parts[0] == 'invalid': + return parts[1] + elif parts[0] == 'sav': + left_part, right_part, pos = parts[1:] + new_pos = pos + offset + return f'{left_part}{new_pos}{right_part}' + elif parts[0] == 'frameshift': + left_part, right_part, pos, tail = parts[1:] + new_pos = pos + offset + return f'{left_part}{new_pos}{right_part}{tail}' + else: + left_part, left_pos, right_part, right_pos, tail = parts[1:] + new_left_pos = left_pos + offset + new_right_pos = right_pos + offset + + return f'{left_part}{new_left_pos}_{right_part}{new_right_pos}{tail}' + + +def apply_offset_to_hgvs_pro(hgvs_pro, offset): + + """ + Routine to add an offset to a hgvs_pro identifier. + + Parameters + ---------- + + hgvs_pro + hgvs_pro identifier. + + offset + An integer for that to position in the identifier has to be shifted. + + Returns + ------- + + new_hgvs_pro + Shifted hgvs_pro identifier. + """ + + if offset == 0: + return hgvs_pro + + precursor, muts = disect_hgvs(hgvs_pro) + if len(muts) == 1: + new_mut = apply_offset_to_hgvs_mut_parts(muts[0], offset) + new_hgvs_pro = f'{precursor}{new_mut}' + else: + new_muts = [] + for mut in muts: + new_muts.append(apply_offset_to_hgvs_mut_parts(mut, offset)) + internal_string = ';'.join(new_muts) + new_hgvs_pro = f'{precursor}[{internal_string}]' + return new_hgvs_pro + + +def offset_loop(seq, offset, hgvs_pros, urn, verbosity = 0): + + """ + Identifies problematic offsets and calculates the rate of mismatched amino acids. + + Parameters + ---------- + + seq + Amino acid sequence of the protein. + + offset + Offset given in the scoreset metadata. + + hgvs_pros + List of hgvs_pro identifiers of the corresponding scoreset. + + urn + MaveDB urn identifier of the corresponding scoreset. + + verbosity + Verbosity level of the function. + + Returns + ------- + + mismatch_found + Boolean, True if at least one mismatched amino acid got detected. + + hit_rate + Rate of correctly matched amino acids. + """ + + mismatch_found = False + correct_count = 0 + incorrect_count = 0 + + for hgvs_pro in hgvs_pros: + + precursor, left_part, right_part, old_pos = disect_hgvs_pro_sav(hgvs_pro) + wt_aa_three_letter = left_part.upper() + + new_pos = old_pos + offset + try: + seq_wt_aa_three_letter = ONE_TO_THREE[seq[new_pos-1]] + except: + incorrect_count += 1 + if incorrect_count < 2 and verbosity >= 1: + print(f'Offset failure: seq_pos too large, {urn}: {old_pos} + {offset} = {new_pos} > {len(seq)}') + mismatch_found = True + continue + + if wt_aa_three_letter != seq_wt_aa_three_letter: + incorrect_count += 1 + if incorrect_count < 2 and verbosity >= 1: + print(f'Offset failure: AA not matched, {urn}: Pos and Offset: {old_pos} + {offset} = {new_pos}; Seq AA: {seq_wt_aa_three_letter}, HGVS AA: {wt_aa_three_letter}') + mismatch_found = True + continue + correct_count += 1 + if (correct_count + incorrect_count) > 0: + hit_rate = correct_count / (correct_count + incorrect_count) + else: + hit_rate = 0 + return mismatch_found, hit_rate + + +def extract_seq(hgvs_pros): + chars = [None] + for hgvs_pro in hgvs_pros: + precursor, left_part, right_part, pos = disect_hgvs_pro_sav(hgvs_pro) + wt_aa_three_letter = left_part.upper() + wt_aa = THREE_TO_ONE[wt_aa_three_letter] + + while len(chars) <= pos: + chars.append('G') + + chars[pos] = wt_aa + + seq = ''.join(chars[1:]) + return seq + + +def get_offset_from_aligned_seq(aligned_extracted_sequence): + offset = 0 + for char in aligned_extracted_sequence: + if char != '-': + break + offset += 1 + + return offset + + + +def repair_seq(seq, offset, hgvs_pros): + pos_dict = {} + for hgvs_pro in hgvs_pros: + precursor, left_part, right_part, old_pos = disect_hgvs_pro_sav(hgvs_pro) + new_pos = old_pos + offset + wt_aa_three_letter = left_part.upper() + wt_aa = THREE_TO_ONE[wt_aa_three_letter] + if new_pos not in pos_dict: + pos_dict[new_pos] = wt_aa + elif wt_aa != pos_dict[new_pos]: + return False, f'Mismatching WT AA {wt_aa=} {new_pos=} {pos_dict[new_pos]=}' + repair_seq = '' + for pos, wt_aa in enumerate(seq): + seq_pos = pos+1 + if seq_pos in pos_dict: + repair_seq += pos_dict[seq_pos] + else: + repair_seq += wt_aa + return True, repair_seq + +def check_offset(seq, offset, hgvs_pros, urn, fallback_seq, verbosity = 0): + + """ + Sanity check of the offset value given in the scoreset metadata. + When the sanity check fails, tries to find the correct offset. + + Parameters + ---------- + + seq + Amino acid sequence of the target protein of the corresponding scoreset. + + offset + Offset value given in the scoreset metadata. + + hgvs_pros + List of all hgvs_pro identifiers given in the scoreset. + + urn + MaveDB urn identifier of the scoreset. + + verbosity + Verbosity level of the function. + + Returns + ------- + + best_offset + The best offset found, can be None, if none got at least 90% correctly matched amino acids. + """ + + if verbosity > 0: + print(f'Call of check_offset {urn=}:\n{seq=}\n{fallback_seq=}') + + hit_rate_thresh = 0.5 + hit_rate = None + best_offset = None + + mismatch_found, hit_rate = offset_loop(seq, offset, hgvs_pros, urn, verbosity = verbosity) + + if not mismatch_found: + return offset, False, hit_rate, seq + + if hit_rate > hit_rate_thresh: + hit_rate_thresh = hit_rate + best_offset = offset + + original_offset = offset + + extracted_seq = extract_seq(hgvs_pros) + + if verbosity > 0: + print(f'In check_offset {urn=}:\n{extracted_seq=}') + + (aligned_sequence, aligned_extracted_sequence) = ( + call_biopython_alignment(seq, extracted_seq) + ) + + offset = get_offset_from_aligned_seq(aligned_extracted_sequence) + mismatch_found, hit_rate = offset_loop(seq, offset, hgvs_pros, urn, verbosity = verbosity) + + if not mismatch_found: + return offset, False, hit_rate, seq + + if hit_rate > hit_rate_thresh: + hit_rate_thresh = hit_rate + best_offset = offset + + offset = 0 + mismatch_found, hit_rate = offset_loop(seq, offset, hgvs_pros, urn, verbosity = verbosity) + + if not mismatch_found: + return offset, False, hit_rate, seq + + if hit_rate > hit_rate_thresh: + hit_rate_thresh = hit_rate + best_offset = offset + + offset = original_offset - 1 + mismatch_found, hit_rate = offset_loop(seq, offset, hgvs_pros, urn, verbosity = verbosity) + + if not mismatch_found: + return offset, False, hit_rate, seq + + if hit_rate > hit_rate_thresh: + hit_rate_thresh = hit_rate + best_offset = offset + + offset = original_offset + 1 + mismatch_found, hit_rate = offset_loop(seq, offset, hgvs_pros, urn, verbosity = verbosity) + + if not mismatch_found: + return offset, False, hit_rate, seq + + if hit_rate > hit_rate_thresh: + hit_rate_thresh = hit_rate + best_offset = offset + + offset = -1 + mismatch_found, hit_rate = offset_loop(seq, offset, hgvs_pros, urn, verbosity = verbosity) + + if not mismatch_found: + return offset, False, hit_rate, seq + + if hit_rate > hit_rate_thresh: + hit_rate_thresh = hit_rate + best_offset = offset + + if best_offset is None: + mismatch_found, hit_rate = offset_loop(fallback_seq, 0, hgvs_pros, urn, verbosity = verbosity) + if hit_rate > hit_rate_thresh: + best_offset = 0 + return best_offset, True, hit_rate, fallback_seq + + elif hit_rate_thresh < 1.0: + successful, repaired_seq = repair_seq(seq, best_offset, hgvs_pros) + if not successful: + print(f'{repaired_seq=}') + return None, False, hit_rate_thresh, None + else: + return best_offset, False, hit_rate_thresh, repaired_seq + return best_offset, False, hit_rate_thresh, seq + def prepare_for_encoding(nested_dict): """ Prepares data for encoding by converting the data in the provided nested_dict into diff --git a/setup.py b/setup.py index 5c1536f..349d2b3 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ "numpy", "pandas", "idutils", + "metapub" ] # fqfa requires backported dataclasses in Python 3.6