VariantEffect · AlexanderGress · Jul 29, 2022 · Aug 2, 2022 · Jun 10, 2024 · Nov 21, 2024
diff --git a/examples/clone.py b/examples/clone.py
@@ -0,0 +1,26 @@
+import sys
+
+from mavetools.client.client import Client
+
+#This example shows how to download the whole MaveDB and create a local clone
+
+#Provide the URL of MaveDB
+base_url = 'https://www.mavedb.org/#/api/'
+
+# Generate a new auth_token in your profile and post it here
+
+auth_token = ""
+# if the base url exists, the client object is instantiated with that value
+# otherwise the client object is instantiated with default value which points to localhost
+client = (
+    Client(base_url, auth_token=auth_token)
+    if base_url
+    else Client(auth_token=auth_token)
+)
+
+#Provide a path where the local clone should be stored.
+local_instance_path = f'../../localMaveDB_Feb_2023'
+
+#Download MaveDB
+experiment_dict = client.clone(local_instance_path)
+
diff --git a/examples/create_ml_dataset.py b/examples/create_ml_dataset.py
@@ -0,0 +1,38 @@
+from mavetools.client.client import LocalClient
+from mavetools.models.ml_tools import MlDataset
+
+#This example shows how to create a scaled dataset of all SAV effect values contained in the MaveDB
+
+#We use a locally cloned version here, if you haven't cloned it yet, look into clone.py
+
+#Provide the path to the local clone
+local_instance_path = f'../../localMaveDB'
+
+#Provide paths to where the dataset will be written.
+outfile = 'mave_db_scaled_savs.fasta'
+statfile = 'mave_db_scaled_savs_statistics.tsv'
+seq_file = 'mave_db_scaled_savs_only_sequences.fasta'
+
+#Create a local client object
+client = LocalClient(local_instance_path)
+
+#Search the database without any filters to retrieve the whole database
+experiment_dict = client.search_database()
+
+#Create the ML dataset object
+ml_dataset = MlDataset(experiment_dict)
+
+#Retrieve the scoretables
+ml_dataset.retrieve_data(client)
+
+#Aggregate scoresets from same experiments
+ml_dataset.aggregate_scoresetdata()
+
+ml_dataset.write_dataset_statistics(statfile)
+
+#Scale all SAV effect scores
+ml_dataset.scale_all_savs()
+
+#Write the output
+ml_dataset.write_scaled_sav_fasta(outfile, sequences_only_file = seq_file)
+
diff --git a/examples/make_gold_standard.py b/examples/make_gold_standard.py
@@ -0,0 +1,47 @@
+import sys
+
+from mavetools.client.client import LocalClient
+from mavetools.models.ml_tools import MlDataset
+
+if len(sys.argv < 3):
+    print('Usage: python make_gold_standard.py [path to local instance of MaveDB (https://zenodo.org/records/11201737)] [path to the ProteinGym Substitutions reference file (https://marks.hms.harvard.edu/proteingym/DMS_ProteinGym_substitutions.zip)]')
+    sys.exit(1)
+#This example shows how to create a scaled dataset of all SAV effect values contained in the MaveDB and ProteiGym
+
+#Provide the path to the local clone (download via https://zenodo.org/records/11201737)
+local_instance_path = sys.argv[1]
+
+#Provide the path to the ProteinGym Substitutions reference file (download via https://marks.hms.harvard.edu/proteingym/DMS_ProteinGym_substitutions.zip)
+path_to_reference_file = sys.argv[2]
+
+#Provide paths to where the dataset will be written.
+outfile = 'mave_db_gold_standard.fasta'
+seq_file = 'mave_db_gold_standard_only_sequences.fasta'
+
+#Create a local client object
+client = LocalClient(local_instance_path)
+
+#Search the database without any filters to retrieve the whole database
+experiment_dict = client.search_database()
+
+#Create the ML dataset object
+ml_dataset = MlDataset(experiment_dict)
+
+#Retrieve the scoretables
+ml_dataset.retrieve_data(client, verbosity = 1)
+
+#Add the ProteinGym database
+ml_dataset.load_protein_gym(path_to_reference_file)
+
+#Aggregate scoresets from same experiments
+filtered_list = ml_dataset.aggregate_scoresetdata(min_prot_size = 50, min_len_coverage = 0.4, std_filter = 0.25, verbosity = 1)
+
+#ml_dataset.write_filtered_entries(filtered_list, 'filtered_entries_mave_db_gold_standard.tsv')
+
+#Scale all SAV effect scores
+ml_dataset.scale_all_savs(verbosity = 1)
+
+#ml_dataset.plot_sav_score_distribution('mave_db_gold_standard_score_distribution.png')
+
+#Write the output
+ml_dataset.write_scaled_sav_fasta(outfile, sequences_only_file = seq_file)
diff --git a/examples/scale_dataset.py b/examples/scale_dataset.py
@@ -0,0 +1,48 @@
+from mavetools.client.client import LocalClient
+from mavetools.models.ml_tools import MlDataset
+
+#This example shows how to create a scaled for one particular scoreset
+
+#We use a locally cloned version here, if you haven't cloned it yet, look into clone.py
+
+#Give the path to local clone
+local_instance_path = f'../../localMaveDB'
+
+#Create the local client object
+client = LocalClient(local_instance_path)
+
+#Provide a MaveDB urn identifier
+particular_experiment_id = 'urn:mavedb:00000005-a'
+
+#Create ML dataset object and aggregate all scoresets
+experiment_dict = client.get_experiment_dict([particular_experiment_id])
+
+print('=== Experiment object created ===')
+
+ml_dataset = MlDataset(experiment_dict)
+
+print('=== ML object created ===')
+
+ml_dataset.retrieve_data(client, verbosity = 1)
+
+print('=== Data retrieval done ===')
+
+ml_dataset.aggregate_scoresetdata(verbosity = 1)
+
+print('=== Data aggregation done ===')
+
+#Uncomment to plot the SAV score histogram
+ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.plot_sav_score_distribution(f'score_distribution_{particular_experiment_id}.png')
+ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.write_nonsense_tsv(f'Nonsense_scores_{particular_experiment_id}.tsv')
+
+#Scale all SAV effect scores
+ml_dataset.scale_all_savs(verbosity = 1)
+
+print('=== Data scaling finished ===')
+
+#Uncomment to plot the scaled SAV score histogram
+ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.plot_sav_score_distribution(f'scaled_score_distribution_{particular_experiment_id}.png', specific_scores = ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.scaled_sav_scores)
+
+#Write the scaled the dataset to the specialized fasta file
+outfile = f'{particular_experiment_id}_scaled_savs.fasta'
+ml_dataset.write_scaled_sav_fasta(outfile)