Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions examples/clone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sys

from mavetools.client.client import Client

#This example shows how to download the whole MaveDB and create a local clone

#Provide the URL of MaveDB
base_url = 'https://www.mavedb.org/#/api/'

# Generate a new auth_token in your profile and post it here

auth_token = ""
# if the base url exists, the client object is instantiated with that value
# otherwise the client object is instantiated with default value which points to localhost
client = (
Client(base_url, auth_token=auth_token)
if base_url
else Client(auth_token=auth_token)
)

#Provide a path where the local clone should be stored.
local_instance_path = f'../../localMaveDB_Feb_2023'

#Download MaveDB
experiment_dict = client.clone(local_instance_path)

38 changes: 38 additions & 0 deletions examples/create_ml_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from mavetools.client.client import LocalClient
from mavetools.models.ml_tools import MlDataset

#This example shows how to create a scaled dataset of all SAV effect values contained in the MaveDB

#We use a locally cloned version here, if you haven't cloned it yet, look into clone.py

#Provide the path to the local clone
local_instance_path = f'../../localMaveDB'

#Provide paths to where the dataset will be written.
outfile = 'mave_db_scaled_savs.fasta'
statfile = 'mave_db_scaled_savs_statistics.tsv'
seq_file = 'mave_db_scaled_savs_only_sequences.fasta'

#Create a local client object
client = LocalClient(local_instance_path)

#Search the database without any filters to retrieve the whole database
experiment_dict = client.search_database()

#Create the ML dataset object
ml_dataset = MlDataset(experiment_dict)

#Retrieve the scoretables
ml_dataset.retrieve_data(client)

#Aggregate scoresets from same experiments
ml_dataset.aggregate_scoresetdata()

ml_dataset.write_dataset_statistics(statfile)

#Scale all SAV effect scores
ml_dataset.scale_all_savs()

#Write the output
ml_dataset.write_scaled_sav_fasta(outfile, sequences_only_file = seq_file)

47 changes: 47 additions & 0 deletions examples/make_gold_standard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import sys

from mavetools.client.client import LocalClient
from mavetools.models.ml_tools import MlDataset

if len(sys.argv < 3):
print('Usage: python make_gold_standard.py [path to local instance of MaveDB (https://zenodo.org/records/11201737)] [path to the ProteinGym Substitutions reference file (https://marks.hms.harvard.edu/proteingym/DMS_ProteinGym_substitutions.zip)]')
sys.exit(1)
#This example shows how to create a scaled dataset of all SAV effect values contained in the MaveDB and ProteiGym

#Provide the path to the local clone (download via https://zenodo.org/records/11201737)
local_instance_path = sys.argv[1]

#Provide the path to the ProteinGym Substitutions reference file (download via https://marks.hms.harvard.edu/proteingym/DMS_ProteinGym_substitutions.zip)
path_to_reference_file = sys.argv[2]

#Provide paths to where the dataset will be written.
outfile = 'mave_db_gold_standard.fasta'
seq_file = 'mave_db_gold_standard_only_sequences.fasta'

#Create a local client object
client = LocalClient(local_instance_path)

#Search the database without any filters to retrieve the whole database
experiment_dict = client.search_database()

#Create the ML dataset object
ml_dataset = MlDataset(experiment_dict)

#Retrieve the scoretables
ml_dataset.retrieve_data(client, verbosity = 1)

#Add the ProteinGym database
ml_dataset.load_protein_gym(path_to_reference_file)

#Aggregate scoresets from same experiments
filtered_list = ml_dataset.aggregate_scoresetdata(min_prot_size = 50, min_len_coverage = 0.4, std_filter = 0.25, verbosity = 1)

#ml_dataset.write_filtered_entries(filtered_list, 'filtered_entries_mave_db_gold_standard.tsv')

#Scale all SAV effect scores
ml_dataset.scale_all_savs(verbosity = 1)

#ml_dataset.plot_sav_score_distribution('mave_db_gold_standard_score_distribution.png')

#Write the output
ml_dataset.write_scaled_sav_fasta(outfile, sequences_only_file = seq_file)
48 changes: 48 additions & 0 deletions examples/scale_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from mavetools.client.client import LocalClient
from mavetools.models.ml_tools import MlDataset

#This example shows how to create a scaled for one particular scoreset

#We use a locally cloned version here, if you haven't cloned it yet, look into clone.py

#Give the path to local clone
local_instance_path = f'../../localMaveDB'

#Create the local client object
client = LocalClient(local_instance_path)

#Provide a MaveDB urn identifier
particular_experiment_id = 'urn:mavedb:00000005-a'

#Create ML dataset object and aggregate all scoresets
experiment_dict = client.get_experiment_dict([particular_experiment_id])

print('=== Experiment object created ===')

ml_dataset = MlDataset(experiment_dict)

print('=== ML object created ===')

ml_dataset.retrieve_data(client, verbosity = 1)

print('=== Data retrieval done ===')

ml_dataset.aggregate_scoresetdata(verbosity = 1)

print('=== Data aggregation done ===')

#Uncomment to plot the SAV score histogram
ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.plot_sav_score_distribution(f'score_distribution_{particular_experiment_id}.png')
ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.write_nonsense_tsv(f'Nonsense_scores_{particular_experiment_id}.tsv')

#Scale all SAV effect scores
ml_dataset.scale_all_savs(verbosity = 1)

print('=== Data scaling finished ===')

#Uncomment to plot the scaled SAV score histogram
ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.plot_sav_score_distribution(f'scaled_score_distribution_{particular_experiment_id}.png', specific_scores = ml_dataset.experiments[particular_experiment_id].experiment_scoresetdata.scaled_sav_scores)

#Write the scaled the dataset to the specialized fasta file
outfile = f'{particular_experiment_id}_scaled_savs.fasta'
ml_dataset.write_scaled_sav_fasta(outfile)
Loading