OATML-Markslab · loodvn · May 11, 2021 · May 12, 2021 · May 23, 2021 · Jun 16, 2021
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,14 @@
 EVE/__pycache__/
 utils/__pycache__/
 results/VAE_parameters/*
-!results/VAE_parameters/.gitkeep
+!results/VAE_parameters/.gitkeep
+logs/
+.idea/
+.ipynb_checkpoints/
+notebooks/
+results/*parameters?*/
+results/evol_indices/
+slurm/
+slurm_dan/
+# Reinclude examples
+!data/mappings/example_mapping.csv
diff --git a/EVE/VAE_model.py b/EVE/VAE_model.py
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ EVE is a set of protein-specific models providing for any single amino acid muta
 The end to end process to compute EVE scores consists of three consecutive steps:
 1. Train the Bayesian VAE on a re-weighted multiple sequence alignment (MSA) for the protein of interest => train_VAE.py
 2. Compute the evolutionary indices for all single amino acid mutations => compute_evol_indices.py
-3. Train a GMM to cluster variants on the basis of the evol indices then output scores and uncertainties on the class assignments => train_GMM_and_compute_EVE_scores.py
+3. Train a GMM to cluster variants on the basis of the qevol indices then output scores and uncertainties on the class assignments => train_GMM_and_compute_EVE_scores.py
 We also provide all EVE scores for all single amino acid mutations for thousands of proteins at the following address: http://evemodel.org/.
 
 ## Example scripts
@@ -47,6 +47,7 @@ The entire codebase is written in python. Package requirements are as follows:
   - tqdm
   - matplotlib
   - seaborn
+  - numba
 
 The corresponding environment may be created via conda and the provided protein_env.yml file as follows:
 ```

diff --git a/calc_weights.py b/calc_weights.py
@@ -0,0 +1,123 @@
+# Basically train_VAE.py but just calculating the weights
+import argparse
+import os
+import time
+
+import numpy as np
+import pandas as pd
+
+from utils import data_utils
+
+
+def create_argparser():
+    parser = argparse.ArgumentParser(description='VAE')
+
+    # If we don't have a mapping file, just use a single MSA path
+    parser.add_argument("--MSA_filepath", type=str, help="Full path to MSA")
+
+    # If we have a mapping file with one MSA path per line
+    parser.add_argument('--MSA_data_folder', type=str, help='Folder where MSAs are stored', required=True)
+    parser.add_argument('--MSA_list', type=str, help='List of proteins and corresponding MSA file name', required=True)
+    parser.add_argument('--protein_index', type=int, help='Row index of protein in input mapping file', required=True)
+    parser.add_argument('--MSA_weights_location', type=str,
+                        help='Location where weights for each sequence in the MSA will be stored', required=True)
+    parser.add_argument('--theta_reweighting', type=float, help='Parameters for MSA sequence re-weighting')
+    parser.add_argument("--num_cpus", type=int, help="Number of CPUs to use", default=1)
+    parser.add_argument("--skip_existing", help="Will quit gracefully if weights file already exists", action="store_true", default=False)
+    parser.add_argument("--overwrite", help="Will overwrite existing weights file", action="store_true", default=False)
+    parser.add_argument("--calc_method", choices=["evcouplings", "eve", "both", "identity"], help="Method to use for calculating weights. Note: Both produce the same results as we modified the evcouplings numba code to mirror the eve calculation", default="evcouplings")
+    parser.add_argument("--threshold_focus_cols_frac_gaps", type=float,
+                        help="Maximum fraction of gaps allowed in focus columns - see data_utils.MSA_processing")
+    return parser
+
+
+def main(args):
+    print("Arguments:", args)
+
+    weights_file = None
+
+    if args.MSA_filepath is not None:
+        assert os.path.isfile(args.MSA_filepath), f"MSA filepath {args.MSA_filepath} doesn't exist"
+        msa_location = args.MSA_filepath
+    else:
+        # Use mapping file
+        assert os.path.isfile(args.MSA_list), f"MSA file list {args.MSA_list} doesn't seem to exist"
+        mapping_file = pd.read_csv(args.MSA_list)
+        protein_name = mapping_file['protein_name'][args.protein_index]
+        msa_location = args.MSA_data_folder + os.sep + mapping_file['msa_location'][args.protein_index]
+        print("Protein name: " + str(protein_name))
+        # If weights_file is in the df_mapping, use that instead
+        if "weight_file_name" in mapping_file.columns:
+            weights_file = args.MSA_weights_location + os.sep + mapping_file["weight_file_name"][args.protein_index]
+            print("Using weights filename from mapping file:", weights_file)
+
+    print("MSA file: " + str(msa_location))
+
+    if args.theta_reweighting is not None:
+        theta = args.theta_reweighting
+        print(f"Using custom theta value {theta} instead of loading from mapping file.")
+    else:
+        try:
+            theta = float(mapping_file['theta'][args.protein_index])
+        except KeyError as e:
+            # Overriding previous errors is bad, but we're being nice to the user
+            raise KeyError("Couldn't load theta from mapping file. "
+                           "NOT using default value of theta=0.2; please specify theta manually. Specific line:",
+                           mapping_file[args.protein_index],
+                           "Previous error:", e)
+        assert not np.isnan(theta), "Theta is NaN, please provide a custom theta value"
+
+    print("Theta MSA re-weighting: " + str(theta))
+
+    # Using data_kwargs so that if options aren't set, they'll be set to default values
+    data_kwargs = {}
+    if args.threshold_focus_cols_frac_gaps is not None:
+        print("Using custom threshold_focus_cols_frac_gaps: ", args.threshold_focus_cols_frac_gaps)
+        data_kwargs['threshold_focus_cols_frac_gaps'] = args.threshold_focus_cols_frac_gaps
+
+    if not os.path.isdir(args.MSA_weights_location):
+        # exist_ok=True: Otherwise we'll get some race conditions between concurrent jobs
+        os.makedirs(args.MSA_weights_location, exist_ok=True)
+        # print(f"{args.MSA_weights_location} is not a directory. "
+        #       f"Being nice and creating it for you, but this might be a mistake.")
+        raise NotADirectoryError(f"{args.MSA_weights_location} is not a directory."
+                                 f"Could create it automatically, but at the moment raising an error.")
+    else:
+        print(f"MSA weights directory: {args.MSA_weights_location}")
+
+    if weights_file is None:
+        print("Weights filename not found - writing to new file")
+        weights_file = args.MSA_weights_location + os.sep + protein_name + '_theta_' + str(theta) + '.npy'
+
+    print(f"Writing to {weights_file}")
+    # First check that the weights file doesn't exist
+    if os.path.isfile(weights_file) and not args.overwrite:
+        if args.skip_existing:
+            print("Weights file already exists, skipping, since --skip_existing was specified")
+            exit(0)
+        else:
+            raise FileExistsError(f"File {weights_file} already exists. "
+                                  f"Please delete it if you want to re-calculate it. "
+                                  f"If you want to skip existing files, use --skip_existing.")
+
+    # The msa_data processing has a side effect of saving a weights file
+    _ = data_utils.MSA_processing(
+        MSA_location=msa_location,
+        theta=theta,
+        use_weights=True,
+        weights_location=weights_file,
+        num_cpus=args.num_cpus,
+        weights_calc_method=args.calc_method,
+        overwrite_weights=args.overwrite,
+        skip_one_hot_encodings=True,
+        **data_kwargs,
+    )
+
+
+if __name__ == '__main__':
+    start = time.perf_counter()
+    parser = create_argparser()
+    args = parser.parse_args()
+    main(args)
+    end = time.perf_counter()
+    print(f"calc_weights.py took {end-start:.2f} seconds in total.")
diff --git a/compute_evol_indices.py b/compute_evol_indices.py
@@ -1,20 +1,20 @@
-import os,sys
-import json
 import argparse
+import os
+
 import pandas as pd
 import torch
 
 from EVE import VAE_model
 from utils import data_utils
 
-if __name__=='__main__':
+if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='Evol indices')
     parser.add_argument('--MSA_data_folder', type=str, help='Folder where MSAs are stored')
     parser.add_argument('--MSA_list', type=str, help='List of proteins and corresponding MSA file name')
     parser.add_argument('--protein_index', type=int, help='Row index of protein in input mapping file')
-    parser.add_argument('--MSA_weights_location', type=str, help='Location where weights for each sequence in the MSA will be stored')
-    parser.add_argument('--theta_reweighting', type=float, help='Parameters for MSA sequence re-weighting')
+    # parser.add_argument('--MSA_weights_location', type=str, help='Location where weights for each sequence in the MSA will be stored')
+    # parser.add_argument('--theta_reweighting', type=float, help='Parameters for MSA sequence re-weighting')
     parser.add_argument('--VAE_checkpoint_location', type=str, help='Location where VAE model checkpoints will be stored')
     parser.add_argument('--model_name_suffix', default='Jan1', type=str, help='model checkpoint name is the protein name followed by this suffix')
     parser.add_argument('--model_parameters_location', type=str, help='Location of VAE model parameters')
@@ -27,29 +27,33 @@
     parser.add_argument('--batch_size', default=256, type=int, help='Batch size when computing evol indices')
     args = parser.parse_args()
 
+    print("Arguments=", args)
+
     mapping_file = pd.read_csv(args.MSA_list)
     protein_name = mapping_file['protein_name'][args.protein_index]
     msa_location = args.MSA_data_folder + os.sep + mapping_file['msa_location'][args.protein_index]
     print("Protein name: "+str(protein_name))
     print("MSA file: "+str(msa_location))
 
-    if args.theta_reweighting is not None:
-        theta = args.theta_reweighting
-    else:
-        try:
-            theta = float(mapping_file['theta'][args.protein_index])
-        except:
-            theta = 0.2
-    print("Theta MSA re-weighting: "+str(theta))
+    # Theta reweighting not necessary for computing evol indices
+    # if args.theta_reweighting is not None:
+    #     theta = args.theta_reweighting
+    # else:
+    #     try:
+    #         theta = float(mapping_file['theta'][args.protein_index])
+    #     except:
+    #         print("Theta not found in mapping file. Using default value of 0.2")
+    #         theta = 0.2
+    # print("Theta MSA re-weighting: "+str(theta))
 
     data = data_utils.MSA_processing(
             MSA_location=msa_location,
-            theta=theta,
-            use_weights=True,
-            weights_location=args.MSA_weights_location + os.sep + protein_name + '_theta_' + str(theta) + '.npy'
+            # theta=theta,
+            use_weights=False,
+            # weights_location=args.MSA_weights_location + os.sep + protein_name + '_theta_' + str(theta) + '.npy'
     )
 
-    if args.computation_mode=="all_singles":
+    if args.computation_mode == "all_singles":
         data.save_all_singles(output_filename=args.all_singles_mutations_folder + os.sep + protein_name + "_all_singles.csv")
         args.mutations_location = args.all_singles_mutations_folder + os.sep + protein_name + "_all_singles.csv"
     else:
@@ -58,25 +62,24 @@
     model_name = protein_name + "_" + args.model_name_suffix
     print("Model name: "+str(model_name))
 
-    model_params = json.load(open(args.model_parameters_location))
+    # model_params = json.load(open(args.model_parameters_location))
+
+    checkpoint_name = str(args.VAE_checkpoint_location) + os.sep + model_name + "_final"
+    assert os.path.isdir(args.VAE_checkpoint_location), "Cannot find dir"+args.VAE_checkpoint_location
+    assert os.path.isfile(checkpoint_name), "Cannot find "+checkpoint_name+".\nOther options: "+str([f for f in os.listdir('.') if os.path.isfile(f)])
+    checkpoint = torch.load(checkpoint_name)
 
     model = VAE_model.VAE_model(
                     model_name=model_name,
                     data=data,
-                    encoder_parameters=model_params["encoder_parameters"],
-                    decoder_parameters=model_params["decoder_parameters"],
+                    encoder_parameters=checkpoint["encoder_parameters"],
+                    decoder_parameters=checkpoint["decoder_parameters"],
                     random_seed=42
     )
     model = model.to(model.device)
 
-    try:
-        checkpoint_name = str(args.VAE_checkpoint_location) + os.sep + model_name + "_final"
-        checkpoint = torch.load(checkpoint_name)
-        model.load_state_dict(checkpoint['model_state_dict'])
-        print("Initialized VAE with checkpoint '{}' ".format(checkpoint_name))
-    except:
-        print("Unable to locate VAE model checkpoint")
-        sys.exit(0)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    print("Initialized VAE with checkpoint '{}' ".format(checkpoint_name))
 
     list_valid_mutations, evol_indices, _, _ = model.compute_evol_indices(msa_data=data,
                                                     list_mutations_location=args.mutations_location, 
@@ -93,5 +96,6 @@
     try:
         keep_header = os.stat(evol_indices_output_filename).st_size == 0
     except:
-        keep_header=True 
-    df.to_csv(path_or_buf=evol_indices_output_filename, index=False, mode='a', header=keep_header)
+        keep_header = True
+    df.to_csv(path_or_buf=evol_indices_output_filename, index=False, mode='a', header=keep_header)
+    print("Script completed successfully.")