Merge pull request #121 from PoonLab/dev

WilliamZekaiWang · web-flow · commit 8726af9ccf01 · 2025-09-09T16:19:05.000-04:00
add data files as args

fixed issue dropping isSDRM from mutdict
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ pip install --user .
 ## Using sierra-local
 
 ### Command-line interface (CLI)
-Before running, we recommend using the `sierralocal/updater.py` script to update the data files associated with this repository to the most updated versions available from [hivfacts](https://github.com/hivdb/hivfacts/tree/main/data). Please note that you do need the requests package stated above for the following command to run. More information regarding this script is detailed below.
+Before running, we recommend using the `sierralocal/updater.py` script to update the data files associated with this repository to the most updated versions available from [hivfacts](https://github.com/hivdb/hivfacts/tree/main/data). Please note that you do need the requests package stated above for the following command to run. More information regarding this script is detailed below. An alternative to running this script through the main function is also provided below.
 ```console
 (sierra) will@dyn172-30-75-11 sierra-local % python3 sierralocal/updater.py 
 Downloading the latest HIVDB XML File
@@ -160,6 +160,50 @@ Writing JSON to file RT_results.json
 Time elapsed: 9.3442 seconds (10.751 it/s)
 ```
 
+To specify other files for detecting the parameters, `isApobecMutation`, `isUnusual`, `isSDRM`, `primaryType`, you can use the following arguments: `-apobec_csv`, `-unusual_csv`, `-sdrms_csv`, `-mutation_csv`, respectively
+```console
+(sierra) will@Williams-MacBook-Pro sierra-local % sierralocal -apobec_csv apobecs.csv -unusual_csv rx-all_subtype-all.csv -sdrms_csv sdrms_hiv1.csv -mutation_csv mutation-type-pairs_hiv1.csv RT.fa
+searching path /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/HIVDB*.xml
+searching path /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/apobec_drms.json
+HIVdb version 9.8
+Using unusual file: rx-all_subtype-all.csv
+Using SDRM mutations file: sdrms_hiv1.csv
+Using mutation type file: mutation-type-pairs_hiv1.csv
+Using APOBEC file: apobecs.csv
+Aligning using post-align
+Aligned RT.fa
+100 sequences found in file RT.fa.
+Writing JSON to file RT_results.json
+Time elapsed: 9.5917 seconds (10.481 it/s)
+```
+
+To update these files while running the script, and subsequently specify an output directory, you can use the args `-forceupdate` followed by `-output_dir` and the new file path. Please note, if you run with `-forceupdate`, you must rerun the installation steps to apply the changes. If you do choose to a different output directory, you must always specifiy the new file locations for these files, otherwise they will default to the ones found in the `sierralocal/data` folder.
+```console
+(sierra) will@Williams-MacBook-Pro sierra-local % sierralocal --forceupdate -updater_outdir . RT.fa        
+Downloading the latest HIVDB XML File
+Updated HIVDB XML into ./HIVDB_9.8.xml
+Downloading the latest APOBEC DRMS File
+Updated APOBEC DRMs into ./apobec_drms.json
+Downloading the latest file to determine apobec
+Updated apobecs file to ./apobecs.csv
+Downloading the latest file to determine is unusual
+Updated is unusual file to ./rx-all_subtype-all.csv
+Downloading the latest file to determine SDRM mutations
+Updated SDRM mutations file to ./sdrms_hiv1.csv
+Downloading the latest file to determine mutation type
+Updated mutation type file to ./mutation-type-pairs_hiv1.csv
+HIVdb version 9.8
+Using unusual file: /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/rx-all_subtype-all.csv
+Using SDRM mutations file: /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/sdrms_hiv1.csv
+Using mutation type file: /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/mutation-type-pairs_hiv1.csv
+Using APOBEC file: /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/apobecs.csv
+Aligning using post-align
+Aligned RT.fa
+100 sequences found in file RT.fa.
+Writing JSON to file RT_results.json
+Time elapsed: 9.9952 seconds (10.846 it/s)
+```
+
 ### As a Python module
 If you have downloaded the package source to your computer, you can also run *sierra-local* as a Python module from the root directory of the package.  In the following example, we are calling the main function of *sierra-local* from an interactive Python session:
 ```console
diff --git a/sierralocal/hivdb.py b/sierralocal/hivdb.py
@@ -12,14 +12,19 @@ class HIVdb():
     webserver, to retrieve the rules-based prediction algorithm as ASI XML,
     and convert this information into Python objects.
     """
-    def __init__(self, asi2=None, apobec=None, forceupdate=False):
+    def __init__(self, asi2=None, apobec=None, forceupdate=False, updater_outdir=None):
         self.xml_filename = None
         self.json_filename = None
 
         if forceupdate:
             import sierralocal.updater as updater
-            self.xml_filename = updater.update_HIVDB()
-            self.json_filename = updater.update_APOBEC()
+            self.xml_filename = updater.update_hivdb(updater_outdir)
+            self.json_filename = updater.update_apobec_mutation(updater_outdir)
+            self.apobec_csv = updater.update_apobec(updater_outdir)
+            self.is_unusual_csv = updater.update_is_unusual(updater_outdir)
+            self.sdrms_csv = updater.update_sdrms(updater_outdir)
+            self.mutation_type_csv = updater.update_mutation_type(updater_outdir)
+            
         else:
             self.set_hivdb_xml(asi2)
             self.set_apobec_json(apobec)
diff --git a/sierralocal/jsonwriter.py b/sierralocal/jsonwriter.py
@@ -10,7 +10,7 @@
 
 
 class JSONWriter():
-    def __init__(self, algorithm):
+    def __init__(self, algorithm, apobec_csv, unusual_csv, sdrms_csv, mutation_csv):
         # possible alternative drug abbrvs
         self.names = {'3TC': 'LMV'}
 
@@ -39,7 +39,17 @@ def __init__(self, algorithm):
             self.rt_comments = dict(csv.reader(rt_file, delimiter='\t'))
 
         # make dictionary for isUnusual
-        dest = str(Path(os.path.dirname(__file__)) / 'data' / 'rx-all_subtype-all.csv')
+        if unusual_csv is None:
+            dest = str(Path(os.path.dirname(__file__)) / 'data' / 'rx-all_subtype-all.csv')
+        else:
+            if os.path.isfile(unusual_csv):    # Ensure is a file
+                dest = unusual_csv
+            else:
+                raise FileNotFoundError(
+                        "Path to CSV file to determine if is unusual cannot be found at user specified "
+                        "path {}".format(unusual_csv))
+        print("Using unusual file: "+dest)
+
         with open(dest, 'r', encoding='utf-8-sig') as is_unusual_file:
             is_unusual_file = csv.DictReader(is_unusual_file)
             self.is_unusual_dic = {}
@@ -54,7 +64,17 @@ def __init__(self, algorithm):
                     self.is_unusual_dic[gene].update({pos: {}})
                 self.is_unusual_dic[gene][pos].update({aa: unusual})
 
-        dest = str(Path(os.path.dirname(__file__)) / 'data' / 'sdrms_hiv1.csv')
+        if sdrms_csv is None:
+            dest = str(Path(os.path.dirname(__file__)) / 'data' / 'sdrms_hiv1.csv')
+        else:
+            if os.path.isfile(sdrms_csv):    # Ensure is a file
+                dest = sdrms_csv
+            else:
+                raise FileNotFoundError(
+                        "Path to CSV file to determine SDRM mutations cannot be found at user specified "
+                        "path {}".format(sdrms_csv))
+        print("Using SDRM mutations file: "+dest)
+
         with open(dest, 'r', encoding='utf-8-sig') as sdrm_files:
             sdrm_files = csv.DictReader(sdrm_files)
             self.sdrm_dic = {}
@@ -86,7 +106,17 @@ def __init__(self, algorithm):
                     self.apobec_drm_dic[gene][position] += aa
 
         # make dictionary for primary type
-        dest = str(Path(os.path.dirname(__file__)) / 'data' / 'mutation-type-pairs_hiv1.csv')
+        if mutation_csv is None:
+            dest = str(Path(os.path.dirname(__file__)) / 'data' / 'mutation-type-pairs_hiv1.csv')
+        else:
+            if os.path.isfile(mutation_csv):    # Ensure is a file
+                dest = mutation_csv
+            else:
+                raise FileNotFoundError(
+                        "Path to CSV file to determine mutation type cannot be found at user specified "
+                        "path {}".format(mutation_csv))
+
+        print("Using mutation type file: "+dest)
         with open(dest, 'r', encoding='utf-8-sig') as mut_type_pairs1_files:
             mut_type_pairs1_files = csv.DictReader(mut_type_pairs1_files)
             self.primary_type_dic = {}
@@ -102,7 +132,16 @@ def __init__(self, algorithm):
                 self.primary_type_dic[gene][pos].update({aa: mut})
 
         # make dictionary for apobec mutations
-        dest = str(Path(os.path.dirname(__file__)) / 'data' / 'apobecs.csv')
+        if apobec_csv is None:
+            dest = str(Path(os.path.dirname(__file__)) / 'data' / 'apobecs.csv')
+        else:
+            if os.path.isfile(apobec_csv):    # Ensure is a file
+                dest = apobec_csv
+            else:
+                raise FileNotFoundError(
+                        "Path to CSV file with APOBEC cannot be found at user specified "
+                        "path {}".format(apobec_csv))
+        print("Using APOBEC file: "+dest)
         with open(dest, 'r', encoding='utf-8-sig') as apobec_mutations:
             apobec_mutations = csv.DictReader(apobec_mutations)
             self.apobec_mutations_dic = {}
@@ -295,9 +334,11 @@ def format_aligned_gene_sequences(self, ordered_mutation_list,
                                                    mutation[3])
             check_sdrm, sdrm_aas = self.is_sdrm(gene,
                                              mutation[0],
-                                             mutation[1])
-            
-            if check_sdrm:
+                                             mutation[1],
+                                             mutation[3])
+            mutdict['isSDRM'] = check_sdrm
+
+            if check_sdrm: 
                 dic['SDRMs'].append({'text': mutation[2] + str(mutation[0]) + sdrm_aas})
 
             mutdict['hasStop'] = self.has_stop(mutation, mutation[3])
@@ -470,14 +511,16 @@ def is_apobec_drm(self, gene, consensus, position, AA):
                         return True
         return False
 
-    def is_sdrm(self, gene, position, AA):
+    def is_sdrm(self, gene, position, AA, text):
         """
         see if specific amino acid mutation is a sdrm through checking hivbd facts
         @param gene: str, RT, IN, PR
         @param position: int, position of mutation relative to POL
         @param AA: new amino acid
         @return: bool
         """
+        if text == 'X':
+            return False, ''
         position = str(position)
         all_aas = ''
         found = False
diff --git a/sierralocal/main.py b/sierralocal/main.py
@@ -3,13 +3,14 @@
 import time
 import argparse
 import json
+from pathlib import Path
+import csv
 
 from sierralocal import score_alg
 from sierralocal.hivdb import HIVdb
 from sierralocal.jsonwriter import JSONWriter
 from sierralocal.nucaminohook import NucAminoAligner
 
-
 def score(filename, xml_path=None, tsv_path=None, forceupdate=False, do_subtype=False, program='post'): # pragma: no cover
     """
     Functionality as a Python module. Can import this function from sierralocal.
@@ -123,7 +124,8 @@ def scorefile(input_file, algorithm, do_subtype=False, program='post'):
            file_genes, sequence_lengths, file_trims, subtypes, na_sequence, ambiguous, gene_order
 
 def sierralocal(fasta, outfile, xml=None, json=None, cleanup=False, forceupdate=False,
-                program='post', do_subtype=False): # pragma: no cover
+                apobec_csv=None, unusual_csv=None, sdrms_csv=None, mutation_csv=None,
+                updater_outdir=None, program='post', do_subtype=False): # pragma: no cover
     """
     Contains all initializing and processing calls.
 
@@ -134,13 +136,17 @@ def sierralocal(fasta, outfile, xml=None, json=None, cleanup=False, forceupdate=
     @param json: <optional> str, path to local copy of HIVdb algorithm APOBEC DRM file
     @param cleanup:  <optional> bool, to delete alignment file
     @param forceupdate: <optional> bool, forces sierralocal to update its local copy of the HIVdb algorithm
+    @param apobec_csv: str <optional>, Path to CSV APOBEC csv file (default: apobecs.csv)
+    @param unusual_csv: str <optional>, Path to CSV file to determine if is unusual (default: rx-all_subtype-all.csv)
+    @param sdrms_csv: str <optional>, Path to CSV file to determine SDRM mutations (default: sdrms_hiv1.csv)
+    @param mutation_csv: str <optional>, Path to CSV file to determine mutation type (default: mutation-type-pairs_hiv1.csv)
     @return: tuple, a tuple of (number of records processed, time elapsed initializing algorithm)
     """
 
     # initialize algorithm and jsonwriter
     time0 = time.time()
-    algorithm = HIVdb(asi2=xml, apobec=json, forceupdate=forceupdate)
-    writer = JSONWriter(algorithm)
+    algorithm = HIVdb(asi2=xml, apobec=json, forceupdate=forceupdate, updater_outdir=updater_outdir)
+    writer = JSONWriter(algorithm, apobec_csv, unusual_csv, sdrms_csv, mutation_csv)
     time_elapsed = time.time() - time0
 
     # accommodate single file path argument
@@ -188,7 +194,7 @@ def parse_args(): # pragma: no cover
     parser.add_argument('fasta', nargs='+', type=str, help='List of input files.')
     parser.add_argument('-o', dest='outfile', default=None, type=str, help='Output filename.')
     parser.add_argument('-xml', default=None,
-                        help='<optional> Path to HIVdb ASI2 XML file')
+                        help='<optional> Path to HIVdb ASI2 XML file (default: HIVDB_9.4.xml)')
     parser.add_argument('-json', default=None,
                         help='<optional> Path to JSON HIVdb APOBEC DRM file')
     parser.add_argument('--cleanup', action='store_true',
@@ -197,16 +203,80 @@ def parse_args(): # pragma: no cover
                         help='Forces update of HIVdb algorithm. Requires network connection.')
     parser.add_argument('-alignment', default='post', choices=['post', 'nuc'],
                         help='Alignment program to use, "post" for post align and "nuc" for nucamino')
+    parser.add_argument('-apobec_csv', default=None,
+                        help='<optional> Path to CSV APOBEC csv file (default: apobecs.csv)')
+    parser.add_argument('-unusual_csv', default=None,
+                        help='<optional> Path to CSV file to determine if is unusual (default: rx-all_subtype-all.csv)')
+    parser.add_argument('-sdrms_csv', default=None,
+                        help='<optional> Path to CSV file to determine SDRM mutations (default: sdrms_hiv1.csv)')
+    parser.add_argument('-mutation_csv', default=None,
+                        help='<optional> Path to CSV file to determine mutation type (default: mutation-type-pairs_hiv1.csv)')
+    parser.add_argument('-updater_outdir', default=None,
+                        help='<optional> Path to folder to store updated files from updater (default: sierralocal/data folder))')
+
     args = parser.parse_args()
     return args
 
 
+def check_input(apobec_path, unusual_path, sdrms_path, mutation_path):
+    """
+    Check if the input for the files are valid based on the first row of the csv.
+
+    apobec_path: path to apobec_drms.csv
+    unusual_path: path to rx-all_subtype-all.csv
+    sdrms_path: path to sdrms_hiv1.csv
+    mutation_path: path to mutation-type-pairs_hiv1.csv
+    """
+    exp = {
+        "apobec_csv": ["gene", "position", "aa"],
+        "unusual_csv": ["gene", "position", "aa", "percent", "count", "total", "reason", "isUnusual"],
+        "sdrms_csv": ["drug_class", "gene", "position", "aa"],
+        "mutation_csv": ["strain", "gene", "drugClass", "position", "aas", "mutationType", "isUnusual"],
+    }
+
+    paths = {
+        "apobec_csv": apobec_path,
+        "unusual_csv": unusual_path,
+        "sdrms_csv": sdrms_path,
+        "mutation_csv": mutation_path,
+    }
+    for key, path in paths.items():
+        if path is None:
+            continue
+        try:
+            with open(path, newline="", encoding="utf-8-sig") as f:
+                reader = csv.reader(f)
+                header = next(reader)
+        except Exception as e:
+            sys.exit(f"Could not open {key} file '{path}': {e}")
+
+        if header != exp[key]:
+            print(
+                f"Invalid header in {key} file '{path}'.\n"
+                f"Expected: {exp[key]}\nFound:    {header}"
+            )
+            sys.exit()
+
+
 def main(): # pragma: no cover
     """
     Main function called from CLI.
     """
     args = parse_args()
 
+    # check for valid file inputs
+    check_input(args.apobec_csv, args.unusual_csv, args.sdrms_csv, args.mutation_csv)
+
+    mod_path = Path(os.path.dirname(__file__))
+
+    if args.updater_outdir:
+        target_dir = args.updater_outdir
+    else:
+        target_dir = os.path.join(mod_path, "data")
+    
+    # Create directory if it doesn't exist
+    os.makedirs(target_dir, exist_ok=True)
+
     # check that FASTA files in list all exist
     for file in args.fasta:
         if not os.path.exists(file):
@@ -216,6 +286,8 @@ def main(): # pragma: no cover
     time_start = time.time()
     count, time_elapsed = sierralocal(args.fasta, args.outfile, xml=args.xml,
                                       json=args.json, cleanup=args.cleanup, forceupdate=args.forceupdate,
+                                      apobec_csv=args.apobec_csv, unusual_csv=args.unusual_csv, 
+                                      sdrms_csv=args.sdrms_csv, mutation_csv=args.mutation_csv, updater_outdir=target_dir,
                                       program=args.alignment)
     time_diff = time.time() - time_start
 
diff --git a/sierralocal/updater.py b/sierralocal/updater.py