From 6ba1f1ef9b971fdc0363dee66eda1111402b6c0f Mon Sep 17 00:00:00 2001 From: svarona Date: Thu, 4 Sep 2025 14:22:16 +0200 Subject: [PATCH 1/8] added apobec_csv, unusual_csv, sdrms_csv and mutation_csv as arguments --- sierralocal/jsonwriter.py | 47 ++++++++++++++++++++++++++++++++++----- sierralocal/main.py | 19 ++++++++++++++-- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/sierralocal/jsonwriter.py b/sierralocal/jsonwriter.py index a6bf12d..c182561 100644 --- a/sierralocal/jsonwriter.py +++ b/sierralocal/jsonwriter.py @@ -10,7 +10,7 @@ class JSONWriter(): - def __init__(self, algorithm): + def __init__(self, algorithm, apobec_csv, unusual_csv, sdrms_csv, mutation_csv): # possible alternative drug abbrvs self.names = {'3TC': 'LMV'} @@ -39,7 +39,16 @@ def __init__(self, algorithm): self.rt_comments = dict(csv.reader(rt_file, delimiter='\t')) # make dictionary for isUnusual - dest = str(Path(os.path.dirname(__file__)) / 'data' / 'rx-all_subtype-all.csv') + if unusual_csv is None: + dest = str(Path(os.path.dirname(__file__)) / 'data' / 'rx-all_subtype-all.csv') + else: + if os.path.isfile(unusual_csv): # Ensure is a file + dest = unusual_csv + else: + raise FileNotFoundError( + "Path to CSV file to determine if is unusual cannot be found at user specified " + "path {}".format(unusual_csv)) + print("Using unusual file: "+dest) with open(dest, 'r', encoding='utf-8-sig') as is_unusual_file: is_unusual_file = csv.DictReader(is_unusual_file) self.is_unusual_dic = {} @@ -54,7 +63,16 @@ def __init__(self, algorithm): self.is_unusual_dic[gene].update({pos: {}}) self.is_unusual_dic[gene][pos].update({aa: unusual}) - dest = str(Path(os.path.dirname(__file__)) / 'data' / 'sdrms_hiv1.csv') + if sdrms_csv is None: + dest = str(Path(os.path.dirname(__file__)) / 'data' / 'sdrms_hiv1.csv') + else: + if os.path.isfile(sdrms_csv): # Ensure is a file + dest = sdrms_csv + else: + raise FileNotFoundError( + "Path to CSV file to determine SDRM mutations cannot be found at user specified " + "path {}".format(sdrms_csv)) + print("Using SDRM mutations file: "+dest) with open(dest, 'r', encoding='utf-8-sig') as sdrm_files: sdrm_files = csv.DictReader(sdrm_files) self.sdrm_dic = {} @@ -86,7 +104,17 @@ def __init__(self, algorithm): self.apobec_drm_dic[gene][position] += aa # make dictionary for primary type - dest = str(Path(os.path.dirname(__file__)) / 'data' / 'mutation-type-pairs_hiv1.csv') + if mutation_csv is None: + dest = str(Path(os.path.dirname(__file__)) / 'data' / 'mutation-type-pairs_hiv1.csv') + else: + if os.path.isfile(mutation_csv): # Ensure is a file + dest = mutation_csv + else: + raise FileNotFoundError( + "Path to CSV file to determine mutation type cannot be found at user specified " + "path {}".format(mutation_csv)) + + print("Using mutation type file: "+dest) with open(dest, 'r', encoding='utf-8-sig') as mut_type_pairs1_files: mut_type_pairs1_files = csv.DictReader(mut_type_pairs1_files) self.primary_type_dic = {} @@ -102,7 +130,16 @@ def __init__(self, algorithm): self.primary_type_dic[gene][pos].update({aa: mut}) # make dictionary for apobec mutations - dest = str(Path(os.path.dirname(__file__)) / 'data' / 'apobecs.csv') + if apobec_csv is None: + dest = str(Path(os.path.dirname(__file__)) / 'data' / 'apobecs.csv') + else: + if os.path.isfile(apobec_csv): # Ensure is a file + dest = apobec_csv + else: + raise FileNotFoundError( + "Path to CSV file with APOBEC cannot be found at user specified " + "path {}".format(apobec_csv)) + print("Using APOBEC file: "+dest) with open(dest, 'r', encoding='utf-8-sig') as apobec_mutations: apobec_mutations = csv.DictReader(apobec_mutations) self.apobec_mutations_dic = {} diff --git a/sierralocal/main.py b/sierralocal/main.py index a959afc..8aab2cd 100644 --- a/sierralocal/main.py +++ b/sierralocal/main.py @@ -123,6 +123,7 @@ def scorefile(input_file, algorithm, do_subtype=False, program='post'): file_genes, sequence_lengths, file_trims, subtypes, na_sequence, ambiguous, gene_order def sierralocal(fasta, outfile, xml=None, json=None, cleanup=False, forceupdate=False, + apobec_csv=None, unusual_csv=None, sdrms_csv=None, mutation_csv=None, program='post', do_subtype=False): # pragma: no cover """ Contains all initializing and processing calls. @@ -134,13 +135,17 @@ def sierralocal(fasta, outfile, xml=None, json=None, cleanup=False, forceupdate= @param json: str, path to local copy of HIVdb algorithm APOBEC DRM file @param cleanup: bool, to delete alignment file @param forceupdate: bool, forces sierralocal to update its local copy of the HIVdb algorithm + @param apobec_csv: str , Path to CSV APOBEC csv file (default: apobecs.csv) + @param unusual_csv: str , Path to CSV file to determine if is unusual (default: rx-all_subtype-all.csv) + @param sdrms_csv: str , Path to CSV file to determine SDRM mutations (default: sdrms_hiv1.csv) + @param mutation_csv: str , Path to CSV file to determine mutation type (default: mutation-type-pairs_hiv1.csv) @return: tuple, a tuple of (number of records processed, time elapsed initializing algorithm) """ # initialize algorithm and jsonwriter time0 = time.time() algorithm = HIVdb(asi2=xml, apobec=json, forceupdate=forceupdate) - writer = JSONWriter(algorithm) + writer = JSONWriter(algorithm, apobec_csv, unusual_csv, sdrms_csv, mutation_csv) time_elapsed = time.time() - time0 # accommodate single file path argument @@ -197,10 +202,18 @@ def parse_args(): # pragma: no cover help='Forces update of HIVdb algorithm. Requires network connection.') parser.add_argument('-alignment', default='post', choices=['post', 'nuc'], help='Alignment program to use, "post" for post align and "nuc" for nucamino') + parser.add_argument('-apobec_csv', default=None, + help=' Path to CSV APOBEC csv file (default: apobecs.csv)') + parser.add_argument('-unusual_csv', default=None, + help=' Path to CSV file to determine if is unusual (default: rx-all_subtype-all.csv)') + parser.add_argument('-sdrms_csv', default=None, + help=' Path to CSV file to determine SDRM mutations (default: sdrms_hiv1.csv)') + parser.add_argument('-mutation_csv', default=None, + help=' Path to CSV file to determine mutation type (default: mutation-type-pairs_hiv1.csv)') + args = parser.parse_args() return args - def main(): # pragma: no cover """ Main function called from CLI. @@ -216,6 +229,8 @@ def main(): # pragma: no cover time_start = time.time() count, time_elapsed = sierralocal(args.fasta, args.outfile, xml=args.xml, json=args.json, cleanup=args.cleanup, forceupdate=args.forceupdate, + apobec_csv=args.apobec_csv, unusual_csv=args.unusual_csv, + sdrms_csv=args.sdrms_csv, mutation_csv=args.mutation_csv, program=args.alignment) time_diff = time.time() - time_start From 043e934c362c8378bf7dba9cecc1c3318ac99ae3 Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 5 Sep 2025 14:53:06 +0200 Subject: [PATCH 2/8] added outdir param for updater --- sierralocal/hivdb.py | 6 ++-- sierralocal/main.py | 20 ++++++++--- sierralocal/updater.py | 79 +++++++++++++++++++++++------------------- 3 files changed, 63 insertions(+), 42 deletions(-) diff --git a/sierralocal/hivdb.py b/sierralocal/hivdb.py index 2a99586..739532c 100644 --- a/sierralocal/hivdb.py +++ b/sierralocal/hivdb.py @@ -12,14 +12,14 @@ class HIVdb(): webserver, to retrieve the rules-based prediction algorithm as ASI XML, and convert this information into Python objects. """ - def __init__(self, asi2=None, apobec=None, forceupdate=False): + def __init__(self, asi2=None, apobec=None, forceupdate=False, updater_outdir=None): self.xml_filename = None self.json_filename = None if forceupdate: import sierralocal.updater as updater - self.xml_filename = updater.update_HIVDB() - self.json_filename = updater.update_APOBEC() + self.xml_filename = updater.update_hivdb(updater_outdir) + self.json_filename = updater.update_apobec_mutation(updater_outdir) else: self.set_hivdb_xml(asi2) self.set_apobec_json(apobec) diff --git a/sierralocal/main.py b/sierralocal/main.py index 8aab2cd..2412548 100644 --- a/sierralocal/main.py +++ b/sierralocal/main.py @@ -3,13 +3,13 @@ import time import argparse import json +from pathlib import Path from sierralocal import score_alg from sierralocal.hivdb import HIVdb from sierralocal.jsonwriter import JSONWriter from sierralocal.nucaminohook import NucAminoAligner - def score(filename, xml_path=None, tsv_path=None, forceupdate=False, do_subtype=False, program='post'): # pragma: no cover """ Functionality as a Python module. Can import this function from sierralocal. @@ -124,7 +124,7 @@ def scorefile(input_file, algorithm, do_subtype=False, program='post'): def sierralocal(fasta, outfile, xml=None, json=None, cleanup=False, forceupdate=False, apobec_csv=None, unusual_csv=None, sdrms_csv=None, mutation_csv=None, - program='post', do_subtype=False): # pragma: no cover + updater_outdir=None, program='post', do_subtype=False): # pragma: no cover """ Contains all initializing and processing calls. @@ -144,7 +144,7 @@ def sierralocal(fasta, outfile, xml=None, json=None, cleanup=False, forceupdate= # initialize algorithm and jsonwriter time0 = time.time() - algorithm = HIVdb(asi2=xml, apobec=json, forceupdate=forceupdate) + algorithm = HIVdb(asi2=xml, apobec=json, forceupdate=forceupdate, updater_outdir=updater_outdir) writer = JSONWriter(algorithm, apobec_csv, unusual_csv, sdrms_csv, mutation_csv) time_elapsed = time.time() - time0 @@ -210,6 +210,8 @@ def parse_args(): # pragma: no cover help=' Path to CSV file to determine SDRM mutations (default: sdrms_hiv1.csv)') parser.add_argument('-mutation_csv', default=None, help=' Path to CSV file to determine mutation type (default: mutation-type-pairs_hiv1.csv)') + parser.add_argument('-updater_outdir', default=None, + help=' Path to folder to store updated files from updater (default: sierralocal/data folder))') args = parser.parse_args() return args @@ -220,6 +222,16 @@ def main(): # pragma: no cover """ args = parse_args() + mod_path = Path(os.path.dirname(__file__)) + + if args.updater_outdir: + target_dir = args.updater_outdir + else: + target_dir = os.path.join(mod_path, "data") + + # Create directory if it doesn't exist + os.makedirs(target_dir, exist_ok=True) + # check that FASTA files in list all exist for file in args.fasta: if not os.path.exists(file): @@ -230,7 +242,7 @@ def main(): # pragma: no cover count, time_elapsed = sierralocal(args.fasta, args.outfile, xml=args.xml, json=args.json, cleanup=args.cleanup, forceupdate=args.forceupdate, apobec_csv=args.apobec_csv, unusual_csv=args.unusual_csv, - sdrms_csv=args.sdrms_csv, mutation_csv=args.mutation_csv, + sdrms_csv=args.sdrms_csv, mutation_csv=args.mutation_csv, updater_outdir=target_dir, program=args.alignment) time_diff = time.time() - time_start diff --git a/sierralocal/updater.py b/sierralocal/updater.py index 1211dfc..523b43b 100644 --- a/sierralocal/updater.py +++ b/sierralocal/updater.py @@ -1,14 +1,14 @@ import requests from pathlib import Path import os - +import argparse mod_path = Path(os.path.dirname(__file__)) - -def update_apobec_mutation(): +def update_apobec_mutation(target_dir=None): """ Update the APOBEC DRMS file from the Github page + :param updater_outdir: optional directory to save files :return: absolute path to the apobec drms JSON file """ # UPDATE APOBEC DRMS @@ -16,7 +16,7 @@ def update_apobec_mutation(): try: url = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/apobecs/apobec_drms.json' - filepath = os.path.join(mod_path, "data", "apobec_drms.json") + filepath = os.path.join(target_dir, "apobec_drms.json") request = requests.get(url, allow_redirects=True) with open(filepath, 'wb') as file: file.write(request.content) @@ -26,12 +26,11 @@ def update_apobec_mutation(): except: # pragma: no cover print("Unable to update APOBEC DRMs. Try manually downloading the APOBEC DRM JSON into data/apobec_drms.json") - - -def update_hivdb(): +def update_hivdb(target_dir=None): """ Query the HIVdb Github page for new ASI (algorithm specification interface) XML files. + :param updater_outdir: optional directory to save files :return: absolute path to new XML file """ print('Downloading the latest HIVDB XML File') @@ -39,7 +38,7 @@ def update_hivdb(): url = requests.get('https://raw.githubusercontent.com/hivdb/hivfacts/main/data/algorithms/HIVDB_latest.xml') file = url.text - filepath = os.path.join(mod_path, "data", file) + filepath = os.path.join(target_dir, file) hivdb_latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/algorithms/{}'.format(file) request = requests.get(hivdb_latest, allow_redirects=True) with open(filepath, 'wb') as file: @@ -53,26 +52,25 @@ def update_hivdb(): print(e) return None - -def update_is_unusual(): +def update_is_unusual(target_dir=None): + print('Downloading the latest file to determine is unusual') try: unusual_latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/2021.3/data/aapcnt/rx-all_subtype-all.csv' request = requests.get(unusual_latest) - filepath = os.path.join(mod_path, "data", "rx-all_subtype-all.csv") + filepath = os.path.join(target_dir, "rx-all_subtype-all.csv") with open(filepath, 'wb') as file: file.write(request.content) print(f'Updated is unusual file to {filepath}') - return filepath except: print('Could not update file for is unusual (rx-all_subtype-all.csv)\n' 'Please download manually from https://hivdb.stanford.edu/page/release-notes/#data.files') -def update_sdrms(): +def update_sdrms(target_dir=None): """ Query the HIVDB facts github page to find and update SDRM mutations file @return: file path of updated file @@ -81,19 +79,18 @@ def update_sdrms(): try: latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/sdrms_hiv1.csv' request = requests.get(latest) - filepath = os.path.join(mod_path, "data", "sdrms_hiv1.csv") + filepath = os.path.join(target_dir, "sdrms_hiv1.csv") with open(filepath, 'wb') as file: file.write(request.content) print(f'Updated SDRM mutations file to {filepath}') - return filepath except: print('Could not update file for SDRM Mutations (sdrms_hiv1.csv)\n' 'Please download manually from https://github.com/hivdb/hivfacts/tree/main/data') -def update_mutation_type(): +def update_mutation_type(target_dir=None): """ Query the HIVDB facts github page to find and update mutations type file @return: file path of updated file @@ -102,19 +99,18 @@ def update_mutation_type(): try: latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/mutation-type-pairs_hiv1.csv' request = requests.get(latest) - filepath = os.path.join(mod_path, "data", "mutation-type-pairs_hiv1.csv") + filepath = os.path.join(target_dir, "mutation-type-pairs_hiv1.csv") with open(filepath, 'wb') as file: file.write(request.content) print(f'Updated mutation type file to {filepath}') - return filepath except: print('Could not update file for mutation type (mutation-type-pairs_hiv1.csv)\n' 'Please download manually from https://github.com/hivdb/hivfacts/tree/main/data') -def update_apobec(): +def update_apobec(target_dir=None): """ Query the HIVDB facts github page to find and update apobec file @return: file path of updated file @@ -123,19 +119,18 @@ def update_apobec(): try: latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/apobecs/apobecs.csv' request = requests.get(latest) - filepath = os.path.join(mod_path, "data", "apobecs.csv") + filepath = os.path.join(target_dir, "apobecs.csv") with open(filepath, 'wb') as file: file.write(request.content) print(f'Updated apobecs file to {filepath}') - return filepath except: print('Could not update file for apobecs (apobecs.csv)\n' 'Please download manually from https://github.com/hivdb/hivfacts/tree/main/data') -def update_reference_fasta(): +def update_reference_fasta(target_dir=None): """ update reference fasta file for subtyper script """ @@ -143,7 +138,7 @@ def update_reference_fasta(): try: latest = "https://cms.hivdb.org/prod/downloads/hiv-genotyper/genotype-references.fasta" request = requests.get(latest) - filepath = os.path.join(mod_path, 'data', 'genotype-references.fasta') + filepath = os.path.join(target_dir, 'genotype-references.fasta') with open(filepath, 'wb') as file: file.write(request.content) @@ -151,7 +146,7 @@ def update_reference_fasta(): except: print("Couldn't update subtyper reference fasta, please get manually at: https://hivdb.stanford.edu/page/hiv-subtyper/") -def update_genotype_properties(): +def update_genotype_properties(target_dir=None): """ update genotype property file for subtyper script """ @@ -159,7 +154,7 @@ def update_genotype_properties(): try: latest = 'https://cms.hivdb.org/prod/downloads/hiv-genotyper/genotype-properties.tsv' request = requests.get(latest) - filepath = os.path.join(mod_path, 'data', 'genotype-properties.csv') + filepath = os.path.join(target_dir, 'genotype-properties.csv') with open(filepath, 'wb') as file: file.write(request.content) @@ -167,15 +162,29 @@ def update_genotype_properties(): except: print("Couldn't update subtyper genotype property file, please get manually at: https://hivdb.stanford.edu/page/hiv-subtyper/") -def main(): # pragma: no cover - update_hivdb() - update_apobec() - update_is_unusual() - update_sdrms() - update_mutation_type() - update_apobec_mutation() - update_genotype_properties() - update_reference_fasta() +def main(updater_outdir=None): # pragma: no cover + update_hivdb(updater_outdir) + update_apobec(updater_outdir) + update_is_unusual(updater_outdir) + update_sdrms(updater_outdir) + update_mutation_type(updater_outdir) + update_apobec_mutation(updater_outdir) + update_genotype_properties(updater_outdir) + update_reference_fasta(updater_outdir) if __name__ == '__main__': - main() + # Add argument parsing for when running updater.py directly + parser = argparse.ArgumentParser(description='Update HIVdb data files') + parser.add_argument('-updater_outdir', default=None, + help=' Path to folder to store updated files from updater (default: sierralocal/data folder))') + args = parser.parse_args() + + if args.updater_outdir: + target_dir = args.updater_outdir + else: + target_dir = os.path.join(mod_path, "data") + + # Create directory if it doesn't exist + os.makedirs(target_dir, exist_ok=True) + + main(updater_outdir=target_dir) \ No newline at end of file From 1fed93a57e5b2b8b13a479ae6cc436c1f12a4e17 Mon Sep 17 00:00:00 2001 From: WilliamZekaiWang Date: Fri, 5 Sep 2025 15:59:43 -0400 Subject: [PATCH 3/8] fixed path in updater.py for isUnusual --- sierralocal/updater.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sierralocal/updater.py b/sierralocal/updater.py index 1211dfc..47c7bc9 100644 --- a/sierralocal/updater.py +++ b/sierralocal/updater.py @@ -58,7 +58,7 @@ def update_is_unusual(): print('Downloading the latest file to determine is unusual') try: - unusual_latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/2021.3/data/aapcnt/rx-all_subtype-all.csv' + unusual_latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/refs/heads/main/data/aapcnt/rx-all_subtype-all.csv' request = requests.get(unusual_latest) filepath = os.path.join(mod_path, "data", "rx-all_subtype-all.csv") with open(filepath, 'wb') as file: From ef7d8329c96de7fcb4956f073dc6dcce15ea5be1 Mon Sep 17 00:00:00 2001 From: WilliamZekaiWang Date: Tue, 9 Sep 2025 10:27:30 -0400 Subject: [PATCH 4/8] fixes so forceupdate runs on all additional files --- sierralocal/hivdb.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sierralocal/hivdb.py b/sierralocal/hivdb.py index 739532c..5e81e15 100644 --- a/sierralocal/hivdb.py +++ b/sierralocal/hivdb.py @@ -20,6 +20,11 @@ def __init__(self, asi2=None, apobec=None, forceupdate=False, updater_outdir=Non import sierralocal.updater as updater self.xml_filename = updater.update_hivdb(updater_outdir) self.json_filename = updater.update_apobec_mutation(updater_outdir) + self.apobec_csv = updater.update_apobec(updater_outdir) + self.is_unusual_csv = updater.update_is_unusual(updater_outdir) + self.sdrms_csv = updater.update_sdrms(updater_outdir) + self.mutation_type_csv = updater.update_mutation_type(updater_outdir) + else: self.set_hivdb_xml(asi2) self.set_apobec_json(apobec) From f398365119a66117016271cb774a9676ab9d980c Mon Sep 17 00:00:00 2001 From: WilliamZekaiWang Date: Tue, 9 Sep 2025 10:48:30 -0400 Subject: [PATCH 5/8] added checker, added fixes for forceupdate --- README.md | 46 +++++++++++++++++++++++++++++++++++++++++++- sierralocal/main.py | 47 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 91 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 466315e..32d87c5 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ pip install --user . ## Using sierra-local ### Command-line interface (CLI) -Before running, we recommend using the `sierralocal/updater.py` script to update the data files associated with this repository to the most updated versions available from [hivfacts](https://github.com/hivdb/hivfacts/tree/main/data). Please note that you do need the requests package stated above for the following command to run. More information regarding this script is detailed below. +Before running, we recommend using the `sierralocal/updater.py` script to update the data files associated with this repository to the most updated versions available from [hivfacts](https://github.com/hivdb/hivfacts/tree/main/data). Please note that you do need the requests package stated above for the following command to run. More information regarding this script is detailed below. An alternative to running this script through the main function is also provided below. ```console (sierra) will@dyn172-30-75-11 sierra-local % python3 sierralocal/updater.py Downloading the latest HIVDB XML File @@ -160,6 +160,50 @@ Writing JSON to file RT_results.json Time elapsed: 9.3442 seconds (10.751 it/s) ``` +To specify other files for detecting the parameters, `isApobecMutation`, `isUnusual`, `isSDRM`, `primaryType`, you can use the following arguments: `-apobec_csv`, `-unusual_csv`, `-sdrms_csv`, `-mutation_csv`, respectively +```console +(sierra) will@Williams-MacBook-Pro sierra-local % sierralocal -apobec_csv apobecs.csv -unusual_csv rx-all_subtype-all.csv -sdrms_csv sdrms_hiv1.csv -mutation_csv mutation-type-pairs_hiv1.csv RT.fa +searching path /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/HIVDB*.xml +searching path /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/apobec_drms.json +HIVdb version 9.8 +Using unusual file: rx-all_subtype-all.csv +Using SDRM mutations file: sdrms_hiv1.csv +Using mutation type file: mutation-type-pairs_hiv1.csv +Using APOBEC file: apobecs.csv +Aligning using post-align +Aligned RT.fa +100 sequences found in file RT.fa. +Writing JSON to file RT_results.json +Time elapsed: 9.5917 seconds (10.481 it/s) +``` + +To update these files while running the script, and subsequently specify an output directory, you can use the args `-forceupdate` followed by `-output_dir` and the new file path. Please note, if you run with `-forceupdate`, you must rerun the installation steps to apply the changes. If you do choose to a different output directory, you must always specifiy the new file locations for these files, otherwise they will default to the ones found in the `sierralocal/data` folder. +```console +(sierra) will@Williams-MacBook-Pro sierra-local % sierralocal --forceupdate -updater_outdir . RT.fa +Downloading the latest HIVDB XML File +Updated HIVDB XML into ./HIVDB_9.8.xml +Downloading the latest APOBEC DRMS File +Updated APOBEC DRMs into ./apobec_drms.json +Downloading the latest file to determine apobec +Updated apobecs file to ./apobecs.csv +Downloading the latest file to determine is unusual +Updated is unusual file to ./rx-all_subtype-all.csv +Downloading the latest file to determine SDRM mutations +Updated SDRM mutations file to ./sdrms_hiv1.csv +Downloading the latest file to determine mutation type +Updated mutation type file to ./mutation-type-pairs_hiv1.csv +HIVdb version 9.8 +Using unusual file: /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/rx-all_subtype-all.csv +Using SDRM mutations file: /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/sdrms_hiv1.csv +Using mutation type file: /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/mutation-type-pairs_hiv1.csv +Using APOBEC file: /Users/will/miniconda3/envs/sierra/lib/python3.9/site-packages/sierralocal/data/apobecs.csv +Aligning using post-align +Aligned RT.fa +100 sequences found in file RT.fa. +Writing JSON to file RT_results.json +Time elapsed: 9.9952 seconds (10.846 it/s) +``` + ### As a Python module If you have downloaded the package source to your computer, you can also run *sierra-local* as a Python module from the root directory of the package. In the following example, we are calling the main function of *sierra-local* from an interactive Python session: ```console diff --git a/sierralocal/main.py b/sierralocal/main.py index 2412548..8224e99 100644 --- a/sierralocal/main.py +++ b/sierralocal/main.py @@ -4,6 +4,7 @@ import argparse import json from pathlib import Path +import csv from sierralocal import score_alg from sierralocal.hivdb import HIVdb @@ -193,7 +194,7 @@ def parse_args(): # pragma: no cover parser.add_argument('fasta', nargs='+', type=str, help='List of input files.') parser.add_argument('-o', dest='outfile', default=None, type=str, help='Output filename.') parser.add_argument('-xml', default=None, - help=' Path to HIVdb ASI2 XML file') + help=' Path to HIVdb ASI2 XML file (default: HIVDB_9.4.xml)') parser.add_argument('-json', default=None, help=' Path to JSON HIVdb APOBEC DRM file') parser.add_argument('--cleanup', action='store_true', @@ -216,12 +217,56 @@ def parse_args(): # pragma: no cover args = parser.parse_args() return args + +def check_input(apobec_path, unusual_path, sdrms_path, mutation_path): + """ + Check if the input for the files are valid based on the first row of the csv. + + apobec_path: path to apobec_drms.csv + unusual_path: path to rx-all_subtype-all.csv + sdrms_path: path to sdrms_hiv1.csv + mutation_path: path to mutation-type-pairs_hiv1.csv + """ + exp = { + "apobec_csv": ["gene", "position", "aa"], + "unusual_csv": ["gene", "position", "aa", "percent", "count", "total", "reason", "isUnusual"], + "sdrms_csv": ["drug_class", "gene", "position", "aa"], + "mutation_csv": ["strain", "gene", "drugClass", "position", "aas", "mutationType", "isUnusual"], + } + + paths = { + "apobec_csv": apobec_path, + "unusual_csv": unusual_path, + "sdrms_csv": sdrms_path, + "mutation_csv": mutation_path, + } + + for key, path in paths.items(): + if path is None: + continue + try: + with open(path, newline="", encoding="utf-8-sig") as f: + reader = csv.reader(f) + header = next(reader) + except Exception as e: + sys.exit(f"Could not open {key} file '{path}': {e}") + + if header != exp[key]: + print( + f"Invalid header in {key} file '{path}'.\n" + f"Expected: {exp[key]}\nFound: {header}" + ) + + def main(): # pragma: no cover """ Main function called from CLI. """ args = parse_args() + # check for valid file inputs + check_input(args.apobec_csv, args.unusual_csv, args.sdrms_csv, args.mutation_csv) + mod_path = Path(os.path.dirname(__file__)) if args.updater_outdir: From 287a5aabf821f2c1109364010bd29d1dec9e0f88 Mon Sep 17 00:00:00 2001 From: WilliamZekaiWang Date: Tue, 9 Sep 2025 15:48:03 -0400 Subject: [PATCH 6/8] fix to accidentally dropping isSDRM from mut dict --- sierralocal/jsonwriter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sierralocal/jsonwriter.py b/sierralocal/jsonwriter.py index c182561..66ed0f7 100644 --- a/sierralocal/jsonwriter.py +++ b/sierralocal/jsonwriter.py @@ -49,6 +49,7 @@ def __init__(self, algorithm, apobec_csv, unusual_csv, sdrms_csv, mutation_csv): "Path to CSV file to determine if is unusual cannot be found at user specified " "path {}".format(unusual_csv)) print("Using unusual file: "+dest) + with open(dest, 'r', encoding='utf-8-sig') as is_unusual_file: is_unusual_file = csv.DictReader(is_unusual_file) self.is_unusual_dic = {} @@ -73,6 +74,7 @@ def __init__(self, algorithm, apobec_csv, unusual_csv, sdrms_csv, mutation_csv): "Path to CSV file to determine SDRM mutations cannot be found at user specified " "path {}".format(sdrms_csv)) print("Using SDRM mutations file: "+dest) + with open(dest, 'r', encoding='utf-8-sig') as sdrm_files: sdrm_files = csv.DictReader(sdrm_files) self.sdrm_dic = {} @@ -333,8 +335,9 @@ def format_aligned_gene_sequences(self, ordered_mutation_list, check_sdrm, sdrm_aas = self.is_sdrm(gene, mutation[0], mutation[1]) - - if check_sdrm: + mutdict['isSDRM'] = check_sdrm + + if check_sdrm: dic['SDRMs'].append({'text': mutation[2] + str(mutation[0]) + sdrm_aas}) mutdict['hasStop'] = self.has_stop(mutation, mutation[3]) From 234567b830bf3e43735c2f2bbf4349c18cc1cdf6 Mon Sep 17 00:00:00 2001 From: WilliamZekaiWang Date: Tue, 9 Sep 2025 15:59:12 -0400 Subject: [PATCH 7/8] fix to exit script if invalid file --- sierralocal/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sierralocal/main.py b/sierralocal/main.py index 8224e99..c052f71 100644 --- a/sierralocal/main.py +++ b/sierralocal/main.py @@ -240,7 +240,6 @@ def check_input(apobec_path, unusual_path, sdrms_path, mutation_path): "sdrms_csv": sdrms_path, "mutation_csv": mutation_path, } - for key, path in paths.items(): if path is None: continue @@ -256,6 +255,7 @@ def check_input(apobec_path, unusual_path, sdrms_path, mutation_path): f"Invalid header in {key} file '{path}'.\n" f"Expected: {exp[key]}\nFound: {header}" ) + sys.exit() def main(): # pragma: no cover From 4e1aa936796fe674ea9da5f851b4203d3ac9237a Mon Sep 17 00:00:00 2001 From: WilliamZekaiWang Date: Tue, 9 Sep 2025 16:11:52 -0400 Subject: [PATCH 8/8] fixes to handling X for is_sdrm --- sierralocal/jsonwriter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sierralocal/jsonwriter.py b/sierralocal/jsonwriter.py index 66ed0f7..6e17af7 100644 --- a/sierralocal/jsonwriter.py +++ b/sierralocal/jsonwriter.py @@ -334,7 +334,8 @@ def format_aligned_gene_sequences(self, ordered_mutation_list, mutation[3]) check_sdrm, sdrm_aas = self.is_sdrm(gene, mutation[0], - mutation[1]) + mutation[1], + mutation[3]) mutdict['isSDRM'] = check_sdrm if check_sdrm: @@ -510,7 +511,7 @@ def is_apobec_drm(self, gene, consensus, position, AA): return True return False - def is_sdrm(self, gene, position, AA): + def is_sdrm(self, gene, position, AA, text): """ see if specific amino acid mutation is a sdrm through checking hivbd facts @param gene: str, RT, IN, PR @@ -518,6 +519,8 @@ def is_sdrm(self, gene, position, AA): @param AA: new amino acid @return: bool """ + if text == 'X': + return False, '' position = str(position) all_aas = '' found = False