diff --git a/sierralocal/hivdb.py b/sierralocal/hivdb.py index 2a99586..739532c 100644 --- a/sierralocal/hivdb.py +++ b/sierralocal/hivdb.py @@ -12,14 +12,14 @@ class HIVdb(): webserver, to retrieve the rules-based prediction algorithm as ASI XML, and convert this information into Python objects. """ - def __init__(self, asi2=None, apobec=None, forceupdate=False): + def __init__(self, asi2=None, apobec=None, forceupdate=False, updater_outdir=None): self.xml_filename = None self.json_filename = None if forceupdate: import sierralocal.updater as updater - self.xml_filename = updater.update_HIVDB() - self.json_filename = updater.update_APOBEC() + self.xml_filename = updater.update_hivdb(updater_outdir) + self.json_filename = updater.update_apobec_mutation(updater_outdir) else: self.set_hivdb_xml(asi2) self.set_apobec_json(apobec) diff --git a/sierralocal/jsonwriter.py b/sierralocal/jsonwriter.py index a6bf12d..c182561 100644 --- a/sierralocal/jsonwriter.py +++ b/sierralocal/jsonwriter.py @@ -10,7 +10,7 @@ class JSONWriter(): - def __init__(self, algorithm): + def __init__(self, algorithm, apobec_csv, unusual_csv, sdrms_csv, mutation_csv): # possible alternative drug abbrvs self.names = {'3TC': 'LMV'} @@ -39,7 +39,16 @@ def __init__(self, algorithm): self.rt_comments = dict(csv.reader(rt_file, delimiter='\t')) # make dictionary for isUnusual - dest = str(Path(os.path.dirname(__file__)) / 'data' / 'rx-all_subtype-all.csv') + if unusual_csv is None: + dest = str(Path(os.path.dirname(__file__)) / 'data' / 'rx-all_subtype-all.csv') + else: + if os.path.isfile(unusual_csv): # Ensure is a file + dest = unusual_csv + else: + raise FileNotFoundError( + "Path to CSV file to determine if is unusual cannot be found at user specified " + "path {}".format(unusual_csv)) + print("Using unusual file: "+dest) with open(dest, 'r', encoding='utf-8-sig') as is_unusual_file: is_unusual_file = csv.DictReader(is_unusual_file) self.is_unusual_dic = {} @@ -54,7 +63,16 @@ def __init__(self, algorithm): self.is_unusual_dic[gene].update({pos: {}}) self.is_unusual_dic[gene][pos].update({aa: unusual}) - dest = str(Path(os.path.dirname(__file__)) / 'data' / 'sdrms_hiv1.csv') + if sdrms_csv is None: + dest = str(Path(os.path.dirname(__file__)) / 'data' / 'sdrms_hiv1.csv') + else: + if os.path.isfile(sdrms_csv): # Ensure is a file + dest = sdrms_csv + else: + raise FileNotFoundError( + "Path to CSV file to determine SDRM mutations cannot be found at user specified " + "path {}".format(sdrms_csv)) + print("Using SDRM mutations file: "+dest) with open(dest, 'r', encoding='utf-8-sig') as sdrm_files: sdrm_files = csv.DictReader(sdrm_files) self.sdrm_dic = {} @@ -86,7 +104,17 @@ def __init__(self, algorithm): self.apobec_drm_dic[gene][position] += aa # make dictionary for primary type - dest = str(Path(os.path.dirname(__file__)) / 'data' / 'mutation-type-pairs_hiv1.csv') + if mutation_csv is None: + dest = str(Path(os.path.dirname(__file__)) / 'data' / 'mutation-type-pairs_hiv1.csv') + else: + if os.path.isfile(mutation_csv): # Ensure is a file + dest = mutation_csv + else: + raise FileNotFoundError( + "Path to CSV file to determine mutation type cannot be found at user specified " + "path {}".format(mutation_csv)) + + print("Using mutation type file: "+dest) with open(dest, 'r', encoding='utf-8-sig') as mut_type_pairs1_files: mut_type_pairs1_files = csv.DictReader(mut_type_pairs1_files) self.primary_type_dic = {} @@ -102,7 +130,16 @@ def __init__(self, algorithm): self.primary_type_dic[gene][pos].update({aa: mut}) # make dictionary for apobec mutations - dest = str(Path(os.path.dirname(__file__)) / 'data' / 'apobecs.csv') + if apobec_csv is None: + dest = str(Path(os.path.dirname(__file__)) / 'data' / 'apobecs.csv') + else: + if os.path.isfile(apobec_csv): # Ensure is a file + dest = apobec_csv + else: + raise FileNotFoundError( + "Path to CSV file with APOBEC cannot be found at user specified " + "path {}".format(apobec_csv)) + print("Using APOBEC file: "+dest) with open(dest, 'r', encoding='utf-8-sig') as apobec_mutations: apobec_mutations = csv.DictReader(apobec_mutations) self.apobec_mutations_dic = {} diff --git a/sierralocal/main.py b/sierralocal/main.py index a959afc..2412548 100644 --- a/sierralocal/main.py +++ b/sierralocal/main.py @@ -3,13 +3,13 @@ import time import argparse import json +from pathlib import Path from sierralocal import score_alg from sierralocal.hivdb import HIVdb from sierralocal.jsonwriter import JSONWriter from sierralocal.nucaminohook import NucAminoAligner - def score(filename, xml_path=None, tsv_path=None, forceupdate=False, do_subtype=False, program='post'): # pragma: no cover """ Functionality as a Python module. Can import this function from sierralocal. @@ -123,7 +123,8 @@ def scorefile(input_file, algorithm, do_subtype=False, program='post'): file_genes, sequence_lengths, file_trims, subtypes, na_sequence, ambiguous, gene_order def sierralocal(fasta, outfile, xml=None, json=None, cleanup=False, forceupdate=False, - program='post', do_subtype=False): # pragma: no cover + apobec_csv=None, unusual_csv=None, sdrms_csv=None, mutation_csv=None, + updater_outdir=None, program='post', do_subtype=False): # pragma: no cover """ Contains all initializing and processing calls. @@ -134,13 +135,17 @@ def sierralocal(fasta, outfile, xml=None, json=None, cleanup=False, forceupdate= @param json: str, path to local copy of HIVdb algorithm APOBEC DRM file @param cleanup: bool, to delete alignment file @param forceupdate: bool, forces sierralocal to update its local copy of the HIVdb algorithm + @param apobec_csv: str , Path to CSV APOBEC csv file (default: apobecs.csv) + @param unusual_csv: str , Path to CSV file to determine if is unusual (default: rx-all_subtype-all.csv) + @param sdrms_csv: str , Path to CSV file to determine SDRM mutations (default: sdrms_hiv1.csv) + @param mutation_csv: str , Path to CSV file to determine mutation type (default: mutation-type-pairs_hiv1.csv) @return: tuple, a tuple of (number of records processed, time elapsed initializing algorithm) """ # initialize algorithm and jsonwriter time0 = time.time() - algorithm = HIVdb(asi2=xml, apobec=json, forceupdate=forceupdate) - writer = JSONWriter(algorithm) + algorithm = HIVdb(asi2=xml, apobec=json, forceupdate=forceupdate, updater_outdir=updater_outdir) + writer = JSONWriter(algorithm, apobec_csv, unusual_csv, sdrms_csv, mutation_csv) time_elapsed = time.time() - time0 # accommodate single file path argument @@ -197,16 +202,36 @@ def parse_args(): # pragma: no cover help='Forces update of HIVdb algorithm. Requires network connection.') parser.add_argument('-alignment', default='post', choices=['post', 'nuc'], help='Alignment program to use, "post" for post align and "nuc" for nucamino') + parser.add_argument('-apobec_csv', default=None, + help=' Path to CSV APOBEC csv file (default: apobecs.csv)') + parser.add_argument('-unusual_csv', default=None, + help=' Path to CSV file to determine if is unusual (default: rx-all_subtype-all.csv)') + parser.add_argument('-sdrms_csv', default=None, + help=' Path to CSV file to determine SDRM mutations (default: sdrms_hiv1.csv)') + parser.add_argument('-mutation_csv', default=None, + help=' Path to CSV file to determine mutation type (default: mutation-type-pairs_hiv1.csv)') + parser.add_argument('-updater_outdir', default=None, + help=' Path to folder to store updated files from updater (default: sierralocal/data folder))') + args = parser.parse_args() return args - def main(): # pragma: no cover """ Main function called from CLI. """ args = parse_args() + mod_path = Path(os.path.dirname(__file__)) + + if args.updater_outdir: + target_dir = args.updater_outdir + else: + target_dir = os.path.join(mod_path, "data") + + # Create directory if it doesn't exist + os.makedirs(target_dir, exist_ok=True) + # check that FASTA files in list all exist for file in args.fasta: if not os.path.exists(file): @@ -216,6 +241,8 @@ def main(): # pragma: no cover time_start = time.time() count, time_elapsed = sierralocal(args.fasta, args.outfile, xml=args.xml, json=args.json, cleanup=args.cleanup, forceupdate=args.forceupdate, + apobec_csv=args.apobec_csv, unusual_csv=args.unusual_csv, + sdrms_csv=args.sdrms_csv, mutation_csv=args.mutation_csv, updater_outdir=target_dir, program=args.alignment) time_diff = time.time() - time_start diff --git a/sierralocal/updater.py b/sierralocal/updater.py index 1211dfc..523b43b 100644 --- a/sierralocal/updater.py +++ b/sierralocal/updater.py @@ -1,14 +1,14 @@ import requests from pathlib import Path import os - +import argparse mod_path = Path(os.path.dirname(__file__)) - -def update_apobec_mutation(): +def update_apobec_mutation(target_dir=None): """ Update the APOBEC DRMS file from the Github page + :param updater_outdir: optional directory to save files :return: absolute path to the apobec drms JSON file """ # UPDATE APOBEC DRMS @@ -16,7 +16,7 @@ def update_apobec_mutation(): try: url = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/apobecs/apobec_drms.json' - filepath = os.path.join(mod_path, "data", "apobec_drms.json") + filepath = os.path.join(target_dir, "apobec_drms.json") request = requests.get(url, allow_redirects=True) with open(filepath, 'wb') as file: file.write(request.content) @@ -26,12 +26,11 @@ def update_apobec_mutation(): except: # pragma: no cover print("Unable to update APOBEC DRMs. Try manually downloading the APOBEC DRM JSON into data/apobec_drms.json") - - -def update_hivdb(): +def update_hivdb(target_dir=None): """ Query the HIVdb Github page for new ASI (algorithm specification interface) XML files. + :param updater_outdir: optional directory to save files :return: absolute path to new XML file """ print('Downloading the latest HIVDB XML File') @@ -39,7 +38,7 @@ def update_hivdb(): url = requests.get('https://raw.githubusercontent.com/hivdb/hivfacts/main/data/algorithms/HIVDB_latest.xml') file = url.text - filepath = os.path.join(mod_path, "data", file) + filepath = os.path.join(target_dir, file) hivdb_latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/algorithms/{}'.format(file) request = requests.get(hivdb_latest, allow_redirects=True) with open(filepath, 'wb') as file: @@ -53,26 +52,25 @@ def update_hivdb(): print(e) return None - -def update_is_unusual(): +def update_is_unusual(target_dir=None): + print('Downloading the latest file to determine is unusual') try: unusual_latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/2021.3/data/aapcnt/rx-all_subtype-all.csv' request = requests.get(unusual_latest) - filepath = os.path.join(mod_path, "data", "rx-all_subtype-all.csv") + filepath = os.path.join(target_dir, "rx-all_subtype-all.csv") with open(filepath, 'wb') as file: file.write(request.content) print(f'Updated is unusual file to {filepath}') - return filepath except: print('Could not update file for is unusual (rx-all_subtype-all.csv)\n' 'Please download manually from https://hivdb.stanford.edu/page/release-notes/#data.files') -def update_sdrms(): +def update_sdrms(target_dir=None): """ Query the HIVDB facts github page to find and update SDRM mutations file @return: file path of updated file @@ -81,19 +79,18 @@ def update_sdrms(): try: latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/sdrms_hiv1.csv' request = requests.get(latest) - filepath = os.path.join(mod_path, "data", "sdrms_hiv1.csv") + filepath = os.path.join(target_dir, "sdrms_hiv1.csv") with open(filepath, 'wb') as file: file.write(request.content) print(f'Updated SDRM mutations file to {filepath}') - return filepath except: print('Could not update file for SDRM Mutations (sdrms_hiv1.csv)\n' 'Please download manually from https://github.com/hivdb/hivfacts/tree/main/data') -def update_mutation_type(): +def update_mutation_type(target_dir=None): """ Query the HIVDB facts github page to find and update mutations type file @return: file path of updated file @@ -102,19 +99,18 @@ def update_mutation_type(): try: latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/mutation-type-pairs_hiv1.csv' request = requests.get(latest) - filepath = os.path.join(mod_path, "data", "mutation-type-pairs_hiv1.csv") + filepath = os.path.join(target_dir, "mutation-type-pairs_hiv1.csv") with open(filepath, 'wb') as file: file.write(request.content) print(f'Updated mutation type file to {filepath}') - return filepath except: print('Could not update file for mutation type (mutation-type-pairs_hiv1.csv)\n' 'Please download manually from https://github.com/hivdb/hivfacts/tree/main/data') -def update_apobec(): +def update_apobec(target_dir=None): """ Query the HIVDB facts github page to find and update apobec file @return: file path of updated file @@ -123,19 +119,18 @@ def update_apobec(): try: latest = 'https://raw.githubusercontent.com/hivdb/hivfacts/main/data/apobecs/apobecs.csv' request = requests.get(latest) - filepath = os.path.join(mod_path, "data", "apobecs.csv") + filepath = os.path.join(target_dir, "apobecs.csv") with open(filepath, 'wb') as file: file.write(request.content) print(f'Updated apobecs file to {filepath}') - return filepath except: print('Could not update file for apobecs (apobecs.csv)\n' 'Please download manually from https://github.com/hivdb/hivfacts/tree/main/data') -def update_reference_fasta(): +def update_reference_fasta(target_dir=None): """ update reference fasta file for subtyper script """ @@ -143,7 +138,7 @@ def update_reference_fasta(): try: latest = "https://cms.hivdb.org/prod/downloads/hiv-genotyper/genotype-references.fasta" request = requests.get(latest) - filepath = os.path.join(mod_path, 'data', 'genotype-references.fasta') + filepath = os.path.join(target_dir, 'genotype-references.fasta') with open(filepath, 'wb') as file: file.write(request.content) @@ -151,7 +146,7 @@ def update_reference_fasta(): except: print("Couldn't update subtyper reference fasta, please get manually at: https://hivdb.stanford.edu/page/hiv-subtyper/") -def update_genotype_properties(): +def update_genotype_properties(target_dir=None): """ update genotype property file for subtyper script """ @@ -159,7 +154,7 @@ def update_genotype_properties(): try: latest = 'https://cms.hivdb.org/prod/downloads/hiv-genotyper/genotype-properties.tsv' request = requests.get(latest) - filepath = os.path.join(mod_path, 'data', 'genotype-properties.csv') + filepath = os.path.join(target_dir, 'genotype-properties.csv') with open(filepath, 'wb') as file: file.write(request.content) @@ -167,15 +162,29 @@ def update_genotype_properties(): except: print("Couldn't update subtyper genotype property file, please get manually at: https://hivdb.stanford.edu/page/hiv-subtyper/") -def main(): # pragma: no cover - update_hivdb() - update_apobec() - update_is_unusual() - update_sdrms() - update_mutation_type() - update_apobec_mutation() - update_genotype_properties() - update_reference_fasta() +def main(updater_outdir=None): # pragma: no cover + update_hivdb(updater_outdir) + update_apobec(updater_outdir) + update_is_unusual(updater_outdir) + update_sdrms(updater_outdir) + update_mutation_type(updater_outdir) + update_apobec_mutation(updater_outdir) + update_genotype_properties(updater_outdir) + update_reference_fasta(updater_outdir) if __name__ == '__main__': - main() + # Add argument parsing for when running updater.py directly + parser = argparse.ArgumentParser(description='Update HIVdb data files') + parser.add_argument('-updater_outdir', default=None, + help=' Path to folder to store updated files from updater (default: sierralocal/data folder))') + args = parser.parse_args() + + if args.updater_outdir: + target_dir = args.updater_outdir + else: + target_dir = os.path.join(mod_path, "data") + + # Create directory if it doesn't exist + os.makedirs(target_dir, exist_ok=True) + + main(updater_outdir=target_dir) \ No newline at end of file