From 0a6a3c71b4f0b498a15289cf44774e451ef69a77 Mon Sep 17 00:00:00 2001 From: Bradford Powell Date: Sun, 16 Jun 2024 20:41:53 -0400 Subject: [PATCH] remove some files that were probably committed in error --- pangolin/.DS_Store | Bin 8196 -> 0 bytes pangolin/.fuse_hidden0000252700000002 | 257 -------------------------- 2 files changed, 257 deletions(-) delete mode 100644 pangolin/.DS_Store delete mode 100644 pangolin/.fuse_hidden0000252700000002 diff --git a/pangolin/.DS_Store b/pangolin/.DS_Store deleted file mode 100644 index ae7ccd48a7a3e9c09ff410227e447a1225856239..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMU2GIp6u#fIz)U-^15zp2k%fXOP_RFsf)cjf1+9QBY)iK-(C+SxbZ~a2?96U~ zQfz!8Q8fCX2)-Eq;)^kg`1fXvL47chMgmX%J)ypss2EN3%-q>xTS$B$hTz;}?wRv< z@0>H=nb|YT7(;78Yh)#%VZ*83;2lJp*3(%w#!cGmYs}`u9&WL#E-l4GrfZl$FnzIZG5P#ERJ!dy<3p zaMDe?1*bKg@AJ5hv^$sy&O2?}oH$Qsb{YC`zcN2=TW;F0^t|tEXgZnf?lvsV9&WV@ zmge~DB)LKqB~j{Esz*lFH8#~po7Ogt)<;LyH#O8p*EZiZIx2|^mN#x}8^|3xJaT08 z=<_@-1kVN*nG)y8sX>}fG0s0ylqE_*UMvMERaDBe)$UYJZ=c++%q!CMw-337Z8^L2 zyhF5Ki8vHXv3$`?`h=%^mD68jg`M^-e3D)1|qnm&PO23m4s1 zqpIBCibKgXRmd0z3Wj?~sTD&p>=5YS&JH1_j=oIOFM4+kZyUGbe41!+LooN+T-b3G|j)DMb!%<>Q;5TsC+5* zI)mxF-abs291r9>MA5&wB==U^tQoF7q3rUMieVKEf{!ajTWze4`%-< zC90s}4%A@=WIwxb8V=z|6W`(Yx7B1UioNAW0*;YmD&r|}F<;U&C`SMeI& z#QXRFAL3)2!#KXed3=pa_yIrS7yOFL_+2O$W(#wKh_FOhDXbFeg-t?CXcKk{eZqiX z@BoBTeeEd)7|8b%o%9f)`U*Y*3a74KUo~6qf1vHb8@RqM%<(H~e$}lDqRUs_xt8i| z0(l&&C3OB1^FGigm4^>>9|%)GI_{UAeEZ^3tzD@;UX5K;A^JhlD^z7xU|qXRqzX}H z1Z>?3Vz(+YIa{-OjY!p~2tF&tx*ah|t`-y=47s?@_8cPVy%~+33KA;oWf_8LZ2a@Q*Ui4!C8G?2W z7Hk}VL%<#-XpiABJdP)D9M9r8oWM!EfLDBYzmB)?HcsOV-XoxYgwF}&7x3*A*lwE) zTYNVaw#8)LGHq)g$xBe5>~eE)Wjvf}um9hC?eG6LSI}WJVFtnsOv(Vt6Ujs?iLUj~ z^Lp(l9RqZD!?PRX)OVrDZ^w!H?KshiKMd(SN~WGqETChYl7z;8{vqIB{=N?1|JUh< NZTSB8aekvse*len(alt): - alt = np.concatenate([alt[0:l//2+1],np.zeros(ndiff),alt[l//2+1:]]) - elif len(ref) pos or gene[4] < pos: - continue - gene_id = gene["gene_id"][0] - exons = [] - for exon in gtf.children(gene, featuretype="exon"): - exons.extend([exon[3], exon[4]]) - if gene[6] == '+': - genes_pos[gene_id] = exons - elif gene[6] == '-': - genes_neg[gene_id] = exons - - return (genes_pos, genes_neg) - - -def process_variant(lnum, chr, pos, ref, alt, gtf, models, args): - d = args.distance - cutoff = args.score_cutoff - - if len(set("ACGT").intersection(set(ref))) == 0 or len(set("ACGT").intersection(set(alt))) == 0 \ - or (len(ref) != 1 and len(alt) != 1 and len(ref) != len(alt)): - print("[Line %s]" % lnum, "WARNING, skipping variant: Variant format not supported.") - return -1 - elif len(ref) > 2*d: - print("[Line %s]" % lnum, "WARNING, skipping variant: Deletion too large") - return -1 - - fasta = pyfastx.Fasta(args.reference_file) - # try to make vcf chromosomes compatible with reference chromosomes - if chr not in fasta.keys() and "chr"+chr in fasta.keys(): - chr = "chr"+chr - elif chr not in fasta.keys() and chr[3:] in fasta.keys(): - chr = chr[3:] - - try: - seq = fasta[chr][pos-5001-d:pos+len(ref)+4999+d].seq - except Exception as e: - print(e) - print("[Line %s]" % lnum, "WARNING, skipping variant: Could not get sequence, possibly because the variant is too close to chromosome ends. " - "See error message above.") - return -1 - - if seq[5000+d:5000+d+len(ref)] != ref: - print("[Line %s]" % lnum, "WARNING, skipping variant: Mismatch between FASTA (ref base: %s) and variant file (ref base: %s)." - % (seq[5000+d:5000+d+len(ref)], ref)) - return -1 - - ref_seq = seq - alt_seq = seq[:5000+d] + alt + seq[5000+d+len(ref):] - - # get genes that intersect variant - genes_pos, genes_neg = get_genes(chr, pos, gtf) - if len(genes_pos)+len(genes_neg)==0: - print("[Line %s]" % lnum, "WARNING, skipping variant: Variant not contained in a gene body. Do GTF/FASTA chromosome names match?") - return -1 - - # get splice scores - loss_pos, gain_pos = None, None - if len(genes_pos) > 0: - loss_pos, gain_pos = compute_score(ref_seq, alt_seq, '+', d, models) - loss_neg, gain_neg = None, None - if len(genes_neg) > 0: - loss_neg, gain_neg = compute_score(ref_seq, alt_seq, '-', d, models) - - scores = "" - for (genes, loss, gain) in \ - ((genes_pos,loss_pos,gain_pos),(genes_neg,loss_neg,gain_neg)): - for gene, positions in genes.items(): - warnings = "Warnings:" - - if args.mask == "True" and len(positions) != 0: - positions = np.array(positions) - positions = positions - (pos - d) - - positions_filt = positions[(positions>=0) & (positions=cutoff)[0] - for p, s in zip(np.concatenate([g-d,l-d]), np.concatenate([gain[g],loss[l]])): - scores += "%s:%s|" % (p, round(s,2)) - - else: - scores = scores+gene+'|' - l, g = np.argmin(loss), np.argmax(gain), - scores += "%s:%s|%s:%s|" % (g-d, round(gain[g],2), l-d, round(loss[l],2)) - - scores += warnings - - return scores.strip('|') - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("variant_file", help="VCF or CSV file with a header (see COLUMN_IDS option).") - parser.add_argument("reference_file", help="FASTA file containing a reference genome sequence.") - parser.add_argument("annotation_file", help="gffutils database file. Can be generated using create_db.py.") - parser.add_argument("output_file", help="Prefix for output file. Will be a VCF/CSV if variant_file is VCF/CSV.") - parser.add_argument("-c", "--column_ids", default="CHROM,POS,REF,ALT", help="(If variant_file is a CSV) Column IDs for: chromosome, variant position, reference bases, and alternative bases. " - "Separate IDs by commas. (Default: CHROM,POS,REF,ALT)") - parser.add_argument("-m", "--mask", default="True", choices=["False","True"], help="If True, splice gains (increases in score) at annotated splice sites and splice losses (decreases in score) at unannotated splice sites will be set to 0. (Default: True)") - parser.add_argument("-s", "--score_cutoff", type=float, help="Output all sites with absolute predicted change in score >= cutoff, instead of only the maximum loss/gain sites.") - parser.add_argument("-d", "--distance", type=int, default=50, help="Number of bases on either side of the variant for which splice scores should be calculated. (Default: 50)") - #parser.add_argument("--score_exons", default="False", choices=["False","True"], help="Output changes in score for both splice sites of annotated exons, as long as one splice site is within the considered range (specified by -d). Output will be: gene|site1_pos:score|site2_pos:score|...") - args = parser.parse_args() - - variants = args.variant_file - gtf = args.annotation_file - try: - gtf = gffutils.FeatureDB(gtf) - except: - print("ERROR, annotation_file could not be opened. Is it a gffutils database file?") - exit() - - if torch.cuda.is_available(): - print("Using GPU") - else: - print("Using CPU") - - models = [] - for i in [0,2,4,6]: - for j in range(1,4): - model = Pangolin(L, W, AR) - if torch.cuda.is_available(): - model.cuda() - weights = torch.load(resource_filename(__name__,"models/final.%s.%s.3.v2" % (j, i))) - else: - weights = torch.load(resource_filename(__name__,"models/final.%s.%s.3.v2" % (j, i)), map_location=torch.device('cpu')) - model.load_state_dict(weights) - model.eval() - models.append(model) - - if variants.endswith(".vcf"): - lnum = 0 - # count the number of header lines - for line in open(variants, 'r'): - lnum += 1 - if line[0] != '#': - break - - variants = vcf.Reader(filename=variants) - variants.infos["Pangolin"] = vcf.parser._Info( - "Pangolin",'.',"String","Pangolin splice scores. " - "Format: gene|pos:score_change|pos:score_change|...",'.','.') - fout = vcf.Writer(open(args.output_file+".vcf", 'w'), variants) - - for i, variant in enumerate(variants): - scores = process_variant(lnum+i, str(variant.CHROM), int(variant.POS), variant.REF, str(variant.ALT[0]), gtf, models, args) - if scores != -1: - variant.INFO["Pangolin"] = scores - fout.write_record(variant) - fout.flush() - - fout.close() - - elif variants.endswith(".csv"): - col_ids = args.column_ids.split(',') - variants = pd.read_csv(variants, header=0) - fout = open(args.output_file+".csv", 'w') - fout.write(','.join(variants.columns)+',Pangolin\n') - fout.flush() - - for lnum, variant in variants.iterrows(): - chr, pos, ref, alt = variant[col_ids] - ref, alt = ref.upper(), alt.upper() - scores = process_variant(lnum+1, str(chr), int(pos), ref, alt, gtf, models, args) - if scores == -1: - fout.write(','.join(variant.to_csv(header=False, index=False).split('\n'))+'\n') - else: - fout.write(','.join(variant.to_csv(header=False, index=False).split('\n'))+scores+'\n') - fout.flush() - - fout.close() - - else: - print("ERROR, variant_file needs to be a CSV or VCF.") - - # executionTime = (time.time() - startTime) - # print('Execution time in seconds: ' + str(executionTime)) - -if __name__ == '__main__': - main()