From 0ef98a7f3750c407c3ced4665bfedf4b99e4b78a Mon Sep 17 00:00:00 2001 From: dariober Date: Tue, 21 Aug 2018 14:11:01 +0100 Subject: [PATCH 01/13] Edit pysam version to be compatible with python3 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 99b373c..a890ffd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ pep8==1.7.0 py==1.4.32 pylint==1.7.2 pyparsing==2.2.0 -pysam==0.7.7 +pysam==0.15.0 pytest==3.0.7 pytest-cov>=2.2.0 radon==2.0.2 From 30622439010cd1a6f3eac34aa38260c1d9170695 Mon Sep 17 00:00:00 2001 From: dariober Date: Wed, 22 Aug 2018 12:03:50 +0100 Subject: [PATCH 02/13] Edits fixing the main CAVA program * On root directory run `2to3 -w ./` which will fix most of the incompatiblities. 2to3 ships with python * Open gzip files in `rt` mode [`gzip.open(), 'rt'`] so that each line is returned as string instead of as binary * Integer division is explicitly cast to integer where an integer is expected * Major version bump to 2.0.0 --- bin/CAVA.py | 2 +- cava_/core.py | 128 +++++++++++++++++++++++++------------------------- cava_/csn.py | 12 ++--- cava_/data.py | 56 +++++++++++----------- cava_/main.py | 55 ++++++++++------------ install.sh | 2 +- setup.py | 2 +- 7 files changed, 127 insertions(+), 130 deletions(-) diff --git a/bin/CAVA.py b/bin/CAVA.py index 102b625..d1c910b 100755 --- a/bin/CAVA.py +++ b/bin/CAVA.py @@ -5,7 +5,7 @@ from cava_ import helper # Version -version = 'v1.2.3' +version = 'v2.0.0' # Read default configuration file name from the default_config_path file default_config_file = helper.defaultConfigPath() diff --git a/cava_/core.py b/cava_/core.py index 9049b54..0e2c291 100755 --- a/cava_/core.py +++ b/cava_/core.py @@ -5,7 +5,7 @@ ####################################################################################################################### -from __future__ import division + import os import logging import gzip @@ -342,7 +342,7 @@ def output(self, outformat, outfile, options, genelist, transcriptlist, snplist, # Writing record to the output file if stdout: - print '\t'.join(record) + print('\t'.join(record)) else: outfile.write('\t'.join(record) + '\n') @@ -381,7 +381,7 @@ def output(self, outformat, outfile, options, genelist, transcriptlist, snplist, # Writing record to the output file if stdout: - print record + rest + print(record + rest) else: outfile.write(record + rest + '\n') @@ -748,13 +748,13 @@ def read(self): line = line.strip() if line.startswith('@'): key = line[1:line.index('=')].strip() - if key in self.defs.keys(): + if key in list(self.defs.keys()): (typeofvar, default) = self.defs[key] if typeofvar == 'string': self.args[key] = line[line.find('=') + 1:].strip() if typeofvar == 'list': self.args[key] = line[line.find('=') + 1:].strip().split(',') if typeofvar == 'boolean': self.args[key] = (line[line.find('=') + 1:].strip().upper() == 'TRUE') - for key, (typeofvar, default) in self.defs.iteritems(): - if not key in self.args.keys(): self.args[key] = default + for key, (typeofvar, default) in self.defs.items(): + if not key in list(self.args.keys()): self.args[key] = default ####################################################################################################################### @@ -764,7 +764,7 @@ def read(self): # Reading gene, transcript or snp list from file def readSet(options, tag): ret = set() - if tag in options.args.keys() and not (options.args[tag] == '' or options.args[tag] == '.'): + if tag in list(options.args.keys()) and not (options.args[tag] == '' or options.args[tag] == '.'): for line in open(options.args[tag]): line = line.strip() if line == '' or line == '.': continue @@ -807,7 +807,7 @@ def writeHeader(options, header, outfile, stdout): if options.args['outputformat'] == 'VCF': if header == '': if stdout: - print '##fileformat=VCFv4.1\n' + dateline + '\n' + headerinfo + '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO' + print('##fileformat=VCFv4.1\n' + dateline + '\n' + headerinfo + '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO') else: outfile.write('##fileformat=VCFv4.1\n' + dateline + '\n' + headerinfo + '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n') else: @@ -821,7 +821,7 @@ def writeHeader(options, header, outfile, stdout): continue headerm.append(x) if stdout: - print startline + '\n' + dateline + '\n' + headerinfo + '\n'.join(headerm)+'\n' + print(startline + '\n' + dateline + '\n' + headerinfo + '\n'.join(headerm)+'\n') else: outfile.write(startline + '\n' + dateline + '\n' + headerinfo + '\n'.join(headerm)+'\n') @@ -845,7 +845,7 @@ def writeHeader(options, header, outfile, stdout): str += '\tDBSNP' if stdout: - print str + print(str) else: outfile.write(str + '\n') @@ -854,7 +854,7 @@ def writeHeader(options, header, outfile, stdout): def countRecords(filename): ret = 0 if filename.endswith('.gz'): - inputf = gzip.open(filename, 'r') + inputf = gzip.open(filename, 'rt') else: inputf = open(filename) for line in inputf: @@ -868,10 +868,10 @@ def checkOptions(options): # Checking if @inputformat was given correct value str = options.args['inputformat'].upper() if not (str == 'VCF' or str == 'TXT'): - print 'ERROR: incorrect value of the tag @inputformat.' - print '(Allowed values: \'VCF\' or \'TXT\')' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @inputformat.') + print('(Allowed values: \'VCF\' or \'TXT\')') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @inputformat.') logging.info('No output file written. CAVA quit.') @@ -880,10 +880,10 @@ def checkOptions(options): # Checking if @outputformat was given correct value str = options.args['outputformat'].upper() if not (str == 'VCF' or str == 'TSV'): - print 'ERROR: incorrect value of the tag @outputformat.' - print '(Allowed values: \'VCF\' or \'TSV\')' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @outputformat.') + print('(Allowed values: \'VCF\' or \'TSV\')') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @outputformat.') logging.info('No output file written. CAVA quit.') @@ -893,10 +893,10 @@ def checkOptions(options): str = options.args['type'].upper() if not ( str == 'ALL' or str == 'SUBSTITUTION' or str == 'INDEL' or str == 'INSERTION' or str == 'DELETION' or str == 'COMPLEX'): - print 'ERROR: incorrect value of the tag @type.' - print '(Allowed values: \'all\', \'substitution\', \'indel\', \'insertion\', \'deletion\' or \'complex\')' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @type.') + print('(Allowed values: \'all\', \'substitution\', \'indel\', \'insertion\', \'deletion\' or \'complex\')') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @type.') logging.info('No output file written. CAVA quit.') @@ -905,10 +905,10 @@ def checkOptions(options): # Checking if @ssrange was given correct value ssrange = int(options.args['ssrange']) if not ssrange >= 6: - print 'ERROR: incorrect value of the tag @ssrange.' - print '(Minimum value allowed is 6.)' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @ssrange.') + print('(Minimum value allowed is 6.)') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @ssrange.') logging.info('No output file written. CAVA quit.') @@ -917,10 +917,10 @@ def checkOptions(options): # Checking if @ontology was given correct value str = options.args['ontology'].upper() if not (str == 'CLASS' or str == 'SO' or str == 'BOTH'): - print 'ERROR: incorrect value of the tag @ontology.' - print '(Allowed values: \'CLASS\' or \'SO\' or \'both\')' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @ontology.') + print('(Allowed values: \'CLASS\' or \'SO\' or \'both\')') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @ontology.') logging.info('No output file written. CAVA quit.') @@ -928,9 +928,9 @@ def checkOptions(options): # Checking if @reference file exists if not os.path.isfile(options.args['reference']): - print 'ERROR: the file given as @reference does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @reference does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @reference does not exist.') logging.info('No output file written. CAVA quit.') @@ -938,9 +938,9 @@ def checkOptions(options): # Checking if @reference index file exists if not os.path.isfile(options.args['reference'] + '.fai'): - print 'ERROR: the .fa.fai index file for @reference is not found.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the .fa.fai index file for @reference is not found.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The .fa.fai index file for @reference is not found.') logging.info('No output file written. CAVA quit.') @@ -949,9 +949,9 @@ def checkOptions(options): # Checking if @ensembl file exists if not (options.args['ensembl'] == '.' or options.args['ensembl'] == '') and not os.path.isfile( options.args['ensembl']): - print 'ERROR: the file given as @ensembl does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @ensembl does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @ensembl does not exist.') logging.info('No output file written. CAVA quit.') @@ -960,9 +960,9 @@ def checkOptions(options): # Checking if @ensembl index file exists if not (options.args['ensembl'] == '.' or options.args['ensembl'] == '') and not os.path.isfile( options.args['ensembl'] + '.tbi'): - print 'ERROR: the .gz.tbi index file for @ensembl is not found.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the .gz.tbi index file for @ensembl is not found.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The .gz.tbi index file for @ensembl is not found.') logging.info('No output file written. CAVA quit.') @@ -970,9 +970,9 @@ def checkOptions(options): # Checking if @dbsnp file exists if not (options.args['dbsnp'] == '.' or options.args['dbsnp'] == '') and not os.path.isfile(options.args['dbsnp']): - print 'ERROR: the file given as @dbsnp does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @dbsnp does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @dbsnp does not exist.') logging.info('No output file written. CAVA quit.') @@ -981,9 +981,9 @@ def checkOptions(options): # Checking if @dbsnp index file exists if not (options.args['dbsnp'] == '.' or options.args['dbsnp'] == '') and not os.path.isfile( options.args['dbsnp'] + '.tbi'): - print 'ERROR: the .gz.tbi index file for @dbsnp is not found.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the .gz.tbi index file for @dbsnp is not found.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The .gz.tbi index file for @dbsnp is not found.') logging.info('No output file written. CAVA quit.') @@ -992,9 +992,9 @@ def checkOptions(options): # Checking if @target file exists if not (options.args['target'] == '.' or options.args['target'] == '') and not os.path.isfile( options.args['target']): - print 'ERROR: the file given as @target does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @target does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @target does not exist.') logging.info('No output file written. CAVA quit.') @@ -1003,9 +1003,9 @@ def checkOptions(options): # Checking if @target index file exists if not (options.args['target'] == '.' or options.args['target'] == '') and not os.path.isfile( options.args['target'] + '.tbi'): - print 'ERROR: the .bed.tbi index file for @target is not found.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the .bed.tbi index file for @target is not found.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The .bed.tbi index file for @target is not found.') logging.info('No output file written. CAVA quit.') @@ -1014,9 +1014,9 @@ def checkOptions(options): # Checking if @genelist file exists if not (options.args['genelist'] == '.' or options.args['genelist'] == '') and not os.path.isfile( options.args['genelist']): - print 'ERROR: the file given as @genelist does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @genelist does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @genelist does not exist.') logging.info('No output file written. CAVA quit.') @@ -1025,9 +1025,9 @@ def checkOptions(options): # Checking if @transcriptlist file exists if not (options.args['transcriptlist'] == '.' or options.args['transcriptlist'] == '') and not os.path.isfile( options.args['transcriptlist']): - print 'ERROR: the file given as @transcriptlist does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @transcriptlist does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @transcriptlist does not exist.') logging.info('No output file written. CAVA quit.') @@ -1036,9 +1036,9 @@ def checkOptions(options): # Checking if @snplist file exists if not (options.args['snplist'] == '.' or options.args['snplist'] == '') and not os.path.isfile( options.args['snplist']): - print 'ERROR: the file given as @snplist does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @snplist does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @snplist does not exist.') logging.info('No output file written. CAVA quit.') diff --git a/cava_/csn.py b/cava_/csn.py index 89fc1e5..3426f7e 100755 --- a/cava_/csn.py +++ b/cava_/csn.py @@ -4,7 +4,7 @@ # CSN annotation ####################################################################################################################### -import core +from . import core # Class representing a CSN annotation @@ -175,7 +175,7 @@ def makeProteinString(variant, transcript, reference, prot, mutprot, coord1): # Checking if there was no change in protein sequence if prot == mutprot: - idx = coord1 / 3 + idx = int(coord1 / 3) if coord1 % 3 > 0: idx += 1 return '_p.=', (str(idx), prot[idx-1], prot[idx-1]) @@ -281,7 +281,7 @@ def transformToCSNCoordinate(pos, transcript): if transcript.strand == 1: # Checking if genomic position is within intron if prevExonEnd < pos < exon.start + 1: - if pos <= (exon.start + 1 - prevExonEnd) / 2 + prevExonEnd: + if pos <= int((exon.start + 1 - prevExonEnd) / 2) + prevExonEnd: x, y = transformToCSNCoordinate(prevExonEnd, transcript) return x, pos - prevExonEnd else: @@ -290,7 +290,7 @@ def transformToCSNCoordinate(pos, transcript): else: # Checking if genomic position is within intron if exon.end < pos < prevExonEnd: - if pos >= (prevExonEnd - exon.end + 1) / 2 + exon.end: + if pos >= int((prevExonEnd - exon.end + 1) / 2) + exon.end: x, y = transformToCSNCoordinate(prevExonEnd, transcript) return x, prevExonEnd - pos else: @@ -320,7 +320,7 @@ def transformToCSNCoordinate(pos, transcript): if transcript.strand == 1: # Checking if genomic position is within intron if prevExonEnd < pos < exon.start + 1: - if pos <= (exon.start + 1 - prevExonEnd) / 2 + prevExonEnd: + if pos <= int((exon.start + 1 - prevExonEnd) / 2) + prevExonEnd: x, y = transformToCSNCoordinate(prevExonEnd, transcript) return x, pos - prevExonEnd else: @@ -329,7 +329,7 @@ def transformToCSNCoordinate(pos, transcript): else: # Checking if genomic position is within intron if exon.end < pos < prevExonEnd: - if pos >= (prevExonEnd - exon.end + 1) / 2 + exon.end: + if pos >= int((prevExonEnd - exon.end + 1) / 2) + exon.end: x, y = transformToCSNCoordinate(prevExonEnd, transcript) return x, prevExonEnd - pos else: diff --git a/cava_/data.py b/cava_/data.py index 5fa95ef..ee5671a 100755 --- a/cava_/data.py +++ b/cava_/data.py @@ -7,9 +7,9 @@ import os import sys -import conseq -import core -import csn +from . import conseq +from . import core +from . import csn sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + '/pysamdir') import pysam @@ -69,22 +69,22 @@ def findTranscripts(self, variant, strand, reference): hitdict2[transcript.TRANSCRIPT] = transcript # Find transcripts with which the variant fully or partially overlaps - for key, transcript in hitdict1.iteritems(): + for key, transcript in hitdict1.items(): if len(self.genelist) > 0 and transcript.geneSymbol not in self.genelist: continue if len(self.transcriptlist) > 0 and transcript.TRANSCRIPT not in self.transcriptlist: continue - if key in hitdict2.keys(): + if key in list(hitdict2.keys()): ret[key] = transcript else: if not variant.isInsertion(): retOUT[key] = transcript if not variant.isInsertion(): - for key, transcript in hitdict2.iteritems(): + for key, transcript in hitdict2.items(): if len(self.genelist) > 0 and transcript.geneSymbol not in self.genelist: continue if len(self.transcriptlist) > 0 and transcript.TRANSCRIPT not in self.transcriptlist: continue - if not key in hitdict1.keys(): + if not key in list(hitdict1.keys()): retOUT[key] = transcript else: @@ -190,8 +190,8 @@ def annotate(self, variant, reference, impactdir): transcripts_plus, transcriptsOUT_plus = self.findTranscripts(variant_plus, 1, reference) transcripts_minus, transcriptsOUT_minus = self.findTranscripts(variant_minus, -1, reference) - transcripts = set(transcripts_plus.keys() + transcripts_minus.keys()) - transcriptsOUT = set(transcriptsOUT_plus.keys() + transcriptsOUT_minus.keys()) + transcripts = set(list(transcripts_plus.keys()) + list(transcripts_minus.keys())) + transcriptsOUT = set(list(transcriptsOUT_plus.keys()) + list(transcriptsOUT_minus.keys())) transcripts = sorted(list(transcripts)) transcriptsOUT = sorted(list(transcriptsOUT)) @@ -201,7 +201,7 @@ def annotate(self, variant, reference, impactdir): if TRANSCRIPT in transcripts: continue - if TRANSCRIPT in transcriptsOUT_plus.keys(): transcript = transcriptsOUT_plus[TRANSCRIPT] + if TRANSCRIPT in list(transcriptsOUT_plus.keys()): transcript = transcriptsOUT_plus[TRANSCRIPT] else: transcript = transcriptsOUT_minus[TRANSCRIPT] if len(TRANSCRIPTstring) > 0: @@ -228,10 +228,10 @@ def annotate(self, variant, reference, impactdir): TRINFOstring += transcript.TRINFO if transcript.strand == 1: - if TRANSCRIPT in transcriptsOUT_plus.keys(): LOCstring += 'OUT' + if TRANSCRIPT in list(transcriptsOUT_plus.keys()): LOCstring += 'OUT' else: LOCstring += '.' else: - if TRANSCRIPT in transcriptsOUT_minus.keys(): LOCstring += 'OUT' + if TRANSCRIPT in list(transcriptsOUT_minus.keys()): LOCstring += 'OUT' else: LOCstring += '.' CSNstring += '.' @@ -250,7 +250,7 @@ def annotate(self, variant, reference, impactdir): # Iterating through the list of transcripts for TRANSCRIPT in transcripts: - if TRANSCRIPT in transcripts_plus.keys(): transcript = transcripts_plus[TRANSCRIPT] + if TRANSCRIPT in list(transcripts_plus.keys()): transcript = transcripts_plus[TRANSCRIPT] else: transcript = transcripts_minus[TRANSCRIPT] # Separating annotations by different transcripts with colon @@ -279,13 +279,13 @@ def annotate(self, variant, reference, impactdir): TRINFOstring += transcript.TRINFO # Creating the LOC annotation - if TRANSCRIPT in transcripts_plus.keys(): loc_plus = transcript.whereIsThisVariant(variant_plus) - elif TRANSCRIPT in transcriptsOUT_plus.keys(): loc_plus = 'OUT' + if TRANSCRIPT in list(transcripts_plus.keys()): loc_plus = transcript.whereIsThisVariant(variant_plus) + elif TRANSCRIPT in list(transcriptsOUT_plus.keys()): loc_plus = 'OUT' else: loc_plus = '.' if difference: - if TRANSCRIPT in transcripts_minus.keys(): loc_minus = transcript.whereIsThisVariant(variant_minus) - elif TRANSCRIPT in transcriptsOUT_minus.keys(): loc_minus = 'OUT' + if TRANSCRIPT in list(transcripts_minus.keys()): loc_minus = transcript.whereIsThisVariant(variant_minus) + elif TRANSCRIPT in list(transcriptsOUT_minus.keys()): loc_minus = 'OUT' else: loc_minus = '.' else: loc_minus = loc_plus @@ -299,7 +299,7 @@ def annotate(self, variant, reference, impactdir): if notexonic_plus and notexonic_minus: protein = '' else: - if not transcript.TRANSCRIPT in self.proteinSeqs.keys(): + if not transcript.TRANSCRIPT in list(self.proteinSeqs.keys()): protein, exonseqs = transcript.getProteinSequence(reference, None, None) if len(self.proteinSeqs) > 5: @@ -327,13 +327,13 @@ def annotate(self, variant, reference, impactdir): # Creating the CSN annotations both for left and right aligned variant - if TRANSCRIPT in transcripts_plus.keys(): + if TRANSCRIPT in list(transcripts_plus.keys()): csn_plus, protchange_plus = csn.getAnnotation(variant_plus, transcript, reference, protein, mutprotein_plus) csn_plus_str = csn_plus.getAsString() else: csn_plus_str, protchange_plus = '.', ('.','.','.') if difference: - if TRANSCRIPT in transcripts_minus.keys(): + if TRANSCRIPT in list(transcripts_minus.keys()): csn_minus, protchange_minus = csn.getAnnotation(variant_minus, transcript, reference, protein, mutprotein_minus) csn_minus_str = csn_minus.getAsString() else: csn_minus_str, protchange_minus = '.', ('.','.','.') @@ -353,12 +353,12 @@ def annotate(self, variant, reference, impactdir): if not impactdir == None or self.options.args['ontology'].upper() in ['CLASS', 'BOTH']: # Creating the CLASS annotations both for left and right aligned variant - if TRANSCRIPT in transcripts_plus.keys(): + if TRANSCRIPT in list(transcripts_plus.keys()): class_plus = conseq.getClassAnnotation(variant_plus, transcript, protein, mutprotein_plus, loc_plus, int(self.options.args['ssrange'])) else: class_plus = '.' if difference: - if TRANSCRIPT in transcripts_minus.keys(): + if TRANSCRIPT in list(transcripts_minus.keys()): class_minus = conseq.getClassAnnotation(variant_minus, transcript, protein, mutprotein_minus, loc_minus, int(self.options.args['ssrange'])) else: class_minus = '.' else: class_minus = class_plus @@ -367,14 +367,14 @@ def annotate(self, variant, reference, impactdir): # Determining the IMPACT flag if not impactdir == None: - if TRANSCRIPT in transcripts_plus.keys(): - if class_plus in impactdir.keys(): impact_plus = impactdir[class_plus] + if TRANSCRIPT in list(transcripts_plus.keys()): + if class_plus in list(impactdir.keys()): impact_plus = impactdir[class_plus] else: impact_plus = 'None' else: impact_plus = '.' - if TRANSCRIPT in transcripts_minus.keys(): - if class_minus in impactdir.keys(): impact_minus = impactdir[class_minus] + if TRANSCRIPT in list(transcripts_minus.keys()): + if class_minus in list(impactdir.keys()): impact_minus = impactdir[class_minus] else: impact_minus = 'None' else: impact_minus = '.' @@ -383,12 +383,12 @@ def annotate(self, variant, reference, impactdir): if self.options.args['ontology'].upper() in ['SO', 'BOTH']: # Creating the SO annotations both for left and right aligned variant - if TRANSCRIPT in transcripts_plus.keys(): + if TRANSCRIPT in list(transcripts_plus.keys()): so_plus = conseq.getSequenceOntologyAnnotation(variant_plus, transcript, protein, mutprotein_plus, loc_plus) else: so_plus = '.' if difference: - if TRANSCRIPT in transcripts_minus.keys(): + if TRANSCRIPT in list(transcripts_minus.keys()): so_minus = conseq.getSequenceOntologyAnnotation(variant_minus, transcript, protein, mutprotein_minus, loc_minus) else: so_minus = '.' diff --git a/cava_/main.py b/cava_/main.py index 6669ce3..15b164b 100644 --- a/cava_/main.py +++ b/cava_/main.py @@ -1,6 +1,3 @@ - -from __future__ import division - import datetime import gzip import logging @@ -9,20 +6,20 @@ import sys import pysam -import core +from . import core from cava_.data import Ensembl from cava_.data import Reference from cava_.data import dbSNP -from core import Options -from core import Record +from .core import Options +from .core import Record # Printing out welcome meassage def printStartInfo(ver): starttime = datetime.datetime.now() - print "\n-----------------------------------------------------------------------" - print 'CAVA (Clinical Annotation of VAriants) ' + ver + ' is now running.' - print 'Started: ', str(starttime), '\n' + print("\n-----------------------------------------------------------------------") + print('CAVA (Clinical Annotation of VAriants) ' + ver + ' is now running.') + print('Started: ', str(starttime), '\n') return starttime @@ -32,12 +29,12 @@ def printInputFileNames(copts, options): outfn = copts.output + '.vcf' else: outfn = copts.output + '.txt' - print 'Configuration file: ' + copts.conf - print 'Input file (' + options.args['inputformat'] + '): ' + copts.input - print 'Output file (' + options.args['outputformat'] + '): ' + outfn - if options.args['logfile']: print 'Log file: ' + copts.output + '.log' + print('Configuration file: ' + copts.conf) + print('Input file (' + options.args['inputformat'] + '): ' + copts.input) + print('Output file (' + options.args['outputformat'] + '): ' + outfn) + if options.args['logfile']: print('Log file: ' + copts.output + '.log') if copts.threads > 1: - print '\nMultithreading: ' + str(copts.threads) + ' threads' + print('\nMultithreading: ' + str(copts.threads) + ' threads') if options.args['logfile']: logging.info('Configuration file - ' + copts.conf) @@ -52,7 +49,7 @@ def printInputFileNames(copts, options): # Printing out number of records in the input file def printNumOfRecords(numOfRecords): - print '\nInput file contains ' + str(numOfRecords) + ' records to annotate.\n' + print('\nInput file contains ' + str(numOfRecords) + ' records to annotate.\n') # Initializing progress information @@ -73,7 +70,7 @@ def printProgressInfo(counter, numOfRecords): def finalizeProgressInfo(): sys.stdout.write('\rAnnotating variants ... 100.0%') sys.stdout.flush() - print ' - Done.' + print(' - Done.') # Printing out goodbye message @@ -83,11 +80,11 @@ def printEndInfo(options, copts, starttime): outfn = copts.output + '.vcf' else: outfn = copts.output + '.txt' - print '\n(Size of output file: ' + str(round(os.stat(outfn).st_size / 1000, 1)) + ' Kbyte)' - print '\nCAVA (Clinical Annotation of VAriants) successfully finished.' - print 'Ended: ', str(endtime) - print 'Total runtime: '+str(endtime-starttime) - print "-----------------------------------------------------------------------\n" + print('\n(Size of output file: ' + str(round(os.stat(outfn).st_size / 1000, 1)) + ' Kbyte)') + print('\nCAVA (Clinical Annotation of VAriants) successfully finished.') + print('Ended: ', str(endtime)) + print('Total runtime: '+str(endtime-starttime)) + print("-----------------------------------------------------------------------\n") if options.args['logfile']: logging.info('100% of records annotated.') if not copts.stdout: logging.info('Output file = ' + str(round(os.stat(outfn).st_size / 1000, 1)) + ' Kbyte') @@ -100,7 +97,7 @@ def findFileBreaks(inputf, threads): started = False counter = 0 - if inputf.endswith('.gz'): infile = gzip.open(inputf, 'r') + if inputf.endswith('.gz'): infile = gzip.open(inputf, 'rt') else: infile = open(inputf) for line in infile: @@ -123,7 +120,7 @@ def findFileBreaks(inputf, threads): def readHeader(inputfn): ret = [] - if inputfn.endswith('.gz'): infile = gzip.open(inputfn, 'r') + if inputfn.endswith('.gz'): infile = gzip.open(inputfn, 'rt') else: infile = open(inputfn) for line in infile: @@ -197,7 +194,7 @@ def __init__(self, threadidx, options, copts, startline, endline, genelist, tran # Input file if copts.input.endswith('.gz'): - self.infile = gzip.open(copts.input, 'r') + self.infile = gzip.open(copts.input, 'rt') else: self.infile = open(copts.input) @@ -310,14 +307,14 @@ def run(copts, version, default_config_file): # Check if input and configuration files exist if copts.conf is None: - print '\nError: no configuration file specified.' - print 'Please use option -c or add the absolute path to the default_config_path file.\n' + print('\nError: no configuration file specified.') + print('Please use option -c or add the absolute path to the default_config_path file.\n') quit() if not os.path.isfile(copts.conf): - print '\nError: configuration file (' + copts.conf + ') cannot be found.\n' + print('\nError: configuration file (' + copts.conf + ') cannot be found.\n') quit() if not os.path.isfile(copts.input): - print '\nError: input file (' + copts.input + ') cannot be found.\n' + print('\nError: input file (' + copts.input + ') cannot be found.\n') quit() # Reading options from configuration file @@ -393,4 +390,4 @@ def run(copts, version, default_config_file): # Printing out summary information and end time if not copts.stdout: - printEndInfo(options, copts, starttime) \ No newline at end of file + printEndInfo(options, copts, starttime) diff --git a/install.sh b/install.sh index 53797aa..f1ff5a3 100755 --- a/install.sh +++ b/install.sh @@ -4,7 +4,7 @@ unset PYTHONPATH ABSOLUTE_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) PIP_ARGS='--no-cache-dir --ignore-installed --force-reinstall' -virtualenv -p python2.7 env +virtualenv -p python3 env source ${ABSOLUTE_PATH}/env/bin/activate diff --git a/setup.py b/setup.py index 44dc8d8..c384b48 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name= 'CAVA', - version = '1.2.3', + version = '2.0.0', description = 'CAVA (Clinical Annotation of VAriants)', url = 'https://github.com/RahmanTeam/CAVA', author = 'RahmanTeam', From 9d40c2d920e22b182c6598137ca66c509503cdc7 Mon Sep 17 00:00:00 2001 From: dariober Date: Wed, 22 Aug 2018 12:05:50 +0100 Subject: [PATCH 03/13] File renamed --- README.md | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..1af04f3 --- /dev/null +++ b/README.md @@ -0,0 +1,96 @@ + + +* [CAVA v1.2.3 README](#cava-v123-readme) + * [1 INTRODUCTION](#1-introduction) + * [2 PUBLICATION](#2-publication) + * [3 DEPENDENCIES](#3-dependencies) + * [4 INSTALLATION ON LINUX OR MAC](#4-installation-on-linux-or-mac) + * [5 RUNNING CAVA](#5-running-cava) + * [6 DOCUMENTATION](#6-documentation) + * [7 LICENCE](#7-licence) + + + +CAVA v1.2.3 README +================== + +1 INTRODUCTION +-------------- + +CAVA (Clinical Annotation of VAriants) is a lightweight, fast, flexible and easy-to-use Next Generation Sequencing (NGS) variant annotation tool. It implements a clinical sequencing nomenclature (CSN), a fixed variant annotation consistent with the principles of the Human Genome Variation Society (HGVS) guidelines, optimised for automated clinical variant annotation of NGS data. + +CAVA has been extensively tested on exome data and is being used in the Mainstreaming Cancer Genetics (MCG) programme which applies NGS to increase the availability and affordability of clinical testing of cancer predisposition genes. + + +2 PUBLICATION +------------- + +If you use CAVA, please cite: + +Márton Münz, Elise Ruark, Anthony Renwick, Emma Ramsay, Matthew Clarke, Shazia Mahamdallie, Victoria Cloke, Sheila Seal, Ann Strydom, Gerton Lunter, Nazneen Rahman. CSN and CAVA: variant annotation tools for rapid, robust next-generation sequencing analysis in the clinical setting. Genome Medicine 7:76, doi:10.1186/s13073-015-0195-6 (2015). + + +3 DEPENDENCIES +-------------- + +To install and run CAVA v1.2.3 you will need the following dependencies installed: +- Python 3 +- GCC and GNU make +- virtualenv + +If your system is missing GCC and GNU make, these can be installed as follows: + +On a Mac, the easiest way to set them up is to install Xcode Command Line Tools. + +On a Debian or Ubuntu Linux, they can be set up by installing the build-essential package: +sudo apt-get install build-essential + +If not already installed on your system, virtualenv can be set up by: +pip install virtualenv + + +4 INSTALLATION ON LINUX OR MAC +------------------------------ + +CAVA v1.2.3 can be downloaded from https://github.com/RahmanTeam/CAVA/releases/tag/v1.2.3 +in either .zip or .tar.gz format. + +To unpack these run one of the following commands: + +unzip CAVA-1.2.3.zip + +or: + +tar -xvzf CAVA-1.2.3.tar.gz + +and then you can install CAVA with the following command from the CAVA-1.2.3 directory: + +./install.sh + +CAVA uses virtualenv and pip to manage all its extra dependencies, which means that it will not clutter up your system by installing things globally. Everything it installs will go into a sub-directory in the CAVA-1.2.3 directory. If you delete CAVA then everything it has installed will also be deleted. + +Once the installation script has finished successfully, CAVA is ready for use. + + +5 RUNNING CAVA +-------------- + +CAVA can be run with the following simple command: + +CAVA-1.2.3/cava -c config.txt -i input.vcf -o output + +It requires three command line arguments: +the name of the configuration file (-c), the name of the input file (-i) and the prefix of the output file name (-o). + + +6 DOCUMENTATION +--------------- + +See CAVA-v1.2,3_doc.pdf for detailed documentation and examples. + + +7 LICENCE +--------- + +CAVA is released under MIT licence (see the LICENCE file). + From 54bb6b10efc9f630a195903d3745553194046a01 Mon Sep 17 00:00:00 2001 From: dariober Date: Wed, 22 Aug 2018 15:55:54 +0100 Subject: [PATCH 04/13] dbsnp_db subprogram from py2 to py3 Minor edits mostly done via `2to3`. Version bumped. Tested against original python2 version and with no differences in output --- bin/dbSNPDB.py | 52 +++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/bin/dbSNPDB.py b/bin/dbSNPDB.py index 62893fe..d7ad51a 100755 --- a/bin/dbSNPDB.py +++ b/bin/dbSNPDB.py @@ -54,7 +54,7 @@ def processData(options, datafile, IDs): sys.stdout.write('\rProcessing SNPs on all chrs ... OK') sys.stdout.flush() - print '' + print('') datafile.close() outfile.close() return counter @@ -83,16 +83,16 @@ def printMetaData(datafile): line = line.strip() if line.startswith('#'): if line.startswith('##dbSNP_BUILD_ID='): - print 'Input file build: '+line[17:] + print('Input file build: '+line[17:]) ret = int(line[17:]) - if line.startswith('##reference='): print 'Reference: '+line[12:]+'\n' + if line.startswith('##reference='): print('Reference: '+line[12:]+'\n') else: return ret ####################################################################################################################### # Version -version = '1.2.3' +version = '2.0.0' descr = 'CAVA dbsnp_db v' + version epilog = '\nExample usage: CAVA-{}/dbsnp_db -d 00-All.vcf.gz -s 138 -o out\n\n'.format(version) @@ -106,39 +106,39 @@ def printMetaData(datafile): # Check options if options.release is None: - print '\nError: no dbSNP release specified' - print 'Please use option -s to specify dbSNP release version\n' + print('\nError: no dbSNP release specified') + print('Please use option -s to specify dbSNP release version\n') quit() if options.data is None: - print '\nError: no 00-All.vcf.gz data file specified' - print 'Please use option -d to specify path to data file\n' + print('\nError: no 00-All.vcf.gz data file specified') + print('Please use option -d to specify path to data file\n') quit() if not os.path.isfile(options.data): - print '\nError: 00-All.vcf.gz file (' + options.data + ') cannot be found.\n' + print('\nError: 00-All.vcf.gz file (' + options.data + ') cannot be found.\n') quit() # Print out version information -print "\n---------------------------------------------------------------------------------------" -print 'CAVA ' + version + ' dbSNP database preparation tool (dbsnp_db) is now running.' -print 'Started: ', datetime.datetime.now(), '\n' +print("\n---------------------------------------------------------------------------------------") +print('CAVA ' + version + ' dbSNP database preparation tool (dbsnp_db) is now running.') +print('Started: ', datetime.datetime.now(), '\n') # Open data fie for reading -datafile = gzip.open(options.data, 'r') +datafile = gzip.open(options.data, 'rt') # Print out meta data build = printMetaData(datafile) datafile.seek(0) if int(options.release) > build: - print 'Error: requested release must be <=' + str(build) + '\n' + print('Error: requested release must be <=' + str(build) + '\n') quit() # Print out info -print 'Requested dbSNP release: ' + str(options.release) +print('Requested dbSNP release: ' + str(options.release)) IDs = [] if options.input is not None: IDs = readIDs(options.input) - print '\nInput file contains ' + str(len(IDs)) + ' dbSNP IDs to be included in the database' -print '' + print('\nInput file contains ' + str(len(IDs)) + ' dbSNP IDs to be included in the database') +print('') # Create compressed output file N = processData(options, datafile, IDs) @@ -150,13 +150,13 @@ def printMetaData(datafile): os.remove(options.output) # Print out summary information -print '\nA total of ' + str(N) + ' SNPs have been retrieved\n' -print '---------------------' -print 'Output files created:' -print '---------------------' -print options.output + '.gz (SNP database)' -print options.output + '.gz.tbi (index file)' -print '' -print 'CAVA dbsnp_db successfully finished: ', datetime.datetime.now() -print "---------------------------------------------------------------------------------------\n" +print('\nA total of ' + str(N) + ' SNPs have been retrieved\n') +print('---------------------') +print('Output files created:') +print('---------------------') +print(options.output + '.gz (SNP database)') +print(options.output + '.gz.tbi (index file)') +print('') +print('CAVA dbsnp_db successfully finished: ', datetime.datetime.now()) +print("---------------------------------------------------------------------------------------\n") From c06f83d096addf1aef0ed4f0c7ef5f97dbe291af Mon Sep 17 00:00:00 2001 From: dariober Date: Wed, 22 Aug 2018 15:58:29 +0100 Subject: [PATCH 05/13] Version bumped --- bin/EnsemblDB.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/EnsemblDB.py b/bin/EnsemblDB.py index 69fac2b..a73f5b9 100644 --- a/bin/EnsemblDB.py +++ b/bin/EnsemblDB.py @@ -4,7 +4,7 @@ from ensembldb import main # Version -version = '1.2.3' +version = '2.0.0' # Command line argument parsing descr = 'CAVA ensembl_db v' + version From d72868dbefca8abd666c0a62ebf7ba1fbf73009a Mon Sep 17 00:00:00 2001 From: dariober Date: Wed, 22 Aug 2018 15:59:02 +0100 Subject: [PATCH 06/13] ensembl_db ported from py2 to py3 Program working on python3 with minor differences with python2: * Rounding in python3 is different from python2 (python3 considered to "correct" one) gives slight differences in transcript lenghts * Order of transcripts in output is slightly differences but it should not matter --- ensembldb/main.py | 125 ++++++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 60 deletions(-) diff --git a/ensembldb/main.py b/ensembldb/main.py index 29ba886..cbac673 100644 --- a/ensembldb/main.py +++ b/ensembldb/main.py @@ -1,13 +1,11 @@ -from __future__ import division import os import datetime import sys -import urllib +import urllib.request, urllib.parse, urllib.error import gzip import pysam from operator import itemgetter - # Class representing a transcript class Transcript(object): @@ -22,8 +20,8 @@ def __init__(self): self.POSEND = None self.GENETYPE = None self.TRANSTYPE = None - self.CODING_START = None - self.CODING_END = None + self.CODING_START = -1 + self.CODING_END = -1 self.CODING_START_RELATIVE = None self.CCDS = None self.EXONS = [] @@ -37,7 +35,7 @@ def getInfoString(self): if self.STRAND == '1': ret = '+/' else: ret = '-/' cdna = self.getcDNALength() - return ret+str(round((self.POSEND-self.POS+1)/1000,1))+'kb/'+str(len(self.EXONS))+'/'+str(round(cdna/1000,1))+'kb/'+str(self.getProteinLength()) + return ret+str(round((self.POSEND-self.POS+1) / 1000, 1))+'kb/'+str(len(self.EXONS))+'/'+str(round(cdna / 1000, 1))+'kb/'+str(self.getProteinLength()) # Get cDNA length of the transcript @@ -54,10 +52,14 @@ def getProteinLength(self): for exon in self.EXONS: if exon.END < self.CODING_START: continue if exon.START > self.CODING_END: continue - if exon.START <= self.CODING_START <= exon.END: start = self.CODING_START - else: start = exon.START + 1 - if exon.START <= self.CODING_END <= exon.END: end = self.CODING_END - else: end = exon.END + if exon.START <= self.CODING_START <= exon.END: + start = self.CODING_START + else: + start = exon.START + 1 + if exon.START <= self.CODING_END <= exon.END: + end = self.CODING_END + else: + end = exon.END codingdna += end - start + 1 else: for exon in self.EXONS: @@ -73,8 +75,10 @@ def getProteinLength(self): # Check if it is a candidate transcript def isCandidate(self): - if not (self.GENETYPE=='protein_coding' and self.TRANSTYPE=='protein_coding'): return False - return (self.CODING_START is not None and self.CODING_END is not None) and self.isComplete + if not (self.GENETYPE=='protein_coding' and self.TRANSTYPE=='protein_coding'): + return False + # return (self.CODING_START is not None and self.CODING_END is not None) and self.isComplete + return (self.CODING_START > -1 and self.CODING_END > -1) and self.isComplete # Output transcript @@ -138,7 +142,7 @@ def __init__(self, symbol, ensg): def selectTranscript(self): ccds_set = [] nonccds_set = [] - for enst,transcript in self.TRANSCRIPTS.iteritems(): + for enst,transcript in self.TRANSCRIPTS.items(): if transcript.CCDS: ccds_set.append(transcript) else: nonccds_set.append(transcript) @@ -157,17 +161,17 @@ def selectTranscript(self): # Output all or selected transcripts def output(self, outfile, outfile_list, select, mcg_transcripts): if select: - if self.SYMBOL in mcg_transcripts.keys(): + if self.SYMBOL in list(mcg_transcripts.keys()): ok = False - for _,transcript in self.TRANSCRIPTS.iteritems(): + for _,transcript in self.TRANSCRIPTS.items(): if transcript.ENST in mcg_transcripts[self.SYMBOL]: transcript.output(outfile,outfile_list) ok = True - if self.SYMBOL not in mcg_transcripts.keys() or not ok: + if self.SYMBOL not in list(mcg_transcripts.keys()) or not ok: transcript = self.selectTranscript() transcript.output(outfile,outfile_list) else: - for _,transcript in self.TRANSCRIPTS.iteritems(): + for _,transcript in self.TRANSCRIPTS.items(): transcript.output(outfile,outfile_list) @@ -209,11 +213,11 @@ def sortRecords(records, idx1, idx2): chroms = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', 'MT', 'X', 'Y'] for i in range(len(chroms)): chrom = chroms[i] - if chrom in records.keys(): + if chrom in list(records.keys()): records[chrom] = sorted(records[chrom], key=itemgetter(idx1,idx2)) for i in range(len(chroms)): chrom = chroms[i] - if chrom in records.keys(): + if chrom in list(records.keys()): for record in records[chrom]: ret.append(record) return ret @@ -245,7 +249,7 @@ def process_data(options, genome_build, version): line = line.strip() if line == '': continue cols = line.split('\t') - if cols[0] not in mcg_transcripts.keys(): mcg_transcripts[cols[0]] = set() + if cols[0] not in list(mcg_transcripts.keys()): mcg_transcripts[cols[0]] = set() mcg_transcripts[cols[0]].add(cols[1]) # Changing transcript for certain releases if int(options.ensembl)>=71: mcg_transcripts['BMPR1A'] = {'ENST00000372037'} @@ -261,11 +265,11 @@ def process_data(options, genome_build, version): transIDs = set() if options.input is not None: transIDs = readTranscriptIDs(options.input) - print '\nOnly ' + str(len(transIDs)) + ' transcripts read from ' + options.input + ' are considered\n' - else: print '\nAll transcripts from the Ensembl release are considered\n' + print('\nOnly ' + str(len(transIDs)) + ' transcripts read from ' + options.input + ' are considered\n') + else: print('\nAll transcripts from the Ensembl release are considered\n') # Print out info - if options.select: print 'Transcript selection switched on\n' + if options.select: print('Transcript selection switched on\n') # Load candidate and CCDS data for Ensembl <75 candidates = dict() @@ -275,7 +279,7 @@ def process_data(options, genome_build, version): line=line.strip() if line=='': continue cols = line.split('\t') - if cols[0] not in candidates.keys(): candidates[cols[0]]=dict() + if cols[0] not in list(candidates.keys()): candidates[cols[0]]=dict() candidates[cols[0]][cols[1]]=int(cols[2]) # Download Ensembl data @@ -283,9 +287,9 @@ def process_data(options, genome_build, version): sys.stdout.flush() url = 'ftp://ftp.ensembl.org/pub/release-' + options.ensembl + '/gtf/homo_sapiens/Homo_sapiens.' + genome_build + '.' + options.ensembl + '.gtf.gz' try: - urllib.urlretrieve(url, 'ensembl_data.gz') + urllib.request.urlretrieve(url, 'ensembl_data.gz') except: - print '\n\nCannot connect to Ensembl FTP site. No internet connection?\n' + print('\n\nCannot connect to Ensembl FTP site. No internet connection?\n') quit() sys.stdout.write('OK\n') @@ -296,7 +300,7 @@ def process_data(options, genome_build, version): first = True prevenst = '' transcript = None - for line in gzip.open('ensembl_data.gz', 'r'): + for line in gzip.open('ensembl_data.gz', 'rt'): line = line.strip() if line.startswith('#'): continue cols = line.split('\t') @@ -322,7 +326,8 @@ def process_data(options, genome_build, version): if not first: transcript.finalize() if transcript.isCandidate(): - if transcript.ENSG not in genesdata.keys(): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) + if transcript.ENSG not in list(genesdata.keys()): + genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) genesdata[transcript.ENSG].TRANSCRIPTS[transcript.ENST] = transcript # Initialize new Transcript object @@ -358,20 +363,20 @@ def process_data(options, genome_build, version): if cols[2] == 'start_codon': if transcript.STRAND == '1': - if transcript.CODING_START is None or int(cols[3]) < transcript.CODING_START: transcript.CODING_START = int(cols[3]) + if transcript.CODING_START < 0 or int(cols[3]) < transcript.CODING_START: transcript.CODING_START = int(cols[3]) else: - if transcript.CODING_START is None or int(cols[4]) > transcript.CODING_START: transcript.CODING_START = int(cols[4]) + if transcript.CODING_START < 0 or int(cols[4]) > transcript.CODING_START: transcript.CODING_START = int(cols[4]) if cols[2] == 'stop_codon': if transcript.STRAND == '1': - if transcript.CODING_END is None or int(cols[4]) > transcript.CODING_END: transcript.CODING_END = int(cols[4]) + if transcript.CODING_END < 0 or int(cols[4]) > transcript.CODING_END: transcript.CODING_END = int(cols[4]) else: - if transcript.CODING_END is None or int(cols[3]) < transcript.CODING_END: transcript.CODING_END = int(cols[3]) + if transcript.CODING_END < 0 or int(cols[3]) < transcript.CODING_END: transcript.CODING_END = int(cols[3]) # Check if transcript is complete and is a CCDS transcript if transcript.isComplete is None: if int(options.ensembl) < 75: - if transcript.ENST in candidates[transcript.CHROM].keys(): + if transcript.ENST in list(candidates[transcript.CHROM].keys()): transcript.CCDS = (candidates[transcript.CHROM][transcript.ENST] == 1) transcript.isComplete = True else: @@ -388,14 +393,14 @@ def process_data(options, genome_build, version): if transcript is not None: transcript.finalize() if transcript.isCandidate(): - if transcript.ENSG not in genesdata.keys(): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) + if transcript.ENSG not in list(genesdata.keys()): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) genesdata[transcript.ENSG].TRANSCRIPTS[transcript.ENST] = transcript # If no transcript ID from the input file was found in the Ensembl release if len(genesdata) == 0: - print '\n\nNo transcripts from '+options.input+' found in Ensembl release.' - print '\nNo transcript database created.' - print "-----------------------------------------------------------------\n" + print('\n\nNo transcripts from '+options.input+' found in Ensembl release.') + print('\nNo transcript database created.') + print("-----------------------------------------------------------------\n") os.remove('ensembl_data.gz') quit() @@ -408,7 +413,7 @@ def process_data(options, genome_build, version): outfile_list.write('ENSG\tGENE\tENST\n') # Output transcripts of each gene - for ensg, gene in genesdata.iteritems(): gene.output(outfile,outfile_list,options.select,mcg_transcripts) + for ensg, gene in genesdata.items(): gene.output(outfile,outfile_list,options.select,mcg_transcripts) # Close temporary output files outfile.close() @@ -423,7 +428,7 @@ def process_data(options, genome_build, version): line.rstrip() record = line.split('\t') record[6] = int(record[6]) - if record[4] in data.keys(): + if record[4] in list(data.keys()): data[record[4]].append(record) else: data[record[4]] = [] @@ -472,18 +477,18 @@ def is_number(s): def run(options, version): # Checking if all required options specified if options.ensembl is None: - print '\nError: no Ensembl release specified. Use option -h to get help!\n' + print('\nError: no Ensembl release specified. Use option -h to get help!\n') quit() if not is_number(options.ensembl): - print '\nError: Ensembl release specified is not an integer. Use option -h to get help!\n' + print('\nError: Ensembl release specified is not an integer. Use option -h to get help!\n') quit() if options.output is None: - print '\nError: no output file name specified. Use option -h to get help!\n' + print('\nError: no output file name specified. Use option -h to get help!\n') quit() # Must use Ensembl release >= 70 if not (int(options.ensembl) >= 70 or int(options.ensembl) == 65): - print '\nError: This version works with Ensembl v65 or >= v70.\n' + print('\nError: This version works with Ensembl v65 or >= v70.\n') quit() # Genome build @@ -491,17 +496,17 @@ def run(options, version): genome_build = 'GRCh37' if int(options.ensembl) <= 75 else 'GRCh38' # Printing out version information - print "\n---------------------------------------------------------------------------------------" - print 'CAVA ' + version + ' transcript database preparation tool (ensembl_db) is now running' - print 'Started: ', datetime.datetime.now(), '\n' + print("\n---------------------------------------------------------------------------------------") + print('CAVA ' + version + ' transcript database preparation tool (ensembl_db) is now running') + print('Started: ', datetime.datetime.now(), '\n') # Print info - print 'Ensembl version: ' + options.ensembl - print 'Reference genome: ' + genome_build + print('Ensembl version: ' + options.ensembl) + print('Reference genome: ' + genome_build) # Creating compressed output file Nretrieved = process_data(options, genome_build, version) - print '\nA total of ' + str(Nretrieved) + ' transcripts have been retrieved\n' + print('\nA total of ' + str(Nretrieved) + ' transcripts have been retrieved\n') # Indexing output file with Tabix indexFile(options) @@ -510,14 +515,14 @@ def run(options, version): os.remove(options.output) # Printing out summary information - print '' - print '---------------------' - print 'Output files created:' - print '---------------------' - print options.output + '.gz (transcript database)' - print options.output + '.gz.tbi (index file)' - print options.output + '.txt (list of transcripts)' - - print '' - print 'CAVA ensembl_db successfully finished: ', datetime.datetime.now() - print "---------------------------------------------------------------------------------------\n" + print('') + print('---------------------') + print('Output files created:') + print('---------------------') + print(options.output + '.gz (transcript database)') + print(options.output + '.gz.tbi (index file)') + print(options.output + '.txt (list of transcripts)') + + print('') + print('CAVA ensembl_db successfully finished: ', datetime.datetime.now()) + print("---------------------------------------------------------------------------------------\n") From de5f77bb429595ed65735ff83b2a20e52e29e10c Mon Sep 17 00:00:00 2001 From: dariober Date: Thu, 23 Aug 2018 09:06:33 +0100 Subject: [PATCH 07/13] File renamed --- README | 84 ---------------------------------------------------------- 1 file changed, 84 deletions(-) delete mode 100644 README diff --git a/README b/README deleted file mode 100644 index 89f1fcf..0000000 --- a/README +++ /dev/null @@ -1,84 +0,0 @@ - -CAVA v1.2.3 README -================== - -1 INTRODUCTION --------------- - -CAVA (Clinical Annotation of VAriants) is a lightweight, fast, flexible and easy-to-use Next Generation Sequencing (NGS) variant annotation tool. It implements a clinical sequencing nomenclature (CSN), a fixed variant annotation consistent with the principles of the Human Genome Variation Society (HGVS) guidelines, optimised for automated clinical variant annotation of NGS data. - -CAVA has been extensively tested on exome data and is being used in the Mainstreaming Cancer Genetics (MCG) programme which applies NGS to increase the availability and affordability of clinical testing of cancer predisposition genes. - - -2 PUBLICATION -------------- - -If you use CAVA, please cite: - -Márton Münz, Elise Ruark, Anthony Renwick, Emma Ramsay, Matthew Clarke, Shazia Mahamdallie, Victoria Cloke, Sheila Seal, Ann Strydom, Gerton Lunter, Nazneen Rahman. CSN and CAVA: variant annotation tools for rapid, robust next-generation sequencing analysis in the clinical setting. Genome Medicine 7:76, doi:10.1186/s13073-015-0195-6 (2015). - - -3 DEPENDENCIES --------------- - -To install and run CAVA v1.2.3 you will need the following dependencies installed: -- Python 2.7.9 or later (Python2 series) -- GCC and GNU make -- virtualenv - -If your system is missing GCC and GNU make, these can be installed as follows: - -On a Mac, the easiest way to set them up is to install Xcode Command Line Tools. - -On a Debian or Ubuntu Linux, they can be set up by installing the build-essential package: -sudo apt-get install build-essential - -If not already installed on your system, virtualenv can be set up by: -pip install virtualenv - - -4 INSTALLATION ON LINUX OR MAC ------------------------------- - -CAVA v1.2.3 can be downloaded from https://github.com/RahmanTeam/CAVA/releases/tag/v1.2.3 -in either .zip or .tar.gz format. - -To unpack these run one of the following commands: - -unzip CAVA-1.2.3.zip - -or: - -tar -xvzf CAVA-1.2.3.tar.gz - -and then you can install CAVA with the following command from the CAVA-1.2.3 directory: - -./install.sh - -CAVA uses virtualenv and pip to manage all its extra dependencies, which means that it will not clutter up your system by installing things globally. Everything it installs will go into a sub-directory in the CAVA-1.2.3 directory. If you delete CAVA then everything it has installed will also be deleted. - -Once the installation script has finished successfully, CAVA is ready for use. - - -5 RUNNING CAVA --------------- - -CAVA can be run with the following simple command: - -CAVA-1.2.3/cava -c config.txt -i input.vcf -o output - -It requires three command line arguments: -the name of the configuration file (-c), the name of the input file (-i) and the prefix of the output file name (-o). - - -6 DOCUMENTATION ---------------- - -See CAVA-v1.2,3_doc.pdf for detailed documentation and examples. - - -7 LICENCE ---------- - -CAVA is released under MIT licence (see the LICENCE file). - From a59a6217f7b3cff0b00f2e14fc81776194770820 Mon Sep 17 00:00:00 2001 From: dariober Date: Tue, 28 Aug 2018 15:44:26 +0100 Subject: [PATCH 08/13] Make transcript selection reproducible In transcript selection mode, the selected transcript may depend on the order the transcripts are returned by the dict.items() method. This order is not guaranteed to be predictable. We sort the list of candidate transcripts by ENST. In this way we know, at least, that in case of ties between candidates we consistently select the last one. (See also comments in the script) --- ensembldb/main.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ensembldb/main.py b/ensembldb/main.py index cbac673..7691cc4 100644 --- a/ensembldb/main.py +++ b/ensembldb/main.py @@ -149,11 +149,20 @@ def selectTranscript(self): if len(ccds_set) > 0: candidates = ccds_set else: candidates = nonccds_set + # Sort candidates by ENST. In case there multiple selectable + # transcripts, the selected one does not depend on the order they are + # returned by .items() + candidates.sort(key=lambda x: x.ENST) + selected = Transcript() selected.PROTL = selected.CDNAL = -1 for t in candidates: - if t.PROTL > selected.PROTL: selected = t - elif t.PROTL == selected.PROTL and t.CDNAL > selected.CDNAL: selected = t + # Note that we return the *last* selected candidate since we + # overwrite the variable `selected`. + if t.PROTL > selected.PROTL: + selected = t + elif t.PROTL == selected.PROTL and t.CDNAL > selected.CDNAL: + selected = t return selected From 4b4d0870e54180acad8a0cab21601e9f5abaaca0 Mon Sep 17 00:00:00 2001 From: dariober Date: Mon, 10 Sep 2018 11:14:50 +0100 Subject: [PATCH 09/13] Add test suite A small unit test to check the basic funcionality of the python3 port --- test.sh | 38 ++++++++++++++++++++++ test/CAVA_config.txt | 77 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 test.sh create mode 100644 test/CAVA_config.txt diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..1466ac8 --- /dev/null +++ b/test.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# Test CAVA suite. After having installed CAVA, execute this script as: +# +# bash test.sh + +set -e +set -o pipefail + +# Set up + +if [ ! -f test/tmp.input.vcf.gz ] +then + curl ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/common_all_20180418.vcf.gz \ + | gunzip \ + | awk '$1 ~ "^#" || NR % 1000 == 0' \ + | gzip > test/tmp.input.vcf.gz +fi + +if [ ! -f test/hg38.fa ] +then + curl http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz | gunzip > test/tmp.hg38.fa + samtools faidx test/tmp.hg38.fa +fi + +# Test: prepare database +./ensembl_db -e 75 -o test/tmp.db + +# Lengths in bp not kb: +gunzip -c test/tmp.db.gz | grep -F '+/919bp/1/918bp/305' > /dev/null + +# Test: annotate +./cava -i test/tmp.input.vcf.gz -o test/tmp. -c test/CAVA_config.txt -t 8 + +# Tear down +# ========= + +rm test/tmp.* diff --git a/test/CAVA_config.txt b/test/CAVA_config.txt new file mode 100644 index 0000000..8250d73 --- /dev/null +++ b/test/CAVA_config.txt @@ -0,0 +1,77 @@ +# Input file format +# Possible values: VCF or TXT | Optional: yes | Default value: VCF +@inputformat = VCF + +# Output file format +# Possible values: VCF or TSV | Optional: yes | Default value: VCF +@outputformat = VCF + +# Absolute path to reference genome file +# Possible values: string | Optional: no +@reference = test/tmp.hg38.fa + +# Absolute path to Ensembl transcript database file +# Possible values: string | Optional: yes (if not specified, default transcript database will be used) +@ensembl = test/tmp.db.gz + +# Absolute path to dbSNP database file +# Possible values: string | Optional: yes +##@dbsnp = absolute/path/to/dbsnp/database + +# Are variants with neither transcript nor dbSNP annotation to be included in the output? +# Possible values: TRUE or FALSE | Optional: yes | Default value: TRUE +@nonannot = TRUE + +# Are only records with PASS filter value included in the output? +# Possible values: TRUE or FALSE | Optional: yes | Default value: FALSE +@filter = FALSE + +# Types of variants to be annotated and outputted +# Possible values: ALL, SUBSTITUTION, INDEL, INSERTION, DELETION or COMPLEX | Optional: yes | Default value: ALL +@type = ALL + +# Name of compressed BED file specifying genomic regions variant annotation is restricted to +# Possible values: string | Optional: yes +##@target = . + +# Name of file providing a list of the gene identifiers variant annotation is restricted to +# Note: gene identifiers need to be given on separate lines in the file +# Possible values: string | Optional: yes +##@genelist = . + +# Name of file providing a list of transcript identifiers variant annotation is restricted to +# Note: transcript identifiers need to be given on separate lines in the file +# Possible values: string | Optional: yes +##@transcriptlist = . + +# Name of file providing a list of the dbSNP identifiers variant annotation is restricted to +# Note: dbSNP identifiers need to be given on separate lines in the file +# Possible values: string | Optional: yes +##@snplist = . + +# Is a log file to be written? +# Possible values: TRUE or FALSE | Optional: yes | Default value: FALSE +@logfile = TRUE + +# Which ontology is used for reporting consequence type? +# Possible values: CLASS, SO or BOTH | Optional: yes | Default value: BOTH +@ontology = BOTH + +# Definition of variant impact levels (reported by the IMPACT annotation flag) +# Different impact levels are separated by | and a comma-separated list of CLASS terms must be given for each level +# Possible values: string | Optional: yes +# Default value: SG,ESS,FS | SS5,IM,SL,EE,IF,NSY | SY,SS,INT,5PU,3PU +@impactdef = SG,ESS,FS | SS5,IM,SL,EE,IF,NSY | SY,SS,INT,5PU,3PU + +# Are alternative annotations outputted? +# If TRUE, the ALTANN and ALTCLASS/ALTSO annotation flags are reported instead of the ALTFLAG flag +# Possible values: TRUE or FALSE | Optional: yes | Default value: TRUE +@givealt = TRUE + +# Number of bases into the intron used as the splice site region +# Possible values: integer >= 6 | Optional: yes | Default value: 8 +@ssrange = 8 + +# Is the prefix CAVA_ added to annotation flag names in VCF output? +# Possible values: TRUE or FALSE | Optional: yes | Default value: FALSE +@prefix = FALSE From 83e3eecfeb5a5af391b5492a3666d7084c9a5682 Mon Sep 17 00:00:00 2001 From: dariober Date: Mon, 10 Sep 2018 11:16:16 +0100 Subject: [PATCH 10/13] Do not round transcript lengths to kilo-bases We report transcript lengths in base-pairs, i.e. as integers instead of kilobases (as floats). --- ensembldb/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ensembldb/main.py b/ensembldb/main.py index 7691cc4..be22077 100644 --- a/ensembldb/main.py +++ b/ensembldb/main.py @@ -35,7 +35,7 @@ def getInfoString(self): if self.STRAND == '1': ret = '+/' else: ret = '-/' cdna = self.getcDNALength() - return ret+str(round((self.POSEND-self.POS+1) / 1000, 1))+'kb/'+str(len(self.EXONS))+'/'+str(round(cdna / 1000, 1))+'kb/'+str(self.getProteinLength()) + return ret+str(self.POSEND-self.POS+1)+'bp/'+str(len(self.EXONS))+'/'+str(cdna)+'bp/'+str(self.getProteinLength()) # Get cDNA length of the transcript From 437c0b2ac656ed8bb1cf869c636acfe63597f23d Mon Sep 17 00:00:00 2001 From: dariober Date: Mon, 10 Sep 2018 11:17:44 +0100 Subject: [PATCH 11/13] Ignore downloaded public test data --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 3bbebb7..c71be5a 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,6 @@ /*/*/__pycache__ /*/*/*/__pycache__ doc/sphinx/_build +/test/tmp.* +/test/hg38.fa +/test/hg38.fa.fai From 9a3c399f7cdb6c0ed605f18ce6bf74ab608f2fcb Mon Sep 17 00:00:00 2001 From: dariober Date: Mon, 10 Sep 2018 11:18:25 +0100 Subject: [PATCH 12/13] Explicity use python3 --- bin/CAVA.py | 2 +- bin/EnsemblDB.py | 2 +- bin/dbSNPDB.py | 2 +- cava_/conseq.py | 4 ++-- cava_/core.py | 2 +- cava_/csn.py | 2 +- cava_/data.py | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/CAVA.py b/bin/CAVA.py index d1c910b..7ddb738 100755 --- a/bin/CAVA.py +++ b/bin/CAVA.py @@ -1,4 +1,4 @@ -#!env/bin/python +#!env/bin/python3 from optparse import OptionParser from cava_ import main diff --git a/bin/EnsemblDB.py b/bin/EnsemblDB.py index a73f5b9..0dfadaa 100644 --- a/bin/EnsemblDB.py +++ b/bin/EnsemblDB.py @@ -1,4 +1,4 @@ -#!env/bin/python +#!env/bin/python3 from optparse import OptionParser from ensembldb import main diff --git a/bin/dbSNPDB.py b/bin/dbSNPDB.py index d7ad51a..47d3be5 100755 --- a/bin/dbSNPDB.py +++ b/bin/dbSNPDB.py @@ -1,4 +1,4 @@ -#!env/bin/python +#!env/bin/python3 import os import sys diff --git a/cava_/conseq.py b/cava_/conseq.py index 2b10616..01b5d64 100755 --- a/cava_/conseq.py +++ b/cava_/conseq.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # CLASS annotation @@ -219,4 +219,4 @@ def checkUTR(transcript, variant): y = variant.pos + len(variant.ref) - 1 if transcript.isPositionOutsideCDS_5prime(x) or transcript.isPositionOutsideCDS_5prime(y): return 'UTR5' if transcript.isPositionOutsideCDS_3prime(x) or transcript.isPositionOutsideCDS_3prime(y): return 'UTR3' - return '' \ No newline at end of file + return '' diff --git a/cava_/core.py b/cava_/core.py index 0e2c291..024285d 100755 --- a/cava_/core.py +++ b/cava_/core.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Collection of basic classes and functions diff --git a/cava_/csn.py b/cava_/csn.py index 3426f7e..77ea7ca 100755 --- a/cava_/csn.py +++ b/cava_/csn.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # CSN annotation diff --git a/cava_/data.py b/cava_/data.py index ee5671a..7d03f72 100755 --- a/cava_/data.py +++ b/cava_/data.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Classes providing interfaces with annotation databases and the reference genome From cd1c955fd24e017e99328484c50830309a31864f Mon Sep 17 00:00:00 2001 From: Dario Beraldi Date: Fri, 28 Sep 2018 11:14:05 +0100 Subject: [PATCH 13/13] Fix filename in if-condition --- test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test.sh b/test.sh index 1466ac8..70f18b6 100644 --- a/test.sh +++ b/test.sh @@ -17,7 +17,7 @@ then | gzip > test/tmp.input.vcf.gz fi -if [ ! -f test/hg38.fa ] +if [ ! -f test/tmp.hg38.fa ] then curl http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz | gunzip > test/tmp.hg38.fa samtools faidx test/tmp.hg38.fa