diff --git a/.gitignore b/.gitignore index 3bbebb7..c71be5a 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,6 @@ /*/*/__pycache__ /*/*/*/__pycache__ doc/sphinx/_build +/test/tmp.* +/test/hg38.fa +/test/hg38.fa.fai diff --git a/README b/README.md similarity index 87% rename from README rename to README.md index 89f1fcf..1af04f3 100644 --- a/README +++ b/README.md @@ -1,3 +1,15 @@ + + +* [CAVA v1.2.3 README](#cava-v123-readme) + * [1 INTRODUCTION](#1-introduction) + * [2 PUBLICATION](#2-publication) + * [3 DEPENDENCIES](#3-dependencies) + * [4 INSTALLATION ON LINUX OR MAC](#4-installation-on-linux-or-mac) + * [5 RUNNING CAVA](#5-running-cava) + * [6 DOCUMENTATION](#6-documentation) + * [7 LICENCE](#7-licence) + + CAVA v1.2.3 README ================== @@ -22,7 +34,7 @@ Márton Münz, Elise Ruark, Anthony Renwick, Emma Ramsay, Matthew Clarke, Shazia -------------- To install and run CAVA v1.2.3 you will need the following dependencies installed: -- Python 2.7.9 or later (Python2 series) +- Python 3 - GCC and GNU make - virtualenv diff --git a/bin/CAVA.py b/bin/CAVA.py index 102b625..7ddb738 100755 --- a/bin/CAVA.py +++ b/bin/CAVA.py @@ -1,11 +1,11 @@ -#!env/bin/python +#!env/bin/python3 from optparse import OptionParser from cava_ import main from cava_ import helper # Version -version = 'v1.2.3' +version = 'v2.0.0' # Read default configuration file name from the default_config_path file default_config_file = helper.defaultConfigPath() diff --git a/bin/EnsemblDB.py b/bin/EnsemblDB.py index 69fac2b..0dfadaa 100644 --- a/bin/EnsemblDB.py +++ b/bin/EnsemblDB.py @@ -1,10 +1,10 @@ -#!env/bin/python +#!env/bin/python3 from optparse import OptionParser from ensembldb import main # Version -version = '1.2.3' +version = '2.0.0' # Command line argument parsing descr = 'CAVA ensembl_db v' + version diff --git a/bin/dbSNPDB.py b/bin/dbSNPDB.py index 62893fe..47d3be5 100755 --- a/bin/dbSNPDB.py +++ b/bin/dbSNPDB.py @@ -1,4 +1,4 @@ -#!env/bin/python +#!env/bin/python3 import os import sys @@ -54,7 +54,7 @@ def processData(options, datafile, IDs): sys.stdout.write('\rProcessing SNPs on all chrs ... OK') sys.stdout.flush() - print '' + print('') datafile.close() outfile.close() return counter @@ -83,16 +83,16 @@ def printMetaData(datafile): line = line.strip() if line.startswith('#'): if line.startswith('##dbSNP_BUILD_ID='): - print 'Input file build: '+line[17:] + print('Input file build: '+line[17:]) ret = int(line[17:]) - if line.startswith('##reference='): print 'Reference: '+line[12:]+'\n' + if line.startswith('##reference='): print('Reference: '+line[12:]+'\n') else: return ret ####################################################################################################################### # Version -version = '1.2.3' +version = '2.0.0' descr = 'CAVA dbsnp_db v' + version epilog = '\nExample usage: CAVA-{}/dbsnp_db -d 00-All.vcf.gz -s 138 -o out\n\n'.format(version) @@ -106,39 +106,39 @@ def printMetaData(datafile): # Check options if options.release is None: - print '\nError: no dbSNP release specified' - print 'Please use option -s to specify dbSNP release version\n' + print('\nError: no dbSNP release specified') + print('Please use option -s to specify dbSNP release version\n') quit() if options.data is None: - print '\nError: no 00-All.vcf.gz data file specified' - print 'Please use option -d to specify path to data file\n' + print('\nError: no 00-All.vcf.gz data file specified') + print('Please use option -d to specify path to data file\n') quit() if not os.path.isfile(options.data): - print '\nError: 00-All.vcf.gz file (' + options.data + ') cannot be found.\n' + print('\nError: 00-All.vcf.gz file (' + options.data + ') cannot be found.\n') quit() # Print out version information -print "\n---------------------------------------------------------------------------------------" -print 'CAVA ' + version + ' dbSNP database preparation tool (dbsnp_db) is now running.' -print 'Started: ', datetime.datetime.now(), '\n' +print("\n---------------------------------------------------------------------------------------") +print('CAVA ' + version + ' dbSNP database preparation tool (dbsnp_db) is now running.') +print('Started: ', datetime.datetime.now(), '\n') # Open data fie for reading -datafile = gzip.open(options.data, 'r') +datafile = gzip.open(options.data, 'rt') # Print out meta data build = printMetaData(datafile) datafile.seek(0) if int(options.release) > build: - print 'Error: requested release must be <=' + str(build) + '\n' + print('Error: requested release must be <=' + str(build) + '\n') quit() # Print out info -print 'Requested dbSNP release: ' + str(options.release) +print('Requested dbSNP release: ' + str(options.release)) IDs = [] if options.input is not None: IDs = readIDs(options.input) - print '\nInput file contains ' + str(len(IDs)) + ' dbSNP IDs to be included in the database' -print '' + print('\nInput file contains ' + str(len(IDs)) + ' dbSNP IDs to be included in the database') +print('') # Create compressed output file N = processData(options, datafile, IDs) @@ -150,13 +150,13 @@ def printMetaData(datafile): os.remove(options.output) # Print out summary information -print '\nA total of ' + str(N) + ' SNPs have been retrieved\n' -print '---------------------' -print 'Output files created:' -print '---------------------' -print options.output + '.gz (SNP database)' -print options.output + '.gz.tbi (index file)' -print '' -print 'CAVA dbsnp_db successfully finished: ', datetime.datetime.now() -print "---------------------------------------------------------------------------------------\n" +print('\nA total of ' + str(N) + ' SNPs have been retrieved\n') +print('---------------------') +print('Output files created:') +print('---------------------') +print(options.output + '.gz (SNP database)') +print(options.output + '.gz.tbi (index file)') +print('') +print('CAVA dbsnp_db successfully finished: ', datetime.datetime.now()) +print("---------------------------------------------------------------------------------------\n") diff --git a/cava_/conseq.py b/cava_/conseq.py index 2b10616..01b5d64 100755 --- a/cava_/conseq.py +++ b/cava_/conseq.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # CLASS annotation @@ -219,4 +219,4 @@ def checkUTR(transcript, variant): y = variant.pos + len(variant.ref) - 1 if transcript.isPositionOutsideCDS_5prime(x) or transcript.isPositionOutsideCDS_5prime(y): return 'UTR5' if transcript.isPositionOutsideCDS_3prime(x) or transcript.isPositionOutsideCDS_3prime(y): return 'UTR3' - return '' \ No newline at end of file + return '' diff --git a/cava_/core.py b/cava_/core.py index 9049b54..024285d 100755 --- a/cava_/core.py +++ b/cava_/core.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Collection of basic classes and functions ####################################################################################################################### -from __future__ import division + import os import logging import gzip @@ -342,7 +342,7 @@ def output(self, outformat, outfile, options, genelist, transcriptlist, snplist, # Writing record to the output file if stdout: - print '\t'.join(record) + print('\t'.join(record)) else: outfile.write('\t'.join(record) + '\n') @@ -381,7 +381,7 @@ def output(self, outformat, outfile, options, genelist, transcriptlist, snplist, # Writing record to the output file if stdout: - print record + rest + print(record + rest) else: outfile.write(record + rest + '\n') @@ -748,13 +748,13 @@ def read(self): line = line.strip() if line.startswith('@'): key = line[1:line.index('=')].strip() - if key in self.defs.keys(): + if key in list(self.defs.keys()): (typeofvar, default) = self.defs[key] if typeofvar == 'string': self.args[key] = line[line.find('=') + 1:].strip() if typeofvar == 'list': self.args[key] = line[line.find('=') + 1:].strip().split(',') if typeofvar == 'boolean': self.args[key] = (line[line.find('=') + 1:].strip().upper() == 'TRUE') - for key, (typeofvar, default) in self.defs.iteritems(): - if not key in self.args.keys(): self.args[key] = default + for key, (typeofvar, default) in self.defs.items(): + if not key in list(self.args.keys()): self.args[key] = default ####################################################################################################################### @@ -764,7 +764,7 @@ def read(self): # Reading gene, transcript or snp list from file def readSet(options, tag): ret = set() - if tag in options.args.keys() and not (options.args[tag] == '' or options.args[tag] == '.'): + if tag in list(options.args.keys()) and not (options.args[tag] == '' or options.args[tag] == '.'): for line in open(options.args[tag]): line = line.strip() if line == '' or line == '.': continue @@ -807,7 +807,7 @@ def writeHeader(options, header, outfile, stdout): if options.args['outputformat'] == 'VCF': if header == '': if stdout: - print '##fileformat=VCFv4.1\n' + dateline + '\n' + headerinfo + '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO' + print('##fileformat=VCFv4.1\n' + dateline + '\n' + headerinfo + '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO') else: outfile.write('##fileformat=VCFv4.1\n' + dateline + '\n' + headerinfo + '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n') else: @@ -821,7 +821,7 @@ def writeHeader(options, header, outfile, stdout): continue headerm.append(x) if stdout: - print startline + '\n' + dateline + '\n' + headerinfo + '\n'.join(headerm)+'\n' + print(startline + '\n' + dateline + '\n' + headerinfo + '\n'.join(headerm)+'\n') else: outfile.write(startline + '\n' + dateline + '\n' + headerinfo + '\n'.join(headerm)+'\n') @@ -845,7 +845,7 @@ def writeHeader(options, header, outfile, stdout): str += '\tDBSNP' if stdout: - print str + print(str) else: outfile.write(str + '\n') @@ -854,7 +854,7 @@ def writeHeader(options, header, outfile, stdout): def countRecords(filename): ret = 0 if filename.endswith('.gz'): - inputf = gzip.open(filename, 'r') + inputf = gzip.open(filename, 'rt') else: inputf = open(filename) for line in inputf: @@ -868,10 +868,10 @@ def checkOptions(options): # Checking if @inputformat was given correct value str = options.args['inputformat'].upper() if not (str == 'VCF' or str == 'TXT'): - print 'ERROR: incorrect value of the tag @inputformat.' - print '(Allowed values: \'VCF\' or \'TXT\')' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @inputformat.') + print('(Allowed values: \'VCF\' or \'TXT\')') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @inputformat.') logging.info('No output file written. CAVA quit.') @@ -880,10 +880,10 @@ def checkOptions(options): # Checking if @outputformat was given correct value str = options.args['outputformat'].upper() if not (str == 'VCF' or str == 'TSV'): - print 'ERROR: incorrect value of the tag @outputformat.' - print '(Allowed values: \'VCF\' or \'TSV\')' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @outputformat.') + print('(Allowed values: \'VCF\' or \'TSV\')') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @outputformat.') logging.info('No output file written. CAVA quit.') @@ -893,10 +893,10 @@ def checkOptions(options): str = options.args['type'].upper() if not ( str == 'ALL' or str == 'SUBSTITUTION' or str == 'INDEL' or str == 'INSERTION' or str == 'DELETION' or str == 'COMPLEX'): - print 'ERROR: incorrect value of the tag @type.' - print '(Allowed values: \'all\', \'substitution\', \'indel\', \'insertion\', \'deletion\' or \'complex\')' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @type.') + print('(Allowed values: \'all\', \'substitution\', \'indel\', \'insertion\', \'deletion\' or \'complex\')') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @type.') logging.info('No output file written. CAVA quit.') @@ -905,10 +905,10 @@ def checkOptions(options): # Checking if @ssrange was given correct value ssrange = int(options.args['ssrange']) if not ssrange >= 6: - print 'ERROR: incorrect value of the tag @ssrange.' - print '(Minimum value allowed is 6.)' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @ssrange.') + print('(Minimum value allowed is 6.)') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @ssrange.') logging.info('No output file written. CAVA quit.') @@ -917,10 +917,10 @@ def checkOptions(options): # Checking if @ontology was given correct value str = options.args['ontology'].upper() if not (str == 'CLASS' or str == 'SO' or str == 'BOTH'): - print 'ERROR: incorrect value of the tag @ontology.' - print '(Allowed values: \'CLASS\' or \'SO\' or \'both\')' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: incorrect value of the tag @ontology.') + print('(Allowed values: \'CLASS\' or \'SO\' or \'both\')') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('Incorrect value of the tag @ontology.') logging.info('No output file written. CAVA quit.') @@ -928,9 +928,9 @@ def checkOptions(options): # Checking if @reference file exists if not os.path.isfile(options.args['reference']): - print 'ERROR: the file given as @reference does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @reference does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @reference does not exist.') logging.info('No output file written. CAVA quit.') @@ -938,9 +938,9 @@ def checkOptions(options): # Checking if @reference index file exists if not os.path.isfile(options.args['reference'] + '.fai'): - print 'ERROR: the .fa.fai index file for @reference is not found.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the .fa.fai index file for @reference is not found.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The .fa.fai index file for @reference is not found.') logging.info('No output file written. CAVA quit.') @@ -949,9 +949,9 @@ def checkOptions(options): # Checking if @ensembl file exists if not (options.args['ensembl'] == '.' or options.args['ensembl'] == '') and not os.path.isfile( options.args['ensembl']): - print 'ERROR: the file given as @ensembl does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @ensembl does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @ensembl does not exist.') logging.info('No output file written. CAVA quit.') @@ -960,9 +960,9 @@ def checkOptions(options): # Checking if @ensembl index file exists if not (options.args['ensembl'] == '.' or options.args['ensembl'] == '') and not os.path.isfile( options.args['ensembl'] + '.tbi'): - print 'ERROR: the .gz.tbi index file for @ensembl is not found.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the .gz.tbi index file for @ensembl is not found.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The .gz.tbi index file for @ensembl is not found.') logging.info('No output file written. CAVA quit.') @@ -970,9 +970,9 @@ def checkOptions(options): # Checking if @dbsnp file exists if not (options.args['dbsnp'] == '.' or options.args['dbsnp'] == '') and not os.path.isfile(options.args['dbsnp']): - print 'ERROR: the file given as @dbsnp does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @dbsnp does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @dbsnp does not exist.') logging.info('No output file written. CAVA quit.') @@ -981,9 +981,9 @@ def checkOptions(options): # Checking if @dbsnp index file exists if not (options.args['dbsnp'] == '.' or options.args['dbsnp'] == '') and not os.path.isfile( options.args['dbsnp'] + '.tbi'): - print 'ERROR: the .gz.tbi index file for @dbsnp is not found.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the .gz.tbi index file for @dbsnp is not found.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The .gz.tbi index file for @dbsnp is not found.') logging.info('No output file written. CAVA quit.') @@ -992,9 +992,9 @@ def checkOptions(options): # Checking if @target file exists if not (options.args['target'] == '.' or options.args['target'] == '') and not os.path.isfile( options.args['target']): - print 'ERROR: the file given as @target does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @target does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @target does not exist.') logging.info('No output file written. CAVA quit.') @@ -1003,9 +1003,9 @@ def checkOptions(options): # Checking if @target index file exists if not (options.args['target'] == '.' or options.args['target'] == '') and not os.path.isfile( options.args['target'] + '.tbi'): - print 'ERROR: the .bed.tbi index file for @target is not found.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the .bed.tbi index file for @target is not found.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The .bed.tbi index file for @target is not found.') logging.info('No output file written. CAVA quit.') @@ -1014,9 +1014,9 @@ def checkOptions(options): # Checking if @genelist file exists if not (options.args['genelist'] == '.' or options.args['genelist'] == '') and not os.path.isfile( options.args['genelist']): - print 'ERROR: the file given as @genelist does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @genelist does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @genelist does not exist.') logging.info('No output file written. CAVA quit.') @@ -1025,9 +1025,9 @@ def checkOptions(options): # Checking if @transcriptlist file exists if not (options.args['transcriptlist'] == '.' or options.args['transcriptlist'] == '') and not os.path.isfile( options.args['transcriptlist']): - print 'ERROR: the file given as @transcriptlist does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @transcriptlist does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @transcriptlist does not exist.') logging.info('No output file written. CAVA quit.') @@ -1036,9 +1036,9 @@ def checkOptions(options): # Checking if @snplist file exists if not (options.args['snplist'] == '.' or options.args['snplist'] == '') and not os.path.isfile( options.args['snplist']): - print 'ERROR: the file given as @snplist does not exist.' - print '\nNo output file written. CAVA quit.' - print "--------------------------------------------------------------------\n" + print('ERROR: the file given as @snplist does not exist.') + print('\nNo output file written. CAVA quit.') + print("--------------------------------------------------------------------\n") if options.args['logfile']: logging.error('The file given as @snplist does not exist.') logging.info('No output file written. CAVA quit.') diff --git a/cava_/csn.py b/cava_/csn.py index 89fc1e5..77ea7ca 100755 --- a/cava_/csn.py +++ b/cava_/csn.py @@ -1,10 +1,10 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # CSN annotation ####################################################################################################################### -import core +from . import core # Class representing a CSN annotation @@ -175,7 +175,7 @@ def makeProteinString(variant, transcript, reference, prot, mutprot, coord1): # Checking if there was no change in protein sequence if prot == mutprot: - idx = coord1 / 3 + idx = int(coord1 / 3) if coord1 % 3 > 0: idx += 1 return '_p.=', (str(idx), prot[idx-1], prot[idx-1]) @@ -281,7 +281,7 @@ def transformToCSNCoordinate(pos, transcript): if transcript.strand == 1: # Checking if genomic position is within intron if prevExonEnd < pos < exon.start + 1: - if pos <= (exon.start + 1 - prevExonEnd) / 2 + prevExonEnd: + if pos <= int((exon.start + 1 - prevExonEnd) / 2) + prevExonEnd: x, y = transformToCSNCoordinate(prevExonEnd, transcript) return x, pos - prevExonEnd else: @@ -290,7 +290,7 @@ def transformToCSNCoordinate(pos, transcript): else: # Checking if genomic position is within intron if exon.end < pos < prevExonEnd: - if pos >= (prevExonEnd - exon.end + 1) / 2 + exon.end: + if pos >= int((prevExonEnd - exon.end + 1) / 2) + exon.end: x, y = transformToCSNCoordinate(prevExonEnd, transcript) return x, prevExonEnd - pos else: @@ -320,7 +320,7 @@ def transformToCSNCoordinate(pos, transcript): if transcript.strand == 1: # Checking if genomic position is within intron if prevExonEnd < pos < exon.start + 1: - if pos <= (exon.start + 1 - prevExonEnd) / 2 + prevExonEnd: + if pos <= int((exon.start + 1 - prevExonEnd) / 2) + prevExonEnd: x, y = transformToCSNCoordinate(prevExonEnd, transcript) return x, pos - prevExonEnd else: @@ -329,7 +329,7 @@ def transformToCSNCoordinate(pos, transcript): else: # Checking if genomic position is within intron if exon.end < pos < prevExonEnd: - if pos >= (prevExonEnd - exon.end + 1) / 2 + exon.end: + if pos >= int((prevExonEnd - exon.end + 1) / 2) + exon.end: x, y = transformToCSNCoordinate(prevExonEnd, transcript) return x, prevExonEnd - pos else: diff --git a/cava_/data.py b/cava_/data.py index 5fa95ef..7d03f72 100755 --- a/cava_/data.py +++ b/cava_/data.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Classes providing interfaces with annotation databases and the reference genome @@ -7,9 +7,9 @@ import os import sys -import conseq -import core -import csn +from . import conseq +from . import core +from . import csn sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + '/pysamdir') import pysam @@ -69,22 +69,22 @@ def findTranscripts(self, variant, strand, reference): hitdict2[transcript.TRANSCRIPT] = transcript # Find transcripts with which the variant fully or partially overlaps - for key, transcript in hitdict1.iteritems(): + for key, transcript in hitdict1.items(): if len(self.genelist) > 0 and transcript.geneSymbol not in self.genelist: continue if len(self.transcriptlist) > 0 and transcript.TRANSCRIPT not in self.transcriptlist: continue - if key in hitdict2.keys(): + if key in list(hitdict2.keys()): ret[key] = transcript else: if not variant.isInsertion(): retOUT[key] = transcript if not variant.isInsertion(): - for key, transcript in hitdict2.iteritems(): + for key, transcript in hitdict2.items(): if len(self.genelist) > 0 and transcript.geneSymbol not in self.genelist: continue if len(self.transcriptlist) > 0 and transcript.TRANSCRIPT not in self.transcriptlist: continue - if not key in hitdict1.keys(): + if not key in list(hitdict1.keys()): retOUT[key] = transcript else: @@ -190,8 +190,8 @@ def annotate(self, variant, reference, impactdir): transcripts_plus, transcriptsOUT_plus = self.findTranscripts(variant_plus, 1, reference) transcripts_minus, transcriptsOUT_minus = self.findTranscripts(variant_minus, -1, reference) - transcripts = set(transcripts_plus.keys() + transcripts_minus.keys()) - transcriptsOUT = set(transcriptsOUT_plus.keys() + transcriptsOUT_minus.keys()) + transcripts = set(list(transcripts_plus.keys()) + list(transcripts_minus.keys())) + transcriptsOUT = set(list(transcriptsOUT_plus.keys()) + list(transcriptsOUT_minus.keys())) transcripts = sorted(list(transcripts)) transcriptsOUT = sorted(list(transcriptsOUT)) @@ -201,7 +201,7 @@ def annotate(self, variant, reference, impactdir): if TRANSCRIPT in transcripts: continue - if TRANSCRIPT in transcriptsOUT_plus.keys(): transcript = transcriptsOUT_plus[TRANSCRIPT] + if TRANSCRIPT in list(transcriptsOUT_plus.keys()): transcript = transcriptsOUT_plus[TRANSCRIPT] else: transcript = transcriptsOUT_minus[TRANSCRIPT] if len(TRANSCRIPTstring) > 0: @@ -228,10 +228,10 @@ def annotate(self, variant, reference, impactdir): TRINFOstring += transcript.TRINFO if transcript.strand == 1: - if TRANSCRIPT in transcriptsOUT_plus.keys(): LOCstring += 'OUT' + if TRANSCRIPT in list(transcriptsOUT_plus.keys()): LOCstring += 'OUT' else: LOCstring += '.' else: - if TRANSCRIPT in transcriptsOUT_minus.keys(): LOCstring += 'OUT' + if TRANSCRIPT in list(transcriptsOUT_minus.keys()): LOCstring += 'OUT' else: LOCstring += '.' CSNstring += '.' @@ -250,7 +250,7 @@ def annotate(self, variant, reference, impactdir): # Iterating through the list of transcripts for TRANSCRIPT in transcripts: - if TRANSCRIPT in transcripts_plus.keys(): transcript = transcripts_plus[TRANSCRIPT] + if TRANSCRIPT in list(transcripts_plus.keys()): transcript = transcripts_plus[TRANSCRIPT] else: transcript = transcripts_minus[TRANSCRIPT] # Separating annotations by different transcripts with colon @@ -279,13 +279,13 @@ def annotate(self, variant, reference, impactdir): TRINFOstring += transcript.TRINFO # Creating the LOC annotation - if TRANSCRIPT in transcripts_plus.keys(): loc_plus = transcript.whereIsThisVariant(variant_plus) - elif TRANSCRIPT in transcriptsOUT_plus.keys(): loc_plus = 'OUT' + if TRANSCRIPT in list(transcripts_plus.keys()): loc_plus = transcript.whereIsThisVariant(variant_plus) + elif TRANSCRIPT in list(transcriptsOUT_plus.keys()): loc_plus = 'OUT' else: loc_plus = '.' if difference: - if TRANSCRIPT in transcripts_minus.keys(): loc_minus = transcript.whereIsThisVariant(variant_minus) - elif TRANSCRIPT in transcriptsOUT_minus.keys(): loc_minus = 'OUT' + if TRANSCRIPT in list(transcripts_minus.keys()): loc_minus = transcript.whereIsThisVariant(variant_minus) + elif TRANSCRIPT in list(transcriptsOUT_minus.keys()): loc_minus = 'OUT' else: loc_minus = '.' else: loc_minus = loc_plus @@ -299,7 +299,7 @@ def annotate(self, variant, reference, impactdir): if notexonic_plus and notexonic_minus: protein = '' else: - if not transcript.TRANSCRIPT in self.proteinSeqs.keys(): + if not transcript.TRANSCRIPT in list(self.proteinSeqs.keys()): protein, exonseqs = transcript.getProteinSequence(reference, None, None) if len(self.proteinSeqs) > 5: @@ -327,13 +327,13 @@ def annotate(self, variant, reference, impactdir): # Creating the CSN annotations both for left and right aligned variant - if TRANSCRIPT in transcripts_plus.keys(): + if TRANSCRIPT in list(transcripts_plus.keys()): csn_plus, protchange_plus = csn.getAnnotation(variant_plus, transcript, reference, protein, mutprotein_plus) csn_plus_str = csn_plus.getAsString() else: csn_plus_str, protchange_plus = '.', ('.','.','.') if difference: - if TRANSCRIPT in transcripts_minus.keys(): + if TRANSCRIPT in list(transcripts_minus.keys()): csn_minus, protchange_minus = csn.getAnnotation(variant_minus, transcript, reference, protein, mutprotein_minus) csn_minus_str = csn_minus.getAsString() else: csn_minus_str, protchange_minus = '.', ('.','.','.') @@ -353,12 +353,12 @@ def annotate(self, variant, reference, impactdir): if not impactdir == None or self.options.args['ontology'].upper() in ['CLASS', 'BOTH']: # Creating the CLASS annotations both for left and right aligned variant - if TRANSCRIPT in transcripts_plus.keys(): + if TRANSCRIPT in list(transcripts_plus.keys()): class_plus = conseq.getClassAnnotation(variant_plus, transcript, protein, mutprotein_plus, loc_plus, int(self.options.args['ssrange'])) else: class_plus = '.' if difference: - if TRANSCRIPT in transcripts_minus.keys(): + if TRANSCRIPT in list(transcripts_minus.keys()): class_minus = conseq.getClassAnnotation(variant_minus, transcript, protein, mutprotein_minus, loc_minus, int(self.options.args['ssrange'])) else: class_minus = '.' else: class_minus = class_plus @@ -367,14 +367,14 @@ def annotate(self, variant, reference, impactdir): # Determining the IMPACT flag if not impactdir == None: - if TRANSCRIPT in transcripts_plus.keys(): - if class_plus in impactdir.keys(): impact_plus = impactdir[class_plus] + if TRANSCRIPT in list(transcripts_plus.keys()): + if class_plus in list(impactdir.keys()): impact_plus = impactdir[class_plus] else: impact_plus = 'None' else: impact_plus = '.' - if TRANSCRIPT in transcripts_minus.keys(): - if class_minus in impactdir.keys(): impact_minus = impactdir[class_minus] + if TRANSCRIPT in list(transcripts_minus.keys()): + if class_minus in list(impactdir.keys()): impact_minus = impactdir[class_minus] else: impact_minus = 'None' else: impact_minus = '.' @@ -383,12 +383,12 @@ def annotate(self, variant, reference, impactdir): if self.options.args['ontology'].upper() in ['SO', 'BOTH']: # Creating the SO annotations both for left and right aligned variant - if TRANSCRIPT in transcripts_plus.keys(): + if TRANSCRIPT in list(transcripts_plus.keys()): so_plus = conseq.getSequenceOntologyAnnotation(variant_plus, transcript, protein, mutprotein_plus, loc_plus) else: so_plus = '.' if difference: - if TRANSCRIPT in transcripts_minus.keys(): + if TRANSCRIPT in list(transcripts_minus.keys()): so_minus = conseq.getSequenceOntologyAnnotation(variant_minus, transcript, protein, mutprotein_minus, loc_minus) else: so_minus = '.' diff --git a/cava_/main.py b/cava_/main.py index 6669ce3..15b164b 100644 --- a/cava_/main.py +++ b/cava_/main.py @@ -1,6 +1,3 @@ - -from __future__ import division - import datetime import gzip import logging @@ -9,20 +6,20 @@ import sys import pysam -import core +from . import core from cava_.data import Ensembl from cava_.data import Reference from cava_.data import dbSNP -from core import Options -from core import Record +from .core import Options +from .core import Record # Printing out welcome meassage def printStartInfo(ver): starttime = datetime.datetime.now() - print "\n-----------------------------------------------------------------------" - print 'CAVA (Clinical Annotation of VAriants) ' + ver + ' is now running.' - print 'Started: ', str(starttime), '\n' + print("\n-----------------------------------------------------------------------") + print('CAVA (Clinical Annotation of VAriants) ' + ver + ' is now running.') + print('Started: ', str(starttime), '\n') return starttime @@ -32,12 +29,12 @@ def printInputFileNames(copts, options): outfn = copts.output + '.vcf' else: outfn = copts.output + '.txt' - print 'Configuration file: ' + copts.conf - print 'Input file (' + options.args['inputformat'] + '): ' + copts.input - print 'Output file (' + options.args['outputformat'] + '): ' + outfn - if options.args['logfile']: print 'Log file: ' + copts.output + '.log' + print('Configuration file: ' + copts.conf) + print('Input file (' + options.args['inputformat'] + '): ' + copts.input) + print('Output file (' + options.args['outputformat'] + '): ' + outfn) + if options.args['logfile']: print('Log file: ' + copts.output + '.log') if copts.threads > 1: - print '\nMultithreading: ' + str(copts.threads) + ' threads' + print('\nMultithreading: ' + str(copts.threads) + ' threads') if options.args['logfile']: logging.info('Configuration file - ' + copts.conf) @@ -52,7 +49,7 @@ def printInputFileNames(copts, options): # Printing out number of records in the input file def printNumOfRecords(numOfRecords): - print '\nInput file contains ' + str(numOfRecords) + ' records to annotate.\n' + print('\nInput file contains ' + str(numOfRecords) + ' records to annotate.\n') # Initializing progress information @@ -73,7 +70,7 @@ def printProgressInfo(counter, numOfRecords): def finalizeProgressInfo(): sys.stdout.write('\rAnnotating variants ... 100.0%') sys.stdout.flush() - print ' - Done.' + print(' - Done.') # Printing out goodbye message @@ -83,11 +80,11 @@ def printEndInfo(options, copts, starttime): outfn = copts.output + '.vcf' else: outfn = copts.output + '.txt' - print '\n(Size of output file: ' + str(round(os.stat(outfn).st_size / 1000, 1)) + ' Kbyte)' - print '\nCAVA (Clinical Annotation of VAriants) successfully finished.' - print 'Ended: ', str(endtime) - print 'Total runtime: '+str(endtime-starttime) - print "-----------------------------------------------------------------------\n" + print('\n(Size of output file: ' + str(round(os.stat(outfn).st_size / 1000, 1)) + ' Kbyte)') + print('\nCAVA (Clinical Annotation of VAriants) successfully finished.') + print('Ended: ', str(endtime)) + print('Total runtime: '+str(endtime-starttime)) + print("-----------------------------------------------------------------------\n") if options.args['logfile']: logging.info('100% of records annotated.') if not copts.stdout: logging.info('Output file = ' + str(round(os.stat(outfn).st_size / 1000, 1)) + ' Kbyte') @@ -100,7 +97,7 @@ def findFileBreaks(inputf, threads): started = False counter = 0 - if inputf.endswith('.gz'): infile = gzip.open(inputf, 'r') + if inputf.endswith('.gz'): infile = gzip.open(inputf, 'rt') else: infile = open(inputf) for line in infile: @@ -123,7 +120,7 @@ def findFileBreaks(inputf, threads): def readHeader(inputfn): ret = [] - if inputfn.endswith('.gz'): infile = gzip.open(inputfn, 'r') + if inputfn.endswith('.gz'): infile = gzip.open(inputfn, 'rt') else: infile = open(inputfn) for line in infile: @@ -197,7 +194,7 @@ def __init__(self, threadidx, options, copts, startline, endline, genelist, tran # Input file if copts.input.endswith('.gz'): - self.infile = gzip.open(copts.input, 'r') + self.infile = gzip.open(copts.input, 'rt') else: self.infile = open(copts.input) @@ -310,14 +307,14 @@ def run(copts, version, default_config_file): # Check if input and configuration files exist if copts.conf is None: - print '\nError: no configuration file specified.' - print 'Please use option -c or add the absolute path to the default_config_path file.\n' + print('\nError: no configuration file specified.') + print('Please use option -c or add the absolute path to the default_config_path file.\n') quit() if not os.path.isfile(copts.conf): - print '\nError: configuration file (' + copts.conf + ') cannot be found.\n' + print('\nError: configuration file (' + copts.conf + ') cannot be found.\n') quit() if not os.path.isfile(copts.input): - print '\nError: input file (' + copts.input + ') cannot be found.\n' + print('\nError: input file (' + copts.input + ') cannot be found.\n') quit() # Reading options from configuration file @@ -393,4 +390,4 @@ def run(copts, version, default_config_file): # Printing out summary information and end time if not copts.stdout: - printEndInfo(options, copts, starttime) \ No newline at end of file + printEndInfo(options, copts, starttime) diff --git a/ensembldb/main.py b/ensembldb/main.py index 29ba886..be22077 100644 --- a/ensembldb/main.py +++ b/ensembldb/main.py @@ -1,13 +1,11 @@ -from __future__ import division import os import datetime import sys -import urllib +import urllib.request, urllib.parse, urllib.error import gzip import pysam from operator import itemgetter - # Class representing a transcript class Transcript(object): @@ -22,8 +20,8 @@ def __init__(self): self.POSEND = None self.GENETYPE = None self.TRANSTYPE = None - self.CODING_START = None - self.CODING_END = None + self.CODING_START = -1 + self.CODING_END = -1 self.CODING_START_RELATIVE = None self.CCDS = None self.EXONS = [] @@ -37,7 +35,7 @@ def getInfoString(self): if self.STRAND == '1': ret = '+/' else: ret = '-/' cdna = self.getcDNALength() - return ret+str(round((self.POSEND-self.POS+1)/1000,1))+'kb/'+str(len(self.EXONS))+'/'+str(round(cdna/1000,1))+'kb/'+str(self.getProteinLength()) + return ret+str(self.POSEND-self.POS+1)+'bp/'+str(len(self.EXONS))+'/'+str(cdna)+'bp/'+str(self.getProteinLength()) # Get cDNA length of the transcript @@ -54,10 +52,14 @@ def getProteinLength(self): for exon in self.EXONS: if exon.END < self.CODING_START: continue if exon.START > self.CODING_END: continue - if exon.START <= self.CODING_START <= exon.END: start = self.CODING_START - else: start = exon.START + 1 - if exon.START <= self.CODING_END <= exon.END: end = self.CODING_END - else: end = exon.END + if exon.START <= self.CODING_START <= exon.END: + start = self.CODING_START + else: + start = exon.START + 1 + if exon.START <= self.CODING_END <= exon.END: + end = self.CODING_END + else: + end = exon.END codingdna += end - start + 1 else: for exon in self.EXONS: @@ -73,8 +75,10 @@ def getProteinLength(self): # Check if it is a candidate transcript def isCandidate(self): - if not (self.GENETYPE=='protein_coding' and self.TRANSTYPE=='protein_coding'): return False - return (self.CODING_START is not None and self.CODING_END is not None) and self.isComplete + if not (self.GENETYPE=='protein_coding' and self.TRANSTYPE=='protein_coding'): + return False + # return (self.CODING_START is not None and self.CODING_END is not None) and self.isComplete + return (self.CODING_START > -1 and self.CODING_END > -1) and self.isComplete # Output transcript @@ -138,18 +142,27 @@ def __init__(self, symbol, ensg): def selectTranscript(self): ccds_set = [] nonccds_set = [] - for enst,transcript in self.TRANSCRIPTS.iteritems(): + for enst,transcript in self.TRANSCRIPTS.items(): if transcript.CCDS: ccds_set.append(transcript) else: nonccds_set.append(transcript) if len(ccds_set) > 0: candidates = ccds_set else: candidates = nonccds_set + # Sort candidates by ENST. In case there multiple selectable + # transcripts, the selected one does not depend on the order they are + # returned by .items() + candidates.sort(key=lambda x: x.ENST) + selected = Transcript() selected.PROTL = selected.CDNAL = -1 for t in candidates: - if t.PROTL > selected.PROTL: selected = t - elif t.PROTL == selected.PROTL and t.CDNAL > selected.CDNAL: selected = t + # Note that we return the *last* selected candidate since we + # overwrite the variable `selected`. + if t.PROTL > selected.PROTL: + selected = t + elif t.PROTL == selected.PROTL and t.CDNAL > selected.CDNAL: + selected = t return selected @@ -157,17 +170,17 @@ def selectTranscript(self): # Output all or selected transcripts def output(self, outfile, outfile_list, select, mcg_transcripts): if select: - if self.SYMBOL in mcg_transcripts.keys(): + if self.SYMBOL in list(mcg_transcripts.keys()): ok = False - for _,transcript in self.TRANSCRIPTS.iteritems(): + for _,transcript in self.TRANSCRIPTS.items(): if transcript.ENST in mcg_transcripts[self.SYMBOL]: transcript.output(outfile,outfile_list) ok = True - if self.SYMBOL not in mcg_transcripts.keys() or not ok: + if self.SYMBOL not in list(mcg_transcripts.keys()) or not ok: transcript = self.selectTranscript() transcript.output(outfile,outfile_list) else: - for _,transcript in self.TRANSCRIPTS.iteritems(): + for _,transcript in self.TRANSCRIPTS.items(): transcript.output(outfile,outfile_list) @@ -209,11 +222,11 @@ def sortRecords(records, idx1, idx2): chroms = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', 'MT', 'X', 'Y'] for i in range(len(chroms)): chrom = chroms[i] - if chrom in records.keys(): + if chrom in list(records.keys()): records[chrom] = sorted(records[chrom], key=itemgetter(idx1,idx2)) for i in range(len(chroms)): chrom = chroms[i] - if chrom in records.keys(): + if chrom in list(records.keys()): for record in records[chrom]: ret.append(record) return ret @@ -245,7 +258,7 @@ def process_data(options, genome_build, version): line = line.strip() if line == '': continue cols = line.split('\t') - if cols[0] not in mcg_transcripts.keys(): mcg_transcripts[cols[0]] = set() + if cols[0] not in list(mcg_transcripts.keys()): mcg_transcripts[cols[0]] = set() mcg_transcripts[cols[0]].add(cols[1]) # Changing transcript for certain releases if int(options.ensembl)>=71: mcg_transcripts['BMPR1A'] = {'ENST00000372037'} @@ -261,11 +274,11 @@ def process_data(options, genome_build, version): transIDs = set() if options.input is not None: transIDs = readTranscriptIDs(options.input) - print '\nOnly ' + str(len(transIDs)) + ' transcripts read from ' + options.input + ' are considered\n' - else: print '\nAll transcripts from the Ensembl release are considered\n' + print('\nOnly ' + str(len(transIDs)) + ' transcripts read from ' + options.input + ' are considered\n') + else: print('\nAll transcripts from the Ensembl release are considered\n') # Print out info - if options.select: print 'Transcript selection switched on\n' + if options.select: print('Transcript selection switched on\n') # Load candidate and CCDS data for Ensembl <75 candidates = dict() @@ -275,7 +288,7 @@ def process_data(options, genome_build, version): line=line.strip() if line=='': continue cols = line.split('\t') - if cols[0] not in candidates.keys(): candidates[cols[0]]=dict() + if cols[0] not in list(candidates.keys()): candidates[cols[0]]=dict() candidates[cols[0]][cols[1]]=int(cols[2]) # Download Ensembl data @@ -283,9 +296,9 @@ def process_data(options, genome_build, version): sys.stdout.flush() url = 'ftp://ftp.ensembl.org/pub/release-' + options.ensembl + '/gtf/homo_sapiens/Homo_sapiens.' + genome_build + '.' + options.ensembl + '.gtf.gz' try: - urllib.urlretrieve(url, 'ensembl_data.gz') + urllib.request.urlretrieve(url, 'ensembl_data.gz') except: - print '\n\nCannot connect to Ensembl FTP site. No internet connection?\n' + print('\n\nCannot connect to Ensembl FTP site. No internet connection?\n') quit() sys.stdout.write('OK\n') @@ -296,7 +309,7 @@ def process_data(options, genome_build, version): first = True prevenst = '' transcript = None - for line in gzip.open('ensembl_data.gz', 'r'): + for line in gzip.open('ensembl_data.gz', 'rt'): line = line.strip() if line.startswith('#'): continue cols = line.split('\t') @@ -322,7 +335,8 @@ def process_data(options, genome_build, version): if not first: transcript.finalize() if transcript.isCandidate(): - if transcript.ENSG not in genesdata.keys(): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) + if transcript.ENSG not in list(genesdata.keys()): + genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) genesdata[transcript.ENSG].TRANSCRIPTS[transcript.ENST] = transcript # Initialize new Transcript object @@ -358,20 +372,20 @@ def process_data(options, genome_build, version): if cols[2] == 'start_codon': if transcript.STRAND == '1': - if transcript.CODING_START is None or int(cols[3]) < transcript.CODING_START: transcript.CODING_START = int(cols[3]) + if transcript.CODING_START < 0 or int(cols[3]) < transcript.CODING_START: transcript.CODING_START = int(cols[3]) else: - if transcript.CODING_START is None or int(cols[4]) > transcript.CODING_START: transcript.CODING_START = int(cols[4]) + if transcript.CODING_START < 0 or int(cols[4]) > transcript.CODING_START: transcript.CODING_START = int(cols[4]) if cols[2] == 'stop_codon': if transcript.STRAND == '1': - if transcript.CODING_END is None or int(cols[4]) > transcript.CODING_END: transcript.CODING_END = int(cols[4]) + if transcript.CODING_END < 0 or int(cols[4]) > transcript.CODING_END: transcript.CODING_END = int(cols[4]) else: - if transcript.CODING_END is None or int(cols[3]) < transcript.CODING_END: transcript.CODING_END = int(cols[3]) + if transcript.CODING_END < 0 or int(cols[3]) < transcript.CODING_END: transcript.CODING_END = int(cols[3]) # Check if transcript is complete and is a CCDS transcript if transcript.isComplete is None: if int(options.ensembl) < 75: - if transcript.ENST in candidates[transcript.CHROM].keys(): + if transcript.ENST in list(candidates[transcript.CHROM].keys()): transcript.CCDS = (candidates[transcript.CHROM][transcript.ENST] == 1) transcript.isComplete = True else: @@ -388,14 +402,14 @@ def process_data(options, genome_build, version): if transcript is not None: transcript.finalize() if transcript.isCandidate(): - if transcript.ENSG not in genesdata.keys(): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) + if transcript.ENSG not in list(genesdata.keys()): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) genesdata[transcript.ENSG].TRANSCRIPTS[transcript.ENST] = transcript # If no transcript ID from the input file was found in the Ensembl release if len(genesdata) == 0: - print '\n\nNo transcripts from '+options.input+' found in Ensembl release.' - print '\nNo transcript database created.' - print "-----------------------------------------------------------------\n" + print('\n\nNo transcripts from '+options.input+' found in Ensembl release.') + print('\nNo transcript database created.') + print("-----------------------------------------------------------------\n") os.remove('ensembl_data.gz') quit() @@ -408,7 +422,7 @@ def process_data(options, genome_build, version): outfile_list.write('ENSG\tGENE\tENST\n') # Output transcripts of each gene - for ensg, gene in genesdata.iteritems(): gene.output(outfile,outfile_list,options.select,mcg_transcripts) + for ensg, gene in genesdata.items(): gene.output(outfile,outfile_list,options.select,mcg_transcripts) # Close temporary output files outfile.close() @@ -423,7 +437,7 @@ def process_data(options, genome_build, version): line.rstrip() record = line.split('\t') record[6] = int(record[6]) - if record[4] in data.keys(): + if record[4] in list(data.keys()): data[record[4]].append(record) else: data[record[4]] = [] @@ -472,18 +486,18 @@ def is_number(s): def run(options, version): # Checking if all required options specified if options.ensembl is None: - print '\nError: no Ensembl release specified. Use option -h to get help!\n' + print('\nError: no Ensembl release specified. Use option -h to get help!\n') quit() if not is_number(options.ensembl): - print '\nError: Ensembl release specified is not an integer. Use option -h to get help!\n' + print('\nError: Ensembl release specified is not an integer. Use option -h to get help!\n') quit() if options.output is None: - print '\nError: no output file name specified. Use option -h to get help!\n' + print('\nError: no output file name specified. Use option -h to get help!\n') quit() # Must use Ensembl release >= 70 if not (int(options.ensembl) >= 70 or int(options.ensembl) == 65): - print '\nError: This version works with Ensembl v65 or >= v70.\n' + print('\nError: This version works with Ensembl v65 or >= v70.\n') quit() # Genome build @@ -491,17 +505,17 @@ def run(options, version): genome_build = 'GRCh37' if int(options.ensembl) <= 75 else 'GRCh38' # Printing out version information - print "\n---------------------------------------------------------------------------------------" - print 'CAVA ' + version + ' transcript database preparation tool (ensembl_db) is now running' - print 'Started: ', datetime.datetime.now(), '\n' + print("\n---------------------------------------------------------------------------------------") + print('CAVA ' + version + ' transcript database preparation tool (ensembl_db) is now running') + print('Started: ', datetime.datetime.now(), '\n') # Print info - print 'Ensembl version: ' + options.ensembl - print 'Reference genome: ' + genome_build + print('Ensembl version: ' + options.ensembl) + print('Reference genome: ' + genome_build) # Creating compressed output file Nretrieved = process_data(options, genome_build, version) - print '\nA total of ' + str(Nretrieved) + ' transcripts have been retrieved\n' + print('\nA total of ' + str(Nretrieved) + ' transcripts have been retrieved\n') # Indexing output file with Tabix indexFile(options) @@ -510,14 +524,14 @@ def run(options, version): os.remove(options.output) # Printing out summary information - print '' - print '---------------------' - print 'Output files created:' - print '---------------------' - print options.output + '.gz (transcript database)' - print options.output + '.gz.tbi (index file)' - print options.output + '.txt (list of transcripts)' - - print '' - print 'CAVA ensembl_db successfully finished: ', datetime.datetime.now() - print "---------------------------------------------------------------------------------------\n" + print('') + print('---------------------') + print('Output files created:') + print('---------------------') + print(options.output + '.gz (transcript database)') + print(options.output + '.gz.tbi (index file)') + print(options.output + '.txt (list of transcripts)') + + print('') + print('CAVA ensembl_db successfully finished: ', datetime.datetime.now()) + print("---------------------------------------------------------------------------------------\n") diff --git a/install.sh b/install.sh index 53797aa..f1ff5a3 100755 --- a/install.sh +++ b/install.sh @@ -4,7 +4,7 @@ unset PYTHONPATH ABSOLUTE_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) PIP_ARGS='--no-cache-dir --ignore-installed --force-reinstall' -virtualenv -p python2.7 env +virtualenv -p python3 env source ${ABSOLUTE_PATH}/env/bin/activate diff --git a/requirements.txt b/requirements.txt index 99b373c..a890ffd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ pep8==1.7.0 py==1.4.32 pylint==1.7.2 pyparsing==2.2.0 -pysam==0.7.7 +pysam==0.15.0 pytest==3.0.7 pytest-cov>=2.2.0 radon==2.0.2 diff --git a/setup.py b/setup.py index 44dc8d8..c384b48 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name= 'CAVA', - version = '1.2.3', + version = '2.0.0', description = 'CAVA (Clinical Annotation of VAriants)', url = 'https://github.com/RahmanTeam/CAVA', author = 'RahmanTeam', diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..70f18b6 --- /dev/null +++ b/test.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# Test CAVA suite. After having installed CAVA, execute this script as: +# +# bash test.sh + +set -e +set -o pipefail + +# Set up + +if [ ! -f test/tmp.input.vcf.gz ] +then + curl ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/common_all_20180418.vcf.gz \ + | gunzip \ + | awk '$1 ~ "^#" || NR % 1000 == 0' \ + | gzip > test/tmp.input.vcf.gz +fi + +if [ ! -f test/tmp.hg38.fa ] +then + curl http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz | gunzip > test/tmp.hg38.fa + samtools faidx test/tmp.hg38.fa +fi + +# Test: prepare database +./ensembl_db -e 75 -o test/tmp.db + +# Lengths in bp not kb: +gunzip -c test/tmp.db.gz | grep -F '+/919bp/1/918bp/305' > /dev/null + +# Test: annotate +./cava -i test/tmp.input.vcf.gz -o test/tmp. -c test/CAVA_config.txt -t 8 + +# Tear down +# ========= + +rm test/tmp.* diff --git a/test/CAVA_config.txt b/test/CAVA_config.txt new file mode 100644 index 0000000..8250d73 --- /dev/null +++ b/test/CAVA_config.txt @@ -0,0 +1,77 @@ +# Input file format +# Possible values: VCF or TXT | Optional: yes | Default value: VCF +@inputformat = VCF + +# Output file format +# Possible values: VCF or TSV | Optional: yes | Default value: VCF +@outputformat = VCF + +# Absolute path to reference genome file +# Possible values: string | Optional: no +@reference = test/tmp.hg38.fa + +# Absolute path to Ensembl transcript database file +# Possible values: string | Optional: yes (if not specified, default transcript database will be used) +@ensembl = test/tmp.db.gz + +# Absolute path to dbSNP database file +# Possible values: string | Optional: yes +##@dbsnp = absolute/path/to/dbsnp/database + +# Are variants with neither transcript nor dbSNP annotation to be included in the output? +# Possible values: TRUE or FALSE | Optional: yes | Default value: TRUE +@nonannot = TRUE + +# Are only records with PASS filter value included in the output? +# Possible values: TRUE or FALSE | Optional: yes | Default value: FALSE +@filter = FALSE + +# Types of variants to be annotated and outputted +# Possible values: ALL, SUBSTITUTION, INDEL, INSERTION, DELETION or COMPLEX | Optional: yes | Default value: ALL +@type = ALL + +# Name of compressed BED file specifying genomic regions variant annotation is restricted to +# Possible values: string | Optional: yes +##@target = . + +# Name of file providing a list of the gene identifiers variant annotation is restricted to +# Note: gene identifiers need to be given on separate lines in the file +# Possible values: string | Optional: yes +##@genelist = . + +# Name of file providing a list of transcript identifiers variant annotation is restricted to +# Note: transcript identifiers need to be given on separate lines in the file +# Possible values: string | Optional: yes +##@transcriptlist = . + +# Name of file providing a list of the dbSNP identifiers variant annotation is restricted to +# Note: dbSNP identifiers need to be given on separate lines in the file +# Possible values: string | Optional: yes +##@snplist = . + +# Is a log file to be written? +# Possible values: TRUE or FALSE | Optional: yes | Default value: FALSE +@logfile = TRUE + +# Which ontology is used for reporting consequence type? +# Possible values: CLASS, SO or BOTH | Optional: yes | Default value: BOTH +@ontology = BOTH + +# Definition of variant impact levels (reported by the IMPACT annotation flag) +# Different impact levels are separated by | and a comma-separated list of CLASS terms must be given for each level +# Possible values: string | Optional: yes +# Default value: SG,ESS,FS | SS5,IM,SL,EE,IF,NSY | SY,SS,INT,5PU,3PU +@impactdef = SG,ESS,FS | SS5,IM,SL,EE,IF,NSY | SY,SS,INT,5PU,3PU + +# Are alternative annotations outputted? +# If TRUE, the ALTANN and ALTCLASS/ALTSO annotation flags are reported instead of the ALTFLAG flag +# Possible values: TRUE or FALSE | Optional: yes | Default value: TRUE +@givealt = TRUE + +# Number of bases into the intron used as the splice site region +# Possible values: integer >= 6 | Optional: yes | Default value: 8 +@ssrange = 8 + +# Is the prefix CAVA_ added to annotation flag names in VCF output? +# Possible values: TRUE or FALSE | Optional: yes | Default value: FALSE +@prefix = FALSE