From 6c66ad1a9b129d4e5b0895940fbb6074174b11a1 Mon Sep 17 00:00:00 2001 From: averagehat Date: Wed, 10 Feb 2016 12:25:46 -0500 Subject: [PATCH 01/41] added tagreads (tags all reads, doesn't skip any) --- bioframework/tagreads.py | 186 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 bioframework/tagreads.py diff --git a/bioframework/tagreads.py b/bioframework/tagreads.py new file mode 100644 index 0000000..d717080 --- /dev/null +++ b/bioframework/tagreads.py @@ -0,0 +1,186 @@ +import argparse +import sys +import samtools +import re +import os.path +#from ngs_mapper.bam import sortbam, indexbam +#import shutil + +import log +logger = log.setup_logger('tagreads',log.get_config()) + +# Exception for when headers exist +class HeaderExists(Exception): pass + +# The next 3 tuples have to be the same length and each index in each is related the same index in each tuple +# AKA zip( IDS, PLATFORMS, ID_MAP ) should work as expected +# Read group ID list +IDS = ('Roche454', 'IonTorrent', 'MiSeq', 'Sanger') +# Valid platforms for read groups +PLATFORMS = ('L454', 'IONTORRENT', 'ILLUMINA', 'CAPILLARY') +# Read name map to ID name +ID_MAP = ( + re.compile( '[0-9A-Z]{14}' ), + re.compile( '[A-Z0-9]{5}:\d{1,}:\d{1,}' ), + re.compile( 'M[0-9]{5}:\d+:\d{9}-[A-Z0-9]{5}:\d:\d{4}:\d{4,5}:\d{4,5}' ), + re.compile( '.*' ) +) +# Read Group Template +RG_TEMPLATE = { + 'SM': None, + 'ID': None, + 'PL': None, + 'CN': None +} + +def main(): + args = parse_args() + for bam in args.bamfiles: + tag_bam( bam, args.SM, args.CN ) + +def tag_bam( bam, SM, CN ): + logger.info( "Gathering existing header for {0}".format(bam) ) + hdr = get_rg_headers( bam, SM, CN ) + tag_reads( bam, hdr ) + +def tag_read( untagged_read, tags ): + ''' + Tags a given samtools.SamRow with various tags + Does not replace existing tags + + @param untagged_read - samtools.SamRow + @param tags - List of valid samspec optional field strings(aka ['RG:Z:Value']) + + @returns a tagged read + ''' + #if untagged_read.FLAG >= 2048: + # # Skip supplementary + # logger.debug( "Skipping read {0} because it is supplementary".format(untagged_read.QNAME) ) + # return untagged_read + # Append the new tags + if untagged_read._tags and untagged_read._tags[-1] != '\t': + untagged_read._tags += '\t' + # Append the tags that do not already exist + # tag+'\t' is used to make it an exact match + tags = [tag for tag in tags if tag+'\t' not in untagged_read._tags] + untagged_read._tags += '\t'.join( tags ) + # Remove trailing tab if all the tags were duplicate + untagged_read._tags = untagged_read._tags.rstrip() + # Return the tagged read + return untagged_read + +def tag_readgroup( read ): + ''' + Tags a given read with the readgroup that the qname + belongs to depending on how it maps to ID_MAP + + @param read - samtools.SamRow + + @returns SamRow that is tagged with the appropriate read group + ''' + rg = get_rg_for_read( read ) + #logger.debug( "Tagging {0} with Read group {0}".format(read.qname,rg) ) + return tag_read( read, ['RG:Z:'+rg] ) + +def tag_reads( bam, hdr ): + ''' + Sets header of bam and tags all reads appropriately for each platform + Overwrites existing header + + @param bam - Bam file to tag reads in + @param hdr - Header string to set in the bam(needs newline at the end) + ''' + # Open the existing bam to fetch the reads to modify from + untagged_bam = samtools.view( bam ) + # Open a file to write the sam output to with the tagged reads + samf = bam.replace('.bam','.sam') + with open( samf, 'w' ) as sam: + # Write the hdr to the file first + sam.write( hdr ) + # Tag the reads + logger.info( "Tagging reads for {0}".format(bam) ) + for read in untagged_bam: + samrow = samtools.SamRow(read) + read = tag_readgroup( samrow ) + sam.write( str(read) + '\n' ) + # Close stdout + untagged_bam.close() + logger.info( "Finished tagging reads for {0}".format(bam) ) + logger.info( "Sorting {0}".format(bam) ) +# b = samtools.view( samf, h=True, S=True, b=True ) +# sortbam( b, bam ) +# # Close the fh +# b.close() +# # Remove temp sam file +# # maybe some day could even just use pipes all the way through :) +# os.unlink( samf ) +# logger.info( "Indexing {0}".format(bam) ) +# indexbam( bam ) + +def get_rg_for_read( aread ): + ''' Gets the read group name for the given samtools.SamRow ''' + rname = aread.QNAME + for i, p in enumerate( ID_MAP ): + if p.match( rname ): + return IDS[i] + raise ValueError( "{0} is from an unknown platform and cannot be tagged".format(rname) ) + +def get_rg_headers( bam, SM=None, CN=None ): + old_header = get_bam_header( bam ) + '\n' + + for id, pl in zip( IDS, PLATFORMS ): + # Skip headers that exist already + if 'ID:{0}\t'.format(id) in old_header: + continue + + rg = '@RG\tID:{0}\tSM:{1}\t' + if SM is None: + SM = os.path.basename(bam).replace( '.bam', '' ) + + if CN is not None: + rg += 'CN:{2}\tPL:{3}' + old_header += rg.format( id, SM, CN, pl ) + '\n' + else: + rg += 'PL:{2}' + old_header += rg.format( id, SM, pl ) + '\n' + + return old_header + +def get_bam_header( bam ): + r = samtools.view( bam, H=True ) + hdr = r.read() + r.close() + return hdr.rstrip() + +def parse_args( args=sys.argv[1:] ): + from ngs_mapper import config + conf_parser, args, config, configfile = config.get_config_argparse(args) + defaults = config['tagreads'] + + parser = argparse.ArgumentParser( + description='''Adds Sanger, MiSeq, 454 and IonTorrent read groups to a bam file. + Will tag all reads in the alignment for each of the platforms based on the read name''', + parents=[conf_parser] + ) + + parser.add_argument( + dest='bamfiles', + nargs='+', + help='Bamfile to add read groups to for platforms' + ) + + parser.add_argument( + '-SM', + dest='SM', + default=defaults['SM']['default'], + help=defaults['SM']['help'] + ) + + parser.add_argument( + '-CN', + dest='CN', + default=defaults['CN']['default'], + help=defaults['CN']['help'] + ) + + return parser.parse_args( args ) From ed0ac2b7db840cbffcbfbce90f5239691127c8cb Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 11 Feb 2016 10:49:16 -0500 Subject: [PATCH 02/41] added tagreads --- bioframework/tagreads.py | 5 +++++ jip_modules/tag_bam.jip | 9 +++++++++ 2 files changed, 14 insertions(+) create mode 100644 jip_modules/tag_bam.jip diff --git a/bioframework/tagreads.py b/bioframework/tagreads.py index d717080..6d285ed 100644 --- a/bioframework/tagreads.py +++ b/bioframework/tagreads.py @@ -183,4 +183,9 @@ def parse_args( args=sys.argv[1:] ): help=defaults['CN']['help'] ) + parser.add_argument( + '--output', + '-o', + dest='output' + ) return parser.parse_args( args ) diff --git a/jip_modules/tag_bam.jip b/jip_modules/tag_bam.jip new file mode 100644 index 0000000..f05a72f --- /dev/null +++ b/jip_modules/tag_bam.jip @@ -0,0 +1,9 @@ +#!/usr/bin/env jip +# Usage: +# tagreads [] [-h] [--config CONFIG] [--version] [-SM ] [-CN ] [-o ] +# Options: +# -SM A tag +# -CN Another tag +# -o, --output= Output fastq. If input was interleave, output will be as well + +tagreads ${input|else('/dev/stdin')} ${SM|arg} ${CN|arg} ${output|else('/dev/stdout')} From 4aa14ca20ab4d85a2b53a0922fc91f421532b65e Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 11 Feb 2016 11:35:16 -0500 Subject: [PATCH 03/41] tag_bam now sorts and indexes afterwards --- jip_modules/tag_bam.jip | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/jip_modules/tag_bam.jip b/jip_modules/tag_bam.jip index f05a72f..2857ef2 100644 --- a/jip_modules/tag_bam.jip +++ b/jip_modules/tag_bam.jip @@ -6,4 +6,7 @@ # -CN Another tag # -o, --output= Output fastq. If input was interleave, output will be as well -tagreads ${input|else('/dev/stdin')} ${SM|arg} ${CN|arg} ${output|else('/dev/stdout')} +#tagreads ${input|else('/dev/stdin')} ${SM|arg} ${CN|arg} ${output|else('/dev/stdout')} + +tagreads ${input|else('/dev/stdin')} ${SM|arg} ${CN|arg} |\ +samtools view -Sbh - | samtools sort - ${output} | samtools index ${output} From 4d46b24a0464139bcf52231e7320fc484ee00083 Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 11 Feb 2016 11:42:49 -0500 Subject: [PATCH 04/41] added freebayes.jip --- jip_modules/freebayes.jip | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 jip_modules/freebayes.jip diff --git a/jip_modules/freebayes.jip b/jip_modules/freebayes.jip new file mode 100644 index 0000000..5159839 --- /dev/null +++ b/jip_modules/freebayes.jip @@ -0,0 +1,16 @@ + +#!/usr/bin/env jip +# +# run freebayes +# +# Usage: +# run_freebayes [] -f [-o ] +# +# Options: +# -o, --output= Output fastq. If input was interleave, output will be as well +# -f, The reference +# the input is assumed to be unpaired fastq +# Help: +# Requires a bam file () and a reference (-f) + +freebayes ${input|else("-")} ${reference|arg} ${output|arg(">")} From 4c9933917ddb30955255a28de7f9ecab5ec42d73 Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 11 Feb 2016 11:55:20 -0500 Subject: [PATCH 05/41] tagreads now writes to separate file or stdout --- bioframework/tagreads.py | 45 +++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/bioframework/tagreads.py b/bioframework/tagreads.py index 6d285ed..1f8b15a 100644 --- a/bioframework/tagreads.py +++ b/bioframework/tagreads.py @@ -6,8 +6,8 @@ #from ngs_mapper.bam import sortbam, indexbam #import shutil -import log -logger = log.setup_logger('tagreads',log.get_config()) +#import log +#logger = log.setup_logger('tagreads',log.get_config()) # Exception for when headers exist class HeaderExists(Exception): pass @@ -36,12 +36,12 @@ class HeaderExists(Exception): pass def main(): args = parse_args() for bam in args.bamfiles: - tag_bam( bam, args.SM, args.CN ) + tag_bam( bam, args.SM, args.CN, args.output ) -def tag_bam( bam, SM, CN ): - logger.info( "Gathering existing header for {0}".format(bam) ) +def tag_bam( bam, SM, CN, output ): + #logger.info( "Gathering existing header for {0}".format(bam) ) hdr = get_rg_headers( bam, SM, CN ) - tag_reads( bam, hdr ) + tag_reads( bam, hdr, output ) def tag_read( untagged_read, tags ): ''' @@ -55,7 +55,7 @@ def tag_read( untagged_read, tags ): ''' #if untagged_read.FLAG >= 2048: # # Skip supplementary - # logger.debug( "Skipping read {0} because it is supplementary".format(untagged_read.QNAME) ) + # #logger.debug( "Skipping read {0} because it is supplementary".format(untagged_read.QNAME) ) # return untagged_read # Append the new tags if untagged_read._tags and untagged_read._tags[-1] != '\t': @@ -79,10 +79,10 @@ def tag_readgroup( read ): @returns SamRow that is tagged with the appropriate read group ''' rg = get_rg_for_read( read ) - #logger.debug( "Tagging {0} with Read group {0}".format(read.qname,rg) ) + ##logger.debug( "Tagging {0} with Read group {0}".format(read.qname,rg) ) return tag_read( read, ['RG:Z:'+rg] ) -def tag_reads( bam, hdr ): +def tag_reads( bam, hdr, output ): ''' Sets header of bam and tags all reads appropriately for each platform Overwrites existing header @@ -93,20 +93,21 @@ def tag_reads( bam, hdr ): # Open the existing bam to fetch the reads to modify from untagged_bam = samtools.view( bam ) # Open a file to write the sam output to with the tagged reads - samf = bam.replace('.bam','.sam') - with open( samf, 'w' ) as sam: + if output: sam = open(output, 'w') + else: sam = sys.stdout # Write the hdr to the file first - sam.write( hdr ) - # Tag the reads - logger.info( "Tagging reads for {0}".format(bam) ) - for read in untagged_bam: - samrow = samtools.SamRow(read) - read = tag_readgroup( samrow ) - sam.write( str(read) + '\n' ) + sam.write( hdr ) + # Tag the reads + #logger.info( "Tagging reads for {0}".format(bam) ) + for read in untagged_bam: + samrow = samtools.SamRow(read) + read = tag_readgroup( samrow ) + sam.write( str(read) + '\n' ) # Close stdout untagged_bam.close() - logger.info( "Finished tagging reads for {0}".format(bam) ) - logger.info( "Sorting {0}".format(bam) ) + sam.close() + #logger.info( "Finished tagging reads for {0}".format(bam) ) + #logger.info( "Sorting {0}".format(bam) ) # b = samtools.view( samf, h=True, S=True, b=True ) # sortbam( b, bam ) # # Close the fh @@ -114,7 +115,7 @@ def tag_reads( bam, hdr ): # # Remove temp sam file # # maybe some day could even just use pipes all the way through :) # os.unlink( samf ) -# logger.info( "Indexing {0}".format(bam) ) +# #logger.info( "Indexing {0}".format(bam) ) # indexbam( bam ) def get_rg_for_read( aread ): @@ -189,3 +190,5 @@ def parse_args( args=sys.argv[1:] ): dest='output' ) return parser.parse_args( args ) + +if __name__ == '__main__': main() From 6244e292e2f1affc06a433932a7d777cbd70f986 Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 11 Feb 2016 12:58:22 -0500 Subject: [PATCH 06/41] foo --- bioframework/tagreads.py | 3 ++- jip_modules/tag_bam.jip | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) mode change 100644 => 100755 jip_modules/tag_bam.jip diff --git a/bioframework/tagreads.py b/bioframework/tagreads.py index 1f8b15a..42fa565 100644 --- a/bioframework/tagreads.py +++ b/bioframework/tagreads.py @@ -5,7 +5,8 @@ import os.path #from ngs_mapper.bam import sortbam, indexbam #import shutil - +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE,SIG_DFL) #http://stackoverflow.com/questions/14207708/ioerror-errno-32-broken-pipe-python #import log #logger = log.setup_logger('tagreads',log.get_config()) diff --git a/jip_modules/tag_bam.jip b/jip_modules/tag_bam.jip old mode 100644 new mode 100755 index 2857ef2..1c988ae --- a/jip_modules/tag_bam.jip +++ b/jip_modules/tag_bam.jip @@ -1,12 +1,18 @@ #!/usr/bin/env jip # Usage: -# tagreads [] [-h] [--config CONFIG] [--version] [-SM ] [-CN ] [-o ] +# tag_bam [] [--config CONFIG] [--version] [--SM ] [--CN ] -o +# # Options: -# -SM A tag -# -CN Another tag +# --SM A tag +# --CN Another tag # -o, --output= Output fastq. If input was interleave, output will be as well -#tagreads ${input|else('/dev/stdin')} ${SM|arg} ${CN|arg} ${output|else('/dev/stdout')} -tagreads ${input|else('/dev/stdin')} ${SM|arg} ${CN|arg} |\ -samtools view -Sbh - | samtools sort - ${output} | samtools index ${output} +#%begin command bash +# +python bioframework/tagreads.py ${input|else('/dev/stdin')} ${SM|arg('-SM')} ${CN|arg('-CN')} | samtools view - -Sbh | samtools sort - ${output} +samtools index ${output}.bam + +#%end + + From eb79428696c39756c4d3316384273f566b777fbe Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 11 Feb 2016 12:58:34 -0500 Subject: [PATCH 07/41] fixing freebayes --- jip_modules/freebayes.jip | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 jip_modules/freebayes.jip diff --git a/jip_modules/freebayes.jip b/jip_modules/freebayes.jip old mode 100644 new mode 100755 From 44fbf7439a4a3637dc894c8d9d24a7d5a31db991 Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 11 Feb 2016 15:24:55 -0500 Subject: [PATCH 08/41] added bioframes dependency from git url --- .travis.yml | 1 + requirements.txt | 1 + 2 files changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/.travis.yml b/.travis.yml index d759796..81c8819 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,6 +23,7 @@ install: - conda config --add channels bioconda - conda install --file requirements-conda.txt - pip install -r tests/requirements-dev.txt + - pip install -r requirements.txt - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install robotframework-python3; else pip install robotframework; fi - python setup.py install diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..676cf9f --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/VDBWRAIR/bioframes.git From 2d7c592cdc51d8c5516d0281b89aecb5046983bc Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 11 Feb 2016 16:07:27 -0500 Subject: [PATCH 09/41] added jip consensus --- jip_modules/consensus.jip | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 jip_modules/consensus.jip diff --git a/jip_modules/consensus.jip b/jip_modules/consensus.jip new file mode 100644 index 0000000..0ceba21 --- /dev/null +++ b/jip_modules/consensus.jip @@ -0,0 +1,19 @@ +#!/usr/bin/env jip +# +# Usage: +# index_filter_quality -r -v [-o ] --minimum +# +# Options: +# -o, --output= Output files [default: stdout] +# -r, --reference= Reference fasta +# -v, --vcf= VCF file generated by freebayes + + +#%begin command python + +from bioframes import consensus +result = consensus.make_consensus("${bam}", "${ref}", "${vcf}") +with open("${output}", 'w') as out: + out.write(result) + +#%end From e21e14d8cab45b496af600b161ef616234ae5fc4 Mon Sep 17 00:00:00 2001 From: averagehat Date: Fri, 12 Feb 2016 16:11:11 -0500 Subject: [PATCH 10/41] added new implementation of consensus --- bioframework/consensus.py | 131 ++++++++++++++++++++++++++++++++++++++ jip_modules/consensus.jip | 6 +- requirements.txt | 4 +- 3 files changed, 136 insertions(+), 5 deletions(-) create mode 100644 bioframework/consensus.py diff --git a/bioframework/consensus.py b/bioframework/consensus.py new file mode 100644 index 0000000..a6b3c4d --- /dev/null +++ b/bioframework/consensus.py @@ -0,0 +1,131 @@ +from toolz.itertoolz import first, rest, peek +from operator import itemgetter as get +from toolz import compose, curry +from functools import partial +from toolz.dicttoolz import merge +from itertools import ifilter, imap, groupby, takewhile +from contracts import contract +from Bio import SeqIO, SeqRecord +import vcf + +############# +# Constants # +############# + +AMBIGUITY_TABLE = { 'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'N': 'N', + 'AC': 'M', 'AG': 'R', 'AT': 'W', 'CG': 'S', 'CT': + 'Y', 'GT': 'K', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', + 'CGT': 'B', 'ACGT': 'N' } + +MAJORITY_PERCENTAGE = 80 +MIN_DEPTH = 10 + +######################## +# Function combintaors # +######################## + +''' just a way to compose filters and mapping functions together ''' +def compose_transormer(transformer, combinator, *funcs): + return partial(transformer, reduce(combinator, funcs)) +_and = lambda f,g: lambda x: f(x) and g(x) +compose_filters = partial(compose_transormer, ifilter, _and) +compose_mappers = partial(compose_transormer, imap, compose) + + +########### +# Reducer # +########### +@contract(reference=str, muts='seq(tuple(str, str, int))' ) +def make_consensus(reference, muts): + ''' Actually builds a consensus string by recursively applying + the mutations.''' + def _do_build((accString, string, lastPos), (x, y, bigPos)): + pos = bigPos - lastPos + return (accString + (string[:pos] + y), string[pos+len(x):], bigPos+len(x)) + return reduce(_do_build, muts, ('', reference, 0)) + + +############## +# Mappers # +############## +@contract(min_depth=int, majoirty_percentage=int, rec=dict, returns='tuple(str, str, int)') +def call_base(min_depth, majoirty_percentage, rec): + '''if a base is under the min depth, it gets called as an `N`. + If the alternate base is >= `majoirty_percentage`, it will be + converted to an ambiguous base if it includes multiple bases. + Otherwise, the reference base replaces itself (gets ignored). + Returns a tuple of the form: + (referene_base, alternate_base, position) + where reference_base == alternate_base if the reference was preferred.''' + get_ambiguous = compose(AMBIGUITY_TABLE.__getitem__, ''.join, sorted) + if rec['DP'] < min_depth: alt = 'N'# (rec['ref'], 'N', rec['POS']) + elif alt_over_percent(majoirty_percentage)(rec): + alt = get_ambiguous(rec['alt']) + else: alt = rec['ref'] + return (rec['ref'], alt, rec['pos']) + +def flatten_vcf_record(rec): + return merge({ + 'alt' : rec.ALT, 'ref' : rec.REF, + 'pos' : rec.POS, 'chrom' : rec.CHROM}, + rec.INFO) + +############## +# Filters # +############## +ref_and_alt_differ = lambda x: x[0] != x[1] +alt_over_percent = lambda n: lambda rec: (n/float(100)) > rec['AO']/float(rec['DP']) + + +############## +# Group By # +############## +def group_muts_by_refs(references, muts): + '''group and sort the mutations so that they match the order of the references.''' + mut_groups = groupby(muts, get('CHROM')) + def index_of_ref(key): + return sum(1 for _ in takewhile(lambda x: x.id != key, references)) + muts_by_ref = sorted(mut_groups, key=index_of_ref) + return muts_by_ref + + + +############### +# Runner # +############### +@contract(references='SeqRecord', muts='seq(dict)', mind=int, majority=int) +def all_consensuses(references, muts, mind, majority): + ''' generates conesnsuses, including for flu and other mult-reference VCFs. + applies filters and base callers to the mutations. + then builds the consensus using these calls and `make_consensus`''' + mappers = [partial(call_base, mind, majority)] + filters = [ref_and_alt_differ] + muts_by_ref = group_muts_by_refs(references, muts) + muts = imap(get(0), muts_by_ref) + def single_ref_consensus(recs, ref): + fix_recs = compose(compose_filters(filters), compose_mappers(mappers)) + ref_str = str(ref.seq) + return make_consensus(ref_str, fix_recs(recs)) + return references, imap(single_ref_consensus, muts, references) + + + +########## +# I/O # +########## + +def consensus_str(ref, consensus): + return ">{0}:Consensus\n{1}".format(ref.id, consensus) + +make_fasta_str = compose('\n'.join, + partial(imap, consensus_str), all_consensuses) + +@contract(ref_fasta=str, vcf=str, mind=int, majority=int) +def run(ref_fasta, freebayes_vcf, outpath, mind, majority): + refs = SeqIO.parse(ref_fasta, 'fasta') + with open(freebayes_vcf, 'r') as vcf_handle, open(outpath, 'w') as out: + muts = imap(flatten_vcf_record, vcf.Reader(vcf_handle)) + result = make_fasta_str(refs, muts, mind, majority) + out.write(result) + return 0 + diff --git a/jip_modules/consensus.jip b/jip_modules/consensus.jip index 0ceba21..c2ad138 100644 --- a/jip_modules/consensus.jip +++ b/jip_modules/consensus.jip @@ -11,9 +11,7 @@ #%begin command python -from bioframes import consensus -result = consensus.make_consensus("${bam}", "${ref}", "${vcf}") -with open("${output}", 'w') as out: - out.write(result) +from bioframework import consensus +consens.run("${bam}", "${ref}", "${vcf}", "${out}") #%end diff --git a/requirements.txt b/requirements.txt index 676cf9f..f0346dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ -git+https://github.com/VDBWRAIR/bioframes.git +contracts +toolz +pyvcf From 44be3d0ae77d79bcde0060756808077497af70dc Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Mon, 22 Feb 2016 17:01:31 -0500 Subject: [PATCH 11/41] update --- bioframework/consensus.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index a6b3c4d..2c365a9 100644 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -48,6 +48,28 @@ def _do_build((accString, string, lastPos), (x, y, bigPos)): ############## # Mappers # ############## + +#when there are multiple alts, zip through with each of them +#zip(*alts), character by character. compare the percentages, and +#sum the percentages for each base. (groupby, sum) pick each character (call each base) based on the given rules (using call_base). +def call_many(min_depth, majoirty_percentage, rec): + if hasattr(rec['alt'], '__iter__'): + fields = ['AO', 'DP', 'alt'] + vals = zip(*get(*fields)(rec)) + muts = map(lambda items: dict(zip(fields, items)), vals) + def extract(rec): + AO, DP, alt = get(*fields)(rec) + #length = len(alt) + #return repeat(AO, length), repeat(DP, length), alt + return repeat(AO), repeat(DP), alt + + def call_multiple(*recs): + for (ao, dp, alt) in zip(*recs): + groups = groupby(recs, get('alt')) + #map() + pass #prep and map using call_base + else: + return call_base(min_depth, majoirty_percentage, rec) @contract(min_depth=int, majoirty_percentage=int, rec=dict, returns='tuple(str, str, int)') def call_base(min_depth, majoirty_percentage, rec): '''if a base is under the min depth, it gets called as an `N`. From 549a244a15730c4c188852c4727a30b977bd7904 Mon Sep 17 00:00:00 2001 From: averagehat Date: Tue, 23 Feb 2016 17:58:38 -0500 Subject: [PATCH 12/41] consensus now handles multiple alts --- bioframework/consensus.py | 113 +++++++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 26 deletions(-) mode change 100644 => 100755 bioframework/consensus.py diff --git a/bioframework/consensus.py b/bioframework/consensus.py old mode 100644 new mode 100755 index 2c365a9..15003c7 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -2,8 +2,8 @@ from operator import itemgetter as get from toolz import compose, curry from functools import partial -from toolz.dicttoolz import merge -from itertools import ifilter, imap, groupby, takewhile +from toolz.dicttoolz import merge, dissoc +from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest from contracts import contract from Bio import SeqIO, SeqRecord import vcf @@ -35,7 +35,7 @@ def compose_transormer(transformer, combinator, *funcs): ########### # Reducer # ########### -@contract(reference=str, muts='seq(tuple(str, str, int))' ) +#@contract(reference=str, muts='seq(tuple(str, str, int))' ) def make_consensus(reference, muts): ''' Actually builds a consensus string by recursively applying the mutations.''' @@ -52,26 +52,87 @@ def _do_build((accString, string, lastPos), (x, y, bigPos)): #when there are multiple alts, zip through with each of them #zip(*alts), character by character. compare the percentages, and #sum the percentages for each base. (groupby, sum) pick each character (call each base) based on the given rules (using call_base). -def call_many(min_depth, majoirty_percentage, rec): - if hasattr(rec['alt'], '__iter__'): - fields = ['AO', 'DP', 'alt'] - vals = zip(*get(*fields)(rec)) - muts = map(lambda items: dict(zip(fields, items)), vals) - def extract(rec): - AO, DP, alt = get(*fields)(rec) - #length = len(alt) - #return repeat(AO, length), repeat(DP, length), alt - return repeat(AO), repeat(DP), alt - - def call_multiple(*recs): - for (ao, dp, alt) in zip(*recs): - groups = groupby(recs, get('alt')) - #map() - pass #prep and map using call_base +from toolz.dicttoolz import merge_with +#@contract(dp=int,ref=str,alts='dict(string,int)') +from toolz.dicttoolz import valfilter +def find_val(pred, d): + res = valfilter(pred, d) + return res if res else None +def find(pred, xs): + res = ifilter(pred, xs) + try: + return next(res) + except: + return None +#TODO: Failing Case: +# a = {'ref': u'AT', 'pos': 2, 'AO': (51771, 41537, 42398, 9342), 'alt': [u'A', +# u'TT', u'AATTG', u'AAGAA'], 'chrom': u'o', 'DP': 87288} +# bioframework.consensus.call_many(10, 80, a) + + +def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): + #TODO: majority_percentage gets ignored, so replace constants + #TODO: behavior is undefined if sum(AO) > dp. + import ipdb; ipdb.set_trace() + if dp < min_depth: #could call REF here sometimes + return 'N' + total_ao = lambda: sum(alts.values()) # avoid evaluating unless necessary + + if ref is None: # this is an insert + if total_ao()/float(dp) < .50: + return ref + # if the insert is above threshold, keep going and call the insert like a normal base + if '-' in alts: + if alts['-']/float(dp) > .50: # a deletion + return '' + dp -= alts['-'] + alts_without_insert = dissoc(alts, '-') else: - return call_base(min_depth, majoirty_percentage, rec) -@contract(min_depth=int, majoirty_percentage=int, rec=dict, returns='tuple(str, str, int)') + alts_without_insert = alts + over_depth = lambda x: lambda depth: depth/float(dp) > x + picked_alt = valfilter(over_depth(0.8), alts_without_insert) + if picked_alt: + return picked_alt.keys()[0] + #add ref so that it will be considered in creating ambiguous base + alts_with_ref = merge(alts_without_insert, ({ref : (dp - total_ao()) } if ref else {})) + over20 = valfilter(over_depth(0.2), alts_with_ref) + as_ambiguous = ''.join(sorted(over20.keys())) + # this could return a single base, (including the reference), becuase i.e. A => A in the ambiguity table + return AMBIGUITY_TABLE[as_ambiguous] + + +def call_many(min_depth, majority_percentage, _rec): + #TODO: switch to generators + if not hasattr(_rec['alt'], '__iter__'): #TODO: put this somewhere else + rec = merge(_rec, dict(alt=[_rec['alt']], AO=[_rec['AO']])) + else: rec = _rec + muts = zip(rec['AO'], rec['alt']) + ref, dp, pos = rec['ref'], rec['DP'], rec['pos'] + longest_len = max(map(lambda x: len(x[-1]), muts)) + longest_len = max(longest_len, len(ref)) + def fill_gap(r): + ao, s = r + return (ao, s + (longest_len - len(s)) * '-') + xs = map(fill_gap, muts) # fill in the shorter alts with '-'. + def merge_sum(x,y): + return x if y is None else (y if x is None else merge_with(sum, x, y)) + def seq_count(acc, ao_and_nts): + ao, nts = ao_and_nts + return map(merge_sum, acc, [{nt:ao} for nt in nts]) + # create a list of {base : count}, where the index matches the position + mut_dicts = reduce(seq_count, xs, [{}]) + base_caller = partial(call_base_multi_alts, min_depth, majority_percentage, dp) + res = map(base_caller, mut_dicts, ref) + # trim None values at the end, (which indicate deletion) + result = takewhile(bool, res) + return (ref, ''.join(result), pos) + # return call_base(min_depth, majority_percentage, rec) + + +#@contract(min_depth=int, majoirty_percentage=int, rec=dict, returns='tuple(str, str, int)') +#DEPRECATED def call_base(min_depth, majoirty_percentage, rec): +#DEPRECATED '''if a base is under the min depth, it gets called as an `N`. If the alternate base is >= `majoirty_percentage`, it will be converted to an ambiguous base if it includes multiple bases. @@ -79,7 +140,7 @@ def call_base(min_depth, majoirty_percentage, rec): Returns a tuple of the form: (referene_base, alternate_base, position) where reference_base == alternate_base if the reference was preferred.''' - get_ambiguous = compose(AMBIGUITY_TABLE.__getitem__, ''.join, sorted) + get_ambiguous = compose(lambda x: AMBIGUITY_TABLE.get(x, x), ''.join, sorted) if rec['DP'] < min_depth: alt = 'N'# (rec['ref'], 'N', rec['POS']) elif alt_over_percent(majoirty_percentage)(rec): alt = get_ambiguous(rec['alt']) @@ -96,7 +157,7 @@ def flatten_vcf_record(rec): # Filters # ############## ref_and_alt_differ = lambda x: x[0] != x[1] -alt_over_percent = lambda n: lambda rec: (n/float(100)) > rec['AO']/float(rec['DP']) +alt_over_percent = lambda n: lambda rec: rec['AO']/float(rec['DP']) > (n/float(100)) ############## @@ -115,12 +176,12 @@ def index_of_ref(key): ############### # Runner # ############### -@contract(references='SeqRecord', muts='seq(dict)', mind=int, majority=int) +#@contract(references='SeqRecord', muts='seq(dict)', mind=int, majority=int) def all_consensuses(references, muts, mind, majority): ''' generates conesnsuses, including for flu and other mult-reference VCFs. applies filters and base callers to the mutations. then builds the consensus using these calls and `make_consensus`''' - mappers = [partial(call_base, mind, majority)] + mappers = [partial(call_many, mind, majority)] filters = [ref_and_alt_differ] muts_by_ref = group_muts_by_refs(references, muts) muts = imap(get(0), muts_by_ref) @@ -142,7 +203,7 @@ def consensus_str(ref, consensus): make_fasta_str = compose('\n'.join, partial(imap, consensus_str), all_consensuses) -@contract(ref_fasta=str, vcf=str, mind=int, majority=int) +#@contract(ref_fasta=str, vcf=str, mind=int, majority=int) def run(ref_fasta, freebayes_vcf, outpath, mind, majority): refs = SeqIO.parse(ref_fasta, 'fasta') with open(freebayes_vcf, 'r') as vcf_handle, open(outpath, 'w') as out: From aefe452d01ebe8e7892c3a51fa638b03ca3b653f Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 24 Feb 2016 13:02:00 -0500 Subject: [PATCH 13/41] various fixes --- bioframework/consensus.py | 84 ++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 33 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 15003c7..184668d 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -1,10 +1,26 @@ +""" +Usage: + consensus --ref --vcf [--mind ] [--majority ] [-o ] + +Options: + --ref= Reference fasta file + --vcf= VCF output + --majority= Percentage required [default: 80] + --mind= minimum depth to call base non-N [default: 10] + --output,-o= output file [default: ] +""" + + from toolz.itertoolz import first, rest, peek from operator import itemgetter as get from toolz import compose, curry from functools import partial -from toolz.dicttoolz import merge, dissoc +from toolz.dicttoolz import merge, dissoc, merge_with, valfilter from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest -from contracts import contract +#from contracts import contract +import os, sys +from docopt import docopt +from schema import Schema, Use from Bio import SeqIO, SeqRecord import vcf @@ -42,38 +58,27 @@ def make_consensus(reference, muts): def _do_build((accString, string, lastPos), (x, y, bigPos)): pos = bigPos - lastPos return (accString + (string[:pos] + y), string[pos+len(x):], bigPos+len(x)) - return reduce(_do_build, muts, ('', reference, 0)) + return reduce(_do_build, muts, ('', reference, 0))[0] ############## # Mappers # ############## -#when there are multiple alts, zip through with each of them -#zip(*alts), character by character. compare the percentages, and -#sum the percentages for each base. (groupby, sum) pick each character (call each base) based on the given rules (using call_base). -from toolz.dicttoolz import merge_with -#@contract(dp=int,ref=str,alts='dict(string,int)') -from toolz.dicttoolz import valfilter -def find_val(pred, d): - res = valfilter(pred, d) - return res if res else None -def find(pred, xs): - res = ifilter(pred, xs) - try: - return next(res) - except: - return None #TODO: Failing Case: # a = {'ref': u'AT', 'pos': 2, 'AO': (51771, 41537, 42398, 9342), 'alt': [u'A', # u'TT', u'AATTG', u'AAGAA'], 'chrom': u'o', 'DP': 87288} # bioframework.consensus.call_many(10, 80, a) - +#@contract(dp=int,ref=str,alts='dict(string,int)') def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): + """when there are multiple alts, zip through with each of them + zip(*alts), character by character. compare the percentages, and + sum the percentages for each base. (groupby, sum) pick each character + (call each base) based on the given rules (using call_base).""" #TODO: majority_percentage gets ignored, so replace constants #TODO: behavior is undefined if sum(AO) > dp. - import ipdb; ipdb.set_trace() + #import ipdb; ipdb.set_trace() if dp < min_depth: #could call REF here sometimes return 'N' total_ao = lambda: sum(alts.values()) # avoid evaluating unless necessary @@ -112,7 +117,7 @@ def call_many(min_depth, majority_percentage, _rec): longest_len = max(longest_len, len(ref)) def fill_gap(r): ao, s = r - return (ao, s + (longest_len - len(s)) * '-') + return (ao, str(s) + (longest_len - len(s)) * '-') xs = map(fill_gap, muts) # fill in the shorter alts with '-'. def merge_sum(x,y): return x if y is None else (y if x is None else merge_with(sum, x, y)) @@ -126,7 +131,6 @@ def seq_count(acc, ao_and_nts): # trim None values at the end, (which indicate deletion) result = takewhile(bool, res) return (ref, ''.join(result), pos) - # return call_base(min_depth, majority_percentage, rec) #@contract(min_depth=int, majoirty_percentage=int, rec=dict, returns='tuple(str, str, int)') @@ -165,7 +169,7 @@ def flatten_vcf_record(rec): ############## def group_muts_by_refs(references, muts): '''group and sort the mutations so that they match the order of the references.''' - mut_groups = groupby(muts, get('CHROM')) + mut_groups = groupby(muts, get('chrom')) def index_of_ref(key): return sum(1 for _ in takewhile(lambda x: x.id != key, references)) muts_by_ref = sorted(mut_groups, key=index_of_ref) @@ -184,31 +188,45 @@ def all_consensuses(references, muts, mind, majority): mappers = [partial(call_many, mind, majority)] filters = [ref_and_alt_differ] muts_by_ref = group_muts_by_refs(references, muts) - muts = imap(get(0), muts_by_ref) + muts = map(compose(list, get(1)), muts_by_ref) def single_ref_consensus(recs, ref): - fix_recs = compose(compose_filters(filters), compose_mappers(mappers)) + fix_recs = compose(compose_filters(*filters), compose_mappers(*mappers)) ref_str = str(ref.seq) return make_consensus(ref_str, fix_recs(recs)) return references, imap(single_ref_consensus, muts, references) - ########## # I/O # ########## - def consensus_str(ref, consensus): return ">{0}:Consensus\n{1}".format(ref.id, consensus) -make_fasta_str = compose('\n'.join, - partial(imap, consensus_str), all_consensuses) #@contract(ref_fasta=str, vcf=str, mind=int, majority=int) -def run(ref_fasta, freebayes_vcf, outpath, mind, majority): +def run(ref_fasta, freebayes_vcf, outfile, mind, majority): refs = SeqIO.parse(ref_fasta, 'fasta') - with open(freebayes_vcf, 'r') as vcf_handle, open(outpath, 'w') as out: + with open(freebayes_vcf, 'r') as vcf_handle: muts = imap(flatten_vcf_record, vcf.Reader(vcf_handle)) - result = make_fasta_str(refs, muts, mind, majority) - out.write(result) + refs, muts = list(refs), list(muts) + refs, seqs = all_consensuses(refs, muts, mind, majority) + strings = imap(consensus_str, refs, seqs) + result = '\n'.join(strings) + outfile.write(result) + outfile.close() return 0 +def main(): + scheme = Schema( + { '--vcf' : os.path.isfile, + '--ref' : os.path.isfile, + '--majority' : Use(int), + '--mind' : Use(int), + '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))}) + raw_args = docopt(__doc__, version='Version 1.0') + args = scheme.validate(raw_args) + run(args['--ref'], args['--vcf'], args['--output'], + args['--mind'], args['--output']) + +if __name__ == '__main__': + main() From e36a4c1720593ec30452ed578770561d1baad030 Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 24 Feb 2016 13:22:58 -0500 Subject: [PATCH 14/41] refactoring to simplify and remove deprecated funcs --- bioframework/consensus.py | 56 ++++++--------------------------------- 1 file changed, 8 insertions(+), 48 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 184668d..cd2c060 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -9,8 +9,6 @@ --mind= minimum depth to call base non-N [default: 10] --output,-o= output file [default: ] """ - - from toolz.itertoolz import first, rest, peek from operator import itemgetter as get from toolz import compose, curry @@ -36,18 +34,6 @@ MAJORITY_PERCENTAGE = 80 MIN_DEPTH = 10 -######################## -# Function combintaors # -######################## - -''' just a way to compose filters and mapping functions together ''' -def compose_transormer(transformer, combinator, *funcs): - return partial(transformer, reduce(combinator, funcs)) -_and = lambda f,g: lambda x: f(x) and g(x) -compose_filters = partial(compose_transormer, ifilter, _and) -compose_mappers = partial(compose_transormer, imap, compose) - - ########### # Reducer # ########### @@ -132,38 +118,12 @@ def seq_count(acc, ao_and_nts): result = takewhile(bool, res) return (ref, ''.join(result), pos) - -#@contract(min_depth=int, majoirty_percentage=int, rec=dict, returns='tuple(str, str, int)') -#DEPRECATED -def call_base(min_depth, majoirty_percentage, rec): -#DEPRECATED - '''if a base is under the min depth, it gets called as an `N`. - If the alternate base is >= `majoirty_percentage`, it will be - converted to an ambiguous base if it includes multiple bases. - Otherwise, the reference base replaces itself (gets ignored). - Returns a tuple of the form: - (referene_base, alternate_base, position) - where reference_base == alternate_base if the reference was preferred.''' - get_ambiguous = compose(lambda x: AMBIGUITY_TABLE.get(x, x), ''.join, sorted) - if rec['DP'] < min_depth: alt = 'N'# (rec['ref'], 'N', rec['POS']) - elif alt_over_percent(majoirty_percentage)(rec): - alt = get_ambiguous(rec['alt']) - else: alt = rec['ref'] - return (rec['ref'], alt, rec['pos']) - def flatten_vcf_record(rec): return merge({ 'alt' : rec.ALT, 'ref' : rec.REF, 'pos' : rec.POS, 'chrom' : rec.CHROM}, rec.INFO) -############## -# Filters # -############## -ref_and_alt_differ = lambda x: x[0] != x[1] -alt_over_percent = lambda n: lambda rec: rec['AO']/float(rec['DP']) > (n/float(100)) - - ############## # Group By # ############## @@ -180,20 +140,20 @@ def index_of_ref(key): ############### # Runner # ############### + #@contract(references='SeqRecord', muts='seq(dict)', mind=int, majority=int) def all_consensuses(references, muts, mind, majority): ''' generates conesnsuses, including for flu and other mult-reference VCFs. applies filters and base callers to the mutations. then builds the consensus using these calls and `make_consensus`''' - mappers = [partial(call_many, mind, majority)] - filters = [ref_and_alt_differ] muts_by_ref = group_muts_by_refs(references, muts) - muts = map(compose(list, get(1)), muts_by_ref) - def single_ref_consensus(recs, ref): - fix_recs = compose(compose_filters(*filters), compose_mappers(*mappers)) - ref_str = str(ref.seq) - return make_consensus(ref_str, fix_recs(recs)) - return references, imap(single_ref_consensus, muts, references) + mut_groups = map(compose(list, get(1)), muts_by_ref) + def single_consensus(muts, ref): + the_muts = map(partial(call_many, mind, majority), muts) + ref_and_alt_differ = lambda x: x[0] != x[1] + real_muts = filter(ref_and_alt_differ, the_muts) + return make_consensus(str(ref.seq), real_muts) + return references, imap(single_consensus, mut_groups, references) ########## From 46f2df9cb23951e771a37a3b27b6adcb6aad0cc8 Mon Sep 17 00:00:00 2001 From: Panciera Date: Fri, 26 Feb 2016 22:20:18 -0500 Subject: [PATCH 15/41] added tests,refactored vcf dict flattening --- bioframework/consensus.py | 10 +++---- tests/test_consensus.py | 62 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 tests/test_consensus.py diff --git a/bioframework/consensus.py b/bioframework/consensus.py index cd2c060..ce433e0 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -92,11 +92,8 @@ def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): return AMBIGUITY_TABLE[as_ambiguous] -def call_many(min_depth, majority_percentage, _rec): +def call_many(min_depth, majority_percentage, rec): #TODO: switch to generators - if not hasattr(_rec['alt'], '__iter__'): #TODO: put this somewhere else - rec = merge(_rec, dict(alt=[_rec['alt']], AO=[_rec['AO']])) - else: rec = _rec muts = zip(rec['AO'], rec['alt']) ref, dp, pos = rec['ref'], rec['DP'], rec['pos'] longest_len = max(map(lambda x: len(x[-1]), muts)) @@ -119,10 +116,13 @@ def seq_count(acc, ao_and_nts): return (ref, ''.join(result), pos) def flatten_vcf_record(rec): - return merge({ + _rec = merge({ 'alt' : rec.ALT, 'ref' : rec.REF, 'pos' : rec.POS, 'chrom' : rec.CHROM}, rec.INFO) + if not hasattr(_rec['alt'], '__iter__'): #TODO: put this somewhere else + return merge(_rec, dict(alt=[_rec['alt']], AO=[_rec['AO']])) + else: return _rec ############## # Group By # diff --git a/tests/test_consensus.py b/tests/test_consensus.py new file mode 100644 index 0000000..5f78e67 --- /dev/null +++ b/tests/test_consensus.py @@ -0,0 +1,62 @@ +from biottest.biohypothesis import ref_with_vcf_dicts_strategy_factory +from hypothesis import given +from hypothesis import strategies as st +from hypothesis import given, assume +from bioframework.consensus import call_many, all_consensuses +import unittest + +simple_vcf_dict_strategy = st.tuples(st.text(string.ascii_letters), + st.integers(min_value=1), + st.text(string.ascii_letters)) \ + .flatmap(lambda tup: + vcf_dict_strategy_factory(*tup)) +pos_int = st.integers(min_value=0) +def just_ref(*args): + return next(all_consensuses(*args)[1]) +class CallBaseHypothesisTest(unittest.TestCase): + @given(simple_vcf_dict_strategy, pos_int) + def test_under_mind_is_N(self, mut, mind): + assume(mut['DP'] < mind) + result = call_many(mind, 80, mut)[1] + self.assertTrue(all(map(lambda x: x == 'N', result))) + + @given(simple_vcf_dict_strategy) + def test_ao_under_minority_is_ref(self, mut): + assume(sum(mut['AO']) / mut['DP'] < 0.2) + result = call_many(0, 80, mut)[1] + self.assertEquals(result == mut['ref']) + + @given(simple_vcf_dict_strategy) + def test_over_minoriy_is_alt(self, mut): + assume(sum(mut['AO']) / mut['DP'] > 0.2) + assume(len(mut['alt']) == 1) + result = call_many(0, 80, mut)[1] + self.assertEquals(result == mut['alt']) + +class ConsesusExampleTest(unittest.TestCase): + def test_single_example(self): + muts = [{ + 'pos' : 2, + 'ref' : 'CG', + 'alt' : 'TT', + 'AO' : 99, + 'DP' : 100, + 'chrom' : 'X' + }, + { + 'pos' : 6, + 'ref' : 'C', + 'alt' : ['G', 'A'], + 'AO' : [15, 120], + 'DP' : 150, + 'chrom' : 'X' + }] + ref = 'ACGTACGT' + expected = 'ATTTAAGT' + result = just_ref([ref], muts, 10, 80) + self.assertEquals(expected, result) + +class ConsensusHypothesisTest(unittest.TestCase): + from collections import Counter + + From 1894e0a92704f3992f68bb2371f745a3080c62c6 Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 2 Mar 2016 12:47:55 -0500 Subject: [PATCH 16/41] add missing dependencies --- setup.py | 1 + tests/requirements-dev.txt | 1 + 2 files changed, 2 insertions(+) mode change 100644 => 100755 setup.py mode change 100644 => 100755 tests/requirements-dev.txt diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 292f24b..0c97ac2 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ def jip_modules(path=bioframework.JIP_PATH): install_requires = [ 'pyjip', 'toolz', + 'docopt', ], data_files=[ (join(sys.prefix,'bin'), jip_modules()), diff --git a/tests/requirements-dev.txt b/tests/requirements-dev.txt old mode 100644 new mode 100755 index d96de86..f0fa163 --- a/tests/requirements-dev.txt +++ b/tests/requirements-dev.txt @@ -4,3 +4,4 @@ python-coveralls testfixtures sh hypothesis +https://github.com/VDBWRAIR/biotest From 4633cc6f832fca1ab7449a2973341b4ea8ac129c Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 2 Mar 2016 12:48:55 -0500 Subject: [PATCH 17/41] fixed ordering bug --- bioframework/consensus.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index ce433e0..7dc0387 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -131,7 +131,10 @@ def group_muts_by_refs(references, muts): '''group and sort the mutations so that they match the order of the references.''' mut_groups = groupby(muts, get('chrom')) def index_of_ref(key): - return sum(1 for _ in takewhile(lambda x: x.id != key, references)) + chrom=key[0] + assert(type(chrom) == str) + index_of_chrom = map(lambda x: x.id, references).index(chrom) + return index_of_chrom muts_by_ref = sorted(mut_groups, key=index_of_ref) return muts_by_ref From 05b04e875c2b90ff2d7ab0498cba8e08ab86cb87 Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 2 Mar 2016 13:20:52 -0500 Subject: [PATCH 18/41] fixed groupby bug --- bioframework/consensus.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 7dc0387..8f39e07 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -129,13 +129,16 @@ def flatten_vcf_record(rec): ############## def group_muts_by_refs(references, muts): '''group and sort the mutations so that they match the order of the references.''' - mut_groups = groupby(muts, get('chrom')) + #NOTE: muts will already be "sorted" in that they are grouped together in the vcf + #fix the groupby so it doesn't incidentally drain the first object of the group + unzip = lambda x: zip(*x) + chroms, groups = unzip(map(lambda kv: (kv[0], list(kv[1])), groupby(muts, get('chrom')))) def index_of_ref(key): chrom=key[0] assert(type(chrom) == str) index_of_chrom = map(lambda x: x.id, references).index(chrom) return index_of_chrom - muts_by_ref = sorted(mut_groups, key=index_of_ref) + _, muts_by_ref = unzip(sorted(zip(chroms, groups), key=index_of_ref)) return muts_by_ref From 8eb0c38cc6242aec8775bb66e794c1a28b7e72ea Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 2 Mar 2016 16:32:18 -0500 Subject: [PATCH 19/41] fixed make_consensus --- bioframework/consensus.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 8f39e07..d9cc462 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -44,7 +44,8 @@ def make_consensus(reference, muts): def _do_build((accString, string, lastPos), (x, y, bigPos)): pos = bigPos - lastPos return (accString + (string[:pos] + y), string[pos+len(x):], bigPos+len(x)) - return reduce(_do_build, muts, ('', reference, 0))[0] + result, remaining, _ = reduce(_do_build, muts, ('', reference, 0)) + return result + remaining ############## @@ -89,7 +90,7 @@ def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): over20 = valfilter(over_depth(0.2), alts_with_ref) as_ambiguous = ''.join(sorted(over20.keys())) # this could return a single base, (including the reference), becuase i.e. A => A in the ambiguity table - return AMBIGUITY_TABLE[as_ambiguous] + return AMBIGUITY_TABLE[as_ambiguous] if as_ambiguous != '' else '' def call_many(min_depth, majority_percentage, rec): @@ -153,13 +154,15 @@ def all_consensuses(references, muts, mind, majority): applies filters and base callers to the mutations. then builds the consensus using these calls and `make_consensus`''' muts_by_ref = group_muts_by_refs(references, muts) - mut_groups = map(compose(list, get(1)), muts_by_ref) + #mut_groups = map(compose(list, get(1)), muts_by_ref) + #real_muts = filter(ref_and_alt_differ, the_muts) def single_consensus(muts, ref): the_muts = map(partial(call_many, mind, majority), muts) ref_and_alt_differ = lambda x: x[0] != x[1] - real_muts = filter(ref_and_alt_differ, the_muts) + # vcf is index-starting-at-1 + real_muts = map(lambda (a,b,pos): (a,b,pos-1), filter(ref_and_alt_differ, the_muts)) return make_consensus(str(ref.seq), real_muts) - return references, imap(single_consensus, mut_groups, references) + return references, imap(single_consensus, muts_by_ref, references) ########## From 8e3062c2917b14f031d55559729878a4d5932f32 Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 2 Mar 2016 16:32:56 -0500 Subject: [PATCH 20/41] more tests --- tests/test_consensus.py | 71 ++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/tests/test_consensus.py b/tests/test_consensus.py index 5f78e67..30910ce 100644 --- a/tests/test_consensus.py +++ b/tests/test_consensus.py @@ -1,14 +1,18 @@ -from biottest.biohypothesis import ref_with_vcf_dicts_strategy_factory +from __future__ import division +from biotest.biohypothesis import ref_with_vcf_dicts_strategy_factory, \ + make_seqrec, vcf_dict_strategy_factory from hypothesis import given +from fn import _ from hypothesis import strategies as st from hypothesis import given, assume -from bioframework.consensus import call_many, all_consensuses +from bioframework.consensus import call_many, all_consensuses, make_consensus +import string import unittest simple_vcf_dict_strategy = st.tuples(st.text(string.ascii_letters), st.integers(min_value=1), - st.text(string.ascii_letters)) \ - .flatmap(lambda tup: + st.text(alphabet='ACTGN', min_size=1, max_size=6)) \ + .flatmap(lambda tup:\ vcf_dict_strategy_factory(*tup)) pos_int = st.integers(min_value=0) def just_ref(*args): @@ -24,22 +28,41 @@ def test_under_mind_is_N(self, mut, mind): def test_ao_under_minority_is_ref(self, mut): assume(sum(mut['AO']) / mut['DP'] < 0.2) result = call_many(0, 80, mut)[1] - self.assertEquals(result == mut['ref']) + self.assertEquals(result, mut['ref']) @given(simple_vcf_dict_strategy) - def test_over_minoriy_is_alt(self, mut): - assume(sum(mut['AO']) / mut['DP'] > 0.2) + def test_over_majority_is_alt(self, mut): + #TODO: this is slow + assume(sum(mut['AO']) / mut['DP'] > 0.8) assume(len(mut['alt']) == 1) result = call_many(0, 80, mut)[1] - self.assertEquals(result == mut['alt']) + self.assertEquals(result, mut['alt'][0]) + +#Commented out because it's not actually always true, +# e.g. mut={'ref': u'AA', 'pos': 1, 'AO': [784313725491], 'alt': [u'A'], +# 'chrom': u'', 'DP': 3921568627454}) +# should result in AA +# @given(simple_vcf_dict_strategy) +# def test_over_minoriy_is_not_ref(self, mut): +# assume(sum(mut['AO']) / mut['DP'] > 0.2) +# result = call_many(0, 80, mut)[1] +# self.assertNotEquals(result, mut['ref']) class ConsesusExampleTest(unittest.TestCase): + def test_make_consensus_example(self): + muts = [('CG', 'TT', 1), + ('C', 'A', 5)] + ref = 'ACGTACGT' + expected = 'ATTTAAGT' + actual = make_consensus(ref, muts) + self.assertEquals(expected, actual) + def test_single_example(self): muts = [{ 'pos' : 2, 'ref' : 'CG', - 'alt' : 'TT', - 'AO' : 99, + 'alt' : ['TT'], + 'AO' : [99], 'DP' : 100, 'chrom' : 'X' }, @@ -51,12 +74,24 @@ def test_single_example(self): 'DP' : 150, 'chrom' : 'X' }] - ref = 'ACGTACGT' - expected = 'ATTTAAGT' - result = just_ref([ref], muts, 10, 80) - self.assertEquals(expected, result) - -class ConsensusHypothesisTest(unittest.TestCase): - from collections import Counter - + ref = make_seqrec('X', 'ACGTACGT') + expected = 'ATTTAAGT' + result = just_ref([ref], muts, 10, 80) + self.assertEquals(expected, result) +from collections import Counter +countof = lambda c: lambda x: Counter(x).get(c, 0) +#class ConsensusHypothesisTest(unittest.TestCase): +# @st.random_module +# @given(ref_with_vcf_dicts_strategy_factory()) +# def test_n_count(self, ref_and_muts): +# ref, muts = ref_and_muts +# originalNs = countof('N')(ref) +# assume(not filter(lambda x: 'N' in x, muts)) +# expectedNs = len(filter(_['DP'] < 10, muts)) + originalNs +# result = just_ref([ref], muts, 10, 80) +# self.assertEquals(countof('N')(result), expectedNs) +# +# +# +# From 33d7db7a7728d7fb594279ee21e73581345f60bc Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 2 Mar 2016 16:40:08 -0500 Subject: [PATCH 21/41] add failing test --- tests/test_consensus.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_consensus.py b/tests/test_consensus.py index 30910ce..3ba6d9c 100644 --- a/tests/test_consensus.py +++ b/tests/test_consensus.py @@ -81,17 +81,17 @@ def test_single_example(self): from collections import Counter countof = lambda c: lambda x: Counter(x).get(c, 0) -#class ConsensusHypothesisTest(unittest.TestCase): -# @st.random_module -# @given(ref_with_vcf_dicts_strategy_factory()) -# def test_n_count(self, ref_and_muts): -# ref, muts = ref_and_muts -# originalNs = countof('N')(ref) -# assume(not filter(lambda x: 'N' in x, muts)) -# expectedNs = len(filter(_['DP'] < 10, muts)) + originalNs -# result = just_ref([ref], muts, 10, 80) -# self.assertEquals(countof('N')(result), expectedNs) -# -# -# -# +class ConsensusHypothesisTest(unittest.TestCase): + #@st.random_module + @given(ref_with_vcf_dicts_strategy_factory(), st.random_module()) + def test_n_count(self, ref_and_muts, rand): + ref, muts = ref_and_muts + originalNs = countof('N')(ref) + assume(not filter(lambda x: 'N' in x, muts)) + expectedNs = len(filter(_['DP'] < 10, muts)) + originalNs + result = just_ref([ref], muts, 10, 80) + self.assertEquals(countof('N')(result), expectedNs) + + + + From d3bfe633fb7c05bb2e831e3ca7f4ea078e4ca425 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 09:12:05 -0500 Subject: [PATCH 22/41] fixed git+ protocol of biotest dep. --- tests/requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements-dev.txt b/tests/requirements-dev.txt index f0fa163..b139172 100755 --- a/tests/requirements-dev.txt +++ b/tests/requirements-dev.txt @@ -4,4 +4,4 @@ python-coveralls testfixtures sh hypothesis -https://github.com/VDBWRAIR/biotest +git+https://github.com/VDBWRAIR/biotest From 2faecbfaeea69ef3d8b3a291bc7359e40c87193b Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 09:40:59 -0500 Subject: [PATCH 23/41] fixed contracts to actual pypy name --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f0346dc..dbdcd86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -contracts +pycontracts toolz pyvcf From 613e825729a4b598bc1789ee9db7f4202873ba91 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 18:17:51 -0500 Subject: [PATCH 24/41] 10/10 working --- bioframework/consensus.py | 61 ++++++++++++++++---------- tests/test_consensus.py | 92 +++++++++++++++++++++++++++++++++++---- 2 files changed, 120 insertions(+), 33 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index d9cc462..05a1970 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -9,19 +9,22 @@ --mind= minimum depth to call base non-N [default: 10] --output,-o= output file [default: ] """ -from toolz.itertoolz import first, rest, peek +#stdlib from operator import itemgetter as get -from toolz import compose, curry from functools import partial -from toolz.dicttoolz import merge, dissoc, merge_with, valfilter from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest -#from contracts import contract import os, sys -from docopt import docopt -from schema import Schema, Use -from Bio import SeqIO, SeqRecord -import vcf - +from typing import Tuple, Dict, List, Iterator, Iterable, Any, Callable + +from Bio import SeqIO #done +from Bio.SeqRecord import SeqRecord #done +import vcf #done +from vcf.model import _Record +#from toolz import compose +from toolz.dicttoolz import merge, dissoc, merge_with, valfilter #todo +from docopt import docopt #ignore +from schema import Schema, Use #ignore +from contracts import contract, new_contract #can ignore ############# # Constants # ############# @@ -33,19 +36,20 @@ MAJORITY_PERCENTAGE = 80 MIN_DEPTH = 10 - +Mut = Tuple[str, str, int] ########### # Reducer # ########### -#@contract(reference=str, muts='seq(tuple(str, str, int))' ) +@contract(reference='string', muts='list(tuple(string, string, int))' ) def make_consensus(reference, muts): + # type: (str, List[Mut]) -> Tuple[str, List[Mut]] ''' Actually builds a consensus string by recursively applying the mutations.''' def _do_build((accString, string, lastPos), (x, y, bigPos)): pos = bigPos - lastPos return (accString + (string[:pos] + y), string[pos+len(x):], bigPos+len(x)) result, remaining, _ = reduce(_do_build, muts, ('', reference, 0)) - return result + remaining + return result + remaining, muts ############## @@ -57,15 +61,15 @@ def _do_build((accString, string, lastPos), (x, y, bigPos)): # u'TT', u'AATTG', u'AAGAA'], 'chrom': u'o', 'DP': 87288} # bioframework.consensus.call_many(10, 80, a) -#@contract(dp=int,ref=str,alts='dict(string,int)') +#@contract(min_depth='number,>=0', majority_percentage='number,>=0,<=100',dp='number,>=0', ref='string|None',alts='dict(string: number)') def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): + # type: (int, int, int, Dict[str, int], str) -> str """when there are multiple alts, zip through with each of them zip(*alts), character by character. compare the percentages, and sum the percentages for each base. (groupby, sum) pick each character (call each base) based on the given rules (using call_base).""" #TODO: majority_percentage gets ignored, so replace constants #TODO: behavior is undefined if sum(AO) > dp. - #import ipdb; ipdb.set_trace() if dp < min_depth: #could call REF here sometimes return 'N' total_ao = lambda: sum(alts.values()) # avoid evaluating unless necessary @@ -92,8 +96,9 @@ def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): # this could return a single base, (including the reference), becuase i.e. A => A in the ambiguity table return AMBIGUITY_TABLE[as_ambiguous] if as_ambiguous != '' else '' - +#@contract(min_depth='number,>=0', majority_percentage='number,>=0,<=100', rec='dict', returns='tuple(string, string, int)') def call_many(min_depth, majority_percentage, rec): + # type: (int, int, Dict) -> Mut #TODO: switch to generators muts = zip(rec['AO'], rec['alt']) ref, dp, pos = rec['ref'], rec['DP'], rec['pos'] @@ -110,13 +115,15 @@ def seq_count(acc, ao_and_nts): return map(merge_sum, acc, [{nt:ao} for nt in nts]) # create a list of {base : count}, where the index matches the position mut_dicts = reduce(seq_count, xs, [{}]) - base_caller = partial(call_base_multi_alts, min_depth, majority_percentage, dp) + base_caller = partial(call_base_multi_alts, min_depth, majority_percentage, dp) # type: Callable[[Dict[Any,Any], str], str] res = map(base_caller, mut_dicts, ref) # trim None values at the end, (which indicate deletion) result = takewhile(bool, res) return (ref, ''.join(result), pos) +@contract(rec='dict',returns='dict') def flatten_vcf_record(rec): + # type: (_Record) -> Dict[str, Any] _rec = merge({ 'alt' : rec.ALT, 'ref' : rec.REF, 'pos' : rec.POS, 'chrom' : rec.CHROM}, @@ -128,15 +135,20 @@ def flatten_vcf_record(rec): ############## # Group By # ############## +#NOTE: could possibly drop lists, use fn.Stream all the time, +# and write a Stream instance for contracts like: +# https://github.com/AndreaCensi/contracts/blob/831ec7a5260ceb8960540ba0cb6cc26370cf2d82/src/contracts/library/lists.py +@contract(references='list[N]($SeqRecord),N>0', muts='list(dict)',returns='tuple(list(dict))') def group_muts_by_refs(references, muts): + # type: (List[SeqRecord], List[Dict[Any, Any]]) -> Iterable[List[Dict]] '''group and sort the mutations so that they match the order of the references.''' #NOTE: muts will already be "sorted" in that they are grouped together in the vcf #fix the groupby so it doesn't incidentally drain the first object of the group unzip = lambda x: zip(*x) chroms, groups = unzip(map(lambda kv: (kv[0], list(kv[1])), groupby(muts, get('chrom')))) + @contract(key='tuple(string,list)') def index_of_ref(key): chrom=key[0] - assert(type(chrom) == str) index_of_chrom = map(lambda x: x.id, references).index(chrom) return index_of_chrom _, muts_by_ref = unzip(sorted(zip(chroms, groups), key=index_of_ref)) @@ -150,17 +162,17 @@ def index_of_ref(key): #@contract(references='SeqRecord', muts='seq(dict)', mind=int, majority=int) def all_consensuses(references, muts, mind, majority): + # type: (Iterable[SeqRecord], Iterable[Dict[Any,Any]], int, int) -> Tuple[List[str], Iterator[Tuple[str, List[Mut]]]] ''' generates conesnsuses, including for flu and other mult-reference VCFs. applies filters and base callers to the mutations. then builds the consensus using these calls and `make_consensus`''' muts_by_ref = group_muts_by_refs(references, muts) - #mut_groups = map(compose(list, get(1)), muts_by_ref) - #real_muts = filter(ref_and_alt_differ, the_muts) def single_consensus(muts, ref): the_muts = map(partial(call_many, mind, majority), muts) ref_and_alt_differ = lambda x: x[0] != x[1] # vcf is index-starting-at-1 - real_muts = map(lambda (a,b,pos): (a,b,pos-1), filter(ref_and_alt_differ, the_muts)) + #real_muts = map(lambda (a,b,pos): (a,b,pos-1), filter(ref_and_alt_differ, the_muts)) + real_muts = map(lambda x: (x[0], x[1], x[2] - 1), filter(ref_and_alt_differ, the_muts)) return make_consensus(str(ref.seq), real_muts) return references, imap(single_consensus, muts_by_ref, references) @@ -168,24 +180,25 @@ def single_consensus(muts, ref): ########## # I/O # ########## -def consensus_str(ref, consensus): +def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str return ">{0}:Consensus\n{1}".format(ref.id, consensus) #@contract(ref_fasta=str, vcf=str, mind=int, majority=int) def run(ref_fasta, freebayes_vcf, outfile, mind, majority): + # type: (str, str, str, int, int) -> int refs = SeqIO.parse(ref_fasta, 'fasta') with open(freebayes_vcf, 'r') as vcf_handle: muts = imap(flatten_vcf_record, vcf.Reader(vcf_handle)) refs, muts = list(refs), list(muts) - refs, seqs = all_consensuses(refs, muts, mind, majority) - strings = imap(consensus_str, refs, seqs) + refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority) + strings = imap(consensus_str, refs, imap(get(0), seqs_and_muts)) result = '\n'.join(strings) outfile.write(result) outfile.close() return 0 -def main(): +def main(): # type () -> None scheme = Schema( { '--vcf' : os.path.isfile, '--ref' : os.path.isfile, diff --git a/tests/test_consensus.py b/tests/test_consensus.py index 3ba6d9c..55f81f5 100644 --- a/tests/test_consensus.py +++ b/tests/test_consensus.py @@ -2,11 +2,13 @@ from biotest.biohypothesis import ref_with_vcf_dicts_strategy_factory, \ make_seqrec, vcf_dict_strategy_factory from hypothesis import given -from fn import _ +#from fn import _ from hypothesis import strategies as st from hypothesis import given, assume +from operator import itemgetter as get from bioframework.consensus import call_many, all_consensuses, make_consensus import string +import itertools import unittest simple_vcf_dict_strategy = st.tuples(st.text(string.ascii_letters), @@ -15,8 +17,10 @@ .flatmap(lambda tup:\ vcf_dict_strategy_factory(*tup)) pos_int = st.integers(min_value=0) + +#TODO: these 10, 80 for trhesh and majority_percentage should be factored out and possibly be strategies themselves def just_ref(*args): - return next(all_consensuses(*args)[1]) + return next(all_consensuses(*args)[1])[0] class CallBaseHypothesisTest(unittest.TestCase): @given(simple_vcf_dict_strategy, pos_int) def test_under_mind_is_N(self, mut, mind): @@ -54,7 +58,7 @@ def test_make_consensus_example(self): ('C', 'A', 5)] ref = 'ACGTACGT' expected = 'ATTTAAGT' - actual = make_consensus(ref, muts) + actual = make_consensus(ref, muts)[0] self.assertEquals(expected, actual) def test_single_example(self): @@ -78,20 +82,90 @@ def test_single_example(self): expected = 'ATTTAAGT' result = just_ref([ref], muts, 10, 80) self.assertEquals(expected, result) - +ref_with_vcf_dicts_strategy = ref_with_vcf_dicts_strategy_factory().map( + lambda (r, muts): (make_seqrec(muts[0]['chrom'], r), muts)) from collections import Counter countof = lambda c: lambda x: Counter(x).get(c, 0) -class ConsensusHypothesisTest(unittest.TestCase): - #@st.random_module - @given(ref_with_vcf_dicts_strategy_factory(), st.random_module()) +def run_cons(*args): + _, alt_and_cons = all_consensuses(*args) + cons, alts = zip(*alt_and_cons) + return cons[0], alts[0] +class ConsensusHypothesisTest(unittest.TestCase): + #ref_and_muts=(SeqRecord(seq=Seq(u'AAAAAAAAAA', IUPACAmbiguousDNA()), id=u'', name='', description='', dbxrefs=[]), [ +# {'ref': u'A', 'pos': 1, 'AO': [479, 777, 119, 604], 'alt': [u'G', u'C', u'G', u'TG'], 'chrom': u'', 'DP': 2635}, +# {'ref': u'A', 'pos': 3, 'AO': [291, 241, 583, 420], 'alt': [u'CTG', u'C', u'G', u'C'], 'chrom': u'', 'DP': 1627}]), rand=random.seed(0)) +# AssertionError: 1 != 0 + + @given(ref_with_vcf_dicts_strategy, st.random_module()) def test_n_count(self, ref_and_muts, rand): ref, muts = ref_and_muts originalNs = countof('N')(ref) - assume(not filter(lambda x: 'N' in x, muts)) - expectedNs = len(filter(_['DP'] < 10, muts)) + originalNs + alts = map(get('alt'), muts) + assume(not any(map(lambda x: 'N' in x, itertools.chain(*alts)))) + # needed because ACGT -> N + assume(not filter(lambda x: len(x) > 3, alts)) + expectedNs = len(filter(lambda x: x['DP'] < 10, muts)) + originalNs result = just_ref([ref], muts, 10, 80) self.assertEquals(countof('N')(result), expectedNs) + @given(ref_with_vcf_dicts_strategy) + def test_less_or_equal_length_when_no_inserts(self, ref_and_muts): + ref, muts = ref_and_muts + cons, alts = run_cons([ref], muts, 10, 80) + assume(not any(map(lambda x: len(x[0]) < len(x[1]), alts))) + self.assertGreaterEqual(len(ref), len(cons)) + + @given(ref_with_vcf_dicts_strategy) + def assume_greater_or_equal_length_when_no_deletions(self, ref_and_muts): + ref, muts = ref_and_muts + def has_deletion(mut): + filter(lambda x: len(x) < mut['ref'], mut['alt']) + assume(not any(map(has_deletion, muts))) + result = just_ref([ref], muts, 10, 80) + self.assertLesserEqual(len(ref), len(result)) + +class ConsensusMetamorphicTests(unittest.TestCase): + '''paper on metamorphic testing: http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-10-24 ''' + @given(ref_with_vcf_dicts_strategy, st.integers(), st.integers()) + def test_more_or_equal_ns_with_lower_threshold(self, ref_and_muts, n1, n2): + ref, muts = ref_and_muts + assume(n1 < n2) + cons1 = just_ref([ref], muts, n1, 80) + cons2 = just_ref([ref], muts, n2, 80) + nsCount1, nsCount2 = countof('N')(cons1), countof('N')(cons2) + self.assertLessEqual(nsCount1, nsCount2) + + @given(ref_with_vcf_dicts_strategy) + def test_consensus_from_consensus_contains_more_alts(self, ref_and_muts): + ref, muts = ref_and_muts + assume(not any(map(lambda x: len(x['alt']) > 1, muts))) + n1 = 10 + cons1, alts = run_cons([ref], muts, n1, 80) + assume(not any(map(lambda x: len(x[0]) > len(x[1]), alts))) + cons2, _ = run_cons([make_seqrec(muts[0]['chrom'], cons1)], muts, n1, 80) + picked_alts = map(get(1), alts) + altCounts1 = sum(map(lambda f: f(cons1), map(countof, picked_alts))) + altCounts2 = sum(map(lambda f: f(cons2), map(countof, picked_alts))) + self.assertLessEqual(altCounts1, altCounts2) + #NOTE: the below test appears to be meaningless, + # ass the values are always equal + percent = st.integers(min_value=0, max_value=100) + @given(ref_with_vcf_dicts_strategy, percent, percent) + def test_lower_majority_required_contains_more_alts(self, ref_and_muts, p1, p2): + ref, muts = ref_and_muts + assume(p1 < p2) + assume(not any(map(lambda x: len(x['alt']) > 1, muts))) + n1 = 10 + cons1, alts = run_cons([ref], muts, n1, p1) + assume(not any(map(lambda x: len(x[0]) > len(x[1]), alts))) + cons2, _ = run_cons([ref], muts, n1, p2) + picked_alts = map(get(1), alts) + altCounts1 = sum(map(lambda f: f(cons1), map(countof, picked_alts))) + altCounts2 = sum(map(lambda f: f(cons2), map(countof, picked_alts))) + self.assertLessEqual(altCounts1, altCounts2) + #TODO: could test for exceptions when e.g., + # 1. POS is repeated + # 2. POS is greater than the size of sequence From 0de7577847ee06821a34874a575ecfe499516eaa Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 21:20:21 -0500 Subject: [PATCH 25/41] baic types and examples --- MyPy/Bio/SeqIO.pyi | 5 +++ MyPy/Bio/SeqRecord.py | 6 ++++ MyPy/Bio/__init__.py | 0 MyPy/example.py | 44 ++++++++++++++++++++++++ MyPy/toolz/__init__.py | 0 MyPy/toolz/dicttoolz.py | 32 +++++++++++++++++ MyPy/types.py | 76 +++++++++++++++++++++++++++++++++++++++++ MyPy/vcf/__init__.py | 10 ++++++ MyPy/vcf/model.py | 3 ++ 9 files changed, 176 insertions(+) create mode 100644 MyPy/Bio/SeqIO.pyi create mode 100644 MyPy/Bio/SeqRecord.py create mode 100644 MyPy/Bio/__init__.py create mode 100644 MyPy/example.py create mode 100644 MyPy/toolz/__init__.py create mode 100644 MyPy/toolz/dicttoolz.py create mode 100644 MyPy/types.py create mode 100644 MyPy/vcf/__init__.py create mode 100644 MyPy/vcf/model.py diff --git a/MyPy/Bio/SeqIO.pyi b/MyPy/Bio/SeqIO.pyi new file mode 100644 index 0000000..baaf8a9 --- /dev/null +++ b/MyPy/Bio/SeqIO.pyi @@ -0,0 +1,5 @@ +from Bio.SeqRecord import SeqRecord +from typing import Generator, Any, Iterator + +def parse(*anything): # type: (*Any) -> Iterator[SeqRecord] + pass diff --git a/MyPy/Bio/SeqRecord.py b/MyPy/Bio/SeqRecord.py new file mode 100644 index 0000000..8bdc6c6 --- /dev/null +++ b/MyPy/Bio/SeqRecord.py @@ -0,0 +1,6 @@ +# from Bio.SeqIO import SeqIO +from typing import NamedTuple +class Stringable(object): + def __str__(self): # type: () -> str + pass +SeqRecord = NamedTuple('SeqRecord', [('id', str), ('seq', Stringable)]) diff --git a/MyPy/Bio/__init__.py b/MyPy/Bio/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/MyPy/example.py b/MyPy/example.py new file mode 100644 index 0000000..4f9bd9f --- /dev/null +++ b/MyPy/example.py @@ -0,0 +1,44 @@ +from typing import List, Dict, Generator, Iterator, Iterable, Tuple +from Bio import SeqIO +from itertools import imap +from Bio.SeqRecord import SeqRecord +def test_long(): # type: () -> int + return 11999999L +def test_seqIO_map_fails(s): # type: (str) -> List[SeqRecord] + return map(lambda x: x.id, SeqIO.parse(s)) + +#def test_seqIO_map_fails2(s): # type: (str) -> Iterator[SeqRecord] +# return map(lambda x: x.id, SeqIO.parse(s)) +def test_seqIO_map_passes(s): # type: (str) -> Iterable[str] + return imap(lambda x: x.id, SeqIO.parse(s)) + +def test_seqIO(s): # type: (str) -> Iterator[SeqRecord] + return SeqIO.parse(s) +def test_list_seqIO(s): # type: (str) -> List[SeqRecord] + return list(SeqIO.parse(s)) +def test_seqIO_fails(s): # type: (str) -> List[str] + return SeqIO.parse(s) +def test_should_pass(s): # type: (SeqRecord) -> str + return s.id +def test_should_fail(s): # type: (SeqRecord) -> int + return s.id +#def test_should_fail(): # type: () -> List[SeqRecord] +# return 3 + +#a = test_should_fail() +def test_ordered_dict(od): # type: (Dict[str,int]) -> Dict[str,int] + return 1 #type error 1 +# +#a = test_ordered_dict(1) #type error 2 +# +#def test_me(): +# a = test_ordered_dict(1) # type error 3 is not reported + +####def test_ordered_dict(od: typing.Dict[str,int]) -> typing.Dict[str,int]: +#### return 1 #type error 1 +#### +####a = test_ordered_dict(1) #type error 2 +#### +####def test_me(): +#### a = test_ordered_dict(1) # type error 3 is not reported +### diff --git a/MyPy/toolz/__init__.py b/MyPy/toolz/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/MyPy/toolz/dicttoolz.py b/MyPy/toolz/dicttoolz.py new file mode 100644 index 0000000..6f7178e --- /dev/null +++ b/MyPy/toolz/dicttoolz.py @@ -0,0 +1,32 @@ +from typing import Dict, Any, Callable, TypeVar +K = TypeVar('K') +V = TypeVar('V') +V2 = TypeVar('V2') +V3 = TypeVar('V3') +def merge(d1, d2): # type: (Dict[K,V], Dict[K,V]) -> Dict[K,V] + pass + +def dissoc(d, k): # type: (Dict[K,V], K) -> Dict[K,V] + pass + +def merge_with(f, d1, d2): # type: (Callable[[V,V2], V3], Dict[K,V], Dict[K,V2]) -> Dict[K,V3] + pass + +def valfilter(f, d): # type: (Callable[[V], bool], Dict[K,V]) -> Dict[K,V] + pass + + + +#from typing import Dict, Any, Callable, TypeVar +#T = TypeVar('T') +#def merge(d1, d2): # type: (Dict[Any,Any], Dict[Any,Any]) -> Dict[Any,Any] +# pass +# +#def dissoc(d, k): # type: (Dict[Any,Any], Any) -> Dict[Any,Any] +# pass +# +#def merge_with(f, d1, d2): # type: (Callable, Dict[Any,Any], Dict[Any,Any]) -> Dict[Any,Any] +# pass +# +#def valfilter(f, d): # type: (Callable, Dict[Any,Any]) -> Dict[Any,Any] +# pass diff --git a/MyPy/types.py b/MyPy/types.py new file mode 100644 index 0000000..9c40861 --- /dev/null +++ b/MyPy/types.py @@ -0,0 +1,76 @@ +from hypothesis import strategies as st +from typing import Dict, Tuple, List, Iterator, Set, Union, Optional, TypingMeta, NamedTuple +import re +import operator +from functools import partial +import string +from collections import namedtuple, OrderedDict +compose = lambda f,g: lambda *x: f(g(*x)) +''' +support: +- [x] NamedTuple +- [ ] Automatic function arguments +could also say, "given a function, generate random return values that it might give" because functions are also annotated with return values +''' + +# Just an exmaple of a named tuple +VCFRow = NamedTuple("VCFRow", + [('ref', str), + ('AO', List[int]), + ('DP', int), + ('chrom',str), + ('pos', int), + ('alt', List[str])]) + +primitives = { + str : st.text(), + int : st.integers(), + bool : st.booleans(), + float : st.floats(), + type(None) : st.none(), + unicode : st.characters(), + bytes : st.binary() # this is weird because str == bytes in py2 +} # missing: fractions, decimal + +#TODO: add Iterable, handle Sequence, etc. +def resolve(x): # type: (TypingMeta) -> hypothesis.strategies.SearchStrategy + if x in primitives: + strat = primitives[x] + elif hasattr(x, '_fields'): + # NamedTuple isn't a type, so this can't be a subclass check + try: + #Only way I know how to extract the name so it's pretty... + name = re.compile("([^\.]+)'>$").search(str(x)).groups()[0] + except: + name = str(x) + fts = OrderedDict(x._field_types) + nt = namedtuple(name, fts.keys()) + vals = map(resolve, fts.values()) + strat = st.tuples(*vals).map(lambda x: nt(*x)) + elif issubclass(x, Dict): + strat = st.dictionaries(*map(resolve, x.__parameters__)) + elif issubclass(x, Tuple): + strat = st.tuples(*map(resolve, x.__tuple_params__)) + elif issubclass(x, Union): + strat = operator.ior(*map(resolve, x.__union_params__)) + elif issubclass(x, Optional): + # Optional[X] is equivalent to Union[X, type(None)]. second param is always Nonetype. + value = x.__union_params__[0] + strat = (resolve(value) | st.none()) + else: # a list-type-ish + collections = { + Iterator : lambda x: st.lists(x).map(iter), + List : st.lists, + Set : st.sets + } #TODO: missing: Iterable , etc. + # For some reason List[T] not a subclass of List: issubclass(x, List) == False. So do these hijinks + params = x.__parameters__ + assert len(params) == 1, "Wrong type %s, not a list-like" % x + matches = filter(lambda k: k == x.__origin__, collections.keys()) + assert len(matches) == 1, "Should have exactly one match. %s matched with %s" % (x, matches) + collection_strat = collections[matches[0]] + strat = collection_strat(resolve(params[0])) + return strat +# see https://docs.python.org/3/library/typing.html +# not Generics +# not Callables diff --git a/MyPy/vcf/__init__.py b/MyPy/vcf/__init__.py new file mode 100644 index 0000000..fbb79f9 --- /dev/null +++ b/MyPy/vcf/__init__.py @@ -0,0 +1,10 @@ +from typing import Union, Dict, List, NamedTuple, Iterator + +#fields = [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] +# +#VCFRecord = NamedTuple('VCFRecord', fields) + +VCFRecord = NamedTuple('VCFRecord', [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] +) +def Reader(s): # type: (str) -> Iterator[VCFRecord] + pass diff --git a/MyPy/vcf/model.py b/MyPy/vcf/model.py new file mode 100644 index 0000000..02b1e9a --- /dev/null +++ b/MyPy/vcf/model.py @@ -0,0 +1,3 @@ +from typing import Union, Dict, List, NamedTuple, Iterator +_Record = NamedTuple('VCFRecord', [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] +) From bf1b67bbd4f7f889b1267883119f014187340b39 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 22:13:49 -0500 Subject: [PATCH 26/41] typechecking works. --- MyPy/vcf/model.py | 3 -- bioframework/consensus.py | 63 ++++++++++++++----------- {MyPy => mypy}/Bio/SeqIO.pyi | 0 {MyPy => mypy}/Bio/SeqRecord.py | 0 {MyPy => mypy}/Bio/__init__.py | 0 {MyPy/toolz => mypy}/__init__.py | 0 {MyPy => mypy}/example.py | 0 mypy/out/docopt.pyi | 76 +++++++++++++++++++++++++++++++ mypy/out/hypothesis/__init__.pyi | 8 ++++ mypy/out/schema.pyi | 31 +++++++++++++ mypy/toolz/__init__.py | 0 {MyPy => mypy}/toolz/dicttoolz.py | 0 {MyPy => mypy}/types.py | 0 {MyPy => mypy}/vcf/__init__.py | 5 +- mypy/vcf/model.py | 3 ++ 15 files changed, 157 insertions(+), 32 deletions(-) delete mode 100644 MyPy/vcf/model.py rename {MyPy => mypy}/Bio/SeqIO.pyi (100%) rename {MyPy => mypy}/Bio/SeqRecord.py (100%) rename {MyPy => mypy}/Bio/__init__.py (100%) rename {MyPy/toolz => mypy}/__init__.py (100%) rename {MyPy => mypy}/example.py (100%) create mode 100644 mypy/out/docopt.pyi create mode 100644 mypy/out/hypothesis/__init__.pyi create mode 100644 mypy/out/schema.pyi create mode 100644 mypy/toolz/__init__.py rename {MyPy => mypy}/toolz/dicttoolz.py (100%) rename {MyPy => mypy}/types.py (100%) rename {MyPy => mypy}/vcf/__init__.py (69%) create mode 100644 mypy/vcf/model.py diff --git a/MyPy/vcf/model.py b/MyPy/vcf/model.py deleted file mode 100644 index 02b1e9a..0000000 --- a/MyPy/vcf/model.py +++ /dev/null @@ -1,3 +0,0 @@ -from typing import Union, Dict, List, NamedTuple, Iterator -_Record = NamedTuple('VCFRecord', [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] -) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 05a1970..8c0da19 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -14,7 +14,7 @@ from functools import partial from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest import os, sys -from typing import Tuple, Dict, List, Iterator, Iterable, Any, Callable +from typing import Tuple, Dict, List, Iterator, Iterable, Any, Callable, NamedTuple, BinaryIO from Bio import SeqIO #done from Bio.SeqRecord import SeqRecord #done @@ -24,15 +24,21 @@ from toolz.dicttoolz import merge, dissoc, merge_with, valfilter #todo from docopt import docopt #ignore from schema import Schema, Use #ignore -from contracts import contract, new_contract #can ignore +#from contracts import contract, new_contract #can ignore +#from mypy.types import VCFRow ############# # Constants # ############# +VCFRow = NamedTuple("VCFRow", + [('ref', str), + ('AO', List[int]), + ('DP', int), + ('chrom',str), + ('pos', int), + ('alt', List[str])]) +#VcfRow = namedtuple("VcfRow", VCFRow._fields) # type: (*Any) -> VCFRow -AMBIGUITY_TABLE = { 'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'N': 'N', - 'AC': 'M', 'AG': 'R', 'AT': 'W', 'CG': 'S', 'CT': - 'Y', 'GT': 'K', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', - 'CGT': 'B', 'ACGT': 'N' } +AMBIGUITY_TABLE = { 'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'N': 'N', 'AC': 'M', 'AG': 'R', 'AT': 'W', 'CG': 'S', 'CT': 'Y', 'GT': 'K', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', 'CGT': 'B', 'ACGT': 'N' } MAJORITY_PERCENTAGE = 80 MIN_DEPTH = 10 @@ -40,7 +46,7 @@ ########### # Reducer # ########### -@contract(reference='string', muts='list(tuple(string, string, int))' ) +#@contract(reference='string', muts='list(tuple(string, string, int))' ) def make_consensus(reference, muts): # type: (str, List[Mut]) -> Tuple[str, List[Mut]] ''' Actually builds a consensus string by recursively applying @@ -98,10 +104,10 @@ def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): #@contract(min_depth='number,>=0', majority_percentage='number,>=0,<=100', rec='dict', returns='tuple(string, string, int)') def call_many(min_depth, majority_percentage, rec): - # type: (int, int, Dict) -> Mut + # type: (int, int, VCFRow) -> Mut #TODO: switch to generators - muts = zip(rec['AO'], rec['alt']) - ref, dp, pos = rec['ref'], rec['DP'], rec['pos'] + muts = zip(rec.AO, rec.alt) + ref, dp, pos = rec.ref, rec.DP, rec.pos longest_len = max(map(lambda x: len(x[-1]), muts)) longest_len = max(longest_len, len(ref)) def fill_gap(r): @@ -115,22 +121,23 @@ def seq_count(acc, ao_and_nts): return map(merge_sum, acc, [{nt:ao} for nt in nts]) # create a list of {base : count}, where the index matches the position mut_dicts = reduce(seq_count, xs, [{}]) - base_caller = partial(call_base_multi_alts, min_depth, majority_percentage, dp) # type: Callable[[Dict[Any,Any], str], str] + base_caller = lambda m,r: call_base_multi_alts(min_depth, majority_percentage, dp, m, r) # # # ?Callable[[Dict[Any,Any], str], str] res = map(base_caller, mut_dicts, ref) # trim None values at the end, (which indicate deletion) result = takewhile(bool, res) return (ref, ''.join(result), pos) -@contract(rec='dict',returns='dict') +#@contract(rec='dict',returns='dict') def flatten_vcf_record(rec): - # type: (_Record) -> Dict[str, Any] + # type: (_Record) -> VCFRow _rec = merge({ 'alt' : rec.ALT, 'ref' : rec.REF, 'pos' : rec.POS, 'chrom' : rec.CHROM}, rec.INFO) if not hasattr(_rec['alt'], '__iter__'): #TODO: put this somewhere else - return merge(_rec, dict(alt=[_rec['alt']], AO=[_rec['AO']])) - else: return _rec + d = merge(_rec, dict(alt=[_rec['alt']], AO=[_rec['AO']])) + else: d = _rec + return VCFRow(**d) ############## # Group By # @@ -138,16 +145,16 @@ def flatten_vcf_record(rec): #NOTE: could possibly drop lists, use fn.Stream all the time, # and write a Stream instance for contracts like: # https://github.com/AndreaCensi/contracts/blob/831ec7a5260ceb8960540ba0cb6cc26370cf2d82/src/contracts/library/lists.py -@contract(references='list[N]($SeqRecord),N>0', muts='list(dict)',returns='tuple(list(dict))') +#@contract(references='list[N]($SeqRecord),N>0', muts='list(dict)',returns='tuple(list(dict))') def group_muts_by_refs(references, muts): - # type: (List[SeqRecord], List[Dict[Any, Any]]) -> Iterable[List[Dict]] + # type: (List[SeqRecord], List[VCFRow]) -> List[List[VCFRow]] '''group and sort the mutations so that they match the order of the references.''' #NOTE: muts will already be "sorted" in that they are grouped together in the vcf #fix the groupby so it doesn't incidentally drain the first object of the group unzip = lambda x: zip(*x) chroms, groups = unzip(map(lambda kv: (kv[0], list(kv[1])), groupby(muts, get('chrom')))) - @contract(key='tuple(string,list)') - def index_of_ref(key): + #@contract(key='tuple(string,list)') + def index_of_ref(key): # type: (Tuple[str, List[SeqRecord]]) -> int chrom=key[0] index_of_chrom = map(lambda x: x.id, references).index(chrom) return index_of_chrom @@ -162,13 +169,15 @@ def index_of_ref(key): #@contract(references='SeqRecord', muts='seq(dict)', mind=int, majority=int) def all_consensuses(references, muts, mind, majority): - # type: (Iterable[SeqRecord], Iterable[Dict[Any,Any]], int, int) -> Tuple[List[str], Iterator[Tuple[str, List[Mut]]]] + # type: (List[SeqRecord], List[VCFRow], int, int) -> Tuple[List[SeqRecord], Iterable[Tuple[str, List[Mut]]]] ''' generates conesnsuses, including for flu and other mult-reference VCFs. applies filters and base callers to the mutations. then builds the consensus using these calls and `make_consensus`''' muts_by_ref = group_muts_by_refs(references, muts) def single_consensus(muts, ref): - the_muts = map(partial(call_many, mind, majority), muts) + # type: (List[VCFRow], SeqRecord) -> Tuple[str, List[Mut]] + #the_muts = map(partial(call_many, mind, majority), muts) + the_muts = map(lambda x: call_many(mind, majority, x), muts) ref_and_alt_differ = lambda x: x[0] != x[1] # vcf is index-starting-at-1 #real_muts = map(lambda (a,b,pos): (a,b,pos-1), filter(ref_and_alt_differ, the_muts)) @@ -186,13 +195,13 @@ def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str #@contract(ref_fasta=str, vcf=str, mind=int, majority=int) def run(ref_fasta, freebayes_vcf, outfile, mind, majority): - # type: (str, str, str, int, int) -> int - refs = SeqIO.parse(ref_fasta, 'fasta') + # type: (str, str, BinaryIO, int, int) -> int + _refs = SeqIO.parse(ref_fasta, 'fasta') with open(freebayes_vcf, 'r') as vcf_handle: - muts = imap(flatten_vcf_record, vcf.Reader(vcf_handle)) - refs, muts = list(refs), list(muts) - refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority) - strings = imap(consensus_str, refs, imap(get(0), seqs_and_muts)) + _muts = map(flatten_vcf_record, vcf.Reader(vcf_handle)) + refs, muts = list(_refs), list(_muts) + the_refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority) + strings = imap(consensus_str, the_refs, imap(get(0), seqs_and_muts)) result = '\n'.join(strings) outfile.write(result) outfile.close() diff --git a/MyPy/Bio/SeqIO.pyi b/mypy/Bio/SeqIO.pyi similarity index 100% rename from MyPy/Bio/SeqIO.pyi rename to mypy/Bio/SeqIO.pyi diff --git a/MyPy/Bio/SeqRecord.py b/mypy/Bio/SeqRecord.py similarity index 100% rename from MyPy/Bio/SeqRecord.py rename to mypy/Bio/SeqRecord.py diff --git a/MyPy/Bio/__init__.py b/mypy/Bio/__init__.py similarity index 100% rename from MyPy/Bio/__init__.py rename to mypy/Bio/__init__.py diff --git a/MyPy/toolz/__init__.py b/mypy/__init__.py similarity index 100% rename from MyPy/toolz/__init__.py rename to mypy/__init__.py diff --git a/MyPy/example.py b/mypy/example.py similarity index 100% rename from MyPy/example.py rename to mypy/example.py diff --git a/mypy/out/docopt.pyi b/mypy/out/docopt.pyi new file mode 100644 index 0000000..6f9431c --- /dev/null +++ b/mypy/out/docopt.pyi @@ -0,0 +1,76 @@ +# Stubs for docopt (Python 2) +# +# NOTE: This dynamically typed stub was automatically generated by stubgen. + +from typing import Any + +class DocoptLanguageError(Exception): ... + +class DocoptExit(SystemExit): + usage = ... # type: Any + def __init__(self, message=''): ... + +class Pattern: + def __eq__(self, other): ... + def __hash__(self): ... + def fix(self): ... + def fix_identities(self, uniq=None): ... + def fix_repeating_arguments(self): ... + @property + def either(self): ... + +class ChildPattern(Pattern): + name = ... # type: Any + value = ... # type: Any + def __init__(self, name, value=None): ... + def flat(self, *types): ... + def match(self, left, collected=None): ... + +class ParentPattern(Pattern): + children = ... # type: Any + def __init__(self, *children): ... + def flat(self, *types): ... + +class Argument(ChildPattern): + def single_match(self, left): ... + @classmethod + def parse(class_, source): ... + +class Command(Argument): + name = ... # type: Any + value = ... # type: Any + def __init__(self, name, value=False): ... + def single_match(self, left): ... + +class Option(ChildPattern): + value = ... # type: Any + def __init__(self, short=None, long=None, argcount=0, value=False): ... + @classmethod + def parse(class_, option_description): ... + def single_match(self, left): ... + @property + def name(self): ... + +class Required(ParentPattern): + def match(self, left, collected=None): ... + +class Optional(ParentPattern): + def match(self, left, collected=None): ... + +class AnyOptions(Optional): ... + +class OneOrMore(ParentPattern): + def match(self, left, collected=None): ... + +class Either(ParentPattern): + def match(self, left, collected=None): ... + +class TokenStream(list): + error = ... # type: Any + def __init__(self, source, error): ... + def move(self): ... + def current(self): ... + +class Dict(dict): ... + +def docopt(doc, argv=None, help=True, version=None, options_first=False): ... diff --git a/mypy/out/hypothesis/__init__.pyi b/mypy/out/hypothesis/__init__.pyi new file mode 100644 index 0000000..764f9f7 --- /dev/null +++ b/mypy/out/hypothesis/__init__.pyi @@ -0,0 +1,8 @@ +# Stubs for hypothesis (Python 2) +# +# NOTE: This dynamically typed stub was automatically generated by stubgen. + +from hypothesis._settings import settings as settings, Verbosity as Verbosity +from hypothesis.version import __version_info__ as __version_info__, __version__ as __version__ +from hypothesis.control import assume as assume, note as note, reject as reject +from hypothesis.core import given as given, find as find, example as example, seed as seed diff --git a/mypy/out/schema.pyi b/mypy/out/schema.pyi new file mode 100644 index 0000000..3eb2140 --- /dev/null +++ b/mypy/out/schema.pyi @@ -0,0 +1,31 @@ +# Stubs for schema (Python 2) +# +# NOTE: This dynamically typed stub was automatically generated by stubgen. + +from typing import Any + +class SchemaError(Exception): + autos = ... # type: Any + errors = ... # type: Any + def __init__(self, autos, errors): ... + @property + def code(self): ... + +class And: + def __init__(self, *args, **kw): ... + def validate(self, data): ... + +class Or(And): + def validate(self, data): ... + +class Use: + def __init__(self, callable_, error=None): ... + def validate(self, data): ... + +def priority(s): ... + +class Schema: + def __init__(self, schema, error=None): ... + def validate(self, data): ... + +class Optional(Schema): ... diff --git a/mypy/toolz/__init__.py b/mypy/toolz/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/MyPy/toolz/dicttoolz.py b/mypy/toolz/dicttoolz.py similarity index 100% rename from MyPy/toolz/dicttoolz.py rename to mypy/toolz/dicttoolz.py diff --git a/MyPy/types.py b/mypy/types.py similarity index 100% rename from MyPy/types.py rename to mypy/types.py diff --git a/MyPy/vcf/__init__.py b/mypy/vcf/__init__.py similarity index 69% rename from MyPy/vcf/__init__.py rename to mypy/vcf/__init__.py index fbb79f9..29c9cc8 100644 --- a/MyPy/vcf/__init__.py +++ b/mypy/vcf/__init__.py @@ -1,4 +1,5 @@ -from typing import Union, Dict, List, NamedTuple, Iterator +from typing import Union, Dict, List, NamedTuple, Iterator, BinaryIO +from vcf.model import _Record #fields = [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] # @@ -6,5 +7,5 @@ VCFRecord = NamedTuple('VCFRecord', [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] ) -def Reader(s): # type: (str) -> Iterator[VCFRecord] +def Reader(s): # type: (BinaryIO) -> Iterator[_Record] pass diff --git a/mypy/vcf/model.py b/mypy/vcf/model.py new file mode 100644 index 0000000..9266869 --- /dev/null +++ b/mypy/vcf/model.py @@ -0,0 +1,3 @@ +from typing import Union, Dict, List, NamedTuple, Iterator +_Record = NamedTuple('_Record', [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] +) From c2aecb8c613804931daec26c82810665c8d7ee95 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 22:17:27 -0500 Subject: [PATCH 27/41] added readme for types --- mypy/README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 mypy/README.md diff --git a/mypy/README.md b/mypy/README.md new file mode 100644 index 0000000..aec5641 --- /dev/null +++ b/mypy/README.md @@ -0,0 +1,6 @@ +1. Install [mypy](https://github.com/python/mypy#quick-start) + +2. Run mypy MYPYPATH=$PWD/mypy:$PWD/mypy/out mypy --py2 bioframework/consensus.py + +If needed, uses `stubgen` to generate more stub files for other libraries. + From c09b94bfbb5c46087a16a57428e3678ebc175cbc Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 22:28:22 -0500 Subject: [PATCH 28/41] fixed tests, now passing --- bioframework/consensus.py | 2 +- tests/test_consensus.py | 37 +++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 8c0da19..c6384ea 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -152,7 +152,7 @@ def group_muts_by_refs(references, muts): #NOTE: muts will already be "sorted" in that they are grouped together in the vcf #fix the groupby so it doesn't incidentally drain the first object of the group unzip = lambda x: zip(*x) - chroms, groups = unzip(map(lambda kv: (kv[0], list(kv[1])), groupby(muts, get('chrom')))) + chroms, groups = unzip(map(lambda kv: (kv[0], list(kv[1])), groupby(muts, lambda x: x.chrom))) #@contract(key='tuple(string,list)') def index_of_ref(key): # type: (Tuple[str, List[SeqRecord]]) -> int chrom=key[0] diff --git a/tests/test_consensus.py b/tests/test_consensus.py index 55f81f5..47076c1 100644 --- a/tests/test_consensus.py +++ b/tests/test_consensus.py @@ -6,7 +6,7 @@ from hypothesis import strategies as st from hypothesis import given, assume from operator import itemgetter as get -from bioframework.consensus import call_many, all_consensuses, make_consensus +from bioframework.consensus import call_many, all_consensuses, make_consensus, VCFRow import string import itertools import unittest @@ -15,7 +15,7 @@ st.integers(min_value=1), st.text(alphabet='ACTGN', min_size=1, max_size=6)) \ .flatmap(lambda tup:\ - vcf_dict_strategy_factory(*tup)) + vcf_dict_strategy_factory(*tup)).map(lambda d: VCFRow(**d)) pos_int = st.integers(min_value=0) #TODO: these 10, 80 for trhesh and majority_percentage should be factored out and possibly be strategies themselves @@ -24,23 +24,23 @@ def just_ref(*args): class CallBaseHypothesisTest(unittest.TestCase): @given(simple_vcf_dict_strategy, pos_int) def test_under_mind_is_N(self, mut, mind): - assume(mut['DP'] < mind) + assume(mut.DP < mind) result = call_many(mind, 80, mut)[1] self.assertTrue(all(map(lambda x: x == 'N', result))) @given(simple_vcf_dict_strategy) def test_ao_under_minority_is_ref(self, mut): - assume(sum(mut['AO']) / mut['DP'] < 0.2) + assume(sum(mut.AO) / mut.DP < 0.2) result = call_many(0, 80, mut)[1] - self.assertEquals(result, mut['ref']) + self.assertEquals(result, mut.ref) @given(simple_vcf_dict_strategy) def test_over_majority_is_alt(self, mut): #TODO: this is slow - assume(sum(mut['AO']) / mut['DP'] > 0.8) - assume(len(mut['alt']) == 1) + assume(sum(mut.AO) / mut.DP > 0.8) + assume(len(mut.alt) == 1) result = call_many(0, 80, mut)[1] - self.assertEquals(result, mut['alt'][0]) + self.assertEquals(result, mut.alt[0]) #Commented out because it's not actually always true, # e.g. mut={'ref': u'AA', 'pos': 1, 'AO': [784313725491], 'alt': [u'A'], @@ -48,9 +48,9 @@ def test_over_majority_is_alt(self, mut): # should result in AA # @given(simple_vcf_dict_strategy) # def test_over_minoriy_is_not_ref(self, mut): -# assume(sum(mut['AO']) / mut['DP'] > 0.2) +# assume(sum(mut.AO) / mut.DP > 0.2) # result = call_many(0, 80, mut)[1] -# self.assertNotEquals(result, mut['ref']) +# self.assertNotEquals(result, mut.ref) class ConsesusExampleTest(unittest.TestCase): def test_make_consensus_example(self): @@ -62,7 +62,7 @@ def test_make_consensus_example(self): self.assertEquals(expected, actual) def test_single_example(self): - muts = [{ + raw_muts = [{ 'pos' : 2, 'ref' : 'CG', 'alt' : ['TT'], @@ -78,12 +78,13 @@ def test_single_example(self): 'DP' : 150, 'chrom' : 'X' }] + muts = map(lambda d: VCFRow(**d), raw_muts) ref = make_seqrec('X', 'ACGTACGT') expected = 'ATTTAAGT' result = just_ref([ref], muts, 10, 80) self.assertEquals(expected, result) ref_with_vcf_dicts_strategy = ref_with_vcf_dicts_strategy_factory().map( - lambda (r, muts): (make_seqrec(muts[0]['chrom'], r), muts)) + lambda (r, muts): (make_seqrec(muts[0]['chrom'], r), map(lambda d: VCFRow(**d), muts))) from collections import Counter countof = lambda c: lambda x: Counter(x).get(c, 0) def run_cons(*args): @@ -100,11 +101,11 @@ class ConsensusHypothesisTest(unittest.TestCase): def test_n_count(self, ref_and_muts, rand): ref, muts = ref_and_muts originalNs = countof('N')(ref) - alts = map(get('alt'), muts) + alts = map(lambda x: x.alt, muts) assume(not any(map(lambda x: 'N' in x, itertools.chain(*alts)))) # needed because ACGT -> N assume(not filter(lambda x: len(x) > 3, alts)) - expectedNs = len(filter(lambda x: x['DP'] < 10, muts)) + originalNs + expectedNs = len(filter(lambda x: x.DP < 10, muts)) + originalNs result = just_ref([ref], muts, 10, 80) self.assertEquals(countof('N')(result), expectedNs) @@ -119,7 +120,7 @@ def test_less_or_equal_length_when_no_inserts(self, ref_and_muts): def assume_greater_or_equal_length_when_no_deletions(self, ref_and_muts): ref, muts = ref_and_muts def has_deletion(mut): - filter(lambda x: len(x) < mut['ref'], mut['alt']) + filter(lambda x: len(x) < mut.ref, mut.alt) assume(not any(map(has_deletion, muts))) result = just_ref([ref], muts, 10, 80) self.assertLesserEqual(len(ref), len(result)) @@ -138,11 +139,11 @@ def test_more_or_equal_ns_with_lower_threshold(self, ref_and_muts, n1, n2): @given(ref_with_vcf_dicts_strategy) def test_consensus_from_consensus_contains_more_alts(self, ref_and_muts): ref, muts = ref_and_muts - assume(not any(map(lambda x: len(x['alt']) > 1, muts))) + assume(not any(map(lambda x: len(x.alt) > 1, muts))) n1 = 10 cons1, alts = run_cons([ref], muts, n1, 80) assume(not any(map(lambda x: len(x[0]) > len(x[1]), alts))) - cons2, _ = run_cons([make_seqrec(muts[0]['chrom'], cons1)], muts, n1, 80) + cons2, _ = run_cons([make_seqrec(muts[0].chrom, cons1)], muts, n1, 80) picked_alts = map(get(1), alts) altCounts1 = sum(map(lambda f: f(cons1), map(countof, picked_alts))) altCounts2 = sum(map(lambda f: f(cons2), map(countof, picked_alts))) @@ -156,7 +157,7 @@ def test_consensus_from_consensus_contains_more_alts(self, ref_and_muts): def test_lower_majority_required_contains_more_alts(self, ref_and_muts, p1, p2): ref, muts = ref_and_muts assume(p1 < p2) - assume(not any(map(lambda x: len(x['alt']) > 1, muts))) + assume(not any(map(lambda x: len(x.alt) > 1, muts))) n1 = 10 cons1, alts = run_cons([ref], muts, n1, p1) assume(not any(map(lambda x: len(x[0]) > len(x[1]), alts))) From 6a0bd962f9d3f96907705cd959d1e41a15c97cd5 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 22:31:54 -0500 Subject: [PATCH 29/41] removing old types-hypothesis generator file --- mypy/types.py | 76 --------------------------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 mypy/types.py diff --git a/mypy/types.py b/mypy/types.py deleted file mode 100644 index 9c40861..0000000 --- a/mypy/types.py +++ /dev/null @@ -1,76 +0,0 @@ -from hypothesis import strategies as st -from typing import Dict, Tuple, List, Iterator, Set, Union, Optional, TypingMeta, NamedTuple -import re -import operator -from functools import partial -import string -from collections import namedtuple, OrderedDict -compose = lambda f,g: lambda *x: f(g(*x)) -''' -support: -- [x] NamedTuple -- [ ] Automatic function arguments -could also say, "given a function, generate random return values that it might give" because functions are also annotated with return values -''' - -# Just an exmaple of a named tuple -VCFRow = NamedTuple("VCFRow", - [('ref', str), - ('AO', List[int]), - ('DP', int), - ('chrom',str), - ('pos', int), - ('alt', List[str])]) - -primitives = { - str : st.text(), - int : st.integers(), - bool : st.booleans(), - float : st.floats(), - type(None) : st.none(), - unicode : st.characters(), - bytes : st.binary() # this is weird because str == bytes in py2 -} # missing: fractions, decimal - -#TODO: add Iterable, handle Sequence, etc. -def resolve(x): # type: (TypingMeta) -> hypothesis.strategies.SearchStrategy - if x in primitives: - strat = primitives[x] - elif hasattr(x, '_fields'): - # NamedTuple isn't a type, so this can't be a subclass check - try: - #Only way I know how to extract the name so it's pretty... - name = re.compile("([^\.]+)'>$").search(str(x)).groups()[0] - except: - name = str(x) - fts = OrderedDict(x._field_types) - nt = namedtuple(name, fts.keys()) - vals = map(resolve, fts.values()) - strat = st.tuples(*vals).map(lambda x: nt(*x)) - elif issubclass(x, Dict): - strat = st.dictionaries(*map(resolve, x.__parameters__)) - elif issubclass(x, Tuple): - strat = st.tuples(*map(resolve, x.__tuple_params__)) - elif issubclass(x, Union): - strat = operator.ior(*map(resolve, x.__union_params__)) - elif issubclass(x, Optional): - # Optional[X] is equivalent to Union[X, type(None)]. second param is always Nonetype. - value = x.__union_params__[0] - strat = (resolve(value) | st.none()) - else: # a list-type-ish - collections = { - Iterator : lambda x: st.lists(x).map(iter), - List : st.lists, - Set : st.sets - } #TODO: missing: Iterable , etc. - # For some reason List[T] not a subclass of List: issubclass(x, List) == False. So do these hijinks - params = x.__parameters__ - assert len(params) == 1, "Wrong type %s, not a list-like" % x - matches = filter(lambda k: k == x.__origin__, collections.keys()) - assert len(matches) == 1, "Should have exactly one match. %s matched with %s" % (x, matches) - collection_strat = collections[matches[0]] - strat = collection_strat(resolve(params[0])) - return strat -# see https://docs.python.org/3/library/typing.html -# not Generics -# not Callables From 6cbd2e16d83ed64f278cb1bb088f3c7dd165a4c9 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 22:32:18 -0500 Subject: [PATCH 30/41] added missing typing requirement --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index dbdcd86..26a06bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pycontracts toolz pyvcf +typing From 4427de8e70b997a8b6a262342f1dda796c5912ed Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Mar 2016 22:44:40 -0500 Subject: [PATCH 31/41] missing deps. --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 26a06bb..8e01b72 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,5 @@ pycontracts toolz pyvcf typing +docopt +schema From 50cf3c9273760ff8d0b702ff34d4c46c5e744e57 Mon Sep 17 00:00:00 2001 From: Panciera Date: Fri, 4 Mar 2016 10:18:25 -0500 Subject: [PATCH 32/41] add typing module dep. --- setup.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.py b/setup.py index 0c97ac2..1d09b17 100755 --- a/setup.py +++ b/setup.py @@ -24,8 +24,12 @@ def jip_modules(path=bioframework.JIP_PATH): 'pyjip', 'toolz', 'docopt', + 'schema', + 'pyvcf', + 'typing' ], data_files=[ (join(sys.prefix,'bin'), jip_modules()), ] ) + From 23afece1421c36dcc1fb2cb09c21250e016cb87c Mon Sep 17 00:00:00 2001 From: Panciera Date: Fri, 4 Mar 2016 10:47:59 -0500 Subject: [PATCH 33/41] fix failing test --- tests/test_consensus.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_consensus.py b/tests/test_consensus.py index 47076c1..955e086 100644 --- a/tests/test_consensus.py +++ b/tests/test_consensus.py @@ -91,7 +91,7 @@ def run_cons(*args): _, alt_and_cons = all_consensuses(*args) cons, alts = zip(*alt_and_cons) return cons[0], alts[0] -class ConsensusHypothesisTest(unittest.TestCase): +class ConsensusHypothesisTest(unittest.TestCase): #ref_and_muts=(SeqRecord(seq=Seq(u'AAAAAAAAAA', IUPACAmbiguousDNA()), id=u'', name='', description='', dbxrefs=[]), [ # {'ref': u'A', 'pos': 1, 'AO': [479, 777, 119, 604], 'alt': [u'G', u'C', u'G', u'TG'], 'chrom': u'', 'DP': 2635}, # {'ref': u'A', 'pos': 3, 'AO': [291, 241, 583, 420], 'alt': [u'CTG', u'C', u'G', u'C'], 'chrom': u'', 'DP': 1627}]), rand=random.seed(0)) @@ -102,9 +102,11 @@ def test_n_count(self, ref_and_muts, rand): ref, muts = ref_and_muts originalNs = countof('N')(ref) alts = map(lambda x: x.alt, muts) - assume(not any(map(lambda x: 'N' in x, itertools.chain(*alts)))) + refs = map(lambda x: x.ref, muts) + assume(not filter(lambda x: 'N' in x, itertools.chain(*alts))) + assume(not filter(lambda x: len(x) > 1, itertools.chain(*alts))) + assume(not filter(lambda x: len(x) > 1, refs)) # needed because ACGT -> N - assume(not filter(lambda x: len(x) > 3, alts)) expectedNs = len(filter(lambda x: x.DP < 10, muts)) + originalNs result = just_ref([ref], muts, 10, 80) self.assertEquals(countof('N')(result), expectedNs) @@ -134,7 +136,7 @@ def test_more_or_equal_ns_with_lower_threshold(self, ref_and_muts, n1, n2): cons1 = just_ref([ref], muts, n1, 80) cons2 = just_ref([ref], muts, n2, 80) nsCount1, nsCount2 = countof('N')(cons1), countof('N')(cons2) - self.assertLessEqual(nsCount1, nsCount2) + self.assertLessEqual(nsCount1, nsCount2) @given(ref_with_vcf_dicts_strategy) def test_consensus_from_consensus_contains_more_alts(self, ref_and_muts): @@ -147,7 +149,7 @@ def test_consensus_from_consensus_contains_more_alts(self, ref_and_muts): picked_alts = map(get(1), alts) altCounts1 = sum(map(lambda f: f(cons1), map(countof, picked_alts))) altCounts2 = sum(map(lambda f: f(cons2), map(countof, picked_alts))) - self.assertLessEqual(altCounts1, altCounts2) + self.assertLessEqual(altCounts1, altCounts2) #NOTE: the below test appears to be meaningless, From bb7f93734e53e714b3a2057094dab1d587894268 Mon Sep 17 00:00:00 2001 From: Panciera Date: Fri, 4 Mar 2016 13:51:19 -0500 Subject: [PATCH 34/41] add support for trimming reference --- bioframework/consensus.py | 18 +++++++++++++++--- mypy/sh.py | 3 +++ 2 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 mypy/sh.py diff --git a/bioframework/consensus.py b/bioframework/consensus.py index c6384ea..84bfdf4 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -14,14 +14,17 @@ from functools import partial from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest import os, sys +import collections + from typing import Tuple, Dict, List, Iterator, Iterable, Any, Callable, NamedTuple, BinaryIO from Bio import SeqIO #done from Bio.SeqRecord import SeqRecord #done import vcf #done from vcf.model import _Record +import sh #todo #from toolz import compose -from toolz.dicttoolz import merge, dissoc, merge_with, valfilter #todo +from toolz.dicttoolz import merge, dissoc, merge_with, valfilter #done from docopt import docopt #ignore from schema import Schema, Use #ignore #from contracts import contract, new_contract #can ignore @@ -36,8 +39,6 @@ ('chrom',str), ('pos', int), ('alt', List[str])]) -#VcfRow = namedtuple("VcfRow", VCFRow._fields) # type: (*Any) -> VCFRow - AMBIGUITY_TABLE = { 'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'N': 'N', 'AC': 'M', 'AG': 'R', 'AT': 'W', 'CG': 'S', 'CT': 'Y', 'GT': 'K', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', 'CGT': 'B', 'ACGT': 'N' } MAJORITY_PERCENTAGE = 80 @@ -192,6 +193,17 @@ def single_consensus(muts, ref): def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str return ">{0}:Consensus\n{1}".format(ref.id, consensus) +def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int] + pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True) + get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int] + return imap(get_pos, pileup) + +#TODO: is pileup 0-based or 1-based index? +def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str + start, end = next(positions), collections.deque(positions, 1)[0] + return '-'*start + ref[:start:end] + '-'*(len(ref) - end) + + #@contract(ref_fasta=str, vcf=str, mind=int, majority=int) def run(ref_fasta, freebayes_vcf, outfile, mind, majority): diff --git a/mypy/sh.py b/mypy/sh.py new file mode 100644 index 0000000..ee8e4e9 --- /dev/null +++ b/mypy/sh.py @@ -0,0 +1,3 @@ +from typing import Callable, Any, Union, List, Iterator +def Command(s): # type: (str) -> Callable[...,Union[List[str],Iterator[str]]] + pass From 272a115ed63eaf7faa8452bf359ea081639c1eed Mon Sep 17 00:00:00 2001 From: Mike Panciera Date: Fri, 4 Mar 2016 14:14:32 -0500 Subject: [PATCH 35/41] fix readme formatting --- mypy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypy/README.md b/mypy/README.md index aec5641..26906fb 100644 --- a/mypy/README.md +++ b/mypy/README.md @@ -1,6 +1,6 @@ 1. Install [mypy](https://github.com/python/mypy#quick-start) -2. Run mypy MYPYPATH=$PWD/mypy:$PWD/mypy/out mypy --py2 bioframework/consensus.py +2. Run mypy: `MYPYPATH=$PWD/mypy:$PWD/mypy/out mypy --py2 bioframework/consensus.py` If needed, uses `stubgen` to generate more stub files for other libraries. From bbd8af8ee784e0578550deedff94138ae5a30f03 Mon Sep 17 00:00:00 2001 From: Panciera Date: Tue, 22 Mar 2016 16:03:40 -0400 Subject: [PATCH 36/41] fix VCFRow instantiation --- bioframework/consensus.py | 21 ++++++++++++++------- mypy/toolz/dicttoolz.py | 2 ++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 84bfdf4..8dd433c 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -24,7 +24,7 @@ from vcf.model import _Record import sh #todo #from toolz import compose -from toolz.dicttoolz import merge, dissoc, merge_with, valfilter #done +from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done from docopt import docopt #ignore from schema import Schema, Use #ignore #from contracts import contract, new_contract #can ignore @@ -36,6 +36,8 @@ [('ref', str), ('AO', List[int]), ('DP', int), + ('QA', List[int]), + ('QR', int), ('chrom',str), ('pos', int), ('alt', List[str])]) @@ -43,6 +45,7 @@ MAJORITY_PERCENTAGE = 80 MIN_DEPTH = 10 +#PMut = NamedTuple('PMut', [('AO', int), ('DP', int), ('QA', int), ('QR', int)]) Mut = Tuple[str, str, int] ########### # Reducer # @@ -52,7 +55,8 @@ def make_consensus(reference, muts): # type: (str, List[Mut]) -> Tuple[str, List[Mut]] ''' Actually builds a consensus string by recursively applying the mutations.''' - def _do_build((accString, string, lastPos), (x, y, bigPos)): + def _do_build(t1, t2): # type: (Tuple[int,str,int], Tuple[str,str,int]) -> Tuple[str,str,int] + (accString, string, lastPos), (x, y, bigPos) = t1, t2 pos = bigPos - lastPos return (accString + (string[:pos] + y), string[pos+len(x):], bigPos+len(x)) result, remaining, _ = reduce(_do_build, muts, ('', reference, 0)) @@ -77,8 +81,8 @@ def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): (call each base) based on the given rules (using call_base).""" #TODO: majority_percentage gets ignored, so replace constants #TODO: behavior is undefined if sum(AO) > dp. - if dp < min_depth: #could call REF here sometimes - return 'N' +# if dp < min_depth: #could call REF here sometimes +# return 'N' total_ao = lambda: sum(alts.values()) # avoid evaluating unless necessary if ref is None: # this is an insert @@ -121,7 +125,7 @@ def seq_count(acc, ao_and_nts): ao, nts = ao_and_nts return map(merge_sum, acc, [{nt:ao} for nt in nts]) # create a list of {base : count}, where the index matches the position - mut_dicts = reduce(seq_count, xs, [{}]) + mut_dicts = reduce(seq_count, xs, [{}]) # type: Iterable[Dict[str,int]] base_caller = lambda m,r: call_base_multi_alts(min_depth, majority_percentage, dp, m, r) # # # ?Callable[[Dict[Any,Any], str], str] res = map(base_caller, mut_dicts, ref) # trim None values at the end, (which indicate deletion) @@ -136,8 +140,11 @@ def flatten_vcf_record(rec): 'pos' : rec.POS, 'chrom' : rec.CHROM}, rec.INFO) if not hasattr(_rec['alt'], '__iter__'): #TODO: put this somewhere else - d = merge(_rec, dict(alt=[_rec['alt']], AO=[_rec['AO']])) + d = merge(_rec, dict(alt=[_rec['alt']], + AO=[_rec['AO']], + QA=[_rec['QA']])) else: d = _rec + d = keyfilter(VCFRow._fields.__contains__, d) return VCFRow(**d) ############## @@ -219,7 +226,7 @@ def run(ref_fasta, freebayes_vcf, outfile, mind, majority): outfile.close() return 0 -def main(): # type () -> None +def main(): # type: () -> None scheme = Schema( { '--vcf' : os.path.isfile, '--ref' : os.path.isfile, diff --git a/mypy/toolz/dicttoolz.py b/mypy/toolz/dicttoolz.py index 6f7178e..22feb1c 100644 --- a/mypy/toolz/dicttoolz.py +++ b/mypy/toolz/dicttoolz.py @@ -15,6 +15,8 @@ def merge_with(f, d1, d2): # type: (Callable[[V,V2], V3], Dict[K,V], Dict[K,V2]) def valfilter(f, d): # type: (Callable[[V], bool], Dict[K,V]) -> Dict[K,V] pass +def keyfilter(f, d): # type: (Callable[[K], bool], Dict[K,V]) -> Dict[K,V] + pass #from typing import Dict, Any, Callable, TypeVar From c556a76c34d7aa5c8d721443deec0b7b3742562c Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 23 Mar 2016 16:52:08 -0400 Subject: [PATCH 37/41] more info in consensus's stuff --- bioframework/consensus.py | 128 +++++++++++++++++++++++--------------- tests/test_consensus.py | 6 +- 2 files changed, 83 insertions(+), 51 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 8dd433c..da29f64 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -10,9 +10,12 @@ --output,-o= output file [default: ] """ #stdlib +#TO type-check: (requires python3) +# $ MYPYPATH=$HOME/bioframework/mypy mypy --py2 --disallow-untyped-calls bioframework/consensus.py + from operator import itemgetter as get from functools import partial -from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest +from itertools import ifilter, ifilterfalse, imap, groupby, takewhile, repeat, starmap, izip_longest import os, sys import collections @@ -25,8 +28,8 @@ import sh #todo #from toolz import compose from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done -from docopt import docopt #ignore -from schema import Schema, Use #ignore +#from docopt import docopt #ignore +#from schema import Schema, Use #ignore #from contracts import contract, new_contract #can ignore #from mypy.types import VCFRow ############# @@ -45,7 +48,7 @@ MAJORITY_PERCENTAGE = 80 MIN_DEPTH = 10 -#PMut = NamedTuple('PMut', [('AO', int), ('DP', int), ('QA', int), ('QR', int)]) +PMut = NamedTuple('PMut', [('ALT', str), ('AO', int), ('DP', int), ('QA', int), ('QR', int)]) Mut = Tuple[str, str, int] ########### # Reducer # @@ -55,7 +58,7 @@ def make_consensus(reference, muts): # type: (str, List[Mut]) -> Tuple[str, List[Mut]] ''' Actually builds a consensus string by recursively applying the mutations.''' - def _do_build(t1, t2): # type: (Tuple[int,str,int], Tuple[str,str,int]) -> Tuple[str,str,int] + def _do_build(t1, t2): # type: (Tuple[str,str,int], Tuple[str,str,int]) -> Tuple[str,str,int] (accString, string, lastPos), (x, y, bigPos) = t1, t2 pos = bigPos - lastPos return (accString + (string[:pos] + y), string[pos+len(x):], bigPos+len(x)) @@ -74,7 +77,7 @@ def _do_build(t1, t2): # type: (Tuple[int,str,int], Tuple[str,str,int]) -> Tuple #@contract(min_depth='number,>=0', majority_percentage='number,>=0,<=100',dp='number,>=0', ref='string|None',alts='dict(string: number)') def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): - # type: (int, int, int, Dict[str, int], str) -> str + # type: (int, int, int, List[PMut], str) -> str """when there are multiple alts, zip through with each of them zip(*alts), character by character. compare the percentages, and sum the percentages for each base. (groupby, sum) pick each character @@ -83,58 +86,81 @@ def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): #TODO: behavior is undefined if sum(AO) > dp. # if dp < min_depth: #could call REF here sometimes # return 'N' - total_ao = lambda: sum(alts.values()) # avoid evaluating unless necessary + get_ao = lambda x: x.AO + total_ao = sum(map(get_ao, alts)) if ref is None: # this is an insert - if total_ao()/float(dp) < .50: + if total_ao/float(dp) < .50: return ref # if the insert is above threshold, keep going and call the insert like a normal base - if '-' in alts: - if alts['-']/float(dp) > .50: # a deletion + is_insert = lambda x: x.ALT == '-' # type: Callable[[PMut],bool] + inserts = list(filter(is_insert, alts)) + if inserts: + assert len(inserts) == 1 + ins = inserts[0] + if ins.AO/float(dp) > .50: # a deletion return '' - dp -= alts['-'] - alts_without_insert = dissoc(alts, '-') + dp -= ins.AO + alts_without_insert = list(ifilterfalse(is_insert, alts)) else: alts_without_insert = alts - over_depth = lambda x: lambda depth: depth/float(dp) > x - picked_alt = valfilter(over_depth(0.8), alts_without_insert) + over_depth = lambda x: lambda y: y.AO/float(dp) > x + picked_alt = filter(over_depth(0.8), alts_without_insert) if picked_alt: - return picked_alt.keys()[0] + return picked_alt[0].ALT #add ref so that it will be considered in creating ambiguous base - alts_with_ref = merge(alts_without_insert, ({ref : (dp - total_ao()) } if ref else {})) - over20 = valfilter(over_depth(0.2), alts_with_ref) - as_ambiguous = ''.join(sorted(over20.keys())) + if ref: + alts.append(PMut(ALT=ref, AO=(dp-total_ao), DP=dp, QA=-1, QR=-1)) + over20 = filter(over_depth(0.2), alts) + as_ambiguous = ''.join(sorted(map(lambda x: x.ALT, over20))) # this could return a single base, (including the reference), becuase i.e. A => A in the ambiguity table return AMBIGUITY_TABLE[as_ambiguous] if as_ambiguous != '' else '' -#@contract(min_depth='number,>=0', majority_percentage='number,>=0,<=100', rec='dict', returns='tuple(string, string, int)') -def call_many(min_depth, majority_percentage, rec): - # type: (int, int, VCFRow) -> Mut - #TODO: switch to generators - muts = zip(rec.AO, rec.alt) - ref, dp, pos = rec.ref, rec.DP, rec.pos +def pmuts_from_row(row): + # type: (VCFRow) -> List[PMut] + muts = zip(row.QA, row.AO, row.alt) longest_len = max(map(lambda x: len(x[-1]), muts)) - longest_len = max(longest_len, len(ref)) + longest_len = max(longest_len, len(row.ref)) def fill_gap(r): - ao, s = r - return (ao, str(s) + (longest_len - len(s)) * '-') - xs = map(fill_gap, muts) # fill in the shorter alts with '-'. + qa, ao, s = r + return (qa, ao, str(s) + (longest_len - len(s)) * '-') def merge_sum(x,y): - return x if y is None else (y if x is None else merge_with(sum, x, y)) + def combine(tups): + return sum(map(get(0), tups)), sum(map(get(1), tups)) + #return x if y is None else (y if x is None else merge_with(sum, x, y)) + return x if y is None else (y if x is None else merge_with(combine, x, y)) def seq_count(acc, ao_and_nts): - ao, nts = ao_and_nts - return map(merge_sum, acc, [{nt:ao} for nt in nts]) + qa, ao, nts = ao_and_nts + #return map(merge_sum, acc, [{nt:ao, 'qa': qa} for nt in nts]) + return map(merge_sum, acc, [{nt:(ao, qa)} for nt in nts]) + xs = map(fill_gap, muts) # fill in the shorter alts with '-'. # create a list of {base : count}, where the index matches the position mut_dicts = reduce(seq_count, xs, [{}]) # type: Iterable[Dict[str,int]] - base_caller = lambda m,r: call_base_multi_alts(min_depth, majority_percentage, dp, m, r) # # # ?Callable[[Dict[Any,Any], str], str] - res = map(base_caller, mut_dicts, ref) + def to_pmuts(d): # type: (Dict[str,int]) -> List[PMut] + #qa = d.pop('qa') + #assert len(d.values()) == 1 + make = lambda k,v: PMut(k, AO=v[0], QA=v[1], DP=row.DP,QR=row.QR) + return list(starmap(make, d.items())) +# res = PMut(d.keys()[0], d.values()[0], row.DP, qa, row.QR) +# return res + res = list(map(to_pmuts, mut_dicts)) + return res + +def call_many(min_depth, majority_percentage, rec): + # type: (int, int, VCFRow) -> Mut + muts_by_column = pmuts_from_row(rec) + #TODO: switch to generators + base_caller = lambda m,r: call_base_multi_alts( + min_depth, majority_percentage, rec.DP, m, r) # # # ?Callable[[Dict[Any,Any], str], str] + res = map(base_caller, muts_by_column, rec.ref) # trim None values at the end, (which indicate deletion) result = takewhile(bool, res) - return (ref, ''.join(result), pos) + return (rec.ref, ''.join(result), rec.pos) #@contract(rec='dict',returns='dict') def flatten_vcf_record(rec): # type: (_Record) -> VCFRow + fields = ['ref', 'AO', 'DP', 'QA', 'QR', 'chrom' 'pos', 'alt'] _rec = merge({ 'alt' : rec.ALT, 'ref' : rec.REF, 'pos' : rec.POS, 'chrom' : rec.CHROM}, @@ -144,7 +170,7 @@ def flatten_vcf_record(rec): AO=[_rec['AO']], QA=[_rec['QA']])) else: d = _rec - d = keyfilter(VCFRow._fields.__contains__, d) + d = keyfilter(fields.__contains__, d) return VCFRow(**d) ############## @@ -226,17 +252,21 @@ def run(ref_fasta, freebayes_vcf, outfile, mind, majority): outfile.close() return 0 -def main(): # type: () -> None - scheme = Schema( - { '--vcf' : os.path.isfile, - '--ref' : os.path.isfile, - '--majority' : Use(int), - '--mind' : Use(int), - '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))}) - raw_args = docopt(__doc__, version='Version 1.0') - args = scheme.validate(raw_args) - run(args['--ref'], args['--vcf'], args['--output'], - args['--mind'], args['--output']) - -if __name__ == '__main__': - main() + + + + +#def main(): # type: () -> None +# scheme = Schema( +# { '--vcf' : os.path.isfile, +# '--ref' : os.path.isfile, +# '--majority' : Use(int), +# '--mind' : Use(int), +# '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))}) +# raw_args = docopt(__doc__, version='Version 1.0') +# args = scheme.validate(raw_args) +# run(args['--ref'], args['--vcf'], args['--output'], +# args['--mind'], args['--output']) + +#if __name__ == '__main__': +# main() diff --git a/tests/test_consensus.py b/tests/test_consensus.py index 955e086..54ec1a1 100644 --- a/tests/test_consensus.py +++ b/tests/test_consensus.py @@ -2,6 +2,7 @@ from biotest.biohypothesis import ref_with_vcf_dicts_strategy_factory, \ make_seqrec, vcf_dict_strategy_factory from hypothesis import given +from collections import Counter #from fn import _ from hypothesis import strategies as st from hypothesis import given, assume @@ -84,8 +85,9 @@ def test_single_example(self): result = just_ref([ref], muts, 10, 80) self.assertEquals(expected, result) ref_with_vcf_dicts_strategy = ref_with_vcf_dicts_strategy_factory().map( - lambda (r, muts): (make_seqrec(muts[0]['chrom'], r), map(lambda d: VCFRow(**d), muts))) -from collections import Counter + #lambda (r, muts): (make_seqrec(muts[0]['chrom'], r), map(lambda d: VCFRow(**d), muts))) + lambda tup: (make_seqrec(tup[1][0]['chrom'], tup[0]), map(lambda d: VCFRow(QA=pos_int,QR=pos_int,**d), tup[1]))) + countof = lambda c: lambda x: Counter(x).get(c, 0) def run_cons(*args): _, alt_and_cons = all_consensuses(*args) From e770b20902469183baafceb710684675a6004e31 Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 23 Mar 2016 17:35:09 -0400 Subject: [PATCH 38/41] add doc for low-depth handling --- bioframework/consensus.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index da29f64..fc40b91 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -86,6 +86,21 @@ def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): #TODO: behavior is undefined if sum(AO) > dp. # if dp < min_depth: #could call REF here sometimes # return 'N' + +# """ +# 1) "if the total quality of the alternates is < 200 (25 * 8), don't call it an alternate." +# 2) "if the total quality of the references is < 200, don't call the reference." +# 3) "if neither the reference nor the alternate can be called, call an N." +# """ +# altsTooLow = sum(map(lambda x: x.QA)) < 200 +# refTooLow = alts[0].QR < 200 +# +# if altsTooLow and refTooLow: +# return 'N' +# +# elif altsTooLow: +# return ref + get_ao = lambda x: x.AO total_ao = sum(map(get_ao, alts)) From c77a809b82b5f8f1ab56fac99fea82077e45f076 Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 23 Mar 2016 17:35:54 -0400 Subject: [PATCH 39/41] restore lowdepth->N so tests can maybe pass --- bioframework/consensus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index fc40b91..2ebcb58 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -84,8 +84,8 @@ def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): (call each base) based on the given rules (using call_base).""" #TODO: majority_percentage gets ignored, so replace constants #TODO: behavior is undefined if sum(AO) > dp. -# if dp < min_depth: #could call REF here sometimes -# return 'N' + if dp < min_depth: #could call REF here sometimes + return 'N' # """ # 1) "if the total quality of the alternates is < 200 (25 * 8), don't call it an alternate." From 101f5aa51040491927e09dbb9849cb06b4f7cbfb Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 23 Mar 2016 17:42:07 -0400 Subject: [PATCH 40/41] fixed duplcate args error --- tests/test_consensus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_consensus.py b/tests/test_consensus.py index 54ec1a1..932bf81 100644 --- a/tests/test_consensus.py +++ b/tests/test_consensus.py @@ -86,7 +86,7 @@ def test_single_example(self): self.assertEquals(expected, result) ref_with_vcf_dicts_strategy = ref_with_vcf_dicts_strategy_factory().map( #lambda (r, muts): (make_seqrec(muts[0]['chrom'], r), map(lambda d: VCFRow(**d), muts))) - lambda tup: (make_seqrec(tup[1][0]['chrom'], tup[0]), map(lambda d: VCFRow(QA=pos_int,QR=pos_int,**d), tup[1]))) + lambda tup: (make_seqrec(tup[1][0]['chrom'], tup[0]), map(lambda d: VCFRow(**d), tup[1]))) countof = lambda c: lambda x: Counter(x).get(c, 0) def run_cons(*args): From 97958974b59542fc96b10dc26a91a8dc3ed50d8b Mon Sep 17 00:00:00 2001 From: Panciera Date: Mon, 17 Apr 2017 13:52:06 -0400 Subject: [PATCH 41/41] ignroe --- newfile | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 newfile diff --git a/newfile b/newfile new file mode 100644 index 0000000..e69de29