diff --git a/.travis.yml b/.travis.yml index d759796..81c8819 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,6 +23,7 @@ install: - conda config --add channels bioconda - conda install --file requirements-conda.txt - pip install -r tests/requirements-dev.txt + - pip install -r requirements.txt - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi - if [[ $TRAVIS_PYTHON_VERSION == '3.4' ]]; then pip install robotframework-python3; else pip install robotframework; fi - python setup.py install diff --git a/bioframework/consensus.py b/bioframework/consensus.py new file mode 100755 index 0000000..2ebcb58 --- /dev/null +++ b/bioframework/consensus.py @@ -0,0 +1,287 @@ +""" +Usage: + consensus --ref --vcf [--mind ] [--majority ] [-o ] + +Options: + --ref= Reference fasta file + --vcf= VCF output + --majority= Percentage required [default: 80] + --mind= minimum depth to call base non-N [default: 10] + --output,-o= output file [default: ] +""" +#stdlib +#TO type-check: (requires python3) +# $ MYPYPATH=$HOME/bioframework/mypy mypy --py2 --disallow-untyped-calls bioframework/consensus.py + +from operator import itemgetter as get +from functools import partial +from itertools import ifilter, ifilterfalse, imap, groupby, takewhile, repeat, starmap, izip_longest +import os, sys +import collections + +from typing import Tuple, Dict, List, Iterator, Iterable, Any, Callable, NamedTuple, BinaryIO + +from Bio import SeqIO #done +from Bio.SeqRecord import SeqRecord #done +import vcf #done +from vcf.model import _Record +import sh #todo +#from toolz import compose +from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done +#from docopt import docopt #ignore +#from schema import Schema, Use #ignore +#from contracts import contract, new_contract #can ignore +#from mypy.types import VCFRow +############# +# Constants # +############# +VCFRow = NamedTuple("VCFRow", + [('ref', str), + ('AO', List[int]), + ('DP', int), + ('QA', List[int]), + ('QR', int), + ('chrom',str), + ('pos', int), + ('alt', List[str])]) +AMBIGUITY_TABLE = { 'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'N': 'N', 'AC': 'M', 'AG': 'R', 'AT': 'W', 'CG': 'S', 'CT': 'Y', 'GT': 'K', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', 'CGT': 'B', 'ACGT': 'N' } + +MAJORITY_PERCENTAGE = 80 +MIN_DEPTH = 10 +PMut = NamedTuple('PMut', [('ALT', str), ('AO', int), ('DP', int), ('QA', int), ('QR', int)]) +Mut = Tuple[str, str, int] +########### +# Reducer # +########### +#@contract(reference='string', muts='list(tuple(string, string, int))' ) +def make_consensus(reference, muts): + # type: (str, List[Mut]) -> Tuple[str, List[Mut]] + ''' Actually builds a consensus string by recursively applying + the mutations.''' + def _do_build(t1, t2): # type: (Tuple[str,str,int], Tuple[str,str,int]) -> Tuple[str,str,int] + (accString, string, lastPos), (x, y, bigPos) = t1, t2 + pos = bigPos - lastPos + return (accString + (string[:pos] + y), string[pos+len(x):], bigPos+len(x)) + result, remaining, _ = reduce(_do_build, muts, ('', reference, 0)) + return result + remaining, muts + + +############## +# Mappers # +############## + +#TODO: Failing Case: +# a = {'ref': u'AT', 'pos': 2, 'AO': (51771, 41537, 42398, 9342), 'alt': [u'A', +# u'TT', u'AATTG', u'AAGAA'], 'chrom': u'o', 'DP': 87288} +# bioframework.consensus.call_many(10, 80, a) + +#@contract(min_depth='number,>=0', majority_percentage='number,>=0,<=100',dp='number,>=0', ref='string|None',alts='dict(string: number)') +def call_base_multi_alts(min_depth, majority_percentage, dp, alts, ref): + # type: (int, int, int, List[PMut], str) -> str + """when there are multiple alts, zip through with each of them + zip(*alts), character by character. compare the percentages, and + sum the percentages for each base. (groupby, sum) pick each character + (call each base) based on the given rules (using call_base).""" + #TODO: majority_percentage gets ignored, so replace constants + #TODO: behavior is undefined if sum(AO) > dp. + if dp < min_depth: #could call REF here sometimes + return 'N' + +# """ +# 1) "if the total quality of the alternates is < 200 (25 * 8), don't call it an alternate." +# 2) "if the total quality of the references is < 200, don't call the reference." +# 3) "if neither the reference nor the alternate can be called, call an N." +# """ +# altsTooLow = sum(map(lambda x: x.QA)) < 200 +# refTooLow = alts[0].QR < 200 +# +# if altsTooLow and refTooLow: +# return 'N' +# +# elif altsTooLow: +# return ref + + get_ao = lambda x: x.AO + total_ao = sum(map(get_ao, alts)) + + if ref is None: # this is an insert + if total_ao/float(dp) < .50: + return ref + # if the insert is above threshold, keep going and call the insert like a normal base + is_insert = lambda x: x.ALT == '-' # type: Callable[[PMut],bool] + inserts = list(filter(is_insert, alts)) + if inserts: + assert len(inserts) == 1 + ins = inserts[0] + if ins.AO/float(dp) > .50: # a deletion + return '' + dp -= ins.AO + alts_without_insert = list(ifilterfalse(is_insert, alts)) + else: + alts_without_insert = alts + over_depth = lambda x: lambda y: y.AO/float(dp) > x + picked_alt = filter(over_depth(0.8), alts_without_insert) + if picked_alt: + return picked_alt[0].ALT + #add ref so that it will be considered in creating ambiguous base + if ref: + alts.append(PMut(ALT=ref, AO=(dp-total_ao), DP=dp, QA=-1, QR=-1)) + over20 = filter(over_depth(0.2), alts) + as_ambiguous = ''.join(sorted(map(lambda x: x.ALT, over20))) + # this could return a single base, (including the reference), becuase i.e. A => A in the ambiguity table + return AMBIGUITY_TABLE[as_ambiguous] if as_ambiguous != '' else '' + +def pmuts_from_row(row): + # type: (VCFRow) -> List[PMut] + muts = zip(row.QA, row.AO, row.alt) + longest_len = max(map(lambda x: len(x[-1]), muts)) + longest_len = max(longest_len, len(row.ref)) + def fill_gap(r): + qa, ao, s = r + return (qa, ao, str(s) + (longest_len - len(s)) * '-') + def merge_sum(x,y): + def combine(tups): + return sum(map(get(0), tups)), sum(map(get(1), tups)) + #return x if y is None else (y if x is None else merge_with(sum, x, y)) + return x if y is None else (y if x is None else merge_with(combine, x, y)) + def seq_count(acc, ao_and_nts): + qa, ao, nts = ao_and_nts + #return map(merge_sum, acc, [{nt:ao, 'qa': qa} for nt in nts]) + return map(merge_sum, acc, [{nt:(ao, qa)} for nt in nts]) + xs = map(fill_gap, muts) # fill in the shorter alts with '-'. + # create a list of {base : count}, where the index matches the position + mut_dicts = reduce(seq_count, xs, [{}]) # type: Iterable[Dict[str,int]] + def to_pmuts(d): # type: (Dict[str,int]) -> List[PMut] + #qa = d.pop('qa') + #assert len(d.values()) == 1 + make = lambda k,v: PMut(k, AO=v[0], QA=v[1], DP=row.DP,QR=row.QR) + return list(starmap(make, d.items())) +# res = PMut(d.keys()[0], d.values()[0], row.DP, qa, row.QR) +# return res + res = list(map(to_pmuts, mut_dicts)) + return res + +def call_many(min_depth, majority_percentage, rec): + # type: (int, int, VCFRow) -> Mut + muts_by_column = pmuts_from_row(rec) + #TODO: switch to generators + base_caller = lambda m,r: call_base_multi_alts( + min_depth, majority_percentage, rec.DP, m, r) # # # ?Callable[[Dict[Any,Any], str], str] + res = map(base_caller, muts_by_column, rec.ref) + # trim None values at the end, (which indicate deletion) + result = takewhile(bool, res) + return (rec.ref, ''.join(result), rec.pos) + +#@contract(rec='dict',returns='dict') +def flatten_vcf_record(rec): + # type: (_Record) -> VCFRow + fields = ['ref', 'AO', 'DP', 'QA', 'QR', 'chrom' 'pos', 'alt'] + _rec = merge({ + 'alt' : rec.ALT, 'ref' : rec.REF, + 'pos' : rec.POS, 'chrom' : rec.CHROM}, + rec.INFO) + if not hasattr(_rec['alt'], '__iter__'): #TODO: put this somewhere else + d = merge(_rec, dict(alt=[_rec['alt']], + AO=[_rec['AO']], + QA=[_rec['QA']])) + else: d = _rec + d = keyfilter(fields.__contains__, d) + return VCFRow(**d) + +############## +# Group By # +############## +#NOTE: could possibly drop lists, use fn.Stream all the time, +# and write a Stream instance for contracts like: +# https://github.com/AndreaCensi/contracts/blob/831ec7a5260ceb8960540ba0cb6cc26370cf2d82/src/contracts/library/lists.py +#@contract(references='list[N]($SeqRecord),N>0', muts='list(dict)',returns='tuple(list(dict))') +def group_muts_by_refs(references, muts): + # type: (List[SeqRecord], List[VCFRow]) -> List[List[VCFRow]] + '''group and sort the mutations so that they match the order of the references.''' + #NOTE: muts will already be "sorted" in that they are grouped together in the vcf + #fix the groupby so it doesn't incidentally drain the first object of the group + unzip = lambda x: zip(*x) + chroms, groups = unzip(map(lambda kv: (kv[0], list(kv[1])), groupby(muts, lambda x: x.chrom))) + #@contract(key='tuple(string,list)') + def index_of_ref(key): # type: (Tuple[str, List[SeqRecord]]) -> int + chrom=key[0] + index_of_chrom = map(lambda x: x.id, references).index(chrom) + return index_of_chrom + _, muts_by_ref = unzip(sorted(zip(chroms, groups), key=index_of_ref)) + return muts_by_ref + + + +############### +# Runner # +############### + +#@contract(references='SeqRecord', muts='seq(dict)', mind=int, majority=int) +def all_consensuses(references, muts, mind, majority): + # type: (List[SeqRecord], List[VCFRow], int, int) -> Tuple[List[SeqRecord], Iterable[Tuple[str, List[Mut]]]] + ''' generates conesnsuses, including for flu and other mult-reference VCFs. + applies filters and base callers to the mutations. + then builds the consensus using these calls and `make_consensus`''' + muts_by_ref = group_muts_by_refs(references, muts) + def single_consensus(muts, ref): + # type: (List[VCFRow], SeqRecord) -> Tuple[str, List[Mut]] + #the_muts = map(partial(call_many, mind, majority), muts) + the_muts = map(lambda x: call_many(mind, majority, x), muts) + ref_and_alt_differ = lambda x: x[0] != x[1] + # vcf is index-starting-at-1 + #real_muts = map(lambda (a,b,pos): (a,b,pos-1), filter(ref_and_alt_differ, the_muts)) + real_muts = map(lambda x: (x[0], x[1], x[2] - 1), filter(ref_and_alt_differ, the_muts)) + return make_consensus(str(ref.seq), real_muts) + return references, imap(single_consensus, muts_by_ref, references) + + +########## +# I/O # +########## +def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str + return ">{0}:Consensus\n{1}".format(ref.id, consensus) + +def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int] + pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True) + get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int] + return imap(get_pos, pileup) + +#TODO: is pileup 0-based or 1-based index? +def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str + start, end = next(positions), collections.deque(positions, 1)[0] + return '-'*start + ref[:start:end] + '-'*(len(ref) - end) + + + +#@contract(ref_fasta=str, vcf=str, mind=int, majority=int) +def run(ref_fasta, freebayes_vcf, outfile, mind, majority): + # type: (str, str, BinaryIO, int, int) -> int + _refs = SeqIO.parse(ref_fasta, 'fasta') + with open(freebayes_vcf, 'r') as vcf_handle: + _muts = map(flatten_vcf_record, vcf.Reader(vcf_handle)) + refs, muts = list(_refs), list(_muts) + the_refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority) + strings = imap(consensus_str, the_refs, imap(get(0), seqs_and_muts)) + result = '\n'.join(strings) + outfile.write(result) + outfile.close() + return 0 + + + + + +#def main(): # type: () -> None +# scheme = Schema( +# { '--vcf' : os.path.isfile, +# '--ref' : os.path.isfile, +# '--majority' : Use(int), +# '--mind' : Use(int), +# '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))}) +# raw_args = docopt(__doc__, version='Version 1.0') +# args = scheme.validate(raw_args) +# run(args['--ref'], args['--vcf'], args['--output'], +# args['--mind'], args['--output']) + +#if __name__ == '__main__': +# main() diff --git a/bioframework/tagreads.py b/bioframework/tagreads.py new file mode 100644 index 0000000..42fa565 --- /dev/null +++ b/bioframework/tagreads.py @@ -0,0 +1,195 @@ +import argparse +import sys +import samtools +import re +import os.path +#from ngs_mapper.bam import sortbam, indexbam +#import shutil +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE,SIG_DFL) #http://stackoverflow.com/questions/14207708/ioerror-errno-32-broken-pipe-python +#import log +#logger = log.setup_logger('tagreads',log.get_config()) + +# Exception for when headers exist +class HeaderExists(Exception): pass + +# The next 3 tuples have to be the same length and each index in each is related the same index in each tuple +# AKA zip( IDS, PLATFORMS, ID_MAP ) should work as expected +# Read group ID list +IDS = ('Roche454', 'IonTorrent', 'MiSeq', 'Sanger') +# Valid platforms for read groups +PLATFORMS = ('L454', 'IONTORRENT', 'ILLUMINA', 'CAPILLARY') +# Read name map to ID name +ID_MAP = ( + re.compile( '[0-9A-Z]{14}' ), + re.compile( '[A-Z0-9]{5}:\d{1,}:\d{1,}' ), + re.compile( 'M[0-9]{5}:\d+:\d{9}-[A-Z0-9]{5}:\d:\d{4}:\d{4,5}:\d{4,5}' ), + re.compile( '.*' ) +) +# Read Group Template +RG_TEMPLATE = { + 'SM': None, + 'ID': None, + 'PL': None, + 'CN': None +} + +def main(): + args = parse_args() + for bam in args.bamfiles: + tag_bam( bam, args.SM, args.CN, args.output ) + +def tag_bam( bam, SM, CN, output ): + #logger.info( "Gathering existing header for {0}".format(bam) ) + hdr = get_rg_headers( bam, SM, CN ) + tag_reads( bam, hdr, output ) + +def tag_read( untagged_read, tags ): + ''' + Tags a given samtools.SamRow with various tags + Does not replace existing tags + + @param untagged_read - samtools.SamRow + @param tags - List of valid samspec optional field strings(aka ['RG:Z:Value']) + + @returns a tagged read + ''' + #if untagged_read.FLAG >= 2048: + # # Skip supplementary + # #logger.debug( "Skipping read {0} because it is supplementary".format(untagged_read.QNAME) ) + # return untagged_read + # Append the new tags + if untagged_read._tags and untagged_read._tags[-1] != '\t': + untagged_read._tags += '\t' + # Append the tags that do not already exist + # tag+'\t' is used to make it an exact match + tags = [tag for tag in tags if tag+'\t' not in untagged_read._tags] + untagged_read._tags += '\t'.join( tags ) + # Remove trailing tab if all the tags were duplicate + untagged_read._tags = untagged_read._tags.rstrip() + # Return the tagged read + return untagged_read + +def tag_readgroup( read ): + ''' + Tags a given read with the readgroup that the qname + belongs to depending on how it maps to ID_MAP + + @param read - samtools.SamRow + + @returns SamRow that is tagged with the appropriate read group + ''' + rg = get_rg_for_read( read ) + ##logger.debug( "Tagging {0} with Read group {0}".format(read.qname,rg) ) + return tag_read( read, ['RG:Z:'+rg] ) + +def tag_reads( bam, hdr, output ): + ''' + Sets header of bam and tags all reads appropriately for each platform + Overwrites existing header + + @param bam - Bam file to tag reads in + @param hdr - Header string to set in the bam(needs newline at the end) + ''' + # Open the existing bam to fetch the reads to modify from + untagged_bam = samtools.view( bam ) + # Open a file to write the sam output to with the tagged reads + if output: sam = open(output, 'w') + else: sam = sys.stdout + # Write the hdr to the file first + sam.write( hdr ) + # Tag the reads + #logger.info( "Tagging reads for {0}".format(bam) ) + for read in untagged_bam: + samrow = samtools.SamRow(read) + read = tag_readgroup( samrow ) + sam.write( str(read) + '\n' ) + # Close stdout + untagged_bam.close() + sam.close() + #logger.info( "Finished tagging reads for {0}".format(bam) ) + #logger.info( "Sorting {0}".format(bam) ) +# b = samtools.view( samf, h=True, S=True, b=True ) +# sortbam( b, bam ) +# # Close the fh +# b.close() +# # Remove temp sam file +# # maybe some day could even just use pipes all the way through :) +# os.unlink( samf ) +# #logger.info( "Indexing {0}".format(bam) ) +# indexbam( bam ) + +def get_rg_for_read( aread ): + ''' Gets the read group name for the given samtools.SamRow ''' + rname = aread.QNAME + for i, p in enumerate( ID_MAP ): + if p.match( rname ): + return IDS[i] + raise ValueError( "{0} is from an unknown platform and cannot be tagged".format(rname) ) + +def get_rg_headers( bam, SM=None, CN=None ): + old_header = get_bam_header( bam ) + '\n' + + for id, pl in zip( IDS, PLATFORMS ): + # Skip headers that exist already + if 'ID:{0}\t'.format(id) in old_header: + continue + + rg = '@RG\tID:{0}\tSM:{1}\t' + if SM is None: + SM = os.path.basename(bam).replace( '.bam', '' ) + + if CN is not None: + rg += 'CN:{2}\tPL:{3}' + old_header += rg.format( id, SM, CN, pl ) + '\n' + else: + rg += 'PL:{2}' + old_header += rg.format( id, SM, pl ) + '\n' + + return old_header + +def get_bam_header( bam ): + r = samtools.view( bam, H=True ) + hdr = r.read() + r.close() + return hdr.rstrip() + +def parse_args( args=sys.argv[1:] ): + from ngs_mapper import config + conf_parser, args, config, configfile = config.get_config_argparse(args) + defaults = config['tagreads'] + + parser = argparse.ArgumentParser( + description='''Adds Sanger, MiSeq, 454 and IonTorrent read groups to a bam file. + Will tag all reads in the alignment for each of the platforms based on the read name''', + parents=[conf_parser] + ) + + parser.add_argument( + dest='bamfiles', + nargs='+', + help='Bamfile to add read groups to for platforms' + ) + + parser.add_argument( + '-SM', + dest='SM', + default=defaults['SM']['default'], + help=defaults['SM']['help'] + ) + + parser.add_argument( + '-CN', + dest='CN', + default=defaults['CN']['default'], + help=defaults['CN']['help'] + ) + + parser.add_argument( + '--output', + '-o', + dest='output' + ) + return parser.parse_args( args ) + +if __name__ == '__main__': main() diff --git a/jip_modules/consensus.jip b/jip_modules/consensus.jip new file mode 100644 index 0000000..c2ad138 --- /dev/null +++ b/jip_modules/consensus.jip @@ -0,0 +1,17 @@ +#!/usr/bin/env jip +# +# Usage: +# index_filter_quality -r -v [-o ] --minimum +# +# Options: +# -o, --output= Output files [default: stdout] +# -r, --reference= Reference fasta +# -v, --vcf= VCF file generated by freebayes + + +#%begin command python + +from bioframework import consensus +consens.run("${bam}", "${ref}", "${vcf}", "${out}") + +#%end diff --git a/jip_modules/freebayes.jip b/jip_modules/freebayes.jip new file mode 100755 index 0000000..5159839 --- /dev/null +++ b/jip_modules/freebayes.jip @@ -0,0 +1,16 @@ + +#!/usr/bin/env jip +# +# run freebayes +# +# Usage: +# run_freebayes [] -f [-o ] +# +# Options: +# -o, --output= Output fastq. If input was interleave, output will be as well +# -f, The reference +# the input is assumed to be unpaired fastq +# Help: +# Requires a bam file () and a reference (-f) + +freebayes ${input|else("-")} ${reference|arg} ${output|arg(">")} diff --git a/jip_modules/tag_bam.jip b/jip_modules/tag_bam.jip new file mode 100755 index 0000000..1c988ae --- /dev/null +++ b/jip_modules/tag_bam.jip @@ -0,0 +1,18 @@ +#!/usr/bin/env jip +# Usage: +# tag_bam [] [--config CONFIG] [--version] [--SM ] [--CN ] -o +# +# Options: +# --SM A tag +# --CN Another tag +# -o, --output= Output fastq. If input was interleave, output will be as well + + +#%begin command bash +# +python bioframework/tagreads.py ${input|else('/dev/stdin')} ${SM|arg('-SM')} ${CN|arg('-CN')} | samtools view - -Sbh | samtools sort - ${output} +samtools index ${output}.bam + +#%end + + diff --git a/mypy/Bio/SeqIO.pyi b/mypy/Bio/SeqIO.pyi new file mode 100644 index 0000000..baaf8a9 --- /dev/null +++ b/mypy/Bio/SeqIO.pyi @@ -0,0 +1,5 @@ +from Bio.SeqRecord import SeqRecord +from typing import Generator, Any, Iterator + +def parse(*anything): # type: (*Any) -> Iterator[SeqRecord] + pass diff --git a/mypy/Bio/SeqRecord.py b/mypy/Bio/SeqRecord.py new file mode 100644 index 0000000..8bdc6c6 --- /dev/null +++ b/mypy/Bio/SeqRecord.py @@ -0,0 +1,6 @@ +# from Bio.SeqIO import SeqIO +from typing import NamedTuple +class Stringable(object): + def __str__(self): # type: () -> str + pass +SeqRecord = NamedTuple('SeqRecord', [('id', str), ('seq', Stringable)]) diff --git a/mypy/Bio/__init__.py b/mypy/Bio/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mypy/README.md b/mypy/README.md new file mode 100644 index 0000000..26906fb --- /dev/null +++ b/mypy/README.md @@ -0,0 +1,6 @@ +1. Install [mypy](https://github.com/python/mypy#quick-start) + +2. Run mypy: `MYPYPATH=$PWD/mypy:$PWD/mypy/out mypy --py2 bioframework/consensus.py` + +If needed, uses `stubgen` to generate more stub files for other libraries. + diff --git a/mypy/__init__.py b/mypy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mypy/example.py b/mypy/example.py new file mode 100644 index 0000000..4f9bd9f --- /dev/null +++ b/mypy/example.py @@ -0,0 +1,44 @@ +from typing import List, Dict, Generator, Iterator, Iterable, Tuple +from Bio import SeqIO +from itertools import imap +from Bio.SeqRecord import SeqRecord +def test_long(): # type: () -> int + return 11999999L +def test_seqIO_map_fails(s): # type: (str) -> List[SeqRecord] + return map(lambda x: x.id, SeqIO.parse(s)) + +#def test_seqIO_map_fails2(s): # type: (str) -> Iterator[SeqRecord] +# return map(lambda x: x.id, SeqIO.parse(s)) +def test_seqIO_map_passes(s): # type: (str) -> Iterable[str] + return imap(lambda x: x.id, SeqIO.parse(s)) + +def test_seqIO(s): # type: (str) -> Iterator[SeqRecord] + return SeqIO.parse(s) +def test_list_seqIO(s): # type: (str) -> List[SeqRecord] + return list(SeqIO.parse(s)) +def test_seqIO_fails(s): # type: (str) -> List[str] + return SeqIO.parse(s) +def test_should_pass(s): # type: (SeqRecord) -> str + return s.id +def test_should_fail(s): # type: (SeqRecord) -> int + return s.id +#def test_should_fail(): # type: () -> List[SeqRecord] +# return 3 + +#a = test_should_fail() +def test_ordered_dict(od): # type: (Dict[str,int]) -> Dict[str,int] + return 1 #type error 1 +# +#a = test_ordered_dict(1) #type error 2 +# +#def test_me(): +# a = test_ordered_dict(1) # type error 3 is not reported + +####def test_ordered_dict(od: typing.Dict[str,int]) -> typing.Dict[str,int]: +#### return 1 #type error 1 +#### +####a = test_ordered_dict(1) #type error 2 +#### +####def test_me(): +#### a = test_ordered_dict(1) # type error 3 is not reported +### diff --git a/mypy/out/docopt.pyi b/mypy/out/docopt.pyi new file mode 100644 index 0000000..6f9431c --- /dev/null +++ b/mypy/out/docopt.pyi @@ -0,0 +1,76 @@ +# Stubs for docopt (Python 2) +# +# NOTE: This dynamically typed stub was automatically generated by stubgen. + +from typing import Any + +class DocoptLanguageError(Exception): ... + +class DocoptExit(SystemExit): + usage = ... # type: Any + def __init__(self, message=''): ... + +class Pattern: + def __eq__(self, other): ... + def __hash__(self): ... + def fix(self): ... + def fix_identities(self, uniq=None): ... + def fix_repeating_arguments(self): ... + @property + def either(self): ... + +class ChildPattern(Pattern): + name = ... # type: Any + value = ... # type: Any + def __init__(self, name, value=None): ... + def flat(self, *types): ... + def match(self, left, collected=None): ... + +class ParentPattern(Pattern): + children = ... # type: Any + def __init__(self, *children): ... + def flat(self, *types): ... + +class Argument(ChildPattern): + def single_match(self, left): ... + @classmethod + def parse(class_, source): ... + +class Command(Argument): + name = ... # type: Any + value = ... # type: Any + def __init__(self, name, value=False): ... + def single_match(self, left): ... + +class Option(ChildPattern): + value = ... # type: Any + def __init__(self, short=None, long=None, argcount=0, value=False): ... + @classmethod + def parse(class_, option_description): ... + def single_match(self, left): ... + @property + def name(self): ... + +class Required(ParentPattern): + def match(self, left, collected=None): ... + +class Optional(ParentPattern): + def match(self, left, collected=None): ... + +class AnyOptions(Optional): ... + +class OneOrMore(ParentPattern): + def match(self, left, collected=None): ... + +class Either(ParentPattern): + def match(self, left, collected=None): ... + +class TokenStream(list): + error = ... # type: Any + def __init__(self, source, error): ... + def move(self): ... + def current(self): ... + +class Dict(dict): ... + +def docopt(doc, argv=None, help=True, version=None, options_first=False): ... diff --git a/mypy/out/hypothesis/__init__.pyi b/mypy/out/hypothesis/__init__.pyi new file mode 100644 index 0000000..764f9f7 --- /dev/null +++ b/mypy/out/hypothesis/__init__.pyi @@ -0,0 +1,8 @@ +# Stubs for hypothesis (Python 2) +# +# NOTE: This dynamically typed stub was automatically generated by stubgen. + +from hypothesis._settings import settings as settings, Verbosity as Verbosity +from hypothesis.version import __version_info__ as __version_info__, __version__ as __version__ +from hypothesis.control import assume as assume, note as note, reject as reject +from hypothesis.core import given as given, find as find, example as example, seed as seed diff --git a/mypy/out/schema.pyi b/mypy/out/schema.pyi new file mode 100644 index 0000000..3eb2140 --- /dev/null +++ b/mypy/out/schema.pyi @@ -0,0 +1,31 @@ +# Stubs for schema (Python 2) +# +# NOTE: This dynamically typed stub was automatically generated by stubgen. + +from typing import Any + +class SchemaError(Exception): + autos = ... # type: Any + errors = ... # type: Any + def __init__(self, autos, errors): ... + @property + def code(self): ... + +class And: + def __init__(self, *args, **kw): ... + def validate(self, data): ... + +class Or(And): + def validate(self, data): ... + +class Use: + def __init__(self, callable_, error=None): ... + def validate(self, data): ... + +def priority(s): ... + +class Schema: + def __init__(self, schema, error=None): ... + def validate(self, data): ... + +class Optional(Schema): ... diff --git a/mypy/sh.py b/mypy/sh.py new file mode 100644 index 0000000..ee8e4e9 --- /dev/null +++ b/mypy/sh.py @@ -0,0 +1,3 @@ +from typing import Callable, Any, Union, List, Iterator +def Command(s): # type: (str) -> Callable[...,Union[List[str],Iterator[str]]] + pass diff --git a/mypy/toolz/__init__.py b/mypy/toolz/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mypy/toolz/dicttoolz.py b/mypy/toolz/dicttoolz.py new file mode 100644 index 0000000..22feb1c --- /dev/null +++ b/mypy/toolz/dicttoolz.py @@ -0,0 +1,34 @@ +from typing import Dict, Any, Callable, TypeVar +K = TypeVar('K') +V = TypeVar('V') +V2 = TypeVar('V2') +V3 = TypeVar('V3') +def merge(d1, d2): # type: (Dict[K,V], Dict[K,V]) -> Dict[K,V] + pass + +def dissoc(d, k): # type: (Dict[K,V], K) -> Dict[K,V] + pass + +def merge_with(f, d1, d2): # type: (Callable[[V,V2], V3], Dict[K,V], Dict[K,V2]) -> Dict[K,V3] + pass + +def valfilter(f, d): # type: (Callable[[V], bool], Dict[K,V]) -> Dict[K,V] + pass + +def keyfilter(f, d): # type: (Callable[[K], bool], Dict[K,V]) -> Dict[K,V] + pass + + +#from typing import Dict, Any, Callable, TypeVar +#T = TypeVar('T') +#def merge(d1, d2): # type: (Dict[Any,Any], Dict[Any,Any]) -> Dict[Any,Any] +# pass +# +#def dissoc(d, k): # type: (Dict[Any,Any], Any) -> Dict[Any,Any] +# pass +# +#def merge_with(f, d1, d2): # type: (Callable, Dict[Any,Any], Dict[Any,Any]) -> Dict[Any,Any] +# pass +# +#def valfilter(f, d): # type: (Callable, Dict[Any,Any]) -> Dict[Any,Any] +# pass diff --git a/mypy/vcf/__init__.py b/mypy/vcf/__init__.py new file mode 100644 index 0000000..29c9cc8 --- /dev/null +++ b/mypy/vcf/__init__.py @@ -0,0 +1,11 @@ +from typing import Union, Dict, List, NamedTuple, Iterator, BinaryIO +from vcf.model import _Record + +#fields = [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] +# +#VCFRecord = NamedTuple('VCFRecord', fields) + +VCFRecord = NamedTuple('VCFRecord', [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] +) +def Reader(s): # type: (BinaryIO) -> Iterator[_Record] + pass diff --git a/mypy/vcf/model.py b/mypy/vcf/model.py new file mode 100644 index 0000000..9266869 --- /dev/null +++ b/mypy/vcf/model.py @@ -0,0 +1,3 @@ +from typing import Union, Dict, List, NamedTuple, Iterator +_Record = NamedTuple('_Record', [("ALT", Union[str, List[str]]), ("REF", str), ("POS", int), ("CHROM", str), ("INFO", Dict[str, Union[int, List[int]]])] +) diff --git a/newfile b/newfile new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8e01b72 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +pycontracts +toolz +pyvcf +typing +docopt +schema diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 292f24b..1d09b17 --- a/setup.py +++ b/setup.py @@ -23,8 +23,13 @@ def jip_modules(path=bioframework.JIP_PATH): install_requires = [ 'pyjip', 'toolz', + 'docopt', + 'schema', + 'pyvcf', + 'typing' ], data_files=[ (join(sys.prefix,'bin'), jip_modules()), ] ) + diff --git a/tests/requirements-dev.txt b/tests/requirements-dev.txt old mode 100644 new mode 100755 index d96de86..b139172 --- a/tests/requirements-dev.txt +++ b/tests/requirements-dev.txt @@ -4,3 +4,4 @@ python-coveralls testfixtures sh hypothesis +git+https://github.com/VDBWRAIR/biotest diff --git a/tests/test_consensus.py b/tests/test_consensus.py new file mode 100644 index 0000000..932bf81 --- /dev/null +++ b/tests/test_consensus.py @@ -0,0 +1,176 @@ +from __future__ import division +from biotest.biohypothesis import ref_with_vcf_dicts_strategy_factory, \ + make_seqrec, vcf_dict_strategy_factory +from hypothesis import given +from collections import Counter +#from fn import _ +from hypothesis import strategies as st +from hypothesis import given, assume +from operator import itemgetter as get +from bioframework.consensus import call_many, all_consensuses, make_consensus, VCFRow +import string +import itertools +import unittest + +simple_vcf_dict_strategy = st.tuples(st.text(string.ascii_letters), + st.integers(min_value=1), + st.text(alphabet='ACTGN', min_size=1, max_size=6)) \ + .flatmap(lambda tup:\ + vcf_dict_strategy_factory(*tup)).map(lambda d: VCFRow(**d)) +pos_int = st.integers(min_value=0) + +#TODO: these 10, 80 for trhesh and majority_percentage should be factored out and possibly be strategies themselves +def just_ref(*args): + return next(all_consensuses(*args)[1])[0] +class CallBaseHypothesisTest(unittest.TestCase): + @given(simple_vcf_dict_strategy, pos_int) + def test_under_mind_is_N(self, mut, mind): + assume(mut.DP < mind) + result = call_many(mind, 80, mut)[1] + self.assertTrue(all(map(lambda x: x == 'N', result))) + + @given(simple_vcf_dict_strategy) + def test_ao_under_minority_is_ref(self, mut): + assume(sum(mut.AO) / mut.DP < 0.2) + result = call_many(0, 80, mut)[1] + self.assertEquals(result, mut.ref) + + @given(simple_vcf_dict_strategy) + def test_over_majority_is_alt(self, mut): + #TODO: this is slow + assume(sum(mut.AO) / mut.DP > 0.8) + assume(len(mut.alt) == 1) + result = call_many(0, 80, mut)[1] + self.assertEquals(result, mut.alt[0]) + +#Commented out because it's not actually always true, +# e.g. mut={'ref': u'AA', 'pos': 1, 'AO': [784313725491], 'alt': [u'A'], +# 'chrom': u'', 'DP': 3921568627454}) +# should result in AA +# @given(simple_vcf_dict_strategy) +# def test_over_minoriy_is_not_ref(self, mut): +# assume(sum(mut.AO) / mut.DP > 0.2) +# result = call_many(0, 80, mut)[1] +# self.assertNotEquals(result, mut.ref) + +class ConsesusExampleTest(unittest.TestCase): + def test_make_consensus_example(self): + muts = [('CG', 'TT', 1), + ('C', 'A', 5)] + ref = 'ACGTACGT' + expected = 'ATTTAAGT' + actual = make_consensus(ref, muts)[0] + self.assertEquals(expected, actual) + + def test_single_example(self): + raw_muts = [{ + 'pos' : 2, + 'ref' : 'CG', + 'alt' : ['TT'], + 'AO' : [99], + 'DP' : 100, + 'chrom' : 'X' + }, + { + 'pos' : 6, + 'ref' : 'C', + 'alt' : ['G', 'A'], + 'AO' : [15, 120], + 'DP' : 150, + 'chrom' : 'X' + }] + muts = map(lambda d: VCFRow(**d), raw_muts) + ref = make_seqrec('X', 'ACGTACGT') + expected = 'ATTTAAGT' + result = just_ref([ref], muts, 10, 80) + self.assertEquals(expected, result) +ref_with_vcf_dicts_strategy = ref_with_vcf_dicts_strategy_factory().map( + #lambda (r, muts): (make_seqrec(muts[0]['chrom'], r), map(lambda d: VCFRow(**d), muts))) + lambda tup: (make_seqrec(tup[1][0]['chrom'], tup[0]), map(lambda d: VCFRow(**d), tup[1]))) + +countof = lambda c: lambda x: Counter(x).get(c, 0) +def run_cons(*args): + _, alt_and_cons = all_consensuses(*args) + cons, alts = zip(*alt_and_cons) + return cons[0], alts[0] +class ConsensusHypothesisTest(unittest.TestCase): + #ref_and_muts=(SeqRecord(seq=Seq(u'AAAAAAAAAA', IUPACAmbiguousDNA()), id=u'', name='', description='', dbxrefs=[]), [ +# {'ref': u'A', 'pos': 1, 'AO': [479, 777, 119, 604], 'alt': [u'G', u'C', u'G', u'TG'], 'chrom': u'', 'DP': 2635}, +# {'ref': u'A', 'pos': 3, 'AO': [291, 241, 583, 420], 'alt': [u'CTG', u'C', u'G', u'C'], 'chrom': u'', 'DP': 1627}]), rand=random.seed(0)) +# AssertionError: 1 != 0 + + @given(ref_with_vcf_dicts_strategy, st.random_module()) + def test_n_count(self, ref_and_muts, rand): + ref, muts = ref_and_muts + originalNs = countof('N')(ref) + alts = map(lambda x: x.alt, muts) + refs = map(lambda x: x.ref, muts) + assume(not filter(lambda x: 'N' in x, itertools.chain(*alts))) + assume(not filter(lambda x: len(x) > 1, itertools.chain(*alts))) + assume(not filter(lambda x: len(x) > 1, refs)) + # needed because ACGT -> N + expectedNs = len(filter(lambda x: x.DP < 10, muts)) + originalNs + result = just_ref([ref], muts, 10, 80) + self.assertEquals(countof('N')(result), expectedNs) + + @given(ref_with_vcf_dicts_strategy) + def test_less_or_equal_length_when_no_inserts(self, ref_and_muts): + ref, muts = ref_and_muts + cons, alts = run_cons([ref], muts, 10, 80) + assume(not any(map(lambda x: len(x[0]) < len(x[1]), alts))) + self.assertGreaterEqual(len(ref), len(cons)) + + @given(ref_with_vcf_dicts_strategy) + def assume_greater_or_equal_length_when_no_deletions(self, ref_and_muts): + ref, muts = ref_and_muts + def has_deletion(mut): + filter(lambda x: len(x) < mut.ref, mut.alt) + assume(not any(map(has_deletion, muts))) + result = just_ref([ref], muts, 10, 80) + self.assertLesserEqual(len(ref), len(result)) + +class ConsensusMetamorphicTests(unittest.TestCase): + '''paper on metamorphic testing: http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-10-24 ''' + @given(ref_with_vcf_dicts_strategy, st.integers(), st.integers()) + def test_more_or_equal_ns_with_lower_threshold(self, ref_and_muts, n1, n2): + ref, muts = ref_and_muts + assume(n1 < n2) + cons1 = just_ref([ref], muts, n1, 80) + cons2 = just_ref([ref], muts, n2, 80) + nsCount1, nsCount2 = countof('N')(cons1), countof('N')(cons2) + self.assertLessEqual(nsCount1, nsCount2) + + @given(ref_with_vcf_dicts_strategy) + def test_consensus_from_consensus_contains_more_alts(self, ref_and_muts): + ref, muts = ref_and_muts + assume(not any(map(lambda x: len(x.alt) > 1, muts))) + n1 = 10 + cons1, alts = run_cons([ref], muts, n1, 80) + assume(not any(map(lambda x: len(x[0]) > len(x[1]), alts))) + cons2, _ = run_cons([make_seqrec(muts[0].chrom, cons1)], muts, n1, 80) + picked_alts = map(get(1), alts) + altCounts1 = sum(map(lambda f: f(cons1), map(countof, picked_alts))) + altCounts2 = sum(map(lambda f: f(cons2), map(countof, picked_alts))) + self.assertLessEqual(altCounts1, altCounts2) + + + #NOTE: the below test appears to be meaningless, + # ass the values are always equal + percent = st.integers(min_value=0, max_value=100) + @given(ref_with_vcf_dicts_strategy, percent, percent) + def test_lower_majority_required_contains_more_alts(self, ref_and_muts, p1, p2): + ref, muts = ref_and_muts + assume(p1 < p2) + assume(not any(map(lambda x: len(x.alt) > 1, muts))) + n1 = 10 + cons1, alts = run_cons([ref], muts, n1, p1) + assume(not any(map(lambda x: len(x[0]) > len(x[1]), alts))) + cons2, _ = run_cons([ref], muts, n1, p2) + picked_alts = map(get(1), alts) + altCounts1 = sum(map(lambda f: f(cons1), map(countof, picked_alts))) + altCounts2 = sum(map(lambda f: f(cons2), map(countof, picked_alts))) + self.assertLessEqual(altCounts1, altCounts2) + + #TODO: could test for exceptions when e.g., + # 1. POS is repeated + # 2. POS is greater than the size of sequence