From 3f5b91ee8301053c743ea9922ac6f595bf9283b6 Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 28 Jul 2016 14:38:39 -0400 Subject: [PATCH 01/14] makefile for doing freebayes-based consensus on ngs_mapper output directory --- bioframework/Makefile | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 bioframework/Makefile diff --git a/bioframework/Makefile b/bioframework/Makefile new file mode 100644 index 0000000..255239f --- /dev/null +++ b/bioframework/Makefile @@ -0,0 +1,21 @@ + +BAM=$(NGSDIR)/$(notdir $(NGSDIR)).bam +REF=$(NGSDIR)/$(notdir $(NGSDIR)).ref.fasta +TAGGED=$(NGSDIR)/tagged.bam +FBVCF=$(NGSDIR)/freebayes.vcf +FBCONSENSUS=$(NGSDIR)/freebayes_consensus.fasta + +all: $(FBCONSENSUS) + +$(TAGGED): $(BAM) + python tagreads.py $< | samtools view - -Sbh | samtools sort - $(TAGGED:.bam=) + +$(TAGGED).bai: $(TAGGED) + samtools index $< + +$(FBVCF): $(TAGGED) $(NGSDIR)/tagged.bam.bai $(REF) + freebayes -f $(REF) $< > $@ + +$(FBCONSENSUS): $(BAM) $(REF) $(FBVCF) + python consensus.py --vcf $< --ref $(REF) > $@ + #cd ~/bioframes && python bioframes/consensus.py $^ > $@ From f8132a0e8786e24b7ced2e21a31f91210f56fd12 Mon Sep 17 00:00:00 2001 From: Panciera Date: Fri, 29 Jul 2016 16:08:25 -0400 Subject: [PATCH 02/14] add consensus for freebayes --- bioframework/consensus.py | 10 +++++----- install.sh | 3 +++ requirements-conda.txt | 1 + {bioframework => scripts}/Makefile | 0 scripts/consensus.sh | 7 +++++++ setup.py | 6 ++++++ 6 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 install.sh rename {bioframework => scripts}/Makefile (100%) create mode 100644 scripts/consensus.sh diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 8dd433c..f443d03 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -22,7 +22,7 @@ from Bio.SeqRecord import SeqRecord #done import vcf #done from vcf.model import _Record -import sh #todo +# import sh #todo #from toolz import compose from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done from docopt import docopt #ignore @@ -200,10 +200,10 @@ def single_consensus(muts, ref): def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str return ">{0}:Consensus\n{1}".format(ref.id, consensus) -def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int] - pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True) - get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int] - return imap(get_pos, pileup) +#def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int] +# pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True) +# get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int] +# return imap(get_pos, pileup) #TODO: is pileup 0-based or 1-based index? def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..8886a67 --- /dev/null +++ b/install.sh @@ -0,0 +1,3 @@ +conda install --file requirements-conda.txt +pip install -r requirements.txt +python setup.py install diff --git a/requirements-conda.txt b/requirements-conda.txt index d2b04c0..62509a0 100644 --- a/requirements-conda.txt +++ b/requirements-conda.txt @@ -1,5 +1,6 @@ # Dependencies that are software and/or large such as scikit-learn... bwa samtools +freebayes cutadapt biopython diff --git a/bioframework/Makefile b/scripts/Makefile similarity index 100% rename from bioframework/Makefile rename to scripts/Makefile diff --git a/scripts/consensus.sh b/scripts/consensus.sh new file mode 100644 index 0000000..38c47ef --- /dev/null +++ b/scripts/consensus.sh @@ -0,0 +1,7 @@ +if [ -e $boo ]; +then + echo "Usage: $(basename $0) "; + exit 1 +fi; + +make NGSDIR=$1 diff --git a/setup.py b/setup.py index 1d09b17..4fcda06 100755 --- a/setup.py +++ b/setup.py @@ -18,7 +18,13 @@ def jip_modules(path=bioframework.JIP_PATH): description = bioframework.__description__, license = "GPLv2", keywords = bioframework.__keywords__, + # scripts = glob('scripts/*'), + scripts = ['scripts/consensus.sh'], entry_points = { + 'console_scripts': [ + 'tagreads = bioframework.tagreads:main', + 'fb_consensus = bioframework.consensus:main' + ] }, install_requires = [ 'pyjip', From 8904f94c95fdaf0de2f02c86784f3fe6c34cb9f0 Mon Sep 17 00:00:00 2001 From: Panciera Date: Fri, 29 Jul 2016 16:28:13 -0400 Subject: [PATCH 03/14] now calls makefile correctly --- scripts/consensus.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/consensus.sh b/scripts/consensus.sh index 38c47ef..eb09908 100644 --- a/scripts/consensus.sh +++ b/scripts/consensus.sh @@ -1,7 +1,7 @@ -if [ -e $boo ]; +if [ -z "$1" ]; then echo "Usage: $(basename $0) "; exit 1 fi; -make NGSDIR=$1 +make -f $(dirname $0)/Makefile NGSDIR=$1 From ac80fa0faad6b1d81210e51655fc8d5140068317 Mon Sep 17 00:00:00 2001 From: Panciera Date: Tue, 13 Sep 2016 16:51:26 -0400 Subject: [PATCH 04/14] missing imports (samtools, ngs_mapper) also fixed --- bioframework/tagreads.py | 13 +++++++++---- scripts/Makefile | 8 ++++---- scripts/consensus.sh | 4 +++- setup.py | 2 +- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/bioframework/tagreads.py b/bioframework/tagreads.py index 42fa565..5acd7d1 100644 --- a/bioframework/tagreads.py +++ b/bioframework/tagreads.py @@ -155,14 +155,19 @@ def get_bam_header( bam ): return hdr.rstrip() def parse_args( args=sys.argv[1:] ): - from ngs_mapper import config - conf_parser, args, config, configfile = config.get_config_argparse(args) - defaults = config['tagreads'] + defaults = { 'SM' : { 'default' : None, + 'help' : 'Sets the SM tag value inside of each read group. Default is the portion of the filname that preceeds the .bam[Default: %(default)s]' + }, + 'CN' : { 'default' : None, + 'help' : 'Sets the CN tag inside of each read group to the value specified.[Default: %(default)s]' + } + } + + # config['tagreads'] parser = argparse.ArgumentParser( description='''Adds Sanger, MiSeq, 454 and IonTorrent read groups to a bam file. Will tag all reads in the alignment for each of the platforms based on the read name''', - parents=[conf_parser] ) parser.add_argument( diff --git a/scripts/Makefile b/scripts/Makefile index 255239f..c025104 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -8,7 +8,7 @@ FBCONSENSUS=$(NGSDIR)/freebayes_consensus.fasta all: $(FBCONSENSUS) $(TAGGED): $(BAM) - python tagreads.py $< | samtools view - -Sbh | samtools sort - $(TAGGED:.bam=) + fb_tagreads $< | samtools view - -Sbh | samtools sort - $(TAGGED:.bam=) $(TAGGED).bai: $(TAGGED) samtools index $< @@ -16,6 +16,6 @@ $(TAGGED).bai: $(TAGGED) $(FBVCF): $(TAGGED) $(NGSDIR)/tagged.bam.bai $(REF) freebayes -f $(REF) $< > $@ -$(FBCONSENSUS): $(BAM) $(REF) $(FBVCF) - python consensus.py --vcf $< --ref $(REF) > $@ - #cd ~/bioframes && python bioframes/consensus.py $^ > $@ +$(FBCONSENSUS): $(FBVCF) $(REF) + fb_consensus --vcf $< --ref $(REF) > $@ +#cd ~/bioframes && python bioframes/consensus.py $^ > $@ diff --git a/scripts/consensus.sh b/scripts/consensus.sh index eb09908..263f86c 100644 --- a/scripts/consensus.sh +++ b/scripts/consensus.sh @@ -3,5 +3,7 @@ then echo "Usage: $(basename $0) "; exit 1 fi; +makefile=/media/VD_Research/Admin/PBS/Software/bioframework/scripts/Makefile -make -f $(dirname $0)/Makefile NGSDIR=$1 +# make -f $(dirname $0)/Makefile NGSDIR=$1 +make -f $makefile NGSDIR=$1 diff --git a/setup.py b/setup.py index 4fcda06..c56d402 100755 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def jip_modules(path=bioframework.JIP_PATH): scripts = ['scripts/consensus.sh'], entry_points = { 'console_scripts': [ - 'tagreads = bioframework.tagreads:main', + 'fb_tagreads = bioframework.tagreads:main', 'fb_consensus = bioframework.consensus:main' ] }, From 09fb5148f48c1ef341b0378409c0c0a854e581e1 Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 15 Sep 2016 14:24:40 -0400 Subject: [PATCH 05/14] fix so consensus includes reference as argument to match Makefile --- scripts/consensus.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/consensus.sh b/scripts/consensus.sh index 263f86c..dbba369 100644 --- a/scripts/consensus.sh +++ b/scripts/consensus.sh @@ -1,9 +1,9 @@ -if [ -z "$1" ]; +if [ -z "$1" ] || [ -z "$2" ]; then - echo "Usage: $(basename $0) "; + echo "Usage: $(basename $0) "; exit 1 fi; makefile=/media/VD_Research/Admin/PBS/Software/bioframework/scripts/Makefile # make -f $(dirname $0)/Makefile NGSDIR=$1 -make -f $makefile NGSDIR=$1 +make -f $makefile NGSDIR=$1 REF=$2 From f2922a2ffc89aea6737d07955e7e1ec5ccf34625 Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 15 Sep 2016 14:25:15 -0400 Subject: [PATCH 06/14] freebayes consnesus now includes sample name in filename --- scripts/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile b/scripts/Makefile index c025104..7953f0b 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -3,7 +3,8 @@ BAM=$(NGSDIR)/$(notdir $(NGSDIR)).bam REF=$(NGSDIR)/$(notdir $(NGSDIR)).ref.fasta TAGGED=$(NGSDIR)/tagged.bam FBVCF=$(NGSDIR)/freebayes.vcf -FBCONSENSUS=$(NGSDIR)/freebayes_consensus.fasta +FBCONSENSUS=$(NGSDIR)/$(SAMPLE).freebayes_consensus.fasta +SAMPLE=$(basename $(NGSDIR)) all: $(FBCONSENSUS) From 8fb4ac62eb3c4bfec327b4d9ca793b52b7dabcca Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 15 Sep 2016 14:44:06 -0400 Subject: [PATCH 07/14] consensus now accepts samplename argument to prepend to fasta IDs --- bioframework/consensus.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index f443d03..9b2cbbc 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -1,12 +1,13 @@ """ Usage: - consensus --ref --vcf [--mind ] [--majority ] [-o ] + consensus --ref --vcf [--mind ] [--majority ] [-o ] [--sample ] Options: --ref= Reference fasta file --vcf= VCF output --majority= Percentage required [default: 80] --mind= minimum depth to call base non-N [default: 10] + --sample= sample name --output,-o= output file [default: ] """ #stdlib @@ -26,7 +27,7 @@ #from toolz import compose from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done from docopt import docopt #ignore -from schema import Schema, Use #ignore +from schema import Schema, Use, Optional #ignore #from contracts import contract, new_contract #can ignore #from mypy.types import VCFRow ############# @@ -197,8 +198,8 @@ def single_consensus(muts, ref): ########## # I/O # ########## -def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str - return ">{0}:Consensus\n{1}".format(ref.id, consensus) +def consensus_str(sample, ref, consensus): # type: (SeqRecord, str) -> str + return ">{0}_{1}:Consensus\n{2}".format(sample if sample else '', ref.id, consensus) #def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int] # pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True) @@ -213,14 +214,14 @@ def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str #@contract(ref_fasta=str, vcf=str, mind=int, majority=int) -def run(ref_fasta, freebayes_vcf, outfile, mind, majority): +def run(ref_fasta, freebayes_vcf, outfile, mind, majority, sample): # type: (str, str, BinaryIO, int, int) -> int _refs = SeqIO.parse(ref_fasta, 'fasta') with open(freebayes_vcf, 'r') as vcf_handle: _muts = map(flatten_vcf_record, vcf.Reader(vcf_handle)) refs, muts = list(_refs), list(_muts) the_refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority) - strings = imap(consensus_str, the_refs, imap(get(0), seqs_and_muts)) + strings = imap(partial(consensus_str, sample), the_refs, imap(get(0), seqs_and_muts)) result = '\n'.join(strings) outfile.write(result) outfile.close() @@ -230,13 +231,14 @@ def main(): # type: () -> None scheme = Schema( { '--vcf' : os.path.isfile, '--ref' : os.path.isfile, + Optional('--sample') : lambda x: True, '--majority' : Use(int), '--mind' : Use(int), '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))}) raw_args = docopt(__doc__, version='Version 1.0') args = scheme.validate(raw_args) run(args['--ref'], args['--vcf'], args['--output'], - args['--mind'], args['--output']) + args['--mind'], args['--output'], args['--sample']) if __name__ == '__main__': main() From 65abc69d39bfc6acc079e8bf009ccb5a9b9600e5 Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 15 Sep 2016 14:44:30 -0400 Subject: [PATCH 08/14] Makefile now uses consensus.py sample argument --- scripts/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/Makefile b/scripts/Makefile index 7953f0b..3ac1f02 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -1,10 +1,10 @@ -BAM=$(NGSDIR)/$(notdir $(NGSDIR)).bam -REF=$(NGSDIR)/$(notdir $(NGSDIR)).ref.fasta +SAMPLE=$(notdir $(NGSDIR)) +BAM=$(NGSDIR)/$(SAMPLE).bam +REF=$(NGSDIR)/$(SAMPLE).ref.fasta TAGGED=$(NGSDIR)/tagged.bam FBVCF=$(NGSDIR)/freebayes.vcf FBCONSENSUS=$(NGSDIR)/$(SAMPLE).freebayes_consensus.fasta -SAMPLE=$(basename $(NGSDIR)) all: $(FBCONSENSUS) @@ -18,5 +18,5 @@ $(FBVCF): $(TAGGED) $(NGSDIR)/tagged.bam.bai $(REF) freebayes -f $(REF) $< > $@ $(FBCONSENSUS): $(FBVCF) $(REF) - fb_consensus --vcf $< --ref $(REF) > $@ + fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) > $@ #cd ~/bioframes && python bioframes/consensus.py $^ > $@ From d414a08fd84a3196f9b6e86620e4818774aa1f4f Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 15 Sep 2016 15:47:30 -0400 Subject: [PATCH 09/14] functionality for coverage checking --- bioframework/consensus.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 9b2cbbc..b6ecf5e 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -28,6 +28,7 @@ from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done from docopt import docopt #ignore from schema import Schema, Use, Optional #ignore +from plumbum.cmd import samtools #from contracts import contract, new_contract #can ignore #from mypy.types import VCFRow ############# @@ -211,6 +212,16 @@ def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str start, end = next(positions), collections.deque(positions, 1)[0] return '-'*start + ref[:start:end] + '-'*(len(ref) - end) +def samtoolsDepth(bam): + lines = samtools['depth'][bam]().split('\n') + lines = map(str.split, lines) + stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines) + return stats + +def uncoveredPositions(reqDepth, bam): + depthStats = samtoolsDepth(bam) + underStats = filter(lambda x: x['depth'] < reqDepth, depthStats) + return map(lambda x: x['pos'], underStats) #@contract(ref_fasta=str, vcf=str, mind=int, majority=int) From 58de882683d24f6cf21ae95febba4e0814ceaf6a Mon Sep 17 00:00:00 2001 From: Panciera Date: Fri, 16 Sep 2016 16:43:45 -0400 Subject: [PATCH 10/14] consensus now considers coverage --- bioframework/consensus.py | 62 ++++++++++++++++++++++++++++++++------- setup.py | 3 +- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index b6ecf5e..60eb541 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -1,6 +1,6 @@ """ Usage: - consensus --ref --vcf [--mind ] [--majority ] [-o ] [--sample ] + consensus --ref --vcf [--mind ] [--majority ] [-o ] [--sample ] [--bam ] Options: --ref= Reference fasta file @@ -8,9 +8,15 @@ --majority= Percentage required [default: 80] --mind= minimum depth to call base non-N [default: 10] --sample= sample name - --output,-o= output file [default: ] + --bam= bam file, required to check for coverage + --output,-o= output file [default: ] """ #stdlib +# fb_consensus --ref +# /media/VD_Research/Analysis/ProjectBased_Analysis/melanie/share/Issue_11973_freeBayes_MP/Dengue/Issue_12657_80-20/Projects/2195/Den3__Thailand__FJ744727__2001.fasta +# --bam $dir/tagged.bam --mind 10 --vcf +# /media/VD_Research/Analysis/ProjectBased_Analysis/melanie/share/Issue_11973_freeBayes_MP/Dengue/Issue_12657_80-20/Projects/2195/freebayes.vcf + from operator import itemgetter as get from functools import partial from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest @@ -21,6 +27,8 @@ from Bio import SeqIO #done from Bio.SeqRecord import SeqRecord #done +from Bio.Seq import Seq + import vcf #done from vcf.model import _Record # import sh #todo @@ -212,25 +220,56 @@ def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str start, end = next(positions), collections.deque(positions, 1)[0] return '-'*start + ref[:start:end] + '-'*(len(ref) - end) -def samtoolsDepth(bam): - lines = samtools['depth'][bam]().split('\n') - lines = map(str.split, lines) +# def samtoolsDepth(bam): +# lines = samtools['depth'][bam]().split('\n') +# lines = map(str.split, lines) +# stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines) +# return stats + +def samtoolsDepth(ref_id, bam): + lines = samtools['depth'][bam, '-r', ref_id]().split('\n') + lines = filter(lambda x: x.strip(), lines) + lines = map(lambda x: x.split('\t'), lines) stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines) return stats -def uncoveredPositions(reqDepth, bam): - depthStats = samtoolsDepth(bam) - underStats = filter(lambda x: x['depth'] < reqDepth, depthStats) - return map(lambda x: x['pos'], underStats) +#def uncoveredPositions(mind, bam): +# depthStats = samtoolsDepth(bam) +# underStats = filter(lambda x: x['depth'] < mind, depthStats) +# return map(lambda x: x['pos'], underStats) + +def uncoveredPositions(mind, bam, ref): + depthStats = samtoolsDepth(str(ref.id), bam) # use ref string + allPositions = range(1, len(ref.seq)+1) + underStats = filter(lambda x: x['depth'] < mind, depthStats) + underPositions = map(lambda x: x['pos'], underStats) + allDepthPositions = map(lambda x: x['pos'], depthStats) + zeroPositions = set(allPositions) - set(allDepthPositions) + return set(underPositions) | zeroPositions + # return map(lambda x: x['pos'], underStats) + +#def addNsAtUncovered(positions, ref): +# new_ref = ref +# for pos in positions: +# new_ref[pos-1] = 'N' +# return new_ref + +def addNsAtUncovered(mind, bam, ref_seq): + badPositions = uncoveredPositions(mind, bam, ref_seq) + new_ref = list(str(ref_seq.seq)) + for pos in badPositions: + new_ref[pos-1] = 'N' + return SeqRecord(seq=Seq(''.join(new_ref)), id=ref_seq.id) #@contract(ref_fasta=str, vcf=str, mind=int, majority=int) -def run(ref_fasta, freebayes_vcf, outfile, mind, majority, sample): +def run(ref_fasta, freebayes_vcf, outfile, mind, majority, sample, bam): # type: (str, str, BinaryIO, int, int) -> int _refs = SeqIO.parse(ref_fasta, 'fasta') with open(freebayes_vcf, 'r') as vcf_handle: _muts = map(flatten_vcf_record, vcf.Reader(vcf_handle)) refs, muts = list(_refs), list(_muts) + refs = map(partial(addNsAtUncovered, mind, bam), refs) the_refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority) strings = imap(partial(consensus_str, sample), the_refs, imap(get(0), seqs_and_muts)) result = '\n'.join(strings) @@ -243,13 +282,14 @@ def main(): # type: () -> None { '--vcf' : os.path.isfile, '--ref' : os.path.isfile, Optional('--sample') : lambda x: True, + Optional('--bam') : lambda x: True, '--majority' : Use(int), '--mind' : Use(int), '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))}) raw_args = docopt(__doc__, version='Version 1.0') args = scheme.validate(raw_args) run(args['--ref'], args['--vcf'], args['--output'], - args['--mind'], args['--output'], args['--sample']) + args['--mind'], args['--output'], args['--sample'], args['--bam']) if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index c56d402..aaaac3f 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,8 @@ def jip_modules(path=bioframework.JIP_PATH): 'docopt', 'schema', 'pyvcf', - 'typing' + 'typing', + 'plumbum' ], data_files=[ (join(sys.prefix,'bin'), jip_modules()), From 2ad9e55af2b29bccf7a4aa0f6d7b2965c374a982 Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 27 Oct 2016 15:26:32 -0400 Subject: [PATCH 11/14] add minimum basequal parameter --- bioframework/consensus.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 60eb541..7dbcc41 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -238,7 +238,7 @@ def samtoolsDepth(ref_id, bam): # underStats = filter(lambda x: x['depth'] < mind, depthStats) # return map(lambda x: x['pos'], underStats) -def uncoveredPositions(mind, bam, ref): +def uncoveredPositions(mind, minbq, bam, ref): depthStats = samtoolsDepth(str(ref.id), bam) # use ref string allPositions = range(1, len(ref.seq)+1) underStats = filter(lambda x: x['depth'] < mind, depthStats) @@ -255,21 +255,21 @@ def uncoveredPositions(mind, bam, ref): # return new_ref -def addNsAtUncovered(mind, bam, ref_seq): - badPositions = uncoveredPositions(mind, bam, ref_seq) +def addNsAtUncovered(mind, minbq, bam, ref_seq): + badPositions = uncoveredPositions(mind, minbq, bam, ref_seq) new_ref = list(str(ref_seq.seq)) for pos in badPositions: new_ref[pos-1] = 'N' return SeqRecord(seq=Seq(''.join(new_ref)), id=ref_seq.id) #@contract(ref_fasta=str, vcf=str, mind=int, majority=int) -def run(ref_fasta, freebayes_vcf, outfile, mind, majority, sample, bam): +def run(ref_fasta, freebayes_vcf, outfile, mind, minbq, majority, sample, bam): # type: (str, str, BinaryIO, int, int) -> int _refs = SeqIO.parse(ref_fasta, 'fasta') with open(freebayes_vcf, 'r') as vcf_handle: _muts = map(flatten_vcf_record, vcf.Reader(vcf_handle)) refs, muts = list(_refs), list(_muts) - refs = map(partial(addNsAtUncovered, mind, bam), refs) + refs = map(partial(addNsAtUncovered, mind, minbq, bam), refs) the_refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority) strings = imap(partial(consensus_str, sample), the_refs, imap(get(0), seqs_and_muts)) result = '\n'.join(strings) @@ -285,11 +285,12 @@ def main(): # type: () -> None Optional('--bam') : lambda x: True, '--majority' : Use(int), '--mind' : Use(int), + '--minbq' : Use(int), '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))}) raw_args = docopt(__doc__, version='Version 1.0') args = scheme.validate(raw_args) run(args['--ref'], args['--vcf'], args['--output'], - args['--mind'], args['--output'], args['--sample'], args['--bam']) + args['--mind'], args['--minbq'], args['--output'], args['--sample'], args['--bam']) if __name__ == '__main__': main() From 5153042a3e69d4c4991c7af9b9897903633f812f Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 27 Oct 2016 15:35:29 -0400 Subject: [PATCH 12/14] minbq now filters samtools depth call --- bioframework/consensus.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 7dbcc41..84fdb01 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -7,6 +7,7 @@ --vcf= VCF output --majority= Percentage required [default: 80] --mind= minimum depth to call base non-N [default: 10] + --minbq= bases below this quality are ignored when computing depth [default: 0] --sample= sample name --bam= bam file, required to check for coverage --output,-o= output file [default: ] @@ -226,8 +227,8 @@ def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str # stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines) # return stats -def samtoolsDepth(ref_id, bam): - lines = samtools['depth'][bam, '-r', ref_id]().split('\n') +def samtoolsDepth(ref_id, bam, minbq): + lines = samtools['depth'][bam, '-r', ref_id, '-q', minbq]().split('\n') lines = filter(lambda x: x.strip(), lines) lines = map(lambda x: x.split('\t'), lines) stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines) @@ -239,7 +240,7 @@ def samtoolsDepth(ref_id, bam): # return map(lambda x: x['pos'], underStats) def uncoveredPositions(mind, minbq, bam, ref): - depthStats = samtoolsDepth(str(ref.id), bam) # use ref string + depthStats = samtoolsDepth(str(ref.id), bam, minbq) # use ref string allPositions = range(1, len(ref.seq)+1) underStats = filter(lambda x: x['depth'] < mind, depthStats) underPositions = map(lambda x: x['pos'], underStats) From 8c09f79dfc0092a2b2e84f4ce0c721a4c3a29070 Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 27 Oct 2016 16:29:26 -0400 Subject: [PATCH 13/14] fixed missing minbq in cmd args doc --- bioframework/consensus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 84fdb01..746e915 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -1,6 +1,6 @@ """ Usage: - consensus --ref --vcf [--mind ] [--majority ] [-o ] [--sample ] [--bam ] + consensus --ref --vcf [--mind ] [--majority ] [-o ] [--sample ] [--bam ] [--minbq ] Options: --ref= Reference fasta file From 720dd6be628d159062bd3b0c230ee7137263ff02 Mon Sep 17 00:00:00 2001 From: Panciera Date: Thu, 27 Oct 2016 16:29:46 -0400 Subject: [PATCH 14/14] Makefile now uses minbq and mind. --- scripts/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile b/scripts/Makefile index 3ac1f02..1f85921 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -5,6 +5,8 @@ REF=$(NGSDIR)/$(SAMPLE).ref.fasta TAGGED=$(NGSDIR)/tagged.bam FBVCF=$(NGSDIR)/freebayes.vcf FBCONSENSUS=$(NGSDIR)/$(SAMPLE).freebayes_consensus.fasta +# MIND=0 +# MINBQ=0 all: $(FBCONSENSUS) @@ -18,5 +20,6 @@ $(FBVCF): $(TAGGED) $(NGSDIR)/tagged.bam.bai $(REF) freebayes -f $(REF) $< > $@ $(FBCONSENSUS): $(FBVCF) $(REF) - fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) > $@ + #fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) > $@ + fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) --bam $(TAGGED) --mind $(MIND) --minbq $(MINBQ) > $@ #cd ~/bioframes && python bioframes/consensus.py $^ > $@