-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbamUtils.py
More file actions
106 lines (93 loc) · 3.49 KB
/
bamUtils.py
File metadata and controls
106 lines (93 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Useful functions for dealing with BAM files.
import warnings
import subprocess
import tempfile
import sys
import re
from Bio import SeqIO
def num_mismatches(ref, query):
"""Number of matches between query and reference."""
mismatches = 0
for i in range(len(ref)):
if query[i] != ref[i]:
mismatches += 1
return mismatches
def multi_align(ref1, ref2, queries):
"""
Do MSA of several query sequences to two references. Return the reference
index with the fewest mismatches for each sequence.
"""
tfile = tempfile.NamedTemporaryFile()
tfile.write(">ref1\n{}\n".format(ref1))
tfile.write(">ref2\n{}\n".format(ref2))
n_queries = 0
for query in queries:
tfile.write(">query{}\n{}\n".format(n_queries, query))
n_queries += 1
tfile.flush()
outfile = tempfile.TemporaryFile()
cmd = ["mafft", "--auto", tfile.name]
subprocess.Popen(cmd, stdout=outfile, stderr=subprocess.PIPE).communicate()
outfile.seek(0)
align = SeqIO.parse(outfile, "fasta")
ref1 = next(align)
ref2 = next(align)
scores = [0, 0]
ties = 0
for query in align:
mismatch1 = num_mismatches(ref1, query)
mismatch2 = num_mismatches(ref2, query)
if mismatch1 < mismatch2:
scores[0] += 1
elif mismatch1 > mismatch2:
scores[1] += 1
else:
ties += 1
if ties > 0:
msg = "Discarding {} reads aligning equally to ref and alt".format(ties)
warnings.warn(msg)
return scores
def count_indels(samfile, reffile, chrom, pos, ref, alt, min_mapq=20):
"""
Count occurences of the reference and alternate indel allele at a given
position, by alignment score.
"""
start = max(pos-100, 0)
end = pos+100
ref_seq = reffile.fetch(reference=chrom, start=start, end=end).decode("utf-8")
alt_seq = ref_seq[:pos-start-1] + alt + ref_seq[pos-start-1+len(ref):]
reads = samfile.fetch(chrom, pos, pos+len(ref))
counts = [0, 0]
reads = [r.seq for r in reads if r.mapq >= min_mapq]
scores = multi_align(ref_seq, alt_seq, reads)
return {ref: scores[0], alt: scores[1]}
def count_bases_pileup(pileup, position, min_baseq=15, min_mapq=20):
"""
Count the number of times each nucleotide occurs at a given position in a
pileup object, subject to minimum base quality and map quality constraints.
Return a dictionary keyed by nucleotides.
"""
counts = dict.fromkeys(["A", "T", "C", "G"], 0)
for x in pileup:
if position == x.pos + 1:
for read in x.pileups:
dup = read.alignment.is_duplicate
qc_fail = read.alignment.is_qcfail
low_mapq = read.alignment.mapq < min_mapq
if not (dup or qc_fail or low_mapq):
base_qual = ord(read.alignment.qual[read.qpos])-33
if base_qual >= min_baseq:
base = read.alignment.seq[read.qpos]
counts[base] += 1
return counts
def count_bases(samfile, reffile, chrom, pos):
"""
Count the number of times each nucleotide occurs at a given position in a
bam file. Return a dictionary keyed by nucleotides.
"""
start = max(pos-200, 0)
end = pos+200
reffile.fetch(reference=chrom, start=pos-1, end=pos)
ref_base = reffile.fetch(reference=chrom, start=pos-1, end=pos).decode("utf-8")
pileup = samfile.pileup(chrom, start, end, fastafile=reffile)
return count_bases_pileup(pileup, pos)