-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathRBNS_Gquad.py
More file actions
39 lines (33 loc) · 1.11 KB
/
RBNS_Gquad.py
File metadata and controls
39 lines (33 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#Usage: python RBNS_Gquad.py <fastqfile> <outfile>
from Bio import SeqIO
import gzip
import re
import itertools
import sys
def getdensity(motif, fastq):
readcount = 0
motifcount = 0
fh = gzip.open(fastq, 'r')
for record in SeqIO.parse(fh, 'fastq'):
readcount +=1
if readcount % 1000000 == 0:
print 'Analyzing read {0}...'.format(readcount)
seq = str(record.seq)
motifmatches = re.findall(motif, seq)
motifcount += len(motifmatches)
return readcount, motifcount
def iteratemotifs(fastq, outfile):
with open(outfile, 'a') as f:
f.write(('\t').join(['fourmer', 'readcount', 'motifcount']) + '\n')
fourmercount = 0
bases = ['A', 'T', 'G', 'C']
all4mers = [''.join(x) for x in itertools.product(bases, repeat = 4)]
for fourmer in all4mers:
fourmercount +=1
if fourmercount % 10 == 0:
print '4mer {0} of 256...'.format(fourmercount)
motif = r'(?=({0}(.{{0,5}}){0}(.{{0,5}}){0}(.{{0,5}}){0}))'.format(fourmer)
readcount, motifcount = getdensity(motif, fastq)
with open(outfile, 'a') as f:
f.write(('\t').join([fourmer, str(readcount), str(motifcount)]) + '\n')
iteratemotifs(sys.argv[1], sys.argv[2])