-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathannotateNonATGStarts.py
More file actions
68 lines (46 loc) · 2.65 KB
/
annotateNonATGStarts.py
File metadata and controls
68 lines (46 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#Takes a gff. Entries annotated as 'start_codon' are checked against fasta files of genome sequence to see if they are 'ATG'. If so, they are unchanged. If not, their category is changed to 'nonATG_start_codon'. Gives essentially the same gff file back. Requires a single fasta of the entire genome.
#~~~~!!!!Reads the entire genome into memory!!! Best run on coyote!!!!~~~~~~
#Usage: python annotateNonATGstarts.py <in_gfffile> <genomefasta> <out_gfffile>
import os
import sys
from Bio import SeqIO
def annotateNonATGStarts(gff, genomefasta, outfile):
seqdict = SeqIO.to_dict(SeqIO.parse(genomefasta, 'fasta'))
print '{0} sequences in fasta file'.format(len(seqdict))
gfffh = open(gff, 'r')
outfh = open(outfile, 'w')
number_of_starts = 0
number_of_ATG_starts = 0
number_of_nonATG_starts = 0
for line in gfffh:
line = line.strip().split('\t')
chrm = line[0]
cat = line[2]
start = int(line[3])
stop = int(line[4])
strand = line[6]
sequence = ''
if cat != 'start_codon' or 'random' in chrm: #skip chr*_random entries
outfh.write(('\t').join(line) + '\n')
elif cat == 'start_codon' and 'random' not in chrm:
number_of_starts +=1
if strand == '+':
sequence = str(seqdict[chrm].seq[start-1:stop].upper())
elif strand == '-':
sequence = str(seqdict[chrm].seq[start-1:stop].upper().reverse_complement())
if sequence == 'ATG':
number_of_ATG_starts +=1
outfh.write(('\t').join(line) + '\n')
elif sequence != 'ATG':
number_of_nonATG_starts +=1
line = [entry.replace('start_codon', 'nonATG_start_codon') for entry in line]
outfh.write(('\t').join(line) + '\n')
if number_of_starts > 0 and number_of_starts <= 1000 and number_of_starts % 100 == 0:
print 'Analyzed {0} start codons. So far {1} were ATG and {2} were non-ATG.'.format(number_of_starts, number_of_ATG_starts, number_of_nonATG_starts)
elif number_of_starts > 1000 and number_of_starts % 1000 == 0:
print 'Analyzed {0} start codons. So far {1} were ATG and {2} were non-ATG.'.format(number_of_starts, number_of_ATG_starts, number_of_nonATG_starts)
print 'Found {0} annotated start codons. Of these, {1} were ATG and {2} were non-ATG.'.format(number_of_starts, number_of_ATG_starts, number_of_nonATG_starts)
gfffh.close()
outfh.close()
if __name__ == '__main__':
annotateNonATGStarts(sys.argv[1], sys.argv[2], sys.argv[3])