-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathRIsplicesites.py
More file actions
108 lines (90 loc) · 4.07 KB
/
RIsplicesites.py
File metadata and controls
108 lines (90 loc) · 4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import sys
import argparse
from Bio import SeqIO
def getRI5ss(events, genomefasta):
#Last 3 nt of exon, first 6 nt of intron
fiveprimesscoords = {} #{eventname : [chrm, start, stop, strand]}
fiveprimessseqs = {} #{eventname : sequence}
eventsfh = open(events, 'r')
for line in eventsfh:
eventname = line.strip()
line = line.strip().split('@')
upstreamexon = line[0]
upstreamexon = upstreamexon.split(':')
chrm = upstreamexon[0]
strand = upstreamexon[3]
if strand == '+':
upstreamexonstart = int(upstreamexon[1])
upstreamexonstop = int(upstreamexon[2]) #last nt of exon
elif strand == '-':
upstreamexonstart = int(upstreamexon[2])
upstreamexonstop = int(upstreamexon[1]) #last nt of exon
fiveprimesscoords[eventname] = [chrm, upstreamexonstart, upstreamexonstop, strand]
eventsfh.close()
#Get sequences
seq_dict = SeqIO.to_dict(SeqIO.parse(genomefasta, 'fasta'))
for event in fiveprimesscoords:
chrm = fiveprimesscoords[event][0]
upstreamexonend = fiveprimesscoords[event][2]
strand = fiveprimesscoords[event][3]
if strand == '+':
ssstart = upstreamexonend - 3
ssend = upstreamexonend + 6
sequence = seq_dict[chrm].seq[ssstart:ssend].upper()
elif strand == '-':
ssstart = upstreamexonend - 7
ssend = upstreamexonend + 2
sequence = seq_dict[chrm].seq[ssstart:ssend].reverse_complement().upper()
fiveprimessseqs[event] = str(sequence)
return fiveprimessseqs
def getRI3ss(events, genomefasta):
#Last 20 nt of intron, first 3 nt of exon
threeprimesscoords = {} #{eventname : [chrm, start, stop, strand]}
threeprimessseqs = {} #{eventname : sequence}
eventsfh = open(events, 'r')
for line in eventsfh:
eventname = line.strip()
line = line.strip().split('@')
downstreamexon = line[1]
downstreamexon = downstreamexon.split(':')
chrm = downstreamexon[0]
strand = downstreamexon[3]
if strand == '+':
downstreamexonstart = int(downstreamexon[1])
downstreamexonstop = int(downstreamexon[2]) #last nt of exon
elif strand == '-':
downstreamexonstart = int(downstreamexon[2])
downstreamexonstop = int(downstreamexon[1]) #last nt of exon
threeprimesscoords[eventname] = [chrm, downstreamexonstart, downstreamexonstop, strand]
eventsfh.close()
#Get sequences
seq_dict = SeqIO.to_dict(SeqIO.parse(genomefasta, 'fasta'))
for event in threeprimesscoords:
chrm = threeprimesscoords[event][0]
downstreamexonstart = threeprimesscoords[event][1]
strand = threeprimesscoords[event][3]
if strand == '+':
ssstart = downstreamexonstart - 21
ssend = downstreamexonstart + 2
sequence = seq_dict[chrm].seq[ssstart:ssend].upper()
elif strand == '-':
ssstart = downstreamexonstart - 3
ssend = downstreamexonstart + 20
sequence = seq_dict[chrm].seq[ssstart:ssend].reverse_complement().upper()
threeprimessseqs[event] = str(sequence)
return threeprimessseqs
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--events', type = str, help = 'File of RI events to consider. Required.', required = True)
parser.add_argument('--genomefasta', type = str, help = 'Genome sequence in fasta format. Required.', required = True)
parser.add_argument('--splicesite', type = int, help = 'Which splice site? 5 or 3? Required.', choices = [5,3], required = True)
parser.add_argument('--outfile', type = str, help = 'Output file.', required = True)
args = parser.parse_args()
if args.splicesite == 5:
seqs = getRI5ss(args.events, args.genomefasta)
elif args.splicesite == 3:
seqs = getRI3ss(args.events, args.genomefasta)
outfh = open(args.outfile, 'w')
for entry in seqs:
outfh.write('>' + entry + '\n' + seqs[entry] + '\n')
outfh.close()