-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathUTRuniqueseqs.py
More file actions
229 lines (185 loc) · 7.83 KB
/
UTRuniqueseqs.py
File metadata and controls
229 lines (185 loc) · 7.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import gffutils
import os
from collections import defaultdict
import sys
from itertools import groupby, chain
from operator import itemgetter
import pickle
def transcriptfilters(transcript, db):
mrnaendpass = False
transcriptsupportlevelpass = False
#Are we confident in the 3' end of this mrnaendpass
if 'tag' not in transcript.attributes or 'mRNA_end_NF' not in transcript.attributes['tag']:
mrnaendpass = True
try:
tsl = transcript.attributes['transcript_support_level'][0]
except KeyError: #if this transcript doesnt have a transcript support level
tsl = None
if tsl != 'NA':
transcriptsupportlevelpass = True
if mrnaendpass and transcriptsupportlevelpass:
return True
else:
return False
def getall3pUTR(gff):
all3pUTRcoords = defaultdict(list) # {ENSGENE_chrm_strand : [[UTR1start, UTR1end], [UTR2start, UTR2end], [[UTR3exon1start, UTR3exon1end], [UTR3exon2start, UTR3exon2end]]]}
genecount = 0
geneboundaries = {} # {ensid : [genestart, genestop]}
no3pUTRcount = 0
#Make gff database
print 'Indexing gff...'
gff_fn = gff
db_fn = os.path.abspath(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)
db = gffutils.FeatureDB(db_fn)
print 'Done indexing!'
genes = db.features_of_type('gene')
for gene in genes:
genecount +=1
if genecount % 5000 == 0:
print 'Gene {0}...'.format(genecount)
geneboundaries[str(gene.id).replace('gene:', '')] = [gene.start, gene.end]
genename = str(gene.id) + '_' + str(gene.chrom) + '_' + gene.strand
strand = gene.strand
#If this gene doesn't have at least one coding 'transcript' (i.e. not 'NMD_transcript_variant', and not a ncRNA), skip it
codingtranscript = False
for transcript in db.children(gene, featuretype = 'transcript', level = 1, order_by = 'start'):
if len(list(db.children(transcript, featuretype = 'CDS', level = 1))) > 0:
codingtranscript = True
break
if not codingtranscript:
continue
all3pUTRcoords[genename] #initiate entry as list (defaultdict)
for transcript in db.children(gene, featuretype = 'transcript'):
#If there's no CDS in this transcript, skip it
if len(list(db.children(transcript, featuretype = 'CDS', level = 1))) == 0:
continue
#Does this transcript pass filters?
passtranscriptfilters = transcriptfilters(transcript, db)
if passtranscriptfilters == False:
continue
exoncoords = [] #[[exon1start, exon1stop], [exon2start, exon2stop]]
CDScoords = []
UTRcoords = [] #[UTRstart, UTRstop]
for exon in db.children(transcript, featuretype = 'exon', order_by = 'start'):
exoncoords.append([exon.start, exon.end])
for CDSexon in db.children(transcript, featuretype = 'CDS', order_by = 'start'):
CDScoords.append([CDSexon.start, CDSexon.end])
#3' UTR start is directly after CDS end
if transcript.strand == '+':
CDSend = max(CDScoords, key = itemgetter(1))[1]
if CDSend == transcript.end: #the transcript ends right where the CDS ends
continue
UTR3start = CDSend + 1
UTRcoords = [UTR3start, transcript.end]
elif transcript.strand == '-':
CDSend = min(CDScoords, key = itemgetter(0))[0]
if CDSend == transcript.start: #the transcript ends right where the CDS ends
continue
UTR3start = CDSend - 1
UTRcoords = [transcript.start, UTR3start]
###Check to see if the UTR is fully contained within the coordinates of one exon
singleexonUTR = False
for exoncoord in exoncoords:
exonstart = exoncoord[0]
exonend = exoncoord[1]
if exonstart <= UTRcoords[0] and exonend >= UTRcoords[1]:
singleexonUTR = True
#No duplicates
if [UTRcoords[0], UTRcoords[1]] not in all3pUTRcoords[genename]:
all3pUTRcoords[genename].append([UTRcoords[0], UTRcoords[1]])
if singleexonUTR == False:
#Get all positions that are both exonic and in the 3' UTR
overlappingbp = [] #sorted exonic positions in UTR
UTR3range = range(UTRcoords[0], UTRcoords[1] + 1)
for exoncoord in exoncoords:
exonrange = range(exoncoord[0], exoncoord[1] + 1)
overlap = set(UTR3range).intersection(exonrange)
for nt in sorted(list(overlap)):
overlappingbp.append(nt)
#Now get breaks in consecutive exonic positions
#http://stackoverflow.com/questions/2361945/detecting-consecutive-integers-in-a-list
UTRexoncoords = []
for k, g in groupby(enumerate(overlappingbp), lambda (index, item): index-item):
exonbp = map(itemgetter(1), g)
if len(exonbp) > 1:
UTRexoncoords.append([exonbp[0], exonbp[-1]])
##############ADD FLIP OF EXON CONNECTIVITY FOR - STRAND HERE??????
#if strand == '-':
#UTRexoncoords.reverse()
#No duplicates
if UTRexoncoords not in all3pUTRcoords[genename]:
all3pUTRcoords[genename].append(UTRexoncoords)
#Remove empty genes
#These are genes that have no UTRs assigned to them.
all3pUTRcoords = {gene:all3pUTRcoords[gene] for gene in all3pUTRcoords if all3pUTRcoords[gene]}
print 'Looked through {0} genes and retrived UTRs for {1} of them.'.format(genecount, len(all3pUTRcoords))
return all3pUTRcoords
def orderUTRs(all3pUTRcoords):
#all3pUTRcoords = {ENSGENE_chrm_strand : [[UTR1start, UTR1end], [UTR2start, UTR2end], [[UTR3exon1start, UTR3exon1end], [UTR3exon2start, UTR3exon2end]]]}
all3pUTRcoords_sorted = {} # {ENSGENE_chrm_strand : [[UTR1start, UTR1end], [UTR2start, UTR2end], [[UTR3exon1start, UTR3exon1end], [UTR3exon2start, UTR3exon2end]]]}
#For each gene:
#order UTRs by end (max end for + strand and min end for - strand)
for gene in all3pUTRcoords:
genename = gene.split('_')[0]
chrm = gene.split('_')[1]
strand = gene.split('_')[2]
utrs = all3pUTRcoords[gene]
#Skip genes that only have one UTR
if len(utrs) == 1:
continue
#print genename, strand
#print utrs
#Get the endpoints for each UTR
utrends = [] # list of utr end coords for this gene
if strand == '+':
for utr in utrs:
#If this is a single exon UTR, this will NOT be a nested list
if any(isinstance(i, list) for i in utr) == False:
utrend = max(utr)
utrends.append(utrend)
#If this is not a single exon UTR, this will be a nested list
#If this is a multi-exon UTR, it's slightly more complicated, but still the maximum value
#(doesn't matter if it's start or stop because stop is always greater than start) is what we want
elif any(isinstance(i, list) for i in utr) == True:
coords = list(chain.from_iterable(utr))
utrend = max(coords)
utrends.append(utrend)
elif strand == '-':
for utr in utrs:
#If this is a single exon UTR, this will NOT be a nested list
if any(isinstance(i, list) for i in utr) == False:
utrend = min(utr)
utrends.append(utrend)
#If this is not a single exon UTR, this will be a nested list
#If this is a multi-exon UTR, it's slightly more complicated, but still the maximum value
#(doesn't matter if it's start or stop because stop is always greater than start) is what we want
elif any(isinstance(i, list) for i in utr) == True:
coords = list(chain.from_iterable(utr))
utrend = min(coords)
utrends.append(utrend)
#print utrends
#Sort UTRs by their ends
#This can be done by sorting UTRs according to the values in utrends, since the ordering of the two is the same.
if strand == '+':
utrs_ends = zip(utrs, utrends)
utrs_ends.sort()
#print utrs_ends
utrs_sorted = [u for u, e in utrs_ends]
elif strand == '-':
utrs_ends = zip(utrs, utrends)
utrs_ends.sort(reverse = True)
#print utrs_ends
utrs_sorted = [u for u, e in utrs_ends]
all3pUTRcoords_sorted[gene] = utrs_sorted
return all3pUTRcoords_sorted
#all3pUTRcoords = getall3pUTR(sys.argv[1])
#outputfh = open('data.pkl', 'wb')
#pickle.dump(all3pUTRcoords, outputfh)
#outputfh.close()
pkl = open('data.pkl', 'rb')
all3pUTRcoords = pickle.load(pkl)
pkl.close()
all3pUTRcoords_sorted = orderUTRs(all3pUTRcoords)
print all3pUTRcoords_sorted