-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathReorderGff3v2.0.py
More file actions
86 lines (65 loc) · 3.25 KB
/
ReorderGff3v2.0.py
File metadata and controls
86 lines (65 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#Reorders MISO Gffs by position of their start coord (if on + strand) or stop coord (if on - strand). Reorders so that most gene-distal isoform is listed first, followed by all of its children exons.
#Assumes each event (gene) only has two isoforms (mRNAs)!!!
#Usage: python ReorderGff3v2.0.py <gff_file> <event_type (AFE or ALE)> > output
import sys
import os
import gffutils
from operator import itemgetter
gff_fn = sys.argv[1]
event_type = sys.argv[2]
db_fn = os.path.basename(gff_fn) + '.db'
if os.path.isfile(db_fn) == False : #if database doesn't exist, create it
gffutils.create_db(gff_fn, db_fn)
db = gffutils.FeatureDB(db_fn)
genes= db.features_of_type('gene')
for gene in genes:
print gene #print gene line
bounds = []
for isoform in db.children(gene, featuretype = 'mRNA'):
if gene.strand == '+':
bounds.append([isoform.start, isoform.stop])
elif gene.strand == '-':
bounds.append([isoform.start, isoform.stop])
if len(bounds) == 1: #if there is only one isoform in event skip it
continue
for isoform in db.children(gene, featuretype = 'mRNA'):
if event_type == 'AFE':
if gene.strand == '+':
if bounds[0][0] != bounds[1][0]:
bounds = sorted(bounds, key = itemgetter(0)) #sort isoform start/stop by start coord, lowest to highest
elif bounds[0][0] == bounds[1][0]:
bounds = sorted(bounds, key = itemgetter(1)) #if start is shared, sort so shortest isoform is first
elif gene.strand == '-':
if bounds[0][1] != bounds[1][1]:
bounds = sorted(bounds, key = itemgetter(1), reverse = True) #sort isoform start/stop by stop coord, highest to lowest
elif bounds[0][1] == bounds[1][1]:
bounds = sorted(bounds, key = itemgetter(0), reverse = True)
elif event_type == 'ALE':
if gene.strand == '+':
if bounds[0][1] != bounds [1][1]:
bounds = sorted(bounds, key = itemgetter(1), reverse = True)
elif bounds[0][1] == bounds [1][1]:
bounds = sorted(bounds, key = itemgetter(0), reverse = True)
elif gene.strand == '-':
if bounds[0][0] != bounds[1][0]:
bounds = sorted(bounds, key = itemgetter(0))
elif bounds[0][0] == bounds[1][0]:
bounds = sorted(bounds, key = itemgetter(1))
for bound in bounds:
if gene.strand == '+':
for isoform in db.children(gene, featuretype = 'mRNA'):
start = bound[0]
stop = bound[1]
if start == isoform.start and stop == isoform.stop:
print isoform
for exon in db.children(isoform, featuretype = 'exon'):
print exon
elif gene.strand == '-':
for isoform in db.children(gene, featuretype = 'mRNA'):
start = bound[0]
stop = bound[1]
if start == isoform.start and stop == isoform.stop:
print isoform
for exon in db.children(isoform, featuretype = 'exon'):
print exon
os.remove(db_fn)