-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathMISO2gene.py
More file actions
109 lines (90 loc) · 4.48 KB
/
MISO2gene.py
File metadata and controls
109 lines (90 loc) · 4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#This is made for ALEs with 2 isoforms. The idea is to take each isoform independently. Using a gtf that contains
#gene names in the gene_name field, intersect the coordinates for each isoform of the event with each entry in the
#gtf. Only consider as "hits" gene_names from the gtf file that overlap both isoforms. If more than one gene_name
#satisfies this, an error is printed and the gene_name is not added to the output.
import pysam
import gffutils
import os
import argparse
def getEvents(eventsfile):
events = []
eventsfh = open(eventsfile, 'r')
for line in eventsfh:
line = line.strip()
events.append(line)
eventsfh.close()
print 'Entered {0} events.'.format(len(events))
return events
def eventstocoords(events, annotation):
gff_fn = annotation
db_fn = os.path.basename(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, force=True)
db = gffutils.FeatureDB(db_fn)
coords = {} # {event : [[isoform1chrm, isoform1start, isoform1end, isoform1strand], [isoform2chrm, isoform2start, isoform2end, isoform2strand]]}
annotatedevents = db.features_of_type('gene')
for annotatedevent in annotatedevents:
for event in events:
eventID = annotatedevent.id
isoformcoords = []
if event == eventID:
for isoform in db.children(annotatedevent, featuretype = 'mRNA'):
isoformcoords.append([str(isoform.chrom), isoform.start, isoform.end, str(isoform.strand)])
coords[event] = isoformcoords
print 'Found {0} of {1} events in the annotation.'.format(len(coords), len(events))
for event in events:
if event not in coords:
print event
#Cleanup
os.remove(db_fn)
if os.path.isfile(db_fn + '-shm'):
os.remove(db_fn + '-shm')
if os.path.isfile(db_fn + '-wal'):
os.remove(db_fn + '-wal')
return coords
def getgenenames(coords, gtf):
eventstogenes = {} # {event : gene}
tabixfile = pysam.Tabixfile(gtf)
for event in coords:
isoform1genes = []
isoform2genes = []
isoform1chrm = coords[event][0][0]
isoform1start = coords[event][0][1]
isoform1end = coords[event][0][2]
isoform1strand = coords[event][0][3]
isoform2chrm = coords[event][1][0]
isoform2start = coords[event][1][1]
isoform2end = coords[event][1][2]
isoform2strand = coords[event][1][3]
for entry in tabixfile.fetch(isoform1chrm, isoform1start, isoform1end, isoform1strand, parser = pysam.asGTF()):
isoform1genes.append(entry.gene_name)
for entry in tabixfile.fetch(isoform2chrm, isoform2start, isoform2end, isoform2strand, parser = pysam.asGTF()):
isoform2genes.append(entry.gene_name)
#Collapse all duplicates
isoform1genes = list(set(isoform1genes))
isoform2genes = list(set(isoform2genes))
#Get genes that overlap both isoforms
isoformintersection = set(isoform1genes).intersection( set(isoform2genes) )
if len(isoformintersection) > 1:
print 'WARNING: more than one gene found for event {0}.'.format(event)
print event, list(isoformintersection)
elif len(isoformintersection) == 1:
eventstogenes[event] = list(isoformintersection)[0]
elif len(isoformintersection) == 0:
print 'No gene found for event {0}!!'.format(event)
print 'Found genes for {0} of {1} events.'.format(len(eventstogenes), len(coords))
return eventstogenes
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--events', type = str, help = 'List of MISO events you are interested in.', required = True)
parser.add_argument('--annotations', type = str, help = 'MISO annotations containing events you are interested in.', required = True)
parser.add_argument('--gtf', type = str, help = 'Gtf annotation of genome. Ones from cuffmerge work well. Must be compressed and tabix indexed with the index in the same directory as the gtf.', required = True)
parser.add_argument('--outfile', type = str, help = 'Output file.')
args = parser.parse_args()
events = getEvents(args.events)
coords = eventstocoords(events, args.annotations)
eventstogenes = getgenenames(coords, args.gtf)
outfh = open(args.outfile, 'w')
for event in eventstogenes:
outfh.write(str(event) + '\t' + str(eventstogenes[event]) + '\n')
outfh.close()