-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpull_mrna_junctions.py
More file actions
127 lines (97 loc) · 5.24 KB
/
pull_mrna_junctions.py
File metadata and controls
127 lines (97 loc) · 5.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
# DESCRIPTION: This program uses the transcript FASTA file and extracts the junctions made by each pair of exons.
# The output is a bed file with all of these junctions.
#
# AUTHOR: Chelsea Tymms
import gffutils
import itertools
def start_pos(x):
""" Little function to let me sort gffutils objects by start position """
return x.start
def createExonArray(exonList,mergedExons):
""" Group overlapping exons together as an list within a list """
exonArray = []
for mergedExon in mergedExons:
exonSubset = []
for exon in exonList:
if exon.start >= mergedExon.start and exon.stop <= mergedExon.stop:
exonSubset.append(exon)
exonArray.append(exonSubset)
return exonArray
def createJunctionArray(exonArray):
""" Using itertools create all possible combination between groups of exons """
iterProduct = []
s=len(exonArray)
for i in range(0,s-1):
# for j in range (i+1,len(exonArray)):
j=i+1
for k in itertools.product(exonArray[i],exonArray[j]):
iterProduct.append(k)
return iterProduct
def defineRegion(index,junctionArray):
""" Instead of keeping the entire exon, only keep up to 75bp on either side of the junction """
if len(junctionArray[index][0]) >= 38:
exon1Start = junctionArray[index][0].stop - 37
exon1Stop = junctionArray[index][0].stop
else:
exon1Start = junctionArray[index][0].start
exon1Stop = junctionArray[index][0].stop
if len(junctionArray[index][1]) >= 38:
exon2Start = junctionArray[index][1].start
exon2Stop = junctionArray[index][1].start + 37
else:
exon2Start = junctionArray[index][1].start
exon2Stop = junctionArray[index][1].stop
return (exon1Start,exon1Stop,exon2Start,exon2Stop)
def main():
# Get the database
#fname='../gff/AceView.GRCh37.hg19.gtf'
#db_fn='AceView.GRCh37.hg19.gtf.db'
#db_fn='AceView.spliced.GRCh37.hg19.gtf.db'
db_fn='../../useful_dmel_data/flybase557/gff/dmel-all-no-analysis-r5.57.gff.db'
# gffutils.create_db(fname, db_fn, merge_strategy='merge', id_spec=["Name", "ID"])
db=gffutils.FeatureDB(db_fn)
rnas=db.features_of_type('mRNA')
# Open the output BED file
with open('../generated_files/mRNA_annotated_junctions.tsv','wb') as outputFile:
for rna in rnas:
# Get all exons
exonList = list(db.children(rna,featuretype='exon'))
# Skip genes that only have 1 exon, or genes that don't have an exon e.g. miRNA
if len(exonList) == 0 or len(exonList) == 1:
continue
# Convert to BED 0-based coordinate system. Only have to change start.
for exon in exonList:
exon.start = exon.start - 1
# Create list of overlapping exons
# Note: mod(mdg4) has exons on both strands, so ignore_strand needs
# to be set for it
#mergedExons = list(db.merge_features(exonList,ignore_strand=True))
mergedExons = list(db.merge(exonList,ignore_strand=True))
# Sort merged exons by start postiion
mergedExons.sort(key=start_pos)
# Using the merged exons to identify overlapping exons create an array where
# overlapping exons are grouped together
exonArray = createExonArray(exonList,mergedExons)
# Now create all possible junctions
junctionArray = createJunctionArray(exonArray)
# Create BED file
for i in range(0,len(junctionArray)):
# Only keep up to 75bp on each side of the junction
(exon1Start, exon1Stop, exon2Start, exon2Stop) = defineRegion(i,junctionArray)
# Construct various parts of the junction BED file
totalStart=exon1Start # Take exon1 start
totalStop=exon2Stop # Take exon2 stop
chrom=junctionArray[i][0].chrom
strand=junctionArray[i][0].strand
name="".join(junctionArray[i][0].attributes['Name']) + '|' + str(junctionArray[i][0].chrom) + ':' + str(junctionArray[i][0].start) + '-' + str(junctionArray[i][0].stop) + ':' + junctionArray[i][0].strand + '|' + "".join(junctionArray[i][1].attributes['Name']) + str(junctionArray[i][1].chrom) + ':' + str(junctionArray[i][1].start) + '-' + str(junctionArray[i][1].stop) + ':' + junctionArray[i][1].strand
cat_gene="|".join([gene.id for gene in db.parents(junctionArray[i][0], featuretype='gene', level=2)])
cat_xscript="|".join(junctionArray[i][0].attributes['Parent'])
xscript_gene="|".join([gene.id for gene in db.parents(rna, featuretype='gene')])
xscripter="|".join(rna.attributes['ID'])
exact_jnct=str(junctionArray[i][0].chrom) + ':' + str(junctionArray[i][0].stop) + ':' + str(junctionArray[i][1].start) + ':' + str(junctionArray[i][0].strand)
bedArray=[chrom,totalStart,totalStop,name,strand,totalStart,totalStop,cat_xscript,cat_gene,exact_jnct,xscripter,xscript_gene]
# Output to BED file
outputFile.write("\t".join(str(i) for i in bedArray)+"\n")
if __name__ == '__main__':
main()