-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathCLIPdensample.py
More file actions
141 lines (113 loc) · 6.3 KB
/
CLIPdensample.py
File metadata and controls
141 lines (113 loc) · 6.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import sys
import os
import tabix
import pysam
import gffutils
import argparse
import random
import numpy
from scipy import stats
def MbnlCLIPden(gff, clusters):
gff_fn = gff
db_fn = os.path.basename(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, force=True)
db = gffutils.FeatureDB(db_fn)
cluster_densities = {} # {UTR_ID : [number of intersecting clusters, UTR length, cluster density in that UTR]}
UTRlengths = []
number_of_UTRs = len(list(db.features_of_type('3\'UTR')))
print number_of_UTRs
three_prime_UTRs = db.features_of_type('3\'UTR')
clustertabix = pysam.Tabixfile(clusters)
total_intersecting_clusters = 0
for UTR in three_prime_UTRs:
UTR_intersecting_clusters = 0
UTRlength = float(UTR.stop - UTR.start)
UTRlengths.append(UTRlength)
try:
for bed in clustertabix.fetch(str(UTR.chrom), UTR.start, UTR.stop, parser = pysam.asBed()):
if bed.strand == UTR.strand:
'''
print UTRlength, UTR.id, bed.start, bed.end
'''
UTR_intersecting_clusters +=1
total_intersecting_clusters +=1
except ValueError:
#print 'Warning: GFF region has start {0} > end {1}. Skipping...'.format(UTR.start, UTR.stop)
continue
try:
cluster_density = float((UTR_intersecting_clusters) / (UTRlength))
except ZeroDivisionError:
#print 'WARNING: UTR {0} has length 0.'.format(UTR.id)
continue
cluster_densities[str(UTR.id)] = [UTR_intersecting_clusters, UTRlength, cluster_density]
total_sequence_length = sum(UTRlengths)
sys.stderr.write('Searched {0} nt from {1} UTRs and found {2} clusters within them. The overall cluster density is {3} clusters per nt \n'.format(int(total_sequence_length), number_of_UTRs, total_intersecting_clusters, float(float(total_intersecting_clusters) / float(total_sequence_length))))
os.remove(db_fn)
if os.path.isfile(db_fn + '-shm'):
os.remove(db_fn + '-shm')
if os.path.isfile(db_fn + '-wal'):
os.remove(db_fn + '-wal')
return cluster_densities
def samplevalues(distaldensities, proximaldensities, number_to_sample, times_to_sample, threshold):
#Given a set of clip cluster densities (distal and proximal) defined by the above function (in this case likely from
#a large set of "unaffected" UTRs, sample it some number of times, recording the mean clip density
#every time. Count what fraction of time this mean exceeds a "threshold", which can be the mean
#clip cluster density for "affected" UTRs.
#densitydict format = # {UTR_ID : [number of intersecting clusters, UTR length, cluster density in that UTR]}
ALEids = []
commonids = []
relativedensities = []
greater_than_threshold = 0
#commonids have to be present in distaldensites and proximaldensities
for UTR_ID in distaldensities:
#UTR_IDs are XXXX.A or XXXX.B, depending on which isoform they came from. The name of the parent ALE is therefore
#XXXX. Get this name.
ALEids.append(('.').join(UTR_ID.split('.')[:-1]))
for UTR_ID in proximaldensities:
if ('.').join(UTR_ID.split('.')[:-1]) in ALEids:
commonids.append(('.').join(UTR_ID.split('.')[:-1]))
print 'From {0} distal UTRs and {1} proximal UTRs, found {2} in common that make up ALEs.'.format(len(distaldensities), len(proximaldensities), len(commonids))
for i in range(int(times_to_sample)):
randomids = []
randomids = random.sample(commonids, int(number_to_sample))
distalclusters = 0
distallength = 0
proximalclusters = 0
proximallength = 0
for ALEID in randomids:
for UTRID in distaldensities:
if ALEID in UTRID:
distalclusters += distaldensities[UTRID][0]
distallength += distaldensities[UTRID][1]
for UTRID in proximaldensities:
if ALEID in UTRID:
proximalclusters += proximaldensities[UTRID][0]
proximallength += proximaldensities[UTRID][1]
distaldensity = distalclusters / float(distallength)
proximaldensity = proximalclusters / float(proximallength)
if proximaldensity == 0:
continue
relativedensity = distaldensity / proximaldensity
relativedensities.append(relativedensity)
if i % 100 == 0:
print 'Random sample {0}...'.format(i)
for density in relativedensities:
if density > threshold:
greater_than_threshold +=1
fractiongreater = greater_than_threshold / float(len(relativedensities))
print 'Of {0} random samplings of {1} UTRs each, {2} ({3}) had mean relative densities greater than the threshold.'.format(times_to_sample, number_to_sample,
greater_than_threshold, fractiongreater)
print 'The mean relative density was {0} with a SD of {1} and a SE of {2}.'.format(numpy.mean(relativedensities), numpy.std(relativedensities), stats.sem(relativedensities))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--distalgff', type = str, help = 'Gff of distal regions to interrogate.', required = True)
parser.add_argument('--proximalgff', type = str, help = 'Gff of proximal regions to interrogate.', required = True)
parser.add_argument('--clusters', type = str, help = 'Tabix indexed file of clip clusters.', required = True)
parser.add_argument('--numbertosample', type = int, help = 'Number of relative densities to sample in each sampling.', required = True)
parser.add_argument('--timestosample', type = int, help = 'Number of times to sample relative densities.', required = True)
parser.add_argument('--threshold', type = float, help = 'Threshold relative density to compare against.', required = True)
args = parser.parse_args()
distaldensities = MbnlCLIPden(args.distalgff, args.clusters)
proximaldensities = MbnlCLIPden(args.proximalgff, args.clusters)
samplevalues(distaldensities, proximaldensities, args.numbertosample, args.timestosample, args.threshold)