AnalysisScripts/BpProbMeta.py at master · TaliaferroLab/AnalysisScripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#Take a fasta of sequences.  Fold it using a sliding window approach and return base pair probabilities for every nt.
#Each nt may or may not have multiple values assigned to it, depending on how many windows it fell in to.
#Take the mean bp prob across all windows as the bp prob for that nt.
#Then, divide into bins and take the mean bp of all nt in that bin.

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
import numpy as np
import subprocess
import time
import os
import sys
import argparse


def chopfasta(seq, windowsize, slidesize):
	#Given a seq, chop it into smaller fasta seqs of length fragsize, sliding slidesize across the sequence
	#Make a fasta record out of it, then combine all of these fasta records into one file
	currentlocation = 0
	fastarecords = []
	if len(seq) >= windowsize:
		while currentlocation + windowsize <= len(seq):
			currentseq = seq[currentlocation : currentlocation + windowsize]
			fastarecord = SeqRecord(Seq(currentseq, IUPAC.IUPACAmbiguousDNA), id = 'seq__{0}'.format(currentlocation), description = '')
			fastarecords.append(fastarecord)
			currentlocation += slidesize

	#Now start at the end and go backwards to ensure that every nt is covered in at least one window
		currentlocation = len(seq) - windowsize
		while currentlocation >= 0:
			currentseq = seq[currentlocation : currentlocation + windowsize]
			fastarecord = SeqRecord(Seq(currentseq, IUPAC.IUPACAmbiguousDNA), id = 'seq__{0}'.format(currentlocation), description = '')
			fastarecords.append(fastarecord)
			currentlocation -= slidesize

		SeqIO.write(fastarecords, 'currentseqs.fasta', 'fasta')


#Parse the output of an RNA fold (usually the folding of one of several sequences)
def parseRNAfoldoutput(outputfile, seq):
	#Given a sequence, make bp probabilities and return the summed bp probability for a base (the sum of its pairing to every other base)
	probs = {} # {base : bpprob}

	#Populate dictionary
	for i in range(len(seq)):
		probs[i] = 0

	#For some reason the slow point in the folding analysis is the writing of the .ps files.
	#This means that sometimes the folding will be finished, but the .ps file that this is trying to read is not made yet
	#Try to read it.  If it's not there, wait 1 sec and then try again.
	while True:
		try:
			bpfh = open(outputfile, 'r')
			break
		except IOError:
			time.sleep(1)

	#lines containing bp probs have 'ubox' in the 4th field
	for line in bpfh:
		line = line.strip().split(' ')
		if len(line) != 4:
			continue
		if line[3] == 'ubox':
			leftbase = int(line[0]) - 1 #make these 0 based
			rightbase = int(line[1]) - 1
			#3rd field is square root of bpprob
			bpprob = float(line[2])**2
			probs[leftbase] += bpprob
			probs[rightbase] += bpprob

	bpfh.close()

	return probs

#Parse the output of an RNA fold (usually the folding of one of several sequences)
def parseRNAfoldoutput(outputfile, seq):
	#Given a sequence, make bp probabilities and return the summed bp probability for a base (the sum of its pairing to every other base)
	probs = {} # {base : bpprob}

	#Populate dictionary
	for i in range(len(seq)):
		probs[i] = 0

	#For some reason the slow point in the folding analysis is the writing of the .ps files.
	#This means that sometimes the folding will be finished, but the .ps file that this is trying to read is not made yet
	#Try to read it.  If it's not there, wait 1 sec and then try again.
	while True:
		try:
			bpfh = open(outputfile, 'r')
			break
		except IOError:
			time.sleep(1)

	#lines containing bp probs have 'ubox' in the 4th field
	for line in bpfh:
		line = line.strip().split(' ')
		if len(line) != 4:
			continue
		if line[3] == 'ubox':
			leftbase = int(line[0]) - 1 #make these 0 based
			rightbase = int(line[1]) - 1
			#3rd field is square root of bpprob
			bpprob = float(line[2])**2
			probs[leftbase] += bpprob
			probs[rightbase] += bpprob

	bpfh.close()

	return probs

#Given a sequence (of one CDS or one UTR, for example), chop it into windowed sizes.
#Then fold each window.
#Keep track of the bp prob for each nt in each window.

def foldseq(seq, windowsize, slidesize):
	#If the sequence is smaller than the windowsize, we can't deal with it
	if len(seq) < windowsize:
		return None

	bpprobs = {} #{position (0-based) : [list of bpprobs for this position]}

	#populate bpprobs
	for i in range(len(seq)):
		bpprobs[i] = []

	#Chop the sequence into windows
	chopfasta(seq, windowsize, slidesize)

	#Fold the fasta file, send stdout to trash
	command = 'RNAfold -p < currentseqs.fasta > /dev/null'
	subprocess.Popen(command, shell = True)

	#Now parse the RNAfold outputs
	for record in SeqIO.parse('currentseqs.fasta', 'fasta'):
		rnafoldoutputfile = str(record.id) + '_dp.ps'
		windowstart = int(str(record.id).split('__')[1])
		probs = parseRNAfoldoutput(rnafoldoutputfile, str(record.seq))

		for position in probs:
			realseqposition = windowstart + position
			bpprobs[realseqposition].append(probs[position])

	#Take medians of the bp probs at every position across all windows as the bpprob for that position
	medianbpprobs = {}
	for position in bpprobs:
		medianbpprobs[position] = np.median(bpprobs[position])

	#Cleanup
	for f in os.listdir('.'):
		if f.endswith('_ss.ps') or f.endswith('_dp.ps'):
			os.remove(os.path.abspath(f))
	os.remove('currentseqs.fasta')

	return medianbpprobs


#Now turn the medianbpprobs into bins (like a metagene)
def turnintobins(medianbpprobs, numberofbins):
	#Given a list of nt positions, split it into n roughly-equal parts
	#http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts
	listofpositions = medianbpprobs.keys()
	bins = []
	k, m = divmod(len(listofpositions), numberofbins)
	gen = (listofpositions[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(numberofbins))
	for b in gen:
		bins.append(b)

	binnedbpprobs = {} #{bin : [bp probs of nt in that bin]}

	for ind, b in enumerate(bins):
		binnumber = ind + 1
		binnedbpprobs[binnumber] = []
		for nt in b:
			binnedbpprobs[binnumber].append(medianbpprobs[nt])

	#Now take the mean of all bpprobs in each bin as the bpprob for that bin
	meanbinnedbpprobs = {} #{bin : mean bpprob of nt in that bin}
	for binnumber in binnedbpprobs:
		meanbinnedbpprobs[binnumber] = np.mean(binnedbpprobs[binnumber])

	return meanbinnedbpprobs


#Now iterate over an entire fasta of sequences
def iteratefasta(fasta, windowsize, slidesize, numberofbins, outputfile, region, seqclass):
	if os.path.isfile(outputfile) == False:
		with open(outputfile, 'w') as outfh:
			outfh.write(('\t').join(['seqname', 'binnumber', 'bpprob', 'region', 'seqclass']) + '\n')

	for record in SeqIO.parse(fasta, 'fasta'):
		seq = str(record.seq.transcribe().upper())
		seqname = str(record.id)
		medianbpprobs = foldseq(seq, windowsize, slidesize)

		if not medianbpprobs: #if the length of the sequence is less than windowsize, medianbpprobs returns None
			continue

		meanbinnedbpprobs = turnintobins(medianbpprobs, numberofbins)
		with open(outputfile, 'a') as outfh:
			for binnumber in meanbinnedbpprobs:
				meanbpprob = meanbinnedbpprobs[binnumber]
				outfh.write(('\t').join([seqname, str(binnumber), str(meanbpprob), region, seqclass]) + '\n')


if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--fasta', type = str, help = 'Fasta file of sequences to fold.')
	parser.add_argument('--windowsize', type = int, help = 'Width of subsequences that get folded.')
	parser.add_argument('--slidesize', type = int, help = 'How far the window gets slid along the sequence to define the next window.')
	parser.add_argument('--numberofbins', type = int, help = 'How many bins to use for the metagene.')
	parser.add_argument('--region', type = str, help = 'Transcript region these sequences came from.  UTR? CDS?')
	parser.add_argument('--seqclass', type = str, help = 'Class of these sequences.  RO up? LR down?')
	parser.add_argument('--outputfile', type = str, help = 'Output file.')

	args = parser.parse_args()
	iteratefasta(args.fasta, args.windowsize, args.slidesize, args.numberofbins, args.outputfile, args.region, args.seqclass)