AnalysisScripts/GquadruplexConservation_WGGA.py at master · TaliaferroLab/AnalysisScripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#Using phylop scores, look at the conservation of G nucleotides contained with WGGA-type
#gquadruplex-forming motifs. Compare this to the conservation of other G nucleotides that are
#not in WGGA-type gquadruplex-forming motifs.  This is analogous to GquadruplexConservation.py,
#except that this uses WGGA to look for G-quadruplexes while GquadruplexConservation.py uses
#RNAfold.

from Bio import SeqIO
import argparse
import regex
import os
import gffutils
import pysam
import numpy as np

def getallGpositions(seq):
	gpositions = []
	seq = seq.upper()
	for ind, nt in enumerate(seq):
		if nt == 'G':
			gpositions.append(ind)

	return gpositions

def getWGGAgpositions(seq):
	wggagpositions = [] #0-based coords of Gs in WGGA gquadruplex motifs
	seq = seq.upper().replace('U', 'T')
	motif = r'(([AU]GGA(.{0,5})[AU]GGA(.{0,5})[AU]GGA(.{0,5})[AU]GGA))'

	#Get all overlapping instances of the motif with their start and end coords
	motifboundaries = [[m.start(), m.end()] for m in regex.finditer(motif, seq, overlapped = True)]
	for motifboundary in motifboundaries:
		motifstart = motifboundary[0]
		motifend = motifboundary[1]
		motifseq = seq[motifstart : motifend]
		for i in range(len(motifseq)):
			#Within each motif instance, get WGGA positions
			if motifseq[i:i+4] == 'AGGA' or motifseq[i:i+4] == 'TGGA':
				#The G positions have to be the middle to positions in WGGA
				gpositions = [motifstart + i + 1, motifstart + i + 2]
				for gposition in gpositions:
					if gposition not in wggagpositions:
						wggagpositions.append(gposition)

	return wggagpositions

def iteratefasta(fasta):
	gquadpositions = {} #{geneid : [wggagquadpositions, 0-based]}
	nongquadpositions = {} #{geneid : [nonwggagquadpositions, 0-based]}
	for record in SeqIO.parse(fasta, 'fasta'):
		genename = str(record.id).split('_')[0]
		seq = str(record.seq).upper().replace('U', 'T')
		nongquadgs = []
		allgpositions = getallGpositions(seq)
		wggagpositions = getWGGAgpositions(seq)
		for position in allgpositions:
			if position not in wggagpositions:
				nongquadgs.append(position)
		gquadpositions[genename] = wggagpositions
		nongquadpositions[genename] = nongquadgs

	return gquadpositions, nongquadpositions

def getcoords(gpositions, feature, gff):
	#Given a list of gpositions within a fasta, get their genome coordinates
	#The gff is one that corresponds to the fasta, usually a longestXXX.gff3.
	#Feature is the the featuretype (3rd field in gff) that you want to go after, usually UTR5, CDS, or UTR3

	#Get all genome coords for a feature that are exonic
	print 'Indexing gff...'
	gff_fn = gff
	db_fn = os.path.abspath(gff_fn) + '.db'
	if os.path.isfile(db_fn) == False:
		gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)

	db = gffutils.FeatureDB(db_fn)
	print 'Done indexing!'

	features = db.features_of_type(feature)
	featurecounter = 0

	exoniccoords = {} # {geneid : [chrm, strand, [list of all positions that are exonic, e.g. 400, 401, 402, 403, 410, 411, 412, etc.]]}
	#These will end up being 1-based gff-style coords
	for feature in features:
		featurecounter +=1
		if featurecounter % 5000 == 0:
			print 'Feature {0}...'.format(featurecounter)
		geneid = feature.attributes['Parent'][0].replace('gene:', '')
		chrm = str(feature.chrom)
		strand = str(feature.strand)
		exonicpos = []
		for exon in db.children(feature, featuretype = 'exon', level = 1):
			exonicnts = range(exon.start, exon.end + 1)
			exonicpos = exonicpos + exonicnts
		if strand == '+':
			exonicpos = sorted(exonicpos)
		elif strand == '-':
			exonicpos = sorted(exonicpos)[::-1]
		exoniccoords[geneid] = [chrm, strand, exonicpos]

	#Now intersect g positions within the feature with exonic coords
	gcoords = {} #{geneid : [chrm, [coords]]}
	for gene in gpositions:
		gpos = gpositions[gene]
		chrm = exoniccoords[gene][0]
		strand = exoniccoords[gene][1]
		ecoords = exoniccoords[gene][2]
		gc = []
		for gp in gpos:
			if strand == '+':
				gc.append(ecoords[gp])
			#If it's on the minus strand we have to count backwards from the end
			elif strand == '-':
				gc.append(ecoords[gp])
		gcoords[gene] = [chrm, strand, sorted(gc)]

	os.remove(db_fn)

	return gcoords

def getphylopscores(phylopbed, gcoords):
	scores = [] #all scores
	scoresd = {} #{geneid : [scores]}
	tbx = pysam.Tabixfile(phylopbed)
	nt = 0
	ntwithscores = 0

	for gene in gcoords:
		pcscores = [] #list of all pc scores for coords in this gene
		chrm = gcoords[gene][0]
		coords = gcoords[gene][2]
		for coord in coords:
			nt +=1
			for row in tbx.fetch(chrm, coord, coord + 1, parser = pysam.asTuple()):
				score = float(row[4])
				if score:
					ntwithscores +=1
					scores.append(score)
					pcscores.append(score)
		if pcscores:
			scoresd[gene] = pcscores

	print 'Interrogated {0} nucleotides. Found phylop scores for {1} ({2}%) of them.'.format(nt, ntwithscores, round((ntwithscores / float(nt)), 4) * 100)

	return scores, scoresd


if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--fasta', type = str, help = 'Fasta file of sequences to consider.')
	parser.add_argument('--gff', type = str, help = 'Gff of regions.  Need it to get genomic coordinates.')
	parser.add_argument('--featurename', type = str, help = '3rd field of gff file of features you are interrogating. Usually UTR5, CDS, or UTR3.')
	parser.add_argument('--phylopbed', type = str, help = 'Phylop scores in gzipped, tabix indexed format.')
	args = parser.parse_args()

	gquadpositions, nongquadpositions = iteratefasta(args.fasta)
	gquadcoords = getcoords(gquadpositions, args.featurename, args.gff)
	nongquadcoords = getcoords(nongquadpositions, args.featurename, args.gff)
	scores, scoresd = getphylopscores(args.phylopbed, gquadcoords)
	print np.mean(scores)
	scores, scoresd = getphylopscores(args.phylopbed, nongquadcoords)
	print np.mean(scores)