AnalysisScripts/GquadruplexConservation.py at master · TaliaferroLab/AnalysisScripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#For some reason on coyote, the fastest sqlite for use with gffutils is 3.7.13

import pysam
from Bio import SeqIO
import sys
import gffutils
import os
import pysam
import argparse

def getquadgs(gquadoutfile):
	#First get the coords of G's that are predicted to be in Gquads from gquadout.txt file (from UTRfold_Gquadruplex.py)
	gquadpositions = {} #{geneid : [positions in sequence]}
	with open(gquadoutfile, 'r') as gqf:
		for line in gqf:
			line = line.strip().split('\t')
			if line[0] == 'seqname':
				continue
			#gene ids look like ENSMUSG00000020097.14_ENSMUSG00000020097.14
			geneid = line[0].split('_')[0]
			positions = line[4]
			if positions != 'none':
				positions = [int(pos) for pos in positions.split(',')] #these positions are 0-based relative to the start of the sequence
			elif positions == 'none':
				positions = []
			gquadpositions[geneid] = positions

	return gquadpositions

def getnongquadgs(gquadoutfile, fasta):
	#Given a gquadoutfile and a fasta of the corresponding sequences, get the positions of any G that was not predicted to be in a quadruplex

	#Get positions of all G's in fasta
	allgpos = {} #{geneid: [all g positions, 0-based]}
	for record in SeqIO.parse(fasta, 'fasta'):
		geneid = str(record.id).split('_')[0]
		seq = str(record.seq)
		gpos = [pos for pos, nt in enumerate(seq) if nt == 'G']
		allgpos[geneid] = gpos

	nongquadpositions = {}
	with open(gquadoutfile, 'r') as gqf:
		for line in gqf:
			line = line.strip().split('\t')
			if line[0] == 'seqname':
				continue
			geneid = line[0].split('_')[0]
			positions = line[4]
			if positions != 'none':
				gquadpositions = [int(pos) for pos in positions.split(',')] #these positions are 0-based relative to the start of the sequence
			elif positions == 'none':
				gquadpositions = []
			#Remove anything from allgpos that is present in positions
			allg = allgpos[geneid]
			for gquadposition in gquadpositions:
				allg.remove(gquadposition)
			nongquadpositions[geneid] = allg

	return nongquadpositions


def getcoords(gpositions, feature, gff):
	#Given a list of gpositions within a fasta, get their genome coordinates
	#The gff is one that corresponds to the fasta, usually a longestXXX.gff3.
	#Feature is the the featuretype (3rd field in gff) that you want to go after, usually UTR5, CDS, or UTR3

	#Get all genome coords for a feature that are exonic
	print 'Indexing gff...'
	gff_fn = gff
	db_fn = os.path.abspath(gff_fn) + '.db'
	if os.path.isfile(db_fn) == False:
		gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)

	db = gffutils.FeatureDB(db_fn)
	print 'Done indexing!'

	features = db.features_of_type(feature)
	featurecounter = 0

	exoniccoords = {} # {geneid : [chrm, strand, [list of all positions that are exonic, e.g. 400, 401, 402, 403, 410, 411, 412, etc.]]}
	#These will end up being 1-based gff-style coords
	for feature in features:
		featurecounter +=1
		if featurecounter % 5000 == 0:
			print 'Feature {0}...'.format(featurecounter)
		geneid = feature.attributes['Parent'][0].replace('gene:', '')
		chrm = str(feature.chrom)
		strand = str(feature.strand)
		exonicpos = []
		for exon in db.children(feature, featuretype = 'exon', level = 1):
			exonicnts = range(exon.start, exon.end + 1)
			exonicpos = exonicpos + exonicnts
		if strand == '+':
			exonicpos = sorted(exonicpos)
		elif strand == '-':
			exonicpos = sorted(exonicpos)[::-1]
		exoniccoords[geneid] = [chrm, strand, exonicpos]

	#Now intersect g positions within the feature with exonic coords
	gcoords = {} #{geneid : [chrm, [coords]]}
	for gene in gpositions:
		gpos = gpositions[gene]
		chrm = exoniccoords[gene][0]
		strand = exoniccoords[gene][1]
		ecoords = exoniccoords[gene][2]
		gc = []
		for gp in gpos:
			if strand == '+':
				gc.append(ecoords[gp])
			#If it's on the minus strand we have to count backwards from the end
			elif strand == '-':
				gc.append(ecoords[gp])
		gcoords[gene] = [chrm, strand, sorted(gc)]

	return gcoords

def getphastscores(phastconsbed, gcoords):
	scores = [] #all scores
	scoresd = {} #{geneid : [scores]}
	tbx = pysam.Tabixfile(phastconsbed)
	nt = 0
	ntwithscores = 0

	for gene in gcoords:
		pcscores = [] #list of all pc scores for coords in this gene
		chrm = gcoords[gene][0]
		coords = gcoords[gene][2]
		for coord in coords:
			nt +=1
			for row in tbx.fetch(chrm, coord, coord + 1, parser = pysam.asTuple()):
				score = row[4]
				if score:
					ntwithscores +=1
					scores.append(score)
					pcscores.append(score)
		if pcscores:
			scoresd[gene] = pcscores

	print 'Interrogated {0} nucleotides. Found phastcons scores for {1} ({2}%) of them.'.format(nt, ntwithscores, round((ntwithscores / float(nt)), 4) * 100)

	return scores, scoresd


if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--gtype', choices =['quad', 'nonquad'], type = str, help = 'Interrogate Gs participating in quads or not participating?')
	parser.add_argument('--gquadout', type = str, help = 'File produced by UTRfold_gquadruplex.py')
	parser.add_argument('--fasta', type = str, help = 'Fasta file of regions. Only necessary if gtype == nonquad. Need it to find out where the other Gs are.')
	parser.add_argument('--gff', type = str, help = 'Gff of regions. Need it to get coordinates.')
	parser.add_argument('--featurename', type = str, help = '3rd field of gff file of features you are interrogating. Usually UTR5, CDS, or UTR3.')
	parser.add_argument('--sequenceclass', type = str, help = 'Sequence bin. Usually AllGenes, Unchanged, SD1, SD2, or SD3.')
	parser.add_argument('--phastconsbed', type = str, help = 'Phastcons scores with tabix index in same directory.')
	parser.add_argument('--outfile', type = str, help = 'Output file.')
	args = parser.parse_args()

	if not os.path.isfile(os.path.abspath(args.outfile)):
		with open(args.outfile, 'w') as f:
			f.write(('\t').join(['gene', 'score', 'gquad', 'Region', 'Class']) + '\n')

	if args.gtype == 'quad':
		gpositions = getquadgs(args.gquadout)
		gcoords = getcoords(gpositions, args.featurename, args.gff)
		scores, scoresd = getphastscores(args.phastconsbed, gcoords)

		with open(args.outfile, 'a') as f:
			for gene in scoresd:
				for score in scoresd[gene]:
					f.write(('\t').join([gene, str(score), 'true', args.featurename, args.sequenceclass]) + '\n')

	elif args.gtype == 'nonquad':
		gpositions = getnongquadgs(args.gquadout, args.fasta)
		gcoords = getcoords(gpositions, args.featurename, args.gff)
		scores, scoresd = getphastscores(args.phastconsbed, gcoords)

		with open(args.outfile, 'a') as f:
			for gene in scoresd:
				for score in scoresd[gene]:
					f.write(('\t').join([gene, str(score), 'false', args.featurename, args.sequenceclass]) + '\n')