AnalysisScripts/RFPmakeCDSfasta.py at master · TaliaferroLab/AnalysisScripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#Given a genome annotation, make a fasta of only CDS sequences, excluding the first 50 nt.

import gffutils
import os
from Bio import SeqIO
import gzip
import argparse

def getCDScoords(gff):
	CDScoords = {} #{ENSTID_chrm_strand : [[cdsexon1start, cdsexon1stop], [cdsexon2start, cdsexon2stop]]}
	print 'Indexing gff...'
	gff_fn = gff
	db_fn = os.path.abspath(gff_fn) + '.db'
	if os.path.isfile(db_fn) == False:
		gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)

	db = gffutils.FeatureDB(db_fn)
	print 'Done indexing!'

	genes = db.features_of_type('gene')

	genecounter = 0
	txcounter = 0
	geneswithcdscounter = 0
	txwithcdscounter = 0
	for gene in genes:
		genecounter +=1
		if genecounter % 10000 == 0:
			print 'Gene {0}...'.format(genecounter)
		genehasCDS = False
		for transcript in db.children(gene, featuretype = 'transcript', level = 1, order_by = 'start'):
			txcounter +=1
			if len(list(db.children(transcript, featuretype = 'CDS'))) == 0:
				continue
			genehasCDS = True
			txwithcdscounter +=1
			txid = str(transcript.id)
			chrm = str(transcript.chrom)
			strand = str(transcript.strand)
			ID = txid + '_' + chrm + '_' + strand
			CDScoords[ID] = []
			for cdsexon in db.children(transcript, featuretype = 'CDS', level = 1, order_by = 'start'):
				exoncoords = [cdsexon.start, cdsexon.end]
				CDScoords[ID].append(exoncoords)

		if genehasCDS:
			geneswithcdscounter +=1

	print 'Looked through {0} genes. {1} had coding exons.'.format(genecounter, geneswithcdscounter)
	print 'Looked through {1} transcripts. {1} had coding exons.'.format(txcounter, txwithcdscounter)

	return CDScoords

def getSequences(CDScoords, genomefasta, outputfasta):
	chrmswithoutseq = [] #Chromsome names that are in allCDScoords but that don't have a fasta entry in genomefasta
	txcounter = 0
	seqcounter = 0
	tooshort = 0
	wrongends = 0


	print 'Indexing genome sequence...'
	seq_dict = SeqIO.to_dict(SeqIO.parse(gzip.open(genomefasta), 'fasta'))
	print 'Done indexing!'

	with open(outputfasta, 'w') as outfh:
		for tx in CDScoords:
			txcounter +=1
			CDSseq = ''
			txname = tx.split('_')[0].replace('transcript:', '').split('.')[0]
			chrm = tx.split('_')[1]
			strand = tx.split('_')[2]

			#Is this chromosome in genomefasta?
			if chrm not in seq_dict:
				if chrm not in chrmswithoutseq:
					print 'WARNING: No entry for chromosome {0} in genomefasta.'.format(chrm)
					chrmswithoutseq.append(chrm)
				continue

			for coords in CDScoords[tx]:
				start = coords[0]
				end = coords[1]
				if strand == '+':
					exonseq = seq_dict[chrm].seq[start-1:end].upper()
					CDSseq += exonseq
				elif strand == '-':
					exonseq = seq_dict[chrm].seq[start-1:end].reverse_complement().upper()
					newseq = exonseq + CDSseq
					CDSseq = newseq

			if len(CDSseq) < 200:
				tooshort +=1
				continue

			if CDSseq[0:3] == 'ATG' and (CDSseq[-3:] == 'TAA' or CDSseq[-3:] == 'TGA' or CDSseq[-3:] == 'TAG'):
				seqcounter +=1
				#Take off first 50 nt
				outfh.write('>' + txname + '\n' + str(CDSseq[50:]) + '\n')

			else:
				wrongends +=1
				continue

	print 'Had CDS seqs for {0} transcripts. Wrote seqs for {1} of them.  {2} were too short and {3} had incorrect terminal codons.'.format(txcounter, seqcounter, tooshort, wrongends)

if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--gff', type = str, help = 'Gff annotation of genome.')
	parser.add_argument('--genomefasta', type = str, help = 'Genome sequence in fasta format.')
	parser.add_argument('--outputfasta', type = str, help = 'Output fasta.')
	args = parser.parse_args()

	CDScoords = getCDScoords(args.gff)
	getSequences(CDScoords, args.genomefasta, args.outputfasta)