fastaaiv2/fastaai_preprocessor.py at main · KGerhardt/fastaaiv2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import sys
import os
import argparse
import multiprocessing
import sqlite3

from supports.agnostic_reader import agnostic_reader
from supports.fasta_importer import fasta_file

from supports.get_file_basename import get_basename

from supports.pyrodigal_manager import new_pyrodigal_manager
from supports.pyhmmer_manager import new_pyhmmer_manager
from supports.crystalizer import crystalizer
from supports.fasta_importer import fasta_file

class fastaai_preprocessor:
	def __init__(self, dbpath, genome_directory, module = "prokaryotes", output_nt = False, threads = 1, compress = True):
		self.dbpath = dbpath
		self.input_genomes = genome_directory
		self.module = module

		self.tax_by_genome = {}

		self.output_nt = output_nt

		self.threads = threads
		self.compress = compress

	def make_directories(self):
		if not os.path.exists("proteins_aa"):
			os.mkdir("proteins_aa")
		if not os.path.exists("proteins_nt"):
			os.mkdir("proteins_nt")
		if not os.path.exists("hmm_searches"):
			os.mkdir("hmm_searches")
		if not os.path.exists("crystals"):
			os.mkdir("crystals")

	def load_taxonomy(self, taxfile):
		self.tax_by_genome = {}
		with open(taxfile) as fh:
			for line in fh:
				segs = line.strip().split("\t")
				genome = segs[0]
				gtdb_fmt_tax_string = segs[1]
				self.tax_by_genome[genome] = gtdb_fmt_tax_string

	def run_preprocessing(self):
		genome_files = os.listdir(self.input_genomes)
		genome_files = [os.path.normpath(self.input_genomes + "/" + g) for g in genome_files]
		if self.threads > len(genome_files):
			self.threads = len(genome_files)

		for g in genome_files:
			bn = get_basename(g)
			if bn not in self.tax_by_genome:
				self.tax_by_genome[bn] = 'd__;p__;c__;o__;f__;g__;s__'

		self.make_directories()

		if self.module == "prokaryotes":
			pyhmmerer = new_pyhmmer_manager(self.compress)
			pyhmmerer.load_hmm_from_database(self.dbpath)

			preproc_args = [prokaryotic_input_file(input_genome = g,
													taxonomy = self.tax_by_genome[get_basename(g)],
													compress = self.compress,
													pyhmmer_manager = pyhmmerer,
													nucleotide_output = self.output_nt) for g in genome_files]

		if self.module == "eukaryotes":
			pass
			#Still working on this. Need to decide on metaeuk or AUGUSTUS. AUGUSTUS sucks to install.

		pool = multiprocessing.Pool(self.threads)
		for result in pool.imap_unordered(run_preprocessor, preproc_args):
			print(result)

		pool.close()
		pool.join()


class prokaryotic_input_file:
	def __init__(self,
				genome_type = "prokaryote",
				preprocessing_stage = "genome",
				add_rules = None,
				input_genome = None,
				input_protein_aa = None,
				input_protein_nt = None,
				input_hmm = None,
				input_crystal = None,
				taxonomy = 'd__;p__;c__;o__;f__;g__;s__',
				compress = False,
				nucleotide_output = False,
				prot_nt_out = "proteins_nt",
				prot_aa_out = "proteins_aa",
				hmm_out = "hmm_searches",
				crystal_out = "crystals",
				pyhmmer_manager = None
				):

		self.gtype = genome_type
		self.start_point = preprocessing_stage
		self.genome_class_rules = add_rules

		self.genome_file = input_genome
		self.protein_file_aa = input_protein_aa
		self.protein_file_nt = input_protein_nt

		self.hmm_file = input_hmm
		self.crystal_file = input_crystal

		self.name_base = get_basename(self.genome_file)

		self.genome = None
		self.protein = None
		self.protein_nt_ct = None
		self.hmm_best_hits = None
		self.hmm_acc_to_score = None
		self.crystal = None

		self.prot_aa_out_dir = prot_aa_out
		self.prot_nt_out_dir = prot_nt_out
		self.hmm_out_dir = hmm_out
		self.crystal_out_dir = crystal_out

		self.compress = compress
		self.output_nt = nucleotide_output

		self.pyhmmer_manager = pyhmmer_manager

		self.genome_length = 0
		self.num_proteins = 0
		self.num_scps = 0

		self.gtdb_tax_string = taxonomy

	def genome_to_protein(self):
		if self.genome_file is None and self.protein_file is None:
			print("FastAAI cannot proceed without either a genome or a protein file")
			return None

		if self.genome_file is not None:
			fr = fasta_file(self.genome_file)
			fr.read()
			for seq in fr.sequences:
				self.genome_length += len(fr.sequences[seq])

			self.genome = fr.sequences
			fr = None

		if self.protein_file_aa is None:
			self.protein_file_aa = os.path.normpath(self.prot_aa_out_dir + "/" +self.name_base+"_protein_aa.txt")
			if self.output_nt:
				self.protein_file_nt = os.path.normpath(self.prot_nt_out_dir + "/" +self.name_base+"_protein_nt.txt")
			else:
				self.protein_file_nt = None


		mn = new_pyrodigal_manager()
		self.protein, aa_filepath_copy = mn.run_in_mem(genome_dict = self.genome, compress = self.compress, outnt = self.protein_file_nt, outaa = self.protein_file_aa)
		self.protein_nt_ct = mn.protein_length_nt
		mn = None

	def protein_to_hmm(self):
		if self.protein is None:
			if self.protein_file is None:
				print("Cannot load a nonexistent protein file.")
			else:
				fr = fasta_file(self.protein_file)
				fr.read()
				self.protein = fr.seqeunces
				fr = None

		if self.protein is not None:
			self.pyhmmer_manager.convert_protein_seqs_in_mem(self.protein)
			self.pyhmmer_manager.search_protein()
			self.hmm = os.path.normpath(self.hmm_out_dir + "/" + self.name_base + "_hmmsearch.txt")
			self.pyhmmer_manager.to_hmm_file_pyhmmer_method(self.hmm)
			self.hmm_best_hits, self.hmm_acc_to_score = self.pyhmmer_manager.filter_to_best_hits()

	def run_crystalize(self):
		mn = crystalizer(taxonomy = self.gtdb_tax_string, compress = self.compress)
		mn.set_values_from_mem(basename = self.name_base,
								genlength = self.genome_length,
								p_to_acc = self.hmm_best_hits,
								acc_to_bh = self.hmm_acc_to_score,
								prot_seqdict = self.protein)

		self.crystal = mn.jsonify_loaded_data()
		if self.crystal_file is None:
			self.crystal_file = os.path.normpath(self.crystal_out_dir + "/" +self.name_base+"_fastaai_crystal.txt")

		mn.write_crystal(self.crystal_file)

	def run(self):
		self.genome_to_protein()
		self.protein_to_hmm()
		self.run_crystalize()
		return self.name_base


def hmm_preproc_initializer(hmm_database, do_compress = False):
	global pyhmmer_manager
	pyhmmer_manager = new_pyhmmer_manager(do_compress)
	pyhmmer_manager.load_hmm_from_database(hmm_database)
	return pyhmmer_manager

def run_preprocessor(preproc_obj):
	this_genome = preproc_obj.run()
	return this_genome


import argparse
def consumer_options():
	parser = argparse.ArgumentParser(description='Process genomes into FastAAI v2 crystals. Needs a database that has been initialized with fastaai2 init and a set of genomes')

	parser.add_argument('-d', '--database', dest='database', default = None,
						help='An initialized FastAAI v2 database')
	parser.add_argument('-g', '--genomes', dest='genome_dir', default = None,
						help='A directory containing genomes to preprocess')
	parser.add_argument('--taxonomy_file', dest='tax', default = None,
						help='A tab-separated file containing genome ID in column 1, GTDB-formatted taxonomy string in column 2. No header.')
	parser.add_argument('--eukaryotic', dest='is_euk', action = 'store_true',
						help='Flag to indicate that genomes are eukaryotic.')
	parser.add_argument('--output_nt', dest='nt', action = 'store_true',
						help='Flag for producing nucleotide sequences for predicted proteins in addition to amino acid sequences. These are not used by FastAAI, this is just for your convenience.')
	parser.add_argument('--compress', dest='compress', action = 'store_true',
						help='Flag for gzipping outputs. Modestly increases runtime.')

	parser.add_argument('-t', '--threads', dest = 'threads', default = 1, type = int,
						help='How many processes to use')


	args = parser.parse_known_args()
	return parser, vars(args[0])

def fastaai_preprocess():
	parser, args = consumer_options()
	if len(sys.argv) < 3:
		parser.print_help()
	else:
		db = args['database']
		genomes = args['genome_dir']
		tax_file = args['tax']
		euk = args['is_euk']
		do_nt = args['nt']
		comp = args['compress']
		threads = args['threads']

		if euk:
			org = 'eukaryotes'
		else:
			org = 'prokaryotes'

		if not os.path.exists(db):
			print("A FastAAI v2 database must already exist at path", db, "or this step cannot proceed.")
			print("Please check that a database already exists or make one with fastaai2 init")
		else:
			mn = fastaai_preprocessor(dbpath = db, genome_directory = genomes, module = org, output_nt = do_nt, threads = threads, compress = comp)
			mn.load_taxonomy(tax_file)
			mn.run_preprocessing()