Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions singlem/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,7 @@ def add_prokaryotic_fraction_parser(name, description, deprecated=False):
current_default = '1e-20'
supplement_rare_group.add_argument('--hmmsearch-evalue', help='evalue for hmmsearch run on proteins to gather markers [default: {}]'.format(current_default), default=current_default)
supplement_rare_group.add_argument('--gene-definitions', help='Tab-separated file of genome_fasta<TAB>transcript_fasta<TAB>protein_fasta [default: undefined, call genes using Prodigal]')
supplement_rare_group.add_argument('--output-matched-protein-sequences', help='Write protein sequences matched by hmmsearch to this file')
supplement_rare_group.add_argument('--working-directory', help='working directory [default: use a temporary directory]')
supplement_rare_group.add_argument('--no-taxon-genome-lengths', help='Do not include taxon genome lengths in updated metapackage', action='store_true')
supplement_rare_group.add_argument('--ignore-taxonomy-database-incompatibility', help='Do not halt when the old metapackage is not the default metapackage.', action='store_true')
Expand Down Expand Up @@ -1522,6 +1523,7 @@ def add_prokaryotic_fraction_parser(name, description, deprecated=False):
skip_taxonomy_check=args.skip_taxonomy_check,
no_taxon_genome_lengths=args.no_taxon_genome_lengths,
gene_definitions=args.gene_definitions,
output_matched_protein_sequences=args.output_matched_protein_sequences,
ignore_taxonomy_database_incompatibility=args.ignore_taxonomy_database_incompatibility,
new_taxonomy_database_name=args.new_taxonomy_database_name,
new_taxonomy_database_version=args.new_taxonomy_database_version,
Expand Down
32 changes: 26 additions & 6 deletions singlem/supplement.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ def generate_new_singlem_package(myargs):
return new_spkg_path


def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, working_directory, hmmsearch_evalue, concatenated_hmms):
def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, output_matched_protein_sequences,
working_directory, hmmsearch_evalue, concatenated_hmms):
total_num_transcripts = 0
failure_genomes = 0
num_transcriptomes = 0
Expand Down Expand Up @@ -411,6 +412,19 @@ def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, working_dir
new_name = genome_basename + '‡' + name # Use ‡ to separate genome and original ID, must be kept in check with elsewhere in the code
print('>' + new_name + '\n' + seq + '\n', file=f)
num_printed += 1

if output_matched_protein_sequences:
num_printed_proteins = 0
with open(tranp.protein_fasta) as g:
with lock:
with open(output_matched_protein_sequences, 'a') as f:
for name, seq, _ in SeqReader().readfq(g):
if name in matched_transcript_ids:
genome_basename = remove_extension(os.path.basename(genome))
new_name = genome_basename + '‡' + name # Use ‡ to separate genome and original ID, must be kept in check with elsewhere in the code
print('>' + new_name + '\n' + seq + '\n', file=f)
num_printed_proteins += 1
logging.debug("Printed {} proteins for {}".format(num_printed_proteins, genome))
logging.debug("Printed {} transcripts for {}".format(num_printed, genome))
if num_printed != len(matched_transcript_ids):
logging.error("Expected to print {} transcripts for {}, but only printed {}".format(
Expand All @@ -421,7 +435,7 @@ def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, working_dir


def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, new_genome_transcripts_and_proteins,
hmmsearch_evalue):
hmmsearch_evalue, output_matched_protein_sequences):
# Run hmmsearch using a concatenated set of HMMs from each graftm package in the metapackage
# Create concatenated HMMs in working_directory/concatenated_alignment_hmms.hmm
concatenated_hmms = os.path.join(working_directory, 'concatenated_alignment_hmms.hmm')
Expand All @@ -441,6 +455,9 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne
# Create a new file which is a concatenation of the transcripts we want to include
# Use a lock to prevent race conditions since each worker writes this this
matched_transcripts_fna = os.path.join(working_directory, 'matched_transcripts.fna')
if output_matched_protein_sequences:
with open(output_matched_protein_sequences, 'w'):
pass

total_num_transcripts = 0
total_failure_genomes = 0
Expand All @@ -456,7 +473,7 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne
with get_context('spawn').Pool(num_threads) as pool:
map_result = pool.starmap(
run_hmmsearch_on_one_genome,
[(lock, data, matched_transcripts_fna, working_directory, hmmsearch_evalue, concatenated_hmms) for data in new_genome_transcripts_and_proteins.items()],
[(lock, data, matched_transcripts_fna, output_matched_protein_sequences, working_directory, hmmsearch_evalue, concatenated_hmms) for data in new_genome_transcripts_and_proteins.items()],
chunksize=1)

for (num_transcripts, failure_genomes, num_transcriptomes, num_found_transcripts) in map_result:
Expand All @@ -481,7 +498,7 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne
def generate_new_metapackage(num_threads, working_directory, old_metapackage, new_genome_fasta_files,
new_taxonomies_file, new_genome_transcripts_and_proteins, hmmsearch_evalue,
checkm2_quality_file, no_taxon_genome_lengths, new_taxonomy_database_name,
new_taxonomy_database_version):
new_taxonomy_database_version, output_matched_protein_sequences):

# Add the new genome data to each singlem package
# For each package, the unaligned seqs are in the graftm package,
Expand Down Expand Up @@ -509,7 +526,8 @@ def generate_new_metapackage(num_threads, working_directory, old_metapackage, ne

logging.info("Gathering OTUs from new genomes ..")
matched_transcripts_fna = gather_hmmsearch_results(num_threads, working_directory, old_metapackage,
new_genome_transcripts_and_proteins, hmmsearch_evalue)
new_genome_transcripts_and_proteins, hmmsearch_evalue,
output_matched_protein_sequences)
if matched_transcripts_fna is None:
# No transcripts were found, so no new OTUs to add
return None
Expand Down Expand Up @@ -842,6 +860,7 @@ def supplement(self, **kwargs):
skip_taxonomy_check = kwargs.pop('skip_taxonomy_check')
no_taxon_genome_lengths = kwargs.pop('no_taxon_genome_lengths')
gene_definitions = kwargs.pop('gene_definitions')
output_matched_protein_sequences = kwargs.pop('output_matched_protein_sequences')
ignore_taxonomy_database_incompatibility = kwargs.pop('ignore_taxonomy_database_incompatibility')
new_taxonomy_database_name = kwargs.pop('new_taxonomy_database_name')
new_taxonomy_database_version = kwargs.pop('new_taxonomy_database_version')
Expand Down Expand Up @@ -969,7 +988,8 @@ def supplement(self, **kwargs):
new_genome_fasta_files, taxonomy_file,
genome_transcripts_and_proteins, hmmsearch_evalue,
checkm2_quality_file, no_taxon_genome_lengths,
new_taxonomy_database_name, new_taxonomy_database_version)
new_taxonomy_database_name, new_taxonomy_database_version,
output_matched_protein_sequences)

if new_metapackage is not None:
logging.info("Copying generated metapackage to {}".format(output_metapackage))
Expand Down
39 changes: 39 additions & 0 deletions test/test_supplement.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

sys.path = [os.path.join(os.path.dirname(os.path.realpath(__file__)),'..')]+sys.path
from singlem.metapackage import Metapackage
from singlem.sequence_classes import SeqReader

path_to_script = 'singlem'
path_to_data = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'supplement'))
Expand Down Expand Up @@ -83,6 +84,44 @@ def test_defined_taxonomy(self):
self.assertEqual('test_supplement', mpkg.taxonomy_database_name())
self.assertEqual('1.0', mpkg.taxonomy_database_version())

def test_output_matched_protein_sequences(self):
with in_tempdir():
gene_definitions = os.path.join(os.getcwd(), 'gene_definitions.tsv')
genome_fasta = os.path.join(path_to_data, 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna')
transcript_fasta = os.path.join(path_to_data, 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna.ffn')
protein_fasta = os.path.join(path_to_data, 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna.faa')
with open(gene_definitions, 'w') as f:
f.write("genome_fasta\ttranscript_fasta\tprotein_fasta\n")
f.write(f"{genome_fasta}\t{transcript_fasta}\t{protein_fasta}\n")

matched_proteins = os.path.join(os.getcwd(), 'matched_proteins.faa')
cmd = (
f"{run} --ignore-taxonomy-database-incompatibility --no-taxon-genome-lengths "
f"--no-dereplication --skip-taxonomy-check --hmmsearch-evalue 1e-5 --no-quality-filter "
f"--new-genome-fasta-files {genome_fasta} --input-metapackage {path_to_data}/4.11.22seqs.gpkg.spkg.smpkg/ "
f"--output-metapackage out.smpkg --new-fully-defined-taxonomies "
f"{path_to_data}/GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna.taxonomy "
f"--new-taxonomy-database-name test_supplement --new-taxonomy-database-version 1.0 "
f"--gene-definitions {gene_definitions} --output-matched-protein-sequences {matched_proteins}"
)
extern.run(cmd)

with open(protein_fasta) as f:
input_protein_names = {name for name, _, _ in SeqReader().readfq(f)}

matched_records = []
with open(matched_proteins) as f:
matched_records = list(SeqReader().readfq(f))

self.assertGreater(len(matched_records), 0)
for name, _, _ in matched_records:
genome_name, protein_name = name.split('‡', 1)
self.assertEqual(
genome_name,
'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps',
)
self.assertIn(protein_name, input_protein_names)

@pytest.mark.expensive
def test_auto_taxonomy(self):
with in_tempdir():
Expand Down
Loading