From 82c127aa83ec751033feb3c8b4796f38e4263abd Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Thu, 8 Jan 2026 09:12:50 +1000 Subject: [PATCH 1/2] Add supplement output for matched proteins --- singlem/main.py | 2 ++ singlem/supplement.py | 32 ++++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/singlem/main.py b/singlem/main.py index 63e77e34..b2f1759f 100755 --- a/singlem/main.py +++ b/singlem/main.py @@ -754,6 +754,7 @@ def add_prokaryotic_fraction_parser(name, description, deprecated=False): current_default = '1e-20' supplement_rare_group.add_argument('--hmmsearch-evalue', help='evalue for hmmsearch run on proteins to gather markers [default: {}]'.format(current_default), default=current_default) supplement_rare_group.add_argument('--gene-definitions', help='Tab-separated file of genome_fastatranscript_fastaprotein_fasta [default: undefined, call genes using Prodigal]') + supplement_rare_group.add_argument('--output-matched-protein-sequences', help='Write protein sequences matched by hmmsearch to this file') supplement_rare_group.add_argument('--working-directory', help='working directory [default: use a temporary directory]') supplement_rare_group.add_argument('--no-taxon-genome-lengths', help='Do not include taxon genome lengths in updated metapackage', action='store_true') supplement_rare_group.add_argument('--ignore-taxonomy-database-incompatibility', help='Do not halt when the old metapackage is not the default metapackage.', action='store_true') @@ -1522,6 +1523,7 @@ def add_prokaryotic_fraction_parser(name, description, deprecated=False): skip_taxonomy_check=args.skip_taxonomy_check, no_taxon_genome_lengths=args.no_taxon_genome_lengths, gene_definitions=args.gene_definitions, + output_matched_protein_sequences=args.output_matched_protein_sequences, ignore_taxonomy_database_incompatibility=args.ignore_taxonomy_database_incompatibility, new_taxonomy_database_name=args.new_taxonomy_database_name, new_taxonomy_database_version=args.new_taxonomy_database_version, diff --git a/singlem/supplement.py b/singlem/supplement.py index 4b90b206..a462d3f3 100755 --- a/singlem/supplement.py +++ b/singlem/supplement.py @@ -345,7 +345,8 @@ def generate_new_singlem_package(myargs): return new_spkg_path -def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, working_directory, hmmsearch_evalue, concatenated_hmms): +def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, output_matched_protein_sequences, + working_directory, hmmsearch_evalue, concatenated_hmms): total_num_transcripts = 0 failure_genomes = 0 num_transcriptomes = 0 @@ -411,6 +412,19 @@ def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, working_dir new_name = genome_basename + '‡' + name # Use ‡ to separate genome and original ID, must be kept in check with elsewhere in the code print('>' + new_name + '\n' + seq + '\n', file=f) num_printed += 1 + + if output_matched_protein_sequences: + num_printed_proteins = 0 + with open(tranp.protein_fasta) as g: + with lock: + with open(output_matched_protein_sequences, 'a') as f: + for name, seq, _ in SeqReader().readfq(g): + if name in matched_transcript_ids: + genome_basename = remove_extension(os.path.basename(genome)) + new_name = genome_basename + '‡' + name # Use ‡ to separate genome and original ID, must be kept in check with elsewhere in the code + print('>' + new_name + '\n' + seq + '\n', file=f) + num_printed_proteins += 1 + logging.debug("Printed {} proteins for {}".format(num_printed_proteins, genome)) logging.debug("Printed {} transcripts for {}".format(num_printed, genome)) if num_printed != len(matched_transcript_ids): logging.error("Expected to print {} transcripts for {}, but only printed {}".format( @@ -421,7 +435,7 @@ def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, working_dir def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, new_genome_transcripts_and_proteins, - hmmsearch_evalue): + hmmsearch_evalue, output_matched_protein_sequences): # Run hmmsearch using a concatenated set of HMMs from each graftm package in the metapackage # Create concatenated HMMs in working_directory/concatenated_alignment_hmms.hmm concatenated_hmms = os.path.join(working_directory, 'concatenated_alignment_hmms.hmm') @@ -441,6 +455,9 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne # Create a new file which is a concatenation of the transcripts we want to include # Use a lock to prevent race conditions since each worker writes this this matched_transcripts_fna = os.path.join(working_directory, 'matched_transcripts.fna') + if output_matched_protein_sequences: + with open(output_matched_protein_sequences, 'w'): + pass total_num_transcripts = 0 total_failure_genomes = 0 @@ -456,7 +473,7 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne with get_context('spawn').Pool(num_threads) as pool: map_result = pool.starmap( run_hmmsearch_on_one_genome, - [(lock, data, matched_transcripts_fna, working_directory, hmmsearch_evalue, concatenated_hmms) for data in new_genome_transcripts_and_proteins.items()], + [(lock, data, matched_transcripts_fna, output_matched_protein_sequences, working_directory, hmmsearch_evalue, concatenated_hmms) for data in new_genome_transcripts_and_proteins.items()], chunksize=1) for (num_transcripts, failure_genomes, num_transcriptomes, num_found_transcripts) in map_result: @@ -481,7 +498,7 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne def generate_new_metapackage(num_threads, working_directory, old_metapackage, new_genome_fasta_files, new_taxonomies_file, new_genome_transcripts_and_proteins, hmmsearch_evalue, checkm2_quality_file, no_taxon_genome_lengths, new_taxonomy_database_name, - new_taxonomy_database_version): + new_taxonomy_database_version, output_matched_protein_sequences): # Add the new genome data to each singlem package # For each package, the unaligned seqs are in the graftm package, @@ -509,7 +526,8 @@ def generate_new_metapackage(num_threads, working_directory, old_metapackage, ne logging.info("Gathering OTUs from new genomes ..") matched_transcripts_fna = gather_hmmsearch_results(num_threads, working_directory, old_metapackage, - new_genome_transcripts_and_proteins, hmmsearch_evalue) + new_genome_transcripts_and_proteins, hmmsearch_evalue, + output_matched_protein_sequences) if matched_transcripts_fna is None: # No transcripts were found, so no new OTUs to add return None @@ -842,6 +860,7 @@ def supplement(self, **kwargs): skip_taxonomy_check = kwargs.pop('skip_taxonomy_check') no_taxon_genome_lengths = kwargs.pop('no_taxon_genome_lengths') gene_definitions = kwargs.pop('gene_definitions') + output_matched_protein_sequences = kwargs.pop('output_matched_protein_sequences') ignore_taxonomy_database_incompatibility = kwargs.pop('ignore_taxonomy_database_incompatibility') new_taxonomy_database_name = kwargs.pop('new_taxonomy_database_name') new_taxonomy_database_version = kwargs.pop('new_taxonomy_database_version') @@ -969,7 +988,8 @@ def supplement(self, **kwargs): new_genome_fasta_files, taxonomy_file, genome_transcripts_and_proteins, hmmsearch_evalue, checkm2_quality_file, no_taxon_genome_lengths, - new_taxonomy_database_name, new_taxonomy_database_version) + new_taxonomy_database_name, new_taxonomy_database_version, + output_matched_protein_sequences) if new_metapackage is not None: logging.info("Copying generated metapackage to {}".format(output_metapackage)) From fc60e44abbdfafd1cec4a6055741a6a65749fc26 Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Thu, 8 Jan 2026 11:01:29 +1000 Subject: [PATCH 2/2] Add test for matched protein output --- test/test_supplement.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/test/test_supplement.py b/test/test_supplement.py index 1dd8b05c..eb0c3e2a 100755 --- a/test/test_supplement.py +++ b/test/test_supplement.py @@ -32,6 +32,7 @@ sys.path = [os.path.join(os.path.dirname(os.path.realpath(__file__)),'..')]+sys.path from singlem.metapackage import Metapackage +from singlem.sequence_classes import SeqReader path_to_script = 'singlem' path_to_data = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'supplement')) @@ -83,6 +84,44 @@ def test_defined_taxonomy(self): self.assertEqual('test_supplement', mpkg.taxonomy_database_name()) self.assertEqual('1.0', mpkg.taxonomy_database_version()) + def test_output_matched_protein_sequences(self): + with in_tempdir(): + gene_definitions = os.path.join(os.getcwd(), 'gene_definitions.tsv') + genome_fasta = os.path.join(path_to_data, 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna') + transcript_fasta = os.path.join(path_to_data, 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna.ffn') + protein_fasta = os.path.join(path_to_data, 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna.faa') + with open(gene_definitions, 'w') as f: + f.write("genome_fasta\ttranscript_fasta\tprotein_fasta\n") + f.write(f"{genome_fasta}\t{transcript_fasta}\t{protein_fasta}\n") + + matched_proteins = os.path.join(os.getcwd(), 'matched_proteins.faa') + cmd = ( + f"{run} --ignore-taxonomy-database-incompatibility --no-taxon-genome-lengths " + f"--no-dereplication --skip-taxonomy-check --hmmsearch-evalue 1e-5 --no-quality-filter " + f"--new-genome-fasta-files {genome_fasta} --input-metapackage {path_to_data}/4.11.22seqs.gpkg.spkg.smpkg/ " + f"--output-metapackage out.smpkg --new-fully-defined-taxonomies " + f"{path_to_data}/GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna.taxonomy " + f"--new-taxonomy-database-name test_supplement --new-taxonomy-database-version 1.0 " + f"--gene-definitions {gene_definitions} --output-matched-protein-sequences {matched_proteins}" + ) + extern.run(cmd) + + with open(protein_fasta) as f: + input_protein_names = {name for name, _, _ in SeqReader().readfq(f)} + + matched_records = [] + with open(matched_proteins) as f: + matched_records = list(SeqReader().readfq(f)) + + self.assertGreater(len(matched_records), 0) + for name, _, _ in matched_records: + genome_name, protein_name = name.split('‡', 1) + self.assertEqual( + genome_name, + 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps', + ) + self.assertIn(protein_name, input_protein_names) + @pytest.mark.expensive def test_auto_taxonomy(self): with in_tempdir():