From 82c127aa83ec751033feb3c8b4796f38e4263abd Mon Sep 17 00:00:00 2001
From: Ben J Woodcroft <donttrustben@gmail.com>
Date: Thu, 8 Jan 2026 09:12:50 +1000
Subject: [PATCH 1/2] Add supplement output for matched proteins

---
 singlem/main.py       |  2 ++
 singlem/supplement.py | 32 ++++++++++++++++++++++++++------
 2 files changed, 28 insertions(+), 6 deletions(-)
diff --git a/singlem/main.py b/singlem/main.py
index 63e77e34..b2f1759f 100755
--- a/singlem/main.py
+++ b/singlem/main.py
@@ -754,6 +754,7 @@ def add_prokaryotic_fraction_parser(name, description, deprecated=False):
     current_default = '1e-20'
     supplement_rare_group.add_argument('--hmmsearch-evalue', help='evalue for hmmsearch run on proteins to gather markers [default: {}]'.format(current_default), default=current_default)
     supplement_rare_group.add_argument('--gene-definitions', help='Tab-separated file of genome_fasta<TAB>transcript_fasta<TAB>protein_fasta [default: undefined, call genes using Prodigal]')
+    supplement_rare_group.add_argument('--output-matched-protein-sequences', help='Write protein sequences matched by hmmsearch to this file')
     supplement_rare_group.add_argument('--working-directory', help='working directory [default: use a temporary directory]')
     supplement_rare_group.add_argument('--no-taxon-genome-lengths', help='Do not include taxon genome lengths in updated metapackage', action='store_true')
     supplement_rare_group.add_argument('--ignore-taxonomy-database-incompatibility', help='Do not halt when the old metapackage is not the default metapackage.', action='store_true')
@@ -1522,6 +1523,7 @@ def add_prokaryotic_fraction_parser(name, description, deprecated=False):
             skip_taxonomy_check=args.skip_taxonomy_check,
             no_taxon_genome_lengths=args.no_taxon_genome_lengths,
             gene_definitions=args.gene_definitions,
+            output_matched_protein_sequences=args.output_matched_protein_sequences,
             ignore_taxonomy_database_incompatibility=args.ignore_taxonomy_database_incompatibility,
             new_taxonomy_database_name=args.new_taxonomy_database_name,
             new_taxonomy_database_version=args.new_taxonomy_database_version,
diff --git a/singlem/supplement.py b/singlem/supplement.py
index 4b90b206..a462d3f3 100755
--- a/singlem/supplement.py
+++ b/singlem/supplement.py
@@ -345,7 +345,8 @@ def generate_new_singlem_package(myargs):
     return new_spkg_path
 
 
-def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, working_directory, hmmsearch_evalue, concatenated_hmms):
+def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, output_matched_protein_sequences,
+                               working_directory, hmmsearch_evalue, concatenated_hmms):
     total_num_transcripts = 0
     failure_genomes = 0
     num_transcriptomes = 0
@@ -411,6 +412,19 @@ def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, working_dir
                         new_name = genome_basename + '‡' + name # Use ‡ to separate genome and original ID, must be kept in check with elsewhere in the code
                         print('>' + new_name + '\n' + seq + '\n', file=f)
                         num_printed += 1
+
+    if output_matched_protein_sequences:
+        num_printed_proteins = 0
+        with open(tranp.protein_fasta) as g:
+            with lock:
+                with open(output_matched_protein_sequences, 'a') as f:
+                    for name, seq, _ in SeqReader().readfq(g):
+                        if name in matched_transcript_ids:
+                            genome_basename = remove_extension(os.path.basename(genome))
+                            new_name = genome_basename + '‡' + name # Use ‡ to separate genome and original ID, must be kept in check with elsewhere in the code
+                            print('>' + new_name + '\n' + seq + '\n', file=f)
+                            num_printed_proteins += 1
+        logging.debug("Printed {} proteins for {}".format(num_printed_proteins, genome))
     logging.debug("Printed {} transcripts for {}".format(num_printed, genome))
     if num_printed != len(matched_transcript_ids):
         logging.error("Expected to print {} transcripts for {}, but only printed {}".format(
@@ -421,7 +435,7 @@ def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, working_dir
 
 
 def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, new_genome_transcripts_and_proteins,
-                             hmmsearch_evalue):
+                             hmmsearch_evalue, output_matched_protein_sequences):
     # Run hmmsearch using a concatenated set of HMMs from each graftm package in the metapackage
     # Create concatenated HMMs in working_directory/concatenated_alignment_hmms.hmm
     concatenated_hmms = os.path.join(working_directory, 'concatenated_alignment_hmms.hmm')
@@ -441,6 +455,9 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne
     # Create a new file which is a concatenation of the transcripts we want to include
     # Use a lock to prevent race conditions since each worker writes this this
     matched_transcripts_fna = os.path.join(working_directory, 'matched_transcripts.fna')
+    if output_matched_protein_sequences:
+        with open(output_matched_protein_sequences, 'w'):
+            pass
 
     total_num_transcripts = 0
     total_failure_genomes = 0
@@ -456,7 +473,7 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne
         with get_context('spawn').Pool(num_threads) as pool:
             map_result = pool.starmap(
                 run_hmmsearch_on_one_genome,
-                [(lock, data, matched_transcripts_fna, working_directory, hmmsearch_evalue, concatenated_hmms) for data in new_genome_transcripts_and_proteins.items()],
+                [(lock, data, matched_transcripts_fna, output_matched_protein_sequences, working_directory, hmmsearch_evalue, concatenated_hmms) for data in new_genome_transcripts_and_proteins.items()],
                 chunksize=1)
 
             for (num_transcripts, failure_genomes, num_transcriptomes, num_found_transcripts) in map_result:
@@ -481,7 +498,7 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne
 def generate_new_metapackage(num_threads, working_directory, old_metapackage, new_genome_fasta_files,
                              new_taxonomies_file, new_genome_transcripts_and_proteins, hmmsearch_evalue,
                              checkm2_quality_file, no_taxon_genome_lengths, new_taxonomy_database_name,
-                             new_taxonomy_database_version):
+                             new_taxonomy_database_version, output_matched_protein_sequences):
 
     # Add the new genome data to each singlem package
     # For each package, the unaligned seqs are in the graftm package,
@@ -509,7 +526,8 @@ def generate_new_metapackage(num_threads, working_directory, old_metapackage, ne
 
     logging.info("Gathering OTUs from new genomes ..")
     matched_transcripts_fna = gather_hmmsearch_results(num_threads, working_directory, old_metapackage,
-                                                       new_genome_transcripts_and_proteins, hmmsearch_evalue)
+                                                       new_genome_transcripts_and_proteins, hmmsearch_evalue,
+                                                       output_matched_protein_sequences)
     if matched_transcripts_fna is None:
         # No transcripts were found, so no new OTUs to add
         return None
@@ -842,6 +860,7 @@ def supplement(self, **kwargs):
         skip_taxonomy_check = kwargs.pop('skip_taxonomy_check')
         no_taxon_genome_lengths = kwargs.pop('no_taxon_genome_lengths')
         gene_definitions = kwargs.pop('gene_definitions')
+        output_matched_protein_sequences = kwargs.pop('output_matched_protein_sequences')
         ignore_taxonomy_database_incompatibility = kwargs.pop('ignore_taxonomy_database_incompatibility')
         new_taxonomy_database_name = kwargs.pop('new_taxonomy_database_name')
         new_taxonomy_database_version = kwargs.pop('new_taxonomy_database_version')
@@ -969,7 +988,8 @@ def supplement(self, **kwargs):
                                                        new_genome_fasta_files, taxonomy_file,
                                                        genome_transcripts_and_proteins, hmmsearch_evalue,
                                                        checkm2_quality_file, no_taxon_genome_lengths,
-                                                       new_taxonomy_database_name, new_taxonomy_database_version)
+                                                       new_taxonomy_database_name, new_taxonomy_database_version,
+                                                       output_matched_protein_sequences)
 
             if new_metapackage is not None:
                 logging.info("Copying generated metapackage to {}".format(output_metapackage))

From fc60e44abbdfafd1cec4a6055741a6a65749fc26 Mon Sep 17 00:00:00 2001
From: Ben J Woodcroft <donttrustben@gmail.com>
Date: Thu, 8 Jan 2026 11:01:29 +1000
Subject: [PATCH 2/2] Add test for matched protein output

---
 test/test_supplement.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/test/test_supplement.py b/test/test_supplement.py
index 1dd8b05c..eb0c3e2a 100755
--- a/test/test_supplement.py
+++ b/test/test_supplement.py
@@ -32,6 +32,7 @@
 
 sys.path = [os.path.join(os.path.dirname(os.path.realpath(__file__)),'..')]+sys.path
 from singlem.metapackage import Metapackage
+from singlem.sequence_classes import SeqReader
 
 path_to_script = 'singlem'
 path_to_data = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'supplement'))
@@ -83,6 +84,44 @@ def test_defined_taxonomy(self):
             self.assertEqual('test_supplement', mpkg.taxonomy_database_name())
             self.assertEqual('1.0', mpkg.taxonomy_database_version())
 
+    def test_output_matched_protein_sequences(self):
+        with in_tempdir():
+            gene_definitions = os.path.join(os.getcwd(), 'gene_definitions.tsv')
+            genome_fasta = os.path.join(path_to_data, 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna')
+            transcript_fasta = os.path.join(path_to_data, 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna.ffn')
+            protein_fasta = os.path.join(path_to_data, 'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna.faa')
+            with open(gene_definitions, 'w') as f:
+                f.write("genome_fasta\ttranscript_fasta\tprotein_fasta\n")
+                f.write(f"{genome_fasta}\t{transcript_fasta}\t{protein_fasta}\n")
+
+            matched_proteins = os.path.join(os.getcwd(), 'matched_proteins.faa')
+            cmd = (
+                f"{run} --ignore-taxonomy-database-incompatibility --no-taxon-genome-lengths "
+                f"--no-dereplication --skip-taxonomy-check --hmmsearch-evalue 1e-5 --no-quality-filter "
+                f"--new-genome-fasta-files {genome_fasta} --input-metapackage {path_to_data}/4.11.22seqs.gpkg.spkg.smpkg/ "
+                f"--output-metapackage out.smpkg --new-fully-defined-taxonomies "
+                f"{path_to_data}/GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps.fna.taxonomy "
+                f"--new-taxonomy-database-name test_supplement --new-taxonomy-database-version 1.0 "
+                f"--gene-definitions {gene_definitions} --output-matched-protein-sequences {matched_proteins}"
+            )
+            extern.run(cmd)
+
+            with open(protein_fasta) as f:
+                input_protein_names = {name for name, _, _ in SeqReader().readfq(f)}
+
+            matched_records = []
+            with open(matched_proteins) as f:
+                matched_records = list(SeqReader().readfq(f))
+
+            self.assertGreater(len(matched_records), 0)
+            for name, _, _ in matched_records:
+                genome_name, protein_name = name.split('‡', 1)
+                self.assertEqual(
+                    genome_name,
+                    'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps',
+                )
+                self.assertIn(protein_name, input_protein_names)
+
     @pytest.mark.expensive
     def test_auto_taxonomy(self):
         with in_tempdir():