wwood · wwood · Aug 26, 2025
diff --git a/docs/AGENTS.md b/docs/AGENTS.md
@@ -0,0 +1,4 @@
+# AGENTS
+
+- Files under `docs/tools/` and `docs/advanced/` are autogenerated; do not modify them directly.
+- `docs/Installation.md` is autogenerated; make changes in `docs/Installation.md.in` instead.
diff --git a/docs/preludes/pipe_prelude.md b/docs/preludes/pipe_prelude.md
@@ -7,7 +7,9 @@ To further convert the generated taxonomic profile to other formats that might b
 
 ## Algorithm details
 
-**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. It can also take as input whole genomes (or contigs), and can output a table of OTUs. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary).
+**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary).
+
+It can also take as input whole genomes (or contigs) via `--genome-fasta-files`, `--genome-fasta-directory` or `--genome-fasta-list`. These options behave the same as providing sequences with `--forward`, except that different defaults for `--min-taxon-coverage` and `--min-orf-length` are used.
 
 `pipe` performs three steps:
 

diff --git a/singlem/lyrebird.py b/singlem/lyrebird.py
@@ -108,7 +108,6 @@ def main():
         singlem.pipe.SearchPipe().run(
             sequences = args.forward,
             reverse_read_files = args.reverse,
-            genomes = args.genome_fasta_files,
             input_sra_files = args.sra_files,
             otu_table = args.otu_table,
             archive_otu_table = args.archive_otu_table,

diff --git a/singlem/main.py b/singlem/main.py
@@ -80,13 +80,13 @@ def add_common_pipe_arguments(argument_group, extra_args=False):
         sequence_input_group.add_argument('-f', '--genome-fasta-files',
                                     nargs='+',
                                     metavar='PATH',
-                                    help='Path(s) to FASTA files of each genome e.g. pathA/genome1.fna pathB/genome2.fa.')
+                                    help='Path(s) to genome FASTA files. These are processed like input given with --forward, but use higher default values for --min-taxon-coverage and --min-orf-length.')
         sequence_input_group.add_argument('-d', '--genome-fasta-directory',
                                     metavar='PATH',
-                                    help='Directory containing FASTA files of each genome.')
+                                    help='Directory containing genome FASTA files. Treated identically to --forward input with higher default values for --min-taxon-coverage and --min-orf-length.')
         sequence_input_group.add_argument('--genome-fasta-list',
                                     metavar='PATH',
-                                    help='File containing FASTA file paths, one per line.')
+                                    help='File containing genome FASTA paths, one per line. Behaviour matches --forward with higher default values for --min-taxon-coverage and --min-orf-length.')
         sequence_input_group.add_argument('--sra-files',
                         nargs='+',
                         metavar='sra_file',
@@ -151,7 +151,7 @@ def add_less_common_pipe_arguments(argument_group, extra_args=False):
                                 help='HMMSEARCH e-value cutoff to use for sequence gathering [default: %s]' % SearchPipe.DEFAULT_HMMSEARCH_EVALUE, default=SearchPipe.DEFAULT_HMMSEARCH_EVALUE)
     argument_group.add_argument('--min-orf-length',
                                 metavar='length',
-                                help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i when input is reads, %i when input is genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH),
+                                help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i for reads, %i for genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH),
                                 type=int)
     argument_group.add_argument('--restrict-read-length',
                                 metavar='length',
@@ -333,11 +333,9 @@ def generate_streaming_otu_table_from_args(args,
 def get_min_orf_length(args, subparser='pipe'):
     if args.min_orf_length:
         return args.min_orf_length
-    elif subparser=='pipe' and (args.forward or args.sra_files):
-        return SearchPipe.DEFAULT_MIN_ORF_LENGTH
-    elif subparser=='pipe' and args.genome_fasta_files:
+    elif subparser == 'pipe' and args.genome_fasta_files:
         return SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH
-    elif subparser=='renew':
+    elif subparser in ('pipe', 'renew'):
         return SearchPipe.DEFAULT_MIN_ORF_LENGTH
     else:
         raise Exception("Programming error")
@@ -363,6 +361,8 @@ def parse_genome_fasta_files(args):
         with open(args.genome_fasta_list) as f:
             genomes.extend([line.strip() for line in f if line.strip()])
     args.genome_fasta_files = genomes if genomes else None
+    if args.genome_fasta_files:
+        args.forward = args.genome_fasta_files
     return args
 
 def main():
@@ -768,7 +768,6 @@ def main():
         singlem.pipe.SearchPipe().run(
             sequences = args.forward,
             reverse_read_files = args.reverse,
-            genomes = args.genome_fasta_files,
             input_sra_files = args.sra_files,
             read_chunk_size = args.read_chunk_size,
             read_chunk_number = args.read_chunk_number,

diff --git a/singlem/pipe.py b/singlem/pipe.py
@@ -24,7 +24,6 @@
 from .archive_otu_table import ArchiveOtuTable
 from .taxonomy import *
 from .otu_table_collection import StreamingOtuTableCollection, OtuTableCollection
-from .genome_fasta_mux import GenomeFastaMux
 
 from graftm.sequence_extractor import SequenceExtractor
 from graftm.greengenes_taxonomy import GreenGenesTaxonomy
@@ -151,7 +150,6 @@ def run_to_otu_table(self, **kwargs):
         input_sra_files = kwargs.pop('input_sra_files',None)
         read_chunk_size = kwargs.pop('read_chunk_size', None)
         read_chunk_number = kwargs.pop('read_chunk_number', None)
-        genome_fasta_files = kwargs.pop('genomes', None)
         sleep_after_mkfifo = kwargs.pop('sleep_after_mkfifo', None)
         num_threads = kwargs.pop('threads')
         known_otu_tables = kwargs.pop('known_otu_tables', None)
@@ -205,8 +203,6 @@ def run_to_otu_table(self, **kwargs):
         if diamond_prefilter_db:
             hmms.set_prefilter_db_path(diamond_prefilter_db)
 
-        if genome_fasta_files and forward_read_files:
-            raise Exception("Cannot process reads and genomes in the same run")
 
         analysing_pairs = reverse_read_files is not None
         if analysing_pairs:
@@ -274,26 +270,6 @@ def run_to_otu_table(self, **kwargs):
         os.mkdir(tempfile_directory)
         tempfile.tempdir = tempfile_directory
 
-        #### Preprocess genomes into transcripts to speed the rest of the pipeline
-        if genome_fasta_files:
-            logging.info("Calling rough transcriptome of genome FASTA files")
-            genome_fasta_mux = GenomeFastaMux(genome_fasta_files)  # to deal with mux and demux of sequence names
-
-            # Create a single tempfile with ORFs from all genomes, and then
-            # demultiplex later, because this saves a bunch of syscalls.
-            #
-            # Make a tempfile with delete=False because it is in a tmpdir already, and useful for debug to keep around with --working-directory
-            transcripts_path = tempfile.NamedTemporaryFile(prefix='singlem-genome-orfs', suffix='.fasta', delete=False)
-            transcripts_path.close()
-            for fasta in genome_fasta_files:
-                extern.run("orfm -c {} -m {} -t >(sed 's/>/>{}|/' >>{}) {} >/dev/null".format(
-                    self._translation_table,
-                    self._min_orf_length,
-                    genome_fasta_mux.fasta_to_prefix(fasta),
-                    transcripts_path.name,
-                    fasta))
-            forward_read_files = [transcripts_path.name]
-
         def return_cleanly():
             if using_temporary_working_directory and not working_directory_dev_shm:
                 # Directly setting tempdir in this way is not recommended, but
@@ -315,9 +291,6 @@ def return_cleanly():
         elif analysing_pairs:
             logging.info("Using as input %i different pairs of sequence files e.g. %s & %s" % (
                 len(forward_read_files), forward_read_files[0], reverse_read_files[0]))
-        elif genome_fasta_files:
-            logging.info("Using as input %i different genomes e.g. %s" % (
-                len(genome_fasta_files), genome_fasta_files[0]))
         else:
             logging.info("Using as input %i different sequence files e.g. %s" % (
                 len(forward_read_files), forward_read_files[0]))
@@ -491,16 +464,6 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com
 
 
 
-        #### Remove duplications which happen when OrfM hits the same sequence more than once.
-        if genome_fasta_files:
-            logging.info("Removing duplicate sequences from rough transcriptome ..")
-            for readset in extracted_reads:
-                # the read~gene format introduced for long read compatibility 
-                # breaks genome_fasta_files mode, so need to remove it
-                for sequence in readset.unknown_sequences:
-                    sequence.name = sequence.name.split('~')[0]
-                readset.remove_duplicate_sequences()
-
         #### Extract diamond_taxonomy_assignment_performance_parameters from metapackage (v5 metapackages only)
         if diamond_taxonomy_assignment_performance_parameters == None:
             diamond_taxonomy_assignment_performance_parameters = metapackage_object.diamond_taxonomy_assignment_performance_parameters()
@@ -519,12 +482,8 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com
             known_taxes=known_taxes,
             output_jplace=output_jplace,
             assignment_singlem_db=assignment_singlem_db,
-            genome_fasta_files=genome_fasta_files
         )
 
-        if genome_fasta_files:
-            otu_table_object = genome_fasta_mux.demux_otu_table(otu_table_object)
-
         return_cleanly()
         return otu_table_object
 
@@ -539,7 +498,6 @@ def assign_taxonomy_and_process(self, **kwargs):
         known_taxes = kwargs.pop('known_taxes')
         output_jplace = kwargs.pop('output_jplace')
         assignment_singlem_db = kwargs.pop('assignment_singlem_db')
-        genome_fasta_files = kwargs.pop('genome_fasta_files', False)
 
         if len(kwargs) > 0:
             raise Exception("Unexpected arguments detected: %s" % kwargs)
@@ -577,8 +535,7 @@ def assign_taxonomy_and_process(self, **kwargs):
                 known_sequence_tax if known_sequence_taxonomy else None,
                 # outputs
                 otu_table_object,
-                package_to_taxonomy_bihash,
-                genome_fasta_files)
+                package_to_taxonomy_bihash)
 
         return otu_table_object
 
@@ -620,8 +577,7 @@ def _process_taxonomically_assigned_reads(
             known_sequence_tax,
             # outputs
             otu_table_object,
-            package_to_taxonomy_bihash,
-            genome_fasta_files):
+            package_to_taxonomy_bihash):
 
         # To deal with paired reads, process each. Then exclude second reads
         # from pairs where both match.

diff --git a/test/test_pipe.py b/test/test_pipe.py
@@ -1194,7 +1194,7 @@ def test_exclude_off_target_hits(self):
 
     def test_genome_input_dereplication(self):
         expected = 'gene    sample  sequence        num_hits        coverage        taxonomy\n' \
-            '4.12.22seqs     GCA_000309865.1_genomic  GATGGCGGTAAAGCCACTCCCGGCCCACCATTAGGTCCAGCAATCGGACCCCTAGGTATC    1       1.00    '
+            '4.12.22seqs     GCA_000309865.1_genomic  CCGGCTTTTCAGATCGCACCGGATCCAACAGTTGCATTCACAGTTGGCTATTTAGGAGTG    1       1.00    '
         # ~/git/singlem/bin/singlem pipe --genome-fasta-files genomes/GCA_000309865.1_genomic.fna --singlem-package ../4.12.22seqs.spkg/ --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96
         cmd = '{} pipe --translation-table 11 --genome-fasta-files {} --singlem-package {} --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96'.format(
             path_to_script,