diff --git a/docs/AGENTS.md b/docs/AGENTS.md new file mode 100644 index 00000000..5685fca6 --- /dev/null +++ b/docs/AGENTS.md @@ -0,0 +1,4 @@ +# AGENTS + +- Files under `docs/tools/` and `docs/advanced/` are autogenerated; do not modify them directly. +- `docs/Installation.md` is autogenerated; make changes in `docs/Installation.md.in` instead. diff --git a/docs/preludes/pipe_prelude.md b/docs/preludes/pipe_prelude.md index 63493725..8fc1d8dc 100644 --- a/docs/preludes/pipe_prelude.md +++ b/docs/preludes/pipe_prelude.md @@ -7,7 +7,9 @@ To further convert the generated taxonomic profile to other formats that might b ## Algorithm details -**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. It can also take as input whole genomes (or contigs), and can output a table of OTUs. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary). +**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary). + +It can also take as input whole genomes (or contigs) via `--genome-fasta-files`, `--genome-fasta-directory` or `--genome-fasta-list`. These options behave the same as providing sequences with `--forward`, except that different defaults for `--min-taxon-coverage` and `--min-orf-length` are used. `pipe` performs three steps: diff --git a/singlem/lyrebird.py b/singlem/lyrebird.py index 95c71e37..2ad590f3 100644 --- a/singlem/lyrebird.py +++ b/singlem/lyrebird.py @@ -108,7 +108,6 @@ def main(): singlem.pipe.SearchPipe().run( sequences = args.forward, reverse_read_files = args.reverse, - genomes = args.genome_fasta_files, input_sra_files = args.sra_files, otu_table = args.otu_table, archive_otu_table = args.archive_otu_table, diff --git a/singlem/main.py b/singlem/main.py index 0d12d657..b70db235 100755 --- a/singlem/main.py +++ b/singlem/main.py @@ -80,13 +80,13 @@ def add_common_pipe_arguments(argument_group, extra_args=False): sequence_input_group.add_argument('-f', '--genome-fasta-files', nargs='+', metavar='PATH', - help='Path(s) to FASTA files of each genome e.g. pathA/genome1.fna pathB/genome2.fa.') + help='Path(s) to genome FASTA files. These are processed like input given with --forward, but use higher default values for --min-taxon-coverage and --min-orf-length.') sequence_input_group.add_argument('-d', '--genome-fasta-directory', metavar='PATH', - help='Directory containing FASTA files of each genome.') + help='Directory containing genome FASTA files. Treated identically to --forward input with higher default values for --min-taxon-coverage and --min-orf-length.') sequence_input_group.add_argument('--genome-fasta-list', metavar='PATH', - help='File containing FASTA file paths, one per line.') + help='File containing genome FASTA paths, one per line. Behaviour matches --forward with higher default values for --min-taxon-coverage and --min-orf-length.') sequence_input_group.add_argument('--sra-files', nargs='+', metavar='sra_file', @@ -151,7 +151,7 @@ def add_less_common_pipe_arguments(argument_group, extra_args=False): help='HMMSEARCH e-value cutoff to use for sequence gathering [default: %s]' % SearchPipe.DEFAULT_HMMSEARCH_EVALUE, default=SearchPipe.DEFAULT_HMMSEARCH_EVALUE) argument_group.add_argument('--min-orf-length', metavar='length', - help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i when input is reads, %i when input is genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH), + help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i for reads, %i for genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH), type=int) argument_group.add_argument('--restrict-read-length', metavar='length', @@ -333,11 +333,9 @@ def generate_streaming_otu_table_from_args(args, def get_min_orf_length(args, subparser='pipe'): if args.min_orf_length: return args.min_orf_length - elif subparser=='pipe' and (args.forward or args.sra_files): - return SearchPipe.DEFAULT_MIN_ORF_LENGTH - elif subparser=='pipe' and args.genome_fasta_files: + elif subparser == 'pipe' and args.genome_fasta_files: return SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH - elif subparser=='renew': + elif subparser in ('pipe', 'renew'): return SearchPipe.DEFAULT_MIN_ORF_LENGTH else: raise Exception("Programming error") @@ -363,6 +361,8 @@ def parse_genome_fasta_files(args): with open(args.genome_fasta_list) as f: genomes.extend([line.strip() for line in f if line.strip()]) args.genome_fasta_files = genomes if genomes else None + if args.genome_fasta_files: + args.forward = args.genome_fasta_files return args def main(): @@ -768,7 +768,6 @@ def main(): singlem.pipe.SearchPipe().run( sequences = args.forward, reverse_read_files = args.reverse, - genomes = args.genome_fasta_files, input_sra_files = args.sra_files, read_chunk_size = args.read_chunk_size, read_chunk_number = args.read_chunk_number, diff --git a/singlem/pipe.py b/singlem/pipe.py index b1cb1184..26850830 100644 --- a/singlem/pipe.py +++ b/singlem/pipe.py @@ -24,7 +24,6 @@ from .archive_otu_table import ArchiveOtuTable from .taxonomy import * from .otu_table_collection import StreamingOtuTableCollection, OtuTableCollection -from .genome_fasta_mux import GenomeFastaMux from graftm.sequence_extractor import SequenceExtractor from graftm.greengenes_taxonomy import GreenGenesTaxonomy @@ -151,7 +150,6 @@ def run_to_otu_table(self, **kwargs): input_sra_files = kwargs.pop('input_sra_files',None) read_chunk_size = kwargs.pop('read_chunk_size', None) read_chunk_number = kwargs.pop('read_chunk_number', None) - genome_fasta_files = kwargs.pop('genomes', None) sleep_after_mkfifo = kwargs.pop('sleep_after_mkfifo', None) num_threads = kwargs.pop('threads') known_otu_tables = kwargs.pop('known_otu_tables', None) @@ -205,8 +203,6 @@ def run_to_otu_table(self, **kwargs): if diamond_prefilter_db: hmms.set_prefilter_db_path(diamond_prefilter_db) - if genome_fasta_files and forward_read_files: - raise Exception("Cannot process reads and genomes in the same run") analysing_pairs = reverse_read_files is not None if analysing_pairs: @@ -274,26 +270,6 @@ def run_to_otu_table(self, **kwargs): os.mkdir(tempfile_directory) tempfile.tempdir = tempfile_directory - #### Preprocess genomes into transcripts to speed the rest of the pipeline - if genome_fasta_files: - logging.info("Calling rough transcriptome of genome FASTA files") - genome_fasta_mux = GenomeFastaMux(genome_fasta_files) # to deal with mux and demux of sequence names - - # Create a single tempfile with ORFs from all genomes, and then - # demultiplex later, because this saves a bunch of syscalls. - # - # Make a tempfile with delete=False because it is in a tmpdir already, and useful for debug to keep around with --working-directory - transcripts_path = tempfile.NamedTemporaryFile(prefix='singlem-genome-orfs', suffix='.fasta', delete=False) - transcripts_path.close() - for fasta in genome_fasta_files: - extern.run("orfm -c {} -m {} -t >(sed 's/>/>{}|/' >>{}) {} >/dev/null".format( - self._translation_table, - self._min_orf_length, - genome_fasta_mux.fasta_to_prefix(fasta), - transcripts_path.name, - fasta)) - forward_read_files = [transcripts_path.name] - def return_cleanly(): if using_temporary_working_directory and not working_directory_dev_shm: # Directly setting tempdir in this way is not recommended, but @@ -315,9 +291,6 @@ def return_cleanly(): elif analysing_pairs: logging.info("Using as input %i different pairs of sequence files e.g. %s & %s" % ( len(forward_read_files), forward_read_files[0], reverse_read_files[0])) - elif genome_fasta_files: - logging.info("Using as input %i different genomes e.g. %s" % ( - len(genome_fasta_files), genome_fasta_files[0])) else: logging.info("Using as input %i different sequence files e.g. %s" % ( len(forward_read_files), forward_read_files[0])) @@ -491,16 +464,6 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com - #### Remove duplications which happen when OrfM hits the same sequence more than once. - if genome_fasta_files: - logging.info("Removing duplicate sequences from rough transcriptome ..") - for readset in extracted_reads: - # the read~gene format introduced for long read compatibility - # breaks genome_fasta_files mode, so need to remove it - for sequence in readset.unknown_sequences: - sequence.name = sequence.name.split('~')[0] - readset.remove_duplicate_sequences() - #### Extract diamond_taxonomy_assignment_performance_parameters from metapackage (v5 metapackages only) if diamond_taxonomy_assignment_performance_parameters == None: diamond_taxonomy_assignment_performance_parameters = metapackage_object.diamond_taxonomy_assignment_performance_parameters() @@ -519,12 +482,8 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com known_taxes=known_taxes, output_jplace=output_jplace, assignment_singlem_db=assignment_singlem_db, - genome_fasta_files=genome_fasta_files ) - if genome_fasta_files: - otu_table_object = genome_fasta_mux.demux_otu_table(otu_table_object) - return_cleanly() return otu_table_object @@ -539,7 +498,6 @@ def assign_taxonomy_and_process(self, **kwargs): known_taxes = kwargs.pop('known_taxes') output_jplace = kwargs.pop('output_jplace') assignment_singlem_db = kwargs.pop('assignment_singlem_db') - genome_fasta_files = kwargs.pop('genome_fasta_files', False) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) @@ -577,8 +535,7 @@ def assign_taxonomy_and_process(self, **kwargs): known_sequence_tax if known_sequence_taxonomy else None, # outputs otu_table_object, - package_to_taxonomy_bihash, - genome_fasta_files) + package_to_taxonomy_bihash) return otu_table_object @@ -620,8 +577,7 @@ def _process_taxonomically_assigned_reads( known_sequence_tax, # outputs otu_table_object, - package_to_taxonomy_bihash, - genome_fasta_files): + package_to_taxonomy_bihash): # To deal with paired reads, process each. Then exclude second reads # from pairs where both match. diff --git a/test/test_pipe.py b/test/test_pipe.py index 66fa9724..463bce66 100644 --- a/test/test_pipe.py +++ b/test/test_pipe.py @@ -1194,7 +1194,7 @@ def test_exclude_off_target_hits(self): def test_genome_input_dereplication(self): expected = 'gene sample sequence num_hits coverage taxonomy\n' \ - '4.12.22seqs GCA_000309865.1_genomic GATGGCGGTAAAGCCACTCCCGGCCCACCATTAGGTCCAGCAATCGGACCCCTAGGTATC 1 1.00 ' + '4.12.22seqs GCA_000309865.1_genomic CCGGCTTTTCAGATCGCACCGGATCCAACAGTTGCATTCACAGTTGGCTATTTAGGAGTG 1 1.00 ' # ~/git/singlem/bin/singlem pipe --genome-fasta-files genomes/GCA_000309865.1_genomic.fna --singlem-package ../4.12.22seqs.spkg/ --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96 cmd = '{} pipe --translation-table 11 --genome-fasta-files {} --singlem-package {} --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96'.format( path_to_script,