Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# AGENTS

- Files under `docs/tools/` and `docs/advanced/` are autogenerated; do not modify them directly.
- `docs/Installation.md` is autogenerated; make changes in `docs/Installation.md.in` instead.
4 changes: 3 additions & 1 deletion docs/preludes/pipe_prelude.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ To further convert the generated taxonomic profile to other formats that might b

## Algorithm details

**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. It can also take as input whole genomes (or contigs), and can output a table of OTUs. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary).
**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary).

It can also take as input whole genomes (or contigs) via `--genome-fasta-files`, `--genome-fasta-directory` or `--genome-fasta-list`. These options behave the same as providing sequences with `--forward`, except that different defaults for `--min-taxon-coverage` and `--min-orf-length` are used.

`pipe` performs three steps:

Expand Down
1 change: 0 additions & 1 deletion singlem/lyrebird.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ def main():
singlem.pipe.SearchPipe().run(
sequences = args.forward,
reverse_read_files = args.reverse,
genomes = args.genome_fasta_files,
input_sra_files = args.sra_files,
otu_table = args.otu_table,
archive_otu_table = args.archive_otu_table,
Expand Down
17 changes: 8 additions & 9 deletions singlem/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,13 @@ def add_common_pipe_arguments(argument_group, extra_args=False):
sequence_input_group.add_argument('-f', '--genome-fasta-files',
nargs='+',
metavar='PATH',
help='Path(s) to FASTA files of each genome e.g. pathA/genome1.fna pathB/genome2.fa.')
help='Path(s) to genome FASTA files. These are processed like input given with --forward, but use higher default values for --min-taxon-coverage and --min-orf-length.')
sequence_input_group.add_argument('-d', '--genome-fasta-directory',
metavar='PATH',
help='Directory containing FASTA files of each genome.')
help='Directory containing genome FASTA files. Treated identically to --forward input with higher default values for --min-taxon-coverage and --min-orf-length.')
sequence_input_group.add_argument('--genome-fasta-list',
metavar='PATH',
help='File containing FASTA file paths, one per line.')
help='File containing genome FASTA paths, one per line. Behaviour matches --forward with higher default values for --min-taxon-coverage and --min-orf-length.')
sequence_input_group.add_argument('--sra-files',
nargs='+',
metavar='sra_file',
Expand Down Expand Up @@ -151,7 +151,7 @@ def add_less_common_pipe_arguments(argument_group, extra_args=False):
help='HMMSEARCH e-value cutoff to use for sequence gathering [default: %s]' % SearchPipe.DEFAULT_HMMSEARCH_EVALUE, default=SearchPipe.DEFAULT_HMMSEARCH_EVALUE)
argument_group.add_argument('--min-orf-length',
metavar='length',
help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i when input is reads, %i when input is genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH),
help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i for reads, %i for genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH),
type=int)
argument_group.add_argument('--restrict-read-length',
metavar='length',
Expand Down Expand Up @@ -333,11 +333,9 @@ def generate_streaming_otu_table_from_args(args,
def get_min_orf_length(args, subparser='pipe'):
if args.min_orf_length:
return args.min_orf_length
elif subparser=='pipe' and (args.forward or args.sra_files):
return SearchPipe.DEFAULT_MIN_ORF_LENGTH
elif subparser=='pipe' and args.genome_fasta_files:
elif subparser == 'pipe' and args.genome_fasta_files:
return SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH
elif subparser=='renew':
elif subparser in ('pipe', 'renew'):
return SearchPipe.DEFAULT_MIN_ORF_LENGTH
else:
raise Exception("Programming error")
Expand All @@ -363,6 +361,8 @@ def parse_genome_fasta_files(args):
with open(args.genome_fasta_list) as f:
genomes.extend([line.strip() for line in f if line.strip()])
args.genome_fasta_files = genomes if genomes else None
if args.genome_fasta_files:
args.forward = args.genome_fasta_files
return args

def main():
Expand Down Expand Up @@ -768,7 +768,6 @@ def main():
singlem.pipe.SearchPipe().run(
sequences = args.forward,
reverse_read_files = args.reverse,
genomes = args.genome_fasta_files,
input_sra_files = args.sra_files,
read_chunk_size = args.read_chunk_size,
read_chunk_number = args.read_chunk_number,
Expand Down
48 changes: 2 additions & 46 deletions singlem/pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from .archive_otu_table import ArchiveOtuTable
from .taxonomy import *
from .otu_table_collection import StreamingOtuTableCollection, OtuTableCollection
from .genome_fasta_mux import GenomeFastaMux

from graftm.sequence_extractor import SequenceExtractor
from graftm.greengenes_taxonomy import GreenGenesTaxonomy
Expand Down Expand Up @@ -151,7 +150,6 @@ def run_to_otu_table(self, **kwargs):
input_sra_files = kwargs.pop('input_sra_files',None)
read_chunk_size = kwargs.pop('read_chunk_size', None)
read_chunk_number = kwargs.pop('read_chunk_number', None)
genome_fasta_files = kwargs.pop('genomes', None)
sleep_after_mkfifo = kwargs.pop('sleep_after_mkfifo', None)
num_threads = kwargs.pop('threads')
known_otu_tables = kwargs.pop('known_otu_tables', None)
Expand Down Expand Up @@ -205,8 +203,6 @@ def run_to_otu_table(self, **kwargs):
if diamond_prefilter_db:
hmms.set_prefilter_db_path(diamond_prefilter_db)

if genome_fasta_files and forward_read_files:
raise Exception("Cannot process reads and genomes in the same run")

analysing_pairs = reverse_read_files is not None
if analysing_pairs:
Expand Down Expand Up @@ -274,26 +270,6 @@ def run_to_otu_table(self, **kwargs):
os.mkdir(tempfile_directory)
tempfile.tempdir = tempfile_directory

#### Preprocess genomes into transcripts to speed the rest of the pipeline
if genome_fasta_files:
logging.info("Calling rough transcriptome of genome FASTA files")
genome_fasta_mux = GenomeFastaMux(genome_fasta_files) # to deal with mux and demux of sequence names

# Create a single tempfile with ORFs from all genomes, and then
# demultiplex later, because this saves a bunch of syscalls.
#
# Make a tempfile with delete=False because it is in a tmpdir already, and useful for debug to keep around with --working-directory
transcripts_path = tempfile.NamedTemporaryFile(prefix='singlem-genome-orfs', suffix='.fasta', delete=False)
transcripts_path.close()
for fasta in genome_fasta_files:
extern.run("orfm -c {} -m {} -t >(sed 's/>/>{}|/' >>{}) {} >/dev/null".format(
self._translation_table,
self._min_orf_length,
genome_fasta_mux.fasta_to_prefix(fasta),
transcripts_path.name,
fasta))
forward_read_files = [transcripts_path.name]

def return_cleanly():
if using_temporary_working_directory and not working_directory_dev_shm:
# Directly setting tempdir in this way is not recommended, but
Expand All @@ -315,9 +291,6 @@ def return_cleanly():
elif analysing_pairs:
logging.info("Using as input %i different pairs of sequence files e.g. %s & %s" % (
len(forward_read_files), forward_read_files[0], reverse_read_files[0]))
elif genome_fasta_files:
logging.info("Using as input %i different genomes e.g. %s" % (
len(genome_fasta_files), genome_fasta_files[0]))
else:
logging.info("Using as input %i different sequence files e.g. %s" % (
len(forward_read_files), forward_read_files[0]))
Expand Down Expand Up @@ -491,16 +464,6 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com



#### Remove duplications which happen when OrfM hits the same sequence more than once.
if genome_fasta_files:
logging.info("Removing duplicate sequences from rough transcriptome ..")
for readset in extracted_reads:
# the read~gene format introduced for long read compatibility
# breaks genome_fasta_files mode, so need to remove it
for sequence in readset.unknown_sequences:
sequence.name = sequence.name.split('~')[0]
readset.remove_duplicate_sequences()

#### Extract diamond_taxonomy_assignment_performance_parameters from metapackage (v5 metapackages only)
if diamond_taxonomy_assignment_performance_parameters == None:
diamond_taxonomy_assignment_performance_parameters = metapackage_object.diamond_taxonomy_assignment_performance_parameters()
Expand All @@ -519,12 +482,8 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com
known_taxes=known_taxes,
output_jplace=output_jplace,
assignment_singlem_db=assignment_singlem_db,
genome_fasta_files=genome_fasta_files
)

if genome_fasta_files:
otu_table_object = genome_fasta_mux.demux_otu_table(otu_table_object)

return_cleanly()
return otu_table_object

Expand All @@ -539,7 +498,6 @@ def assign_taxonomy_and_process(self, **kwargs):
known_taxes = kwargs.pop('known_taxes')
output_jplace = kwargs.pop('output_jplace')
assignment_singlem_db = kwargs.pop('assignment_singlem_db')
genome_fasta_files = kwargs.pop('genome_fasta_files', False)

if len(kwargs) > 0:
raise Exception("Unexpected arguments detected: %s" % kwargs)
Expand Down Expand Up @@ -577,8 +535,7 @@ def assign_taxonomy_and_process(self, **kwargs):
known_sequence_tax if known_sequence_taxonomy else None,
# outputs
otu_table_object,
package_to_taxonomy_bihash,
genome_fasta_files)
package_to_taxonomy_bihash)

return otu_table_object

Expand Down Expand Up @@ -620,8 +577,7 @@ def _process_taxonomically_assigned_reads(
known_sequence_tax,
# outputs
otu_table_object,
package_to_taxonomy_bihash,
genome_fasta_files):
package_to_taxonomy_bihash):

# To deal with paired reads, process each. Then exclude second reads
# from pairs where both match.
Expand Down
2 changes: 1 addition & 1 deletion test/test_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,7 +1194,7 @@ def test_exclude_off_target_hits(self):

def test_genome_input_dereplication(self):
expected = 'gene sample sequence num_hits coverage taxonomy\n' \
'4.12.22seqs GCA_000309865.1_genomic GATGGCGGTAAAGCCACTCCCGGCCCACCATTAGGTCCAGCAATCGGACCCCTAGGTATC 1 1.00 '
'4.12.22seqs GCA_000309865.1_genomic CCGGCTTTTCAGATCGCACCGGATCCAACAGTTGCATTCACAGTTGGCTATTTAGGAGTG 1 1.00 '
# ~/git/singlem/bin/singlem pipe --genome-fasta-files genomes/GCA_000309865.1_genomic.fna --singlem-package ../4.12.22seqs.spkg/ --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96
cmd = '{} pipe --translation-table 11 --genome-fasta-files {} --singlem-package {} --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96'.format(
path_to_script,
Expand Down