From ad79996f373b1203f25a88e4f7a8d1e908555b86 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Sat, 2 Aug 2025 09:33:13 +1000 Subject: [PATCH 1/8] Partially address https://github.com/BioContainers/containers/issues/602 Suggested by: @mboudet --- pixi.lock | 38 +++++++++++++++++++++++++++++++++----- pixi.toml | 2 ++ 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/pixi.lock b/pixi.lock index 98a3d3d4..9e76f55b 100644 --- a/pixi.lock +++ b/pixi.lock @@ -24,6 +24,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.1.18-h83b837d_6.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.26.9-he3a8b3b_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.329-hba8bd5f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bash-5.2.37-h4be8908_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/biom-format-2.1.16-py312h66e93f0_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/biopython-1.85-py312h66e93f0_1.conda - conda: https://conda.anaconda.org/bioconda/noarch/bird_tool_utils_python-0.4.1-pyhdfd78af_0.tar.bz2 @@ -43,6 +44,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.1.8-pyh707e725_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py312h68727a3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coreutils-9.5-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/curl-8.8.0-he654da7_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/bioconda/linux-64/dashing-0.4.0-h735f0e5_3.tar.bz2 @@ -219,7 +221,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-h4ab18f5_6.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py312h66e93f0_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.6-ha6fb4c9_0.conda - - pypi: . + - pypi: ./ dev: channels: - url: https://conda.anaconda.org/conda-forge/ @@ -245,6 +247,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.1.18-h83b837d_6.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.26.9-he3a8b3b_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.329-hba8bd5f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bash-5.2.37-h4be8908_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/biom-format-2.1.16-py312h66e93f0_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/biopython-1.85-py312h66e93f0_1.conda - conda: https://conda.anaconda.org/bioconda/noarch/bird_tool_utils_python-0.4.1-pyhdfd78af_0.tar.bz2 @@ -264,6 +267,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.1.8-pyh707e725_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py312h68727a3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coreutils-9.5-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/curl-8.8.0-he654da7_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/bioconda/linux-64/dashing-0.4.0-h735f0e5_3.tar.bz2 @@ -464,7 +468,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-h4ab18f5_6.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py312h66e93f0_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.6-ha6fb4c9_0.conda - - pypi: . + - pypi: ./ packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 @@ -690,6 +694,19 @@ packages: purls: [] size: 3638342 timestamp: 1716466127210 +- conda: https://conda.anaconda.org/conda-forge/linux-64/bash-5.2.37-h4be8908_0.conda + sha256: a0ce6ed2b346501be1fcae415e4df04618f822834902dc22174a350ae39c791e + md5: c918f7141733d412f5c579d07f437690 + depends: + - readline + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - readline >=8.2,<9.0a0 + license: GPL-3.0-or-later + license_family: GPL + purls: [] + size: 1929937 + timestamp: 1748631191479 - conda: https://conda.anaconda.org/conda-forge/linux-64/biom-format-2.1.16-py312h66e93f0_2.conda sha256: db6bf3fa7778c6d128ba92e883c1ce8ac8e5e5268b3a71b6d445fbaf91aa4f7e md5: 65e5dd22088478d2beca0ac782532868 @@ -932,6 +949,16 @@ packages: - pkg:pypi/contourpy?source=hash-mapping size: 276533 timestamp: 1744743235779 +- conda: https://conda.anaconda.org/conda-forge/linux-64/coreutils-9.5-hd590300_0.conda + sha256: 7cd3b0f55aa55bb27b045c30f32b3f6b874ecc006f3abcb274c71a3bcbacb358 + md5: 126d457e0e7a535278e808a7d8960015 + depends: + - libgcc-ng >=12 + license: GPL-3.0-or-later + license_family: GPL + purls: [] + size: 3014238 + timestamp: 1711655132451 - conda: https://conda.anaconda.org/conda-forge/linux-64/curl-8.8.0-he654da7_1.conda sha256: a7bb4f8d1cba26c238cd0469b7bdcdc032cf06b0ac0de09992638744eaab9954 md5: 78678b2ddfd9bd7c7861b8d2e3b7473b @@ -2825,7 +2852,8 @@ packages: - python license: BSD-3-Clause license_family: BSD - purls: [] + purls: + - pkg:pypi/pycparser?source=hash-mapping size: 110100 timestamp: 1733195786147 - conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-1.10.21-py312h30e4e5e_0.conda @@ -3163,9 +3191,9 @@ packages: - pkg:pypi/setuptools-scm?source=compressed-mapping size: 38426 timestamp: 1745450953205 -- pypi: . +- pypi: ./ name: singlem - version: 0.18.3 + version: 0.19.0 sha256: 02f7d50b8fb0afd29122360abd90e1d08fd3a7acfebb1c0e15daf6daa0aa187f requires_dist: - biopython==1.85 diff --git a/pixi.toml b/pixi.toml index 5d1c647b..932f73dd 100644 --- a/pixi.toml +++ b/pixi.toml @@ -11,6 +11,8 @@ scripts = ["admin/set_env_vars.sh"] [dependencies] python = ">=3.7" +coreutils = "*" # So sort --parallel works +bash = "*" diamond = ">=2.1.7,<2.1.11" # 2.1.11 segfaults (on some non-test data) biopython = "*" hmmer = "*" From 80727fba8ceea77c97d1dc378a1f51555eda327c Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Thu, 14 Aug 2025 10:29:38 +1000 Subject: [PATCH 2/8] Move setup script to admin and add AGENTS instructions --- AGENTS.md | 5 +++++ admin/setup.sh | 25 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 AGENTS.md create mode 100755 admin/setup.sh diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..c0374494 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,5 @@ +# AGENTS instructions + +Before running any commands or tests: +1. `source admin/setup.sh` (loads Pixi dev environment and environment variables) +2. Continue with your usual commands. diff --git a/admin/setup.sh b/admin/setup.sh new file mode 100755 index 00000000..b81f62b7 --- /dev/null +++ b/admin/setup.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Activate the pixi dev environment for this repository. + +set -e + +# Resolve repository root directory +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +# pixi uses PIXI_PROJECT_ROOT for activation scripts. +export PIXI_PROJECT_ROOT="$REPO_ROOT" + +ENV_PATH="$REPO_ROOT/.pixi/envs/dev" +BIN_PATH="$ENV_PATH/bin" + +if [ -d "$BIN_PATH" ]; then + export PATH="$BIN_PATH:$PATH" + export CONDA_PREFIX="$ENV_PATH" +else + echo "Pixi dev environment not found. Run 'pixi install' to create it." >&2 + return 1 2>/dev/null || exit 1 +fi + +# Run additional activation script to set environment variables +# shellcheck source=/dev/null +. "$REPO_ROOT/admin/set_env_vars.sh" From 7787583ce2e6af2440a68320c4cf0a2fbfb8e943 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Thu, 14 Aug 2025 13:41:06 +1000 Subject: [PATCH 3/8] Revert "Move setup script to admin and add AGENTS instructions" This reverts commit 80727fba8ceea77c97d1dc378a1f51555eda327c. --- AGENTS.md | 5 ----- admin/setup.sh | 25 ------------------------- 2 files changed, 30 deletions(-) delete mode 100644 AGENTS.md delete mode 100755 admin/setup.sh diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index c0374494..00000000 --- a/AGENTS.md +++ /dev/null @@ -1,5 +0,0 @@ -# AGENTS instructions - -Before running any commands or tests: -1. `source admin/setup.sh` (loads Pixi dev environment and environment variables) -2. Continue with your usual commands. diff --git a/admin/setup.sh b/admin/setup.sh deleted file mode 100755 index b81f62b7..00000000 --- a/admin/setup.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash -# Activate the pixi dev environment for this repository. - -set -e - -# Resolve repository root directory -REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" - -# pixi uses PIXI_PROJECT_ROOT for activation scripts. -export PIXI_PROJECT_ROOT="$REPO_ROOT" - -ENV_PATH="$REPO_ROOT/.pixi/envs/dev" -BIN_PATH="$ENV_PATH/bin" - -if [ -d "$BIN_PATH" ]; then - export PATH="$BIN_PATH:$PATH" - export CONDA_PREFIX="$ENV_PATH" -else - echo "Pixi dev environment not found. Run 'pixi install' to create it." >&2 - return 1 2>/dev/null || exit 1 -fi - -# Run additional activation script to set environment variables -# shellcheck source=/dev/null -. "$REPO_ROOT/admin/set_env_vars.sh" From b925fbb1d68e15a377139b1d4fde5d60a16c52a1 Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Thu, 14 Aug 2025 13:58:22 +1000 Subject: [PATCH 4/8] feat: add flexible genome input options --- singlem/lyrebird.py | 3 ++- singlem/main.py | 32 +++++++++++++++++++++++++++++--- test/test_pipe.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/singlem/lyrebird.py b/singlem/lyrebird.py index 281ac542..95c71e37 100644 --- a/singlem/lyrebird.py +++ b/singlem/lyrebird.py @@ -22,7 +22,7 @@ import singlem import singlem.pipe as pipe -from singlem.main import add_common_pipe_arguments, add_less_common_pipe_arguments, validate_pipe_args, add_condense_arguments, generate_streaming_otu_table_from_args, get_min_orf_length, get_min_taxon_coverage +from singlem.main import add_common_pipe_arguments, add_less_common_pipe_arguments, validate_pipe_args, add_condense_arguments, generate_streaming_otu_table_from_args, get_min_orf_length, get_min_taxon_coverage, parse_genome_fasta_files from singlem.pipe import SearchPipe from singlem.condense import Condenser from singlem.metapackage import LYREBIRD_DATA_ENVIRONMENT_VARIABLE, CUSTOM_TAXONOMY_DATABASE_NAME @@ -86,6 +86,7 @@ def main(): add_condense_arguments(condense_parser) args = bird_argparser.parse_the_args() + parse_genome_fasta_files(args) if args.debug: loglevel = logging.DEBUG diff --git a/singlem/main.py b/singlem/main.py index 798287fb..783948d2 100755 --- a/singlem/main.py +++ b/singlem/main.py @@ -77,14 +77,24 @@ def add_common_pipe_arguments(argument_group, extra_args=False): nargs='+', metavar='sequence_file', help='reverse reads to be searched. Can be FASTA or FASTQ format, GZIP-compressed or not.') - sequence_input_group.add_argument('--genome-fasta-files', + sequence_input_group.add_argument('-f', '--genome-fasta-files', nargs='+', - metavar='sequence_file', - help='nucleotide genome sequence(s) to be searched') + metavar='PATH', + help='Path(s) to FASTA files of each genome e.g. pathA/genome1.fna pathB/genome2.fa.') + sequence_input_group.add_argument('-d', '--genome-fasta-directory', + metavar='PATH', + help='Directory containing FASTA files of each genome.') + sequence_input_group.add_argument('--genome-fasta-list', + metavar='PATH', + help='File containing FASTA file paths, one per line.') sequence_input_group.add_argument('--sra-files', nargs='+', metavar='sra_file', help='"sra" format files (usually from NCBI SRA) to be searched') + argument_group.add_argument('-x', '--genome-fasta-extension', + metavar='EXT', + help='File extension of genomes in the directory specified with -d/--genome-fasta-directory. [default: fna]', + default='fna') argument_group.add_argument('--read-chunk-size', type=int, metavar='num_reads', @@ -340,6 +350,21 @@ def get_min_taxon_coverage(args, subparser='pipe'): else: return CONDENSE_DEFAULT_MIN_TAXON_COVERAGE +def parse_genome_fasta_files(args): + genomes = [] + if getattr(args, 'genome_fasta_files', None): + genomes.extend(args.genome_fasta_files) + if getattr(args, 'genome_fasta_directory', None): + extension = getattr(args, 'genome_fasta_extension', 'fna') + for fn in sorted(os.listdir(args.genome_fasta_directory)): + if fn.endswith('.' + extension): + genomes.append(os.path.join(args.genome_fasta_directory, fn)) + if getattr(args, 'genome_fasta_list', None): + with open(args.genome_fasta_list) as f: + genomes.extend([line.strip() for line in f if line.strip()]) + args.genome_fasta_files = genomes if genomes else None + return args + def main(): bird_argparser = BirdArgparser( program='SingleM', @@ -721,6 +746,7 @@ def main(): supplement_rare_group.add_argument('--new-taxonomy-database-version', help='Version of the taxonomy database to use [default: None]') args = bird_argparser.parse_the_args() + parse_genome_fasta_files(args) if args.debug: loglevel = logging.DEBUG diff --git a/test/test_pipe.py b/test/test_pipe.py index 650e75f9..66fa9724 100644 --- a/test/test_pipe.py +++ b/test/test_pipe.py @@ -24,6 +24,7 @@ import unittest import os.path import tempfile +import shutil import extern import sys import json @@ -1002,6 +1003,41 @@ def test_genome(self): list([line.split("\t") for line in expected]), extern.run(cmd)) + def test_genome_directory_input(self): + expected = [ + "\t".join(self.headers), + 'S1.13.ribosomal_protein_S15P_S13e\tuap2\tAAGGATTTGAGTGCAAAAAGAGGACTCGATTTTACAGAGGCAAAGATAAGAAAACTTGGA\t1\t1.00\tRoot; d__Archaea; p__Halobacterota; c__Methanomicrobia; o__Methanomicrobiales; f__Methanocullaceae; g__Methanoculleus', + '' + ] + with tempfile.TemporaryDirectory() as d: + shutil.copy(os.path.join(path_to_data, 'uap2.fna'), os.path.join(d, 'uap2.fa')) + cmd = '{} pipe --genome-fasta-directory {} --genome-fasta-extension fa --singlem-packages {}/S1.13.chainsaw.gpkg.spkg --otu-table /dev/stdout --assignment-method diamond'.format( + path_to_script, + d, + path_to_data, + ) + self.assertEqualOtuTable( + list([line.split("\t") for line in expected]), + extern.run(cmd)) + + def test_genome_fasta_list_input(self): + expected = [ + "\t".join(self.headers), + 'S1.13.ribosomal_protein_S15P_S13e\tuap2\tAAGGATTTGAGTGCAAAAAGAGGACTCGATTTTACAGAGGCAAAGATAAGAAAACTTGGA\t1\t1.00\tRoot; d__Archaea; p__Halobacterota; c__Methanomicrobia; o__Methanomicrobiales; f__Methanocullaceae; g__Methanoculleus', + '' + ] + with tempfile.NamedTemporaryFile(mode='w', suffix='.lst') as lst: + lst.write(os.path.join(path_to_data, 'uap2.fna')) + lst.flush() + cmd = '{} pipe --genome-fasta-list {} --singlem-packages {}/S1.13.chainsaw.gpkg.spkg --otu-table /dev/stdout --assignment-method diamond'.format( + path_to_script, + lst.name, + path_to_data, + ) + self.assertEqualOtuTable( + list([line.split("\t") for line in expected]), + extern.run(cmd)) + def test_prefilter_piped_in_input(self): with tempfile.NamedTemporaryFile(suffix='.mkfifo.fna') as fifo: with tempfile.NamedTemporaryFile(suffix='.mkfifo.fna') as script: From 19b801c5d6fcc7a387563373b87cd18bce1bdaec Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Sun, 17 Aug 2025 08:30:24 +1000 Subject: [PATCH 5/8] summarise: Fail when --krona used with taxonomic profile input (#242) * Warn when --krona used with taxonomic profiles * chore: drop generated docs --- singlem/main.py | 2 ++ test/test_summariser.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/singlem/main.py b/singlem/main.py index 783948d2..0d12d657 100755 --- a/singlem/main.py +++ b/singlem/main.py @@ -886,6 +886,8 @@ def main(): raise Exception("--collapse-paired-with-unpaired-archive-otu-table currently only works with archive tables") elif not len(args.input_archive_otu_tables) == 2: raise Exception("--collapse-paired-with-unpaired-archive-otu-table requires exactly two archive tables") + if args.krona and args.input_taxonomic_profiles: + raise Exception("--krona generates a krona file from an OTU table. Use --output-taxonomic-profile-krona when providing taxonomic profiles") if args.output_taxonomic_profile_krona and not args.input_taxonomic_profiles: raise Exception("--output-taxonomic-profile-krona requires --input-taxonomic-profiles to be defined") if args.output_archive_otu_table and not args.collapse_to_sample_name: diff --git a/test/test_summariser.py b/test/test_summariser.py index 7c63e956..7c463283 100644 --- a/test/test_summariser.py +++ b/test/test_summariser.py @@ -154,6 +154,11 @@ def test_krona(self): table_collection=table_collection) self.assertTrue(os.path.exists(os.path.join(tmp,'KronaOK.html'))) + def test_krona_with_taxonomic_profile_input(self): + with self.assertRaises(Exception) as cm: + extern.run(f'singlem summarise --input-taxonomic-profile {path_to_data}/summarise/profile1.tsv --krona /dev/null') + self.assertIn('--output-taxonomic-profile-krona', str(cm.exception)) + def test_wide_format(self): e = [['gene','sample','sequence','num_hits','coverage','taxonomy'], ['4.11.ribosomal_protein_L10','minimal','TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA','2','4.88','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus'], From 5919ddf0762f067e1647ce5120ed341f4e47baf4 Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Wed, 20 Aug 2025 14:09:53 +1000 Subject: [PATCH 6/8] Make GTDB-Tk optional. Closes #245 Reported-by: @dutchscientist --- admin/build_docs.py | 1 + admin/environment.yml | 1 - admin/requirements.txt | 1 - docs/preludes/supplement_prelude.md | 16 ++++++++++++++++ docs/tools/supplement.md | 7 +++++++ pixi.lock | 6 ------ pixi.toml | 2 +- singlem/supplement.py | 2 ++ 8 files changed, 27 insertions(+), 9 deletions(-) create mode 100644 docs/preludes/supplement_prelude.md diff --git a/admin/build_docs.py b/admin/build_docs.py index 9afa7a41..7986ba4f 100755 --- a/admin/build_docs.py +++ b/admin/build_docs.py @@ -75,6 +75,7 @@ def get_version(relpath): 'appraise': 'INPUT OTU TABLE OPTIONS', 'seqs': 'OPTIONS', 'metapackage': 'OPTIONS', + 'supplement': 'OPTIONS', } logging.info("For ROFF for command {}, removing everything before '{}'".format( subcommand, splitters[subcommand])) diff --git a/admin/environment.yml b/admin/environment.yml index 0ceecee6..f0b8e9b5 100644 --- a/admin/environment.yml +++ b/admin/environment.yml @@ -13,7 +13,6 @@ dependencies: - fasttree=2.1.11 - galah=0.4.2 - graftm=0.15.1 -- gtdbtk=2.4.1 - hmmer=3.2.1 - jinja2=3.1.6 - krona=2.8.1 diff --git a/admin/requirements.txt b/admin/requirements.txt index e4f3140e..45ad4fd7 100644 --- a/admin/requirements.txt +++ b/admin/requirements.txt @@ -3,7 +3,6 @@ biopython==1.85 extern==0.4.1 fastalite graftm==0.15.1 -gtdbtk==2.4.1 Jinja2==3.1.6 pandas==2.2.3 pip==25.0.1 diff --git a/docs/preludes/supplement_prelude.md b/docs/preludes/supplement_prelude.md new file mode 100644 index 00000000..5446e6c8 --- /dev/null +++ b/docs/preludes/supplement_prelude.md @@ -0,0 +1,16 @@ +The SingleM `supplement` subcommand adds genomes to a SingleM metapackage. + +**TLDR**: Supplement a metapackage with new genomes like so: +``` +singlem supplement --new-genome-fasta-files \ + --output-metapackage +``` + +## GTDB-Tk +In order to add genomes to a metapackage, their taxonomy is required. SingleM `supplement` can generate this taxonomy for new genomes using GTDB-Tk. However, GTDB-Tk is not installed by default, and so it must be installed separately. + +For instance, if you are using conda to manage dependencies, use: +``` +conda install gtdbtk +``` +Note that the version of GTDB-Tk installed must match the GTDB release of the metapackage. The [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/installing/index.html) provides guidance on installation and compatibility. diff --git a/docs/tools/supplement.md b/docs/tools/supplement.md index 11374954..3f112f0d 100644 --- a/docs/tools/supplement.md +++ b/docs/tools/supplement.md @@ -3,6 +3,13 @@ title: SingleM supplement --- # singlem supplement +**TLDR**: Supplement a metapackage with new genomes like so: +``` +singlem supplement --new-genome-fasta-files \ + --output-metapackage +``` +GTDB-Tk may be needed to assign taxonomy to the new genomes. If required, install it separately (e.g. `conda install gtdbtk`) and ensure its version matches the GTDB release of the metapackage. + # DESCRIPTION Create a new metapackage from a vanilla one plus new genomes diff --git a/pixi.lock b/pixi.lock index 9e76f55b..8c5f828d 100644 --- a/pixi.lock +++ b/pixi.lock @@ -36,7 +36,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2025.1.31-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cached-property-1.5.2-hd8ed1ab_1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/cached_property-1.5.2-pyha770c72_1.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/capnproto-1.0.2-h2b92303_1.conda - conda: https://conda.anaconda.org/bioconda/linux-64/cd-hit-4.8.1-h43eeafb_11.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.1.31-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py312h06ac9bb_0.conda @@ -66,7 +65,6 @@ environments: - conda: https://conda.anaconda.org/bioconda/noarch/graftm-0.15.1-pyhdfd78af_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/greenlet-3.2.0-py312h2ec8cdc_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/gsl-2.7-he838d99_0.tar.bz2 - - conda: https://conda.anaconda.org/bioconda/noarch/gtdbtk-2.4.1-pyhdfd78af_1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.2.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/h5py-3.13.0-nompi_py312hedeef09_100.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/hdf5-1.14.3-nompi_hdf9ad27_105.conda @@ -143,7 +141,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.4-hcb278e6_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/mafft-7.526-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py312h178313f_1.conda - - conda: https://conda.anaconda.org/bioconda/linux-64/mash-2.3-hc74b729_7.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.10.1-py312hd3ec401_0.conda - conda: https://conda.anaconda.org/bioconda/linux-64/mfqe-0.5.0-h7b50bb2_5.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda @@ -171,7 +168,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-16.1.0-py312h9cebb41_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-16.1.0-py312h0983c49_2_cpu.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-1.10.21-py312h30e4e5e_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygtrie-2.5.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/bioconda/noarch/pyranges-0.1.4-pyhdfd78af_0.tar.bz2 @@ -191,7 +187,6 @@ environments: - conda: https://conda.anaconda.org/bioconda/noarch/seqmagick-0.8.6-pyhdfd78af_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-78.1.0-pyhff2d567_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/bioconda/linux-64/skani-0.2.2-ha6fb395_2.tar.bz2 - conda: https://conda.anaconda.org/bioconda/linux-64/smafa-0.8.0-hc1c3326_1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.1-h8bd8927_1.conda - conda: https://conda.anaconda.org/bioconda/linux-64/sorted_nearest-0.0.39-py312h0fa9677_5.tar.bz2 @@ -3200,7 +3195,6 @@ packages: - extern==0.4.1 - fastalite - graftm==0.15.1 - - gtdbtk==2.4.1 - jinja2==3.1.6 - pandas==2.2.3 - pip==25.0.1 diff --git a/pixi.toml b/pixi.toml index 932f73dd..db4bbcfb 100644 --- a/pixi.toml +++ b/pixi.toml @@ -46,7 +46,6 @@ pyarrow = "*" galah = ">=0.4.0" sqlparse = "*" # Required indirectly (e.g. taxtastic) zenodo_backpack = ">=0.3.0" -gtdbtk = "==2.4.1" # Must match the version of GTDB release used to make the default metapackage. # Optional (commented out) # python-annoy = "*" # nmslib = "*" @@ -61,6 +60,7 @@ ipython = "*" toml = "*" python-build = "*" setuptools-scm = "*" +gtdbtk = "==2.4.1" # Must match the version of GTDB release used to make the default metapackage. [environments] dev = ["dev"] diff --git a/singlem/supplement.py b/singlem/supplement.py index 664e8385..296d5d7c 100755 --- a/singlem/supplement.py +++ b/singlem/supplement.py @@ -127,6 +127,8 @@ def generate_taxonomy_for_new_genomes(**kwargs): # run gtdbtk to temporary output directory gtdbtk_output = os.path.join(working_directory, 'gtdbtk_output') logging.info("Running GTDBtk to generate taxonomy for new genomes ..") + if not shutil.which("gtdbtk"): + raise Exception("gtdbtk is not installed by default; install it (e.g. 'conda install gtdbtk').") # logging.warning("mash_db used is specific to QUT's CMR cluster, will fix this in future") cmd = f'gtdbtk classify_wf --cpus {threads} --batchfile {batchfile.name} --out_dir {gtdbtk_output} --mash_db {working_directory}/gtdbtk_mash.msh' if pplacer_threads: From 6959b5dedb3901fa756861b605283ad585d5e30e Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Fri, 22 Aug 2025 15:16:53 +1000 Subject: [PATCH 7/8] actions: Pytest with -v. --- .github/workflows/test-singlem.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-singlem.yml b/.github/workflows/test-singlem.yml index 16abef66..ade7c57e 100644 --- a/.github/workflows/test-singlem.yml +++ b/.github/workflows/test-singlem.yml @@ -23,7 +23,7 @@ jobs: - name: Run tests with Pixi run: | - pixi run -e dev --frozen pytest test + pixi run -e dev --frozen pytest test -v # Run after removing lock file so dependences are unlocked pixi_test_dependencies_optimistic: @@ -55,4 +55,4 @@ jobs: - name: Run tests with Pixi run: | - pixi run -e dev pytest test + pixi run -e dev pytest test -v From e30f60c1147ed375e980ea618e155fe9f92b9dfc Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Tue, 26 Aug 2025 20:22:28 +1000 Subject: [PATCH 8/8] Add docs AGENTS instructions --- docs/AGENTS.md | 4 +++ docs/preludes/pipe_prelude.md | 4 ++- singlem/lyrebird.py | 1 - singlem/main.py | 17 ++++++------- singlem/pipe.py | 48 ++--------------------------------- test/test_pipe.py | 2 +- 6 files changed, 18 insertions(+), 58 deletions(-) create mode 100644 docs/AGENTS.md diff --git a/docs/AGENTS.md b/docs/AGENTS.md new file mode 100644 index 00000000..5685fca6 --- /dev/null +++ b/docs/AGENTS.md @@ -0,0 +1,4 @@ +# AGENTS + +- Files under `docs/tools/` and `docs/advanced/` are autogenerated; do not modify them directly. +- `docs/Installation.md` is autogenerated; make changes in `docs/Installation.md.in` instead. diff --git a/docs/preludes/pipe_prelude.md b/docs/preludes/pipe_prelude.md index 63493725..8fc1d8dc 100644 --- a/docs/preludes/pipe_prelude.md +++ b/docs/preludes/pipe_prelude.md @@ -7,7 +7,9 @@ To further convert the generated taxonomic profile to other formats that might b ## Algorithm details -**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. It can also take as input whole genomes (or contigs), and can output a table of OTUs. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary). +**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary). + +It can also take as input whole genomes (or contigs) via `--genome-fasta-files`, `--genome-fasta-directory` or `--genome-fasta-list`. These options behave the same as providing sequences with `--forward`, except that different defaults for `--min-taxon-coverage` and `--min-orf-length` are used. `pipe` performs three steps: diff --git a/singlem/lyrebird.py b/singlem/lyrebird.py index 95c71e37..2ad590f3 100644 --- a/singlem/lyrebird.py +++ b/singlem/lyrebird.py @@ -108,7 +108,6 @@ def main(): singlem.pipe.SearchPipe().run( sequences = args.forward, reverse_read_files = args.reverse, - genomes = args.genome_fasta_files, input_sra_files = args.sra_files, otu_table = args.otu_table, archive_otu_table = args.archive_otu_table, diff --git a/singlem/main.py b/singlem/main.py index 0d12d657..b70db235 100755 --- a/singlem/main.py +++ b/singlem/main.py @@ -80,13 +80,13 @@ def add_common_pipe_arguments(argument_group, extra_args=False): sequence_input_group.add_argument('-f', '--genome-fasta-files', nargs='+', metavar='PATH', - help='Path(s) to FASTA files of each genome e.g. pathA/genome1.fna pathB/genome2.fa.') + help='Path(s) to genome FASTA files. These are processed like input given with --forward, but use higher default values for --min-taxon-coverage and --min-orf-length.') sequence_input_group.add_argument('-d', '--genome-fasta-directory', metavar='PATH', - help='Directory containing FASTA files of each genome.') + help='Directory containing genome FASTA files. Treated identically to --forward input with higher default values for --min-taxon-coverage and --min-orf-length.') sequence_input_group.add_argument('--genome-fasta-list', metavar='PATH', - help='File containing FASTA file paths, one per line.') + help='File containing genome FASTA paths, one per line. Behaviour matches --forward with higher default values for --min-taxon-coverage and --min-orf-length.') sequence_input_group.add_argument('--sra-files', nargs='+', metavar='sra_file', @@ -151,7 +151,7 @@ def add_less_common_pipe_arguments(argument_group, extra_args=False): help='HMMSEARCH e-value cutoff to use for sequence gathering [default: %s]' % SearchPipe.DEFAULT_HMMSEARCH_EVALUE, default=SearchPipe.DEFAULT_HMMSEARCH_EVALUE) argument_group.add_argument('--min-orf-length', metavar='length', - help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i when input is reads, %i when input is genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH), + help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i for reads, %i for genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH), type=int) argument_group.add_argument('--restrict-read-length', metavar='length', @@ -333,11 +333,9 @@ def generate_streaming_otu_table_from_args(args, def get_min_orf_length(args, subparser='pipe'): if args.min_orf_length: return args.min_orf_length - elif subparser=='pipe' and (args.forward or args.sra_files): - return SearchPipe.DEFAULT_MIN_ORF_LENGTH - elif subparser=='pipe' and args.genome_fasta_files: + elif subparser == 'pipe' and args.genome_fasta_files: return SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH - elif subparser=='renew': + elif subparser in ('pipe', 'renew'): return SearchPipe.DEFAULT_MIN_ORF_LENGTH else: raise Exception("Programming error") @@ -363,6 +361,8 @@ def parse_genome_fasta_files(args): with open(args.genome_fasta_list) as f: genomes.extend([line.strip() for line in f if line.strip()]) args.genome_fasta_files = genomes if genomes else None + if args.genome_fasta_files: + args.forward = args.genome_fasta_files return args def main(): @@ -768,7 +768,6 @@ def main(): singlem.pipe.SearchPipe().run( sequences = args.forward, reverse_read_files = args.reverse, - genomes = args.genome_fasta_files, input_sra_files = args.sra_files, read_chunk_size = args.read_chunk_size, read_chunk_number = args.read_chunk_number, diff --git a/singlem/pipe.py b/singlem/pipe.py index b1cb1184..26850830 100644 --- a/singlem/pipe.py +++ b/singlem/pipe.py @@ -24,7 +24,6 @@ from .archive_otu_table import ArchiveOtuTable from .taxonomy import * from .otu_table_collection import StreamingOtuTableCollection, OtuTableCollection -from .genome_fasta_mux import GenomeFastaMux from graftm.sequence_extractor import SequenceExtractor from graftm.greengenes_taxonomy import GreenGenesTaxonomy @@ -151,7 +150,6 @@ def run_to_otu_table(self, **kwargs): input_sra_files = kwargs.pop('input_sra_files',None) read_chunk_size = kwargs.pop('read_chunk_size', None) read_chunk_number = kwargs.pop('read_chunk_number', None) - genome_fasta_files = kwargs.pop('genomes', None) sleep_after_mkfifo = kwargs.pop('sleep_after_mkfifo', None) num_threads = kwargs.pop('threads') known_otu_tables = kwargs.pop('known_otu_tables', None) @@ -205,8 +203,6 @@ def run_to_otu_table(self, **kwargs): if diamond_prefilter_db: hmms.set_prefilter_db_path(diamond_prefilter_db) - if genome_fasta_files and forward_read_files: - raise Exception("Cannot process reads and genomes in the same run") analysing_pairs = reverse_read_files is not None if analysing_pairs: @@ -274,26 +270,6 @@ def run_to_otu_table(self, **kwargs): os.mkdir(tempfile_directory) tempfile.tempdir = tempfile_directory - #### Preprocess genomes into transcripts to speed the rest of the pipeline - if genome_fasta_files: - logging.info("Calling rough transcriptome of genome FASTA files") - genome_fasta_mux = GenomeFastaMux(genome_fasta_files) # to deal with mux and demux of sequence names - - # Create a single tempfile with ORFs from all genomes, and then - # demultiplex later, because this saves a bunch of syscalls. - # - # Make a tempfile with delete=False because it is in a tmpdir already, and useful for debug to keep around with --working-directory - transcripts_path = tempfile.NamedTemporaryFile(prefix='singlem-genome-orfs', suffix='.fasta', delete=False) - transcripts_path.close() - for fasta in genome_fasta_files: - extern.run("orfm -c {} -m {} -t >(sed 's/>/>{}|/' >>{}) {} >/dev/null".format( - self._translation_table, - self._min_orf_length, - genome_fasta_mux.fasta_to_prefix(fasta), - transcripts_path.name, - fasta)) - forward_read_files = [transcripts_path.name] - def return_cleanly(): if using_temporary_working_directory and not working_directory_dev_shm: # Directly setting tempdir in this way is not recommended, but @@ -315,9 +291,6 @@ def return_cleanly(): elif analysing_pairs: logging.info("Using as input %i different pairs of sequence files e.g. %s & %s" % ( len(forward_read_files), forward_read_files[0], reverse_read_files[0])) - elif genome_fasta_files: - logging.info("Using as input %i different genomes e.g. %s" % ( - len(genome_fasta_files), genome_fasta_files[0])) else: logging.info("Using as input %i different sequence files e.g. %s" % ( len(forward_read_files), forward_read_files[0])) @@ -491,16 +464,6 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com - #### Remove duplications which happen when OrfM hits the same sequence more than once. - if genome_fasta_files: - logging.info("Removing duplicate sequences from rough transcriptome ..") - for readset in extracted_reads: - # the read~gene format introduced for long read compatibility - # breaks genome_fasta_files mode, so need to remove it - for sequence in readset.unknown_sequences: - sequence.name = sequence.name.split('~')[0] - readset.remove_duplicate_sequences() - #### Extract diamond_taxonomy_assignment_performance_parameters from metapackage (v5 metapackages only) if diamond_taxonomy_assignment_performance_parameters == None: diamond_taxonomy_assignment_performance_parameters = metapackage_object.diamond_taxonomy_assignment_performance_parameters() @@ -519,12 +482,8 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com known_taxes=known_taxes, output_jplace=output_jplace, assignment_singlem_db=assignment_singlem_db, - genome_fasta_files=genome_fasta_files ) - if genome_fasta_files: - otu_table_object = genome_fasta_mux.demux_otu_table(otu_table_object) - return_cleanly() return otu_table_object @@ -539,7 +498,6 @@ def assign_taxonomy_and_process(self, **kwargs): known_taxes = kwargs.pop('known_taxes') output_jplace = kwargs.pop('output_jplace') assignment_singlem_db = kwargs.pop('assignment_singlem_db') - genome_fasta_files = kwargs.pop('genome_fasta_files', False) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) @@ -577,8 +535,7 @@ def assign_taxonomy_and_process(self, **kwargs): known_sequence_tax if known_sequence_taxonomy else None, # outputs otu_table_object, - package_to_taxonomy_bihash, - genome_fasta_files) + package_to_taxonomy_bihash) return otu_table_object @@ -620,8 +577,7 @@ def _process_taxonomically_assigned_reads( known_sequence_tax, # outputs otu_table_object, - package_to_taxonomy_bihash, - genome_fasta_files): + package_to_taxonomy_bihash): # To deal with paired reads, process each. Then exclude second reads # from pairs where both match. diff --git a/test/test_pipe.py b/test/test_pipe.py index 66fa9724..463bce66 100644 --- a/test/test_pipe.py +++ b/test/test_pipe.py @@ -1194,7 +1194,7 @@ def test_exclude_off_target_hits(self): def test_genome_input_dereplication(self): expected = 'gene sample sequence num_hits coverage taxonomy\n' \ - '4.12.22seqs GCA_000309865.1_genomic GATGGCGGTAAAGCCACTCCCGGCCCACCATTAGGTCCAGCAATCGGACCCCTAGGTATC 1 1.00 ' + '4.12.22seqs GCA_000309865.1_genomic CCGGCTTTTCAGATCGCACCGGATCCAACAGTTGCATTCACAGTTGGCTATTTAGGAGTG 1 1.00 ' # ~/git/singlem/bin/singlem pipe --genome-fasta-files genomes/GCA_000309865.1_genomic.fna --singlem-package ../4.12.22seqs.spkg/ --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96 cmd = '{} pipe --translation-table 11 --genome-fasta-files {} --singlem-package {} --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96'.format( path_to_script,