diff --git a/.github/workflows/test-singlem.yml b/.github/workflows/test-singlem.yml index 16abef66..ade7c57e 100644 --- a/.github/workflows/test-singlem.yml +++ b/.github/workflows/test-singlem.yml @@ -23,7 +23,7 @@ jobs: - name: Run tests with Pixi run: | - pixi run -e dev --frozen pytest test + pixi run -e dev --frozen pytest test -v # Run after removing lock file so dependences are unlocked pixi_test_dependencies_optimistic: @@ -55,4 +55,4 @@ jobs: - name: Run tests with Pixi run: | - pixi run -e dev pytest test + pixi run -e dev pytest test -v diff --git a/admin/build_docs.py b/admin/build_docs.py index 9afa7a41..7986ba4f 100755 --- a/admin/build_docs.py +++ b/admin/build_docs.py @@ -75,6 +75,7 @@ def get_version(relpath): 'appraise': 'INPUT OTU TABLE OPTIONS', 'seqs': 'OPTIONS', 'metapackage': 'OPTIONS', + 'supplement': 'OPTIONS', } logging.info("For ROFF for command {}, removing everything before '{}'".format( subcommand, splitters[subcommand])) diff --git a/admin/environment.yml b/admin/environment.yml index 0ceecee6..f0b8e9b5 100644 --- a/admin/environment.yml +++ b/admin/environment.yml @@ -13,7 +13,6 @@ dependencies: - fasttree=2.1.11 - galah=0.4.2 - graftm=0.15.1 -- gtdbtk=2.4.1 - hmmer=3.2.1 - jinja2=3.1.6 - krona=2.8.1 diff --git a/admin/requirements.txt b/admin/requirements.txt index e4f3140e..45ad4fd7 100644 --- a/admin/requirements.txt +++ b/admin/requirements.txt @@ -3,7 +3,6 @@ biopython==1.85 extern==0.4.1 fastalite graftm==0.15.1 -gtdbtk==2.4.1 Jinja2==3.1.6 pandas==2.2.3 pip==25.0.1 diff --git a/docs/AGENTS.md b/docs/AGENTS.md new file mode 100644 index 00000000..5685fca6 --- /dev/null +++ b/docs/AGENTS.md @@ -0,0 +1,4 @@ +# AGENTS + +- Files under `docs/tools/` and `docs/advanced/` are autogenerated; do not modify them directly. +- `docs/Installation.md` is autogenerated; make changes in `docs/Installation.md.in` instead. diff --git a/docs/preludes/pipe_prelude.md b/docs/preludes/pipe_prelude.md index 63493725..8fc1d8dc 100644 --- a/docs/preludes/pipe_prelude.md +++ b/docs/preludes/pipe_prelude.md @@ -7,7 +7,9 @@ To further convert the generated taxonomic profile to other formats that might b ## Algorithm details -**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. It can also take as input whole genomes (or contigs), and can output a table of OTUs. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary). +**Details**: In its most common usage, the SingleM `pipe` subcommand takes as input raw metagenomic reads and outputs a taxonomic profile. Note that taxonomic profiles are generated from OTU tables, they are [not the same thing](/Glossary). + +It can also take as input whole genomes (or contigs) via `--genome-fasta-files`, `--genome-fasta-directory` or `--genome-fasta-list`. These options behave the same as providing sequences with `--forward`, except that different defaults for `--min-taxon-coverage` and `--min-orf-length` are used. `pipe` performs three steps: diff --git a/docs/preludes/supplement_prelude.md b/docs/preludes/supplement_prelude.md new file mode 100644 index 00000000..5446e6c8 --- /dev/null +++ b/docs/preludes/supplement_prelude.md @@ -0,0 +1,16 @@ +The SingleM `supplement` subcommand adds genomes to a SingleM metapackage. + +**TLDR**: Supplement a metapackage with new genomes like so: +``` +singlem supplement --new-genome-fasta-files \ + --output-metapackage +``` + +## GTDB-Tk +In order to add genomes to a metapackage, their taxonomy is required. SingleM `supplement` can generate this taxonomy for new genomes using GTDB-Tk. However, GTDB-Tk is not installed by default, and so it must be installed separately. + +For instance, if you are using conda to manage dependencies, use: +``` +conda install gtdbtk +``` +Note that the version of GTDB-Tk installed must match the GTDB release of the metapackage. The [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/installing/index.html) provides guidance on installation and compatibility. diff --git a/docs/tools/supplement.md b/docs/tools/supplement.md index 11374954..3f112f0d 100644 --- a/docs/tools/supplement.md +++ b/docs/tools/supplement.md @@ -3,6 +3,13 @@ title: SingleM supplement --- # singlem supplement +**TLDR**: Supplement a metapackage with new genomes like so: +``` +singlem supplement --new-genome-fasta-files \ + --output-metapackage +``` +GTDB-Tk may be needed to assign taxonomy to the new genomes. If required, install it separately (e.g. `conda install gtdbtk`) and ensure its version matches the GTDB release of the metapackage. + # DESCRIPTION Create a new metapackage from a vanilla one plus new genomes diff --git a/pixi.lock b/pixi.lock index 98a3d3d4..8c5f828d 100644 --- a/pixi.lock +++ b/pixi.lock @@ -24,6 +24,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.1.18-h83b837d_6.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.26.9-he3a8b3b_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.329-hba8bd5f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bash-5.2.37-h4be8908_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/biom-format-2.1.16-py312h66e93f0_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/biopython-1.85-py312h66e93f0_1.conda - conda: https://conda.anaconda.org/bioconda/noarch/bird_tool_utils_python-0.4.1-pyhdfd78af_0.tar.bz2 @@ -35,7 +36,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2025.1.31-hbcca054_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cached-property-1.5.2-hd8ed1ab_1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/cached_property-1.5.2-pyha770c72_1.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/capnproto-1.0.2-h2b92303_1.conda - conda: https://conda.anaconda.org/bioconda/linux-64/cd-hit-4.8.1-h43eeafb_11.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.1.31-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py312h06ac9bb_0.conda @@ -43,6 +43,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.1.8-pyh707e725_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py312h68727a3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coreutils-9.5-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/curl-8.8.0-he654da7_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/bioconda/linux-64/dashing-0.4.0-h735f0e5_3.tar.bz2 @@ -64,7 +65,6 @@ environments: - conda: https://conda.anaconda.org/bioconda/noarch/graftm-0.15.1-pyhdfd78af_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/greenlet-3.2.0-py312h2ec8cdc_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/gsl-2.7-he838d99_0.tar.bz2 - - conda: https://conda.anaconda.org/bioconda/noarch/gtdbtk-2.4.1-pyhdfd78af_1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.2.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/h5py-3.13.0-nompi_py312hedeef09_100.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/hdf5-1.14.3-nompi_hdf9ad27_105.conda @@ -141,7 +141,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.4-hcb278e6_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/mafft-7.526-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py312h178313f_1.conda - - conda: https://conda.anaconda.org/bioconda/linux-64/mash-2.3-hc74b729_7.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.10.1-py312hd3ec401_0.conda - conda: https://conda.anaconda.org/bioconda/linux-64/mfqe-0.5.0-h7b50bb2_5.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda @@ -169,7 +168,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-16.1.0-py312h9cebb41_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-16.1.0-py312h0983c49_2_cpu.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-1.10.21-py312h30e4e5e_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygtrie-2.5.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/bioconda/noarch/pyranges-0.1.4-pyhdfd78af_0.tar.bz2 @@ -189,7 +187,6 @@ environments: - conda: https://conda.anaconda.org/bioconda/noarch/seqmagick-0.8.6-pyhdfd78af_0.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-78.1.0-pyhff2d567_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/bioconda/linux-64/skani-0.2.2-ha6fb395_2.tar.bz2 - conda: https://conda.anaconda.org/bioconda/linux-64/smafa-0.8.0-hc1c3326_1.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.1-h8bd8927_1.conda - conda: https://conda.anaconda.org/bioconda/linux-64/sorted_nearest-0.0.39-py312h0fa9677_5.tar.bz2 @@ -219,7 +216,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-h4ab18f5_6.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py312h66e93f0_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.6-ha6fb4c9_0.conda - - pypi: . + - pypi: ./ dev: channels: - url: https://conda.anaconda.org/conda-forge/ @@ -245,6 +242,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.1.18-h83b837d_6.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.26.9-he3a8b3b_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.329-hba8bd5f_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bash-5.2.37-h4be8908_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/biom-format-2.1.16-py312h66e93f0_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/biopython-1.85-py312h66e93f0_1.conda - conda: https://conda.anaconda.org/bioconda/noarch/bird_tool_utils_python-0.4.1-pyhdfd78af_0.tar.bz2 @@ -264,6 +262,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.1.8-pyh707e725_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py312h68727a3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coreutils-9.5-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/curl-8.8.0-he654da7_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/bioconda/linux-64/dashing-0.4.0-h735f0e5_3.tar.bz2 @@ -464,7 +463,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-h4ab18f5_6.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py312h66e93f0_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.6-ha6fb4c9_0.conda - - pypi: . + - pypi: ./ packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 @@ -690,6 +689,19 @@ packages: purls: [] size: 3638342 timestamp: 1716466127210 +- conda: https://conda.anaconda.org/conda-forge/linux-64/bash-5.2.37-h4be8908_0.conda + sha256: a0ce6ed2b346501be1fcae415e4df04618f822834902dc22174a350ae39c791e + md5: c918f7141733d412f5c579d07f437690 + depends: + - readline + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - readline >=8.2,<9.0a0 + license: GPL-3.0-or-later + license_family: GPL + purls: [] + size: 1929937 + timestamp: 1748631191479 - conda: https://conda.anaconda.org/conda-forge/linux-64/biom-format-2.1.16-py312h66e93f0_2.conda sha256: db6bf3fa7778c6d128ba92e883c1ce8ac8e5e5268b3a71b6d445fbaf91aa4f7e md5: 65e5dd22088478d2beca0ac782532868 @@ -932,6 +944,16 @@ packages: - pkg:pypi/contourpy?source=hash-mapping size: 276533 timestamp: 1744743235779 +- conda: https://conda.anaconda.org/conda-forge/linux-64/coreutils-9.5-hd590300_0.conda + sha256: 7cd3b0f55aa55bb27b045c30f32b3f6b874ecc006f3abcb274c71a3bcbacb358 + md5: 126d457e0e7a535278e808a7d8960015 + depends: + - libgcc-ng >=12 + license: GPL-3.0-or-later + license_family: GPL + purls: [] + size: 3014238 + timestamp: 1711655132451 - conda: https://conda.anaconda.org/conda-forge/linux-64/curl-8.8.0-he654da7_1.conda sha256: a7bb4f8d1cba26c238cd0469b7bdcdc032cf06b0ac0de09992638744eaab9954 md5: 78678b2ddfd9bd7c7861b8d2e3b7473b @@ -2825,7 +2847,8 @@ packages: - python license: BSD-3-Clause license_family: BSD - purls: [] + purls: + - pkg:pypi/pycparser?source=hash-mapping size: 110100 timestamp: 1733195786147 - conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-1.10.21-py312h30e4e5e_0.conda @@ -3163,16 +3186,15 @@ packages: - pkg:pypi/setuptools-scm?source=compressed-mapping size: 38426 timestamp: 1745450953205 -- pypi: . +- pypi: ./ name: singlem - version: 0.18.3 + version: 0.19.0 sha256: 02f7d50b8fb0afd29122360abd90e1d08fd3a7acfebb1c0e15daf6daa0aa187f requires_dist: - biopython==1.85 - extern==0.4.1 - fastalite - graftm==0.15.1 - - gtdbtk==2.4.1 - jinja2==3.1.6 - pandas==2.2.3 - pip==25.0.1 diff --git a/pixi.toml b/pixi.toml index 5d1c647b..db4bbcfb 100644 --- a/pixi.toml +++ b/pixi.toml @@ -11,6 +11,8 @@ scripts = ["admin/set_env_vars.sh"] [dependencies] python = ">=3.7" +coreutils = "*" # So sort --parallel works +bash = "*" diamond = ">=2.1.7,<2.1.11" # 2.1.11 segfaults (on some non-test data) biopython = "*" hmmer = "*" @@ -44,7 +46,6 @@ pyarrow = "*" galah = ">=0.4.0" sqlparse = "*" # Required indirectly (e.g. taxtastic) zenodo_backpack = ">=0.3.0" -gtdbtk = "==2.4.1" # Must match the version of GTDB release used to make the default metapackage. # Optional (commented out) # python-annoy = "*" # nmslib = "*" @@ -59,6 +60,7 @@ ipython = "*" toml = "*" python-build = "*" setuptools-scm = "*" +gtdbtk = "==2.4.1" # Must match the version of GTDB release used to make the default metapackage. [environments] dev = ["dev"] diff --git a/singlem/lyrebird.py b/singlem/lyrebird.py index 281ac542..2ad590f3 100644 --- a/singlem/lyrebird.py +++ b/singlem/lyrebird.py @@ -22,7 +22,7 @@ import singlem import singlem.pipe as pipe -from singlem.main import add_common_pipe_arguments, add_less_common_pipe_arguments, validate_pipe_args, add_condense_arguments, generate_streaming_otu_table_from_args, get_min_orf_length, get_min_taxon_coverage +from singlem.main import add_common_pipe_arguments, add_less_common_pipe_arguments, validate_pipe_args, add_condense_arguments, generate_streaming_otu_table_from_args, get_min_orf_length, get_min_taxon_coverage, parse_genome_fasta_files from singlem.pipe import SearchPipe from singlem.condense import Condenser from singlem.metapackage import LYREBIRD_DATA_ENVIRONMENT_VARIABLE, CUSTOM_TAXONOMY_DATABASE_NAME @@ -86,6 +86,7 @@ def main(): add_condense_arguments(condense_parser) args = bird_argparser.parse_the_args() + parse_genome_fasta_files(args) if args.debug: loglevel = logging.DEBUG @@ -107,7 +108,6 @@ def main(): singlem.pipe.SearchPipe().run( sequences = args.forward, reverse_read_files = args.reverse, - genomes = args.genome_fasta_files, input_sra_files = args.sra_files, otu_table = args.otu_table, archive_otu_table = args.archive_otu_table, diff --git a/singlem/main.py b/singlem/main.py index 798287fb..b70db235 100755 --- a/singlem/main.py +++ b/singlem/main.py @@ -77,14 +77,24 @@ def add_common_pipe_arguments(argument_group, extra_args=False): nargs='+', metavar='sequence_file', help='reverse reads to be searched. Can be FASTA or FASTQ format, GZIP-compressed or not.') - sequence_input_group.add_argument('--genome-fasta-files', + sequence_input_group.add_argument('-f', '--genome-fasta-files', nargs='+', - metavar='sequence_file', - help='nucleotide genome sequence(s) to be searched') + metavar='PATH', + help='Path(s) to genome FASTA files. These are processed like input given with --forward, but use higher default values for --min-taxon-coverage and --min-orf-length.') + sequence_input_group.add_argument('-d', '--genome-fasta-directory', + metavar='PATH', + help='Directory containing genome FASTA files. Treated identically to --forward input with higher default values for --min-taxon-coverage and --min-orf-length.') + sequence_input_group.add_argument('--genome-fasta-list', + metavar='PATH', + help='File containing genome FASTA paths, one per line. Behaviour matches --forward with higher default values for --min-taxon-coverage and --min-orf-length.') sequence_input_group.add_argument('--sra-files', nargs='+', metavar='sra_file', help='"sra" format files (usually from NCBI SRA) to be searched') + argument_group.add_argument('-x', '--genome-fasta-extension', + metavar='EXT', + help='File extension of genomes in the directory specified with -d/--genome-fasta-directory. [default: fna]', + default='fna') argument_group.add_argument('--read-chunk-size', type=int, metavar='num_reads', @@ -141,7 +151,7 @@ def add_less_common_pipe_arguments(argument_group, extra_args=False): help='HMMSEARCH e-value cutoff to use for sequence gathering [default: %s]' % SearchPipe.DEFAULT_HMMSEARCH_EVALUE, default=SearchPipe.DEFAULT_HMMSEARCH_EVALUE) argument_group.add_argument('--min-orf-length', metavar='length', - help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i when input is reads, %i when input is genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH), + help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i for reads, %i for genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH), type=int) argument_group.add_argument('--restrict-read-length', metavar='length', @@ -323,11 +333,9 @@ def generate_streaming_otu_table_from_args(args, def get_min_orf_length(args, subparser='pipe'): if args.min_orf_length: return args.min_orf_length - elif subparser=='pipe' and (args.forward or args.sra_files): - return SearchPipe.DEFAULT_MIN_ORF_LENGTH - elif subparser=='pipe' and args.genome_fasta_files: + elif subparser == 'pipe' and args.genome_fasta_files: return SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH - elif subparser=='renew': + elif subparser in ('pipe', 'renew'): return SearchPipe.DEFAULT_MIN_ORF_LENGTH else: raise Exception("Programming error") @@ -340,6 +348,23 @@ def get_min_taxon_coverage(args, subparser='pipe'): else: return CONDENSE_DEFAULT_MIN_TAXON_COVERAGE +def parse_genome_fasta_files(args): + genomes = [] + if getattr(args, 'genome_fasta_files', None): + genomes.extend(args.genome_fasta_files) + if getattr(args, 'genome_fasta_directory', None): + extension = getattr(args, 'genome_fasta_extension', 'fna') + for fn in sorted(os.listdir(args.genome_fasta_directory)): + if fn.endswith('.' + extension): + genomes.append(os.path.join(args.genome_fasta_directory, fn)) + if getattr(args, 'genome_fasta_list', None): + with open(args.genome_fasta_list) as f: + genomes.extend([line.strip() for line in f if line.strip()]) + args.genome_fasta_files = genomes if genomes else None + if args.genome_fasta_files: + args.forward = args.genome_fasta_files + return args + def main(): bird_argparser = BirdArgparser( program='SingleM', @@ -721,6 +746,7 @@ def main(): supplement_rare_group.add_argument('--new-taxonomy-database-version', help='Version of the taxonomy database to use [default: None]') args = bird_argparser.parse_the_args() + parse_genome_fasta_files(args) if args.debug: loglevel = logging.DEBUG @@ -742,7 +768,6 @@ def main(): singlem.pipe.SearchPipe().run( sequences = args.forward, reverse_read_files = args.reverse, - genomes = args.genome_fasta_files, input_sra_files = args.sra_files, read_chunk_size = args.read_chunk_size, read_chunk_number = args.read_chunk_number, @@ -860,6 +885,8 @@ def main(): raise Exception("--collapse-paired-with-unpaired-archive-otu-table currently only works with archive tables") elif not len(args.input_archive_otu_tables) == 2: raise Exception("--collapse-paired-with-unpaired-archive-otu-table requires exactly two archive tables") + if args.krona and args.input_taxonomic_profiles: + raise Exception("--krona generates a krona file from an OTU table. Use --output-taxonomic-profile-krona when providing taxonomic profiles") if args.output_taxonomic_profile_krona and not args.input_taxonomic_profiles: raise Exception("--output-taxonomic-profile-krona requires --input-taxonomic-profiles to be defined") if args.output_archive_otu_table and not args.collapse_to_sample_name: diff --git a/singlem/pipe.py b/singlem/pipe.py index b1cb1184..26850830 100644 --- a/singlem/pipe.py +++ b/singlem/pipe.py @@ -24,7 +24,6 @@ from .archive_otu_table import ArchiveOtuTable from .taxonomy import * from .otu_table_collection import StreamingOtuTableCollection, OtuTableCollection -from .genome_fasta_mux import GenomeFastaMux from graftm.sequence_extractor import SequenceExtractor from graftm.greengenes_taxonomy import GreenGenesTaxonomy @@ -151,7 +150,6 @@ def run_to_otu_table(self, **kwargs): input_sra_files = kwargs.pop('input_sra_files',None) read_chunk_size = kwargs.pop('read_chunk_size', None) read_chunk_number = kwargs.pop('read_chunk_number', None) - genome_fasta_files = kwargs.pop('genomes', None) sleep_after_mkfifo = kwargs.pop('sleep_after_mkfifo', None) num_threads = kwargs.pop('threads') known_otu_tables = kwargs.pop('known_otu_tables', None) @@ -205,8 +203,6 @@ def run_to_otu_table(self, **kwargs): if diamond_prefilter_db: hmms.set_prefilter_db_path(diamond_prefilter_db) - if genome_fasta_files and forward_read_files: - raise Exception("Cannot process reads and genomes in the same run") analysing_pairs = reverse_read_files is not None if analysing_pairs: @@ -274,26 +270,6 @@ def run_to_otu_table(self, **kwargs): os.mkdir(tempfile_directory) tempfile.tempdir = tempfile_directory - #### Preprocess genomes into transcripts to speed the rest of the pipeline - if genome_fasta_files: - logging.info("Calling rough transcriptome of genome FASTA files") - genome_fasta_mux = GenomeFastaMux(genome_fasta_files) # to deal with mux and demux of sequence names - - # Create a single tempfile with ORFs from all genomes, and then - # demultiplex later, because this saves a bunch of syscalls. - # - # Make a tempfile with delete=False because it is in a tmpdir already, and useful for debug to keep around with --working-directory - transcripts_path = tempfile.NamedTemporaryFile(prefix='singlem-genome-orfs', suffix='.fasta', delete=False) - transcripts_path.close() - for fasta in genome_fasta_files: - extern.run("orfm -c {} -m {} -t >(sed 's/>/>{}|/' >>{}) {} >/dev/null".format( - self._translation_table, - self._min_orf_length, - genome_fasta_mux.fasta_to_prefix(fasta), - transcripts_path.name, - fasta)) - forward_read_files = [transcripts_path.name] - def return_cleanly(): if using_temporary_working_directory and not working_directory_dev_shm: # Directly setting tempdir in this way is not recommended, but @@ -315,9 +291,6 @@ def return_cleanly(): elif analysing_pairs: logging.info("Using as input %i different pairs of sequence files e.g. %s & %s" % ( len(forward_read_files), forward_read_files[0], reverse_read_files[0])) - elif genome_fasta_files: - logging.info("Using as input %i different genomes e.g. %s" % ( - len(genome_fasta_files), genome_fasta_files[0])) else: logging.info("Using as input %i different sequence files e.g. %s" % ( len(forward_read_files), forward_read_files[0])) @@ -491,16 +464,6 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com - #### Remove duplications which happen when OrfM hits the same sequence more than once. - if genome_fasta_files: - logging.info("Removing duplicate sequences from rough transcriptome ..") - for readset in extracted_reads: - # the read~gene format introduced for long read compatibility - # breaks genome_fasta_files mode, so need to remove it - for sequence in readset.unknown_sequences: - sequence.name = sequence.name.split('~')[0] - readset.remove_duplicate_sequences() - #### Extract diamond_taxonomy_assignment_performance_parameters from metapackage (v5 metapackages only) if diamond_taxonomy_assignment_performance_parameters == None: diamond_taxonomy_assignment_performance_parameters = metapackage_object.diamond_taxonomy_assignment_performance_parameters() @@ -519,12 +482,8 @@ def finish_sra_extraction_processes(sra_extraction_processes, sra_extraction_com known_taxes=known_taxes, output_jplace=output_jplace, assignment_singlem_db=assignment_singlem_db, - genome_fasta_files=genome_fasta_files ) - if genome_fasta_files: - otu_table_object = genome_fasta_mux.demux_otu_table(otu_table_object) - return_cleanly() return otu_table_object @@ -539,7 +498,6 @@ def assign_taxonomy_and_process(self, **kwargs): known_taxes = kwargs.pop('known_taxes') output_jplace = kwargs.pop('output_jplace') assignment_singlem_db = kwargs.pop('assignment_singlem_db') - genome_fasta_files = kwargs.pop('genome_fasta_files', False) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) @@ -577,8 +535,7 @@ def assign_taxonomy_and_process(self, **kwargs): known_sequence_tax if known_sequence_taxonomy else None, # outputs otu_table_object, - package_to_taxonomy_bihash, - genome_fasta_files) + package_to_taxonomy_bihash) return otu_table_object @@ -620,8 +577,7 @@ def _process_taxonomically_assigned_reads( known_sequence_tax, # outputs otu_table_object, - package_to_taxonomy_bihash, - genome_fasta_files): + package_to_taxonomy_bihash): # To deal with paired reads, process each. Then exclude second reads # from pairs where both match. diff --git a/singlem/supplement.py b/singlem/supplement.py index 664e8385..296d5d7c 100755 --- a/singlem/supplement.py +++ b/singlem/supplement.py @@ -127,6 +127,8 @@ def generate_taxonomy_for_new_genomes(**kwargs): # run gtdbtk to temporary output directory gtdbtk_output = os.path.join(working_directory, 'gtdbtk_output') logging.info("Running GTDBtk to generate taxonomy for new genomes ..") + if not shutil.which("gtdbtk"): + raise Exception("gtdbtk is not installed by default; install it (e.g. 'conda install gtdbtk').") # logging.warning("mash_db used is specific to QUT's CMR cluster, will fix this in future") cmd = f'gtdbtk classify_wf --cpus {threads} --batchfile {batchfile.name} --out_dir {gtdbtk_output} --mash_db {working_directory}/gtdbtk_mash.msh' if pplacer_threads: diff --git a/test/test_pipe.py b/test/test_pipe.py index 650e75f9..463bce66 100644 --- a/test/test_pipe.py +++ b/test/test_pipe.py @@ -24,6 +24,7 @@ import unittest import os.path import tempfile +import shutil import extern import sys import json @@ -1002,6 +1003,41 @@ def test_genome(self): list([line.split("\t") for line in expected]), extern.run(cmd)) + def test_genome_directory_input(self): + expected = [ + "\t".join(self.headers), + 'S1.13.ribosomal_protein_S15P_S13e\tuap2\tAAGGATTTGAGTGCAAAAAGAGGACTCGATTTTACAGAGGCAAAGATAAGAAAACTTGGA\t1\t1.00\tRoot; d__Archaea; p__Halobacterota; c__Methanomicrobia; o__Methanomicrobiales; f__Methanocullaceae; g__Methanoculleus', + '' + ] + with tempfile.TemporaryDirectory() as d: + shutil.copy(os.path.join(path_to_data, 'uap2.fna'), os.path.join(d, 'uap2.fa')) + cmd = '{} pipe --genome-fasta-directory {} --genome-fasta-extension fa --singlem-packages {}/S1.13.chainsaw.gpkg.spkg --otu-table /dev/stdout --assignment-method diamond'.format( + path_to_script, + d, + path_to_data, + ) + self.assertEqualOtuTable( + list([line.split("\t") for line in expected]), + extern.run(cmd)) + + def test_genome_fasta_list_input(self): + expected = [ + "\t".join(self.headers), + 'S1.13.ribosomal_protein_S15P_S13e\tuap2\tAAGGATTTGAGTGCAAAAAGAGGACTCGATTTTACAGAGGCAAAGATAAGAAAACTTGGA\t1\t1.00\tRoot; d__Archaea; p__Halobacterota; c__Methanomicrobia; o__Methanomicrobiales; f__Methanocullaceae; g__Methanoculleus', + '' + ] + with tempfile.NamedTemporaryFile(mode='w', suffix='.lst') as lst: + lst.write(os.path.join(path_to_data, 'uap2.fna')) + lst.flush() + cmd = '{} pipe --genome-fasta-list {} --singlem-packages {}/S1.13.chainsaw.gpkg.spkg --otu-table /dev/stdout --assignment-method diamond'.format( + path_to_script, + lst.name, + path_to_data, + ) + self.assertEqualOtuTable( + list([line.split("\t") for line in expected]), + extern.run(cmd)) + def test_prefilter_piped_in_input(self): with tempfile.NamedTemporaryFile(suffix='.mkfifo.fna') as fifo: with tempfile.NamedTemporaryFile(suffix='.mkfifo.fna') as script: @@ -1158,7 +1194,7 @@ def test_exclude_off_target_hits(self): def test_genome_input_dereplication(self): expected = 'gene sample sequence num_hits coverage taxonomy\n' \ - '4.12.22seqs GCA_000309865.1_genomic GATGGCGGTAAAGCCACTCCCGGCCCACCATTAGGTCCAGCAATCGGACCCCTAGGTATC 1 1.00 ' + '4.12.22seqs GCA_000309865.1_genomic CCGGCTTTTCAGATCGCACCGGATCCAACAGTTGCATTCACAGTTGGCTATTTAGGAGTG 1 1.00 ' # ~/git/singlem/bin/singlem pipe --genome-fasta-files genomes/GCA_000309865.1_genomic.fna --singlem-package ../4.12.22seqs.spkg/ --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96 cmd = '{} pipe --translation-table 11 --genome-fasta-files {} --singlem-package {} --otu-table /dev/stdout --no-assign-taxonomy --min-orf-length 96'.format( path_to_script, diff --git a/test/test_summariser.py b/test/test_summariser.py index 7c63e956..7c463283 100644 --- a/test/test_summariser.py +++ b/test/test_summariser.py @@ -154,6 +154,11 @@ def test_krona(self): table_collection=table_collection) self.assertTrue(os.path.exists(os.path.join(tmp,'KronaOK.html'))) + def test_krona_with_taxonomic_profile_input(self): + with self.assertRaises(Exception) as cm: + extern.run(f'singlem summarise --input-taxonomic-profile {path_to_data}/summarise/profile1.tsv --krona /dev/null') + self.assertIn('--output-taxonomic-profile-krona', str(cm.exception)) + def test_wide_format(self): e = [['gene','sample','sequence','num_hits','coverage','taxonomy'], ['4.11.ribosomal_protein_L10','minimal','TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA','2','4.88','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus'],