From 1a4fa514e4b4555f6041e3655cbabe7d325f1f06 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Sat, 27 Apr 2019 10:00:26 -0700 Subject: [PATCH 01/21] Add click option --pipeline --- hundo/hundo.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hundo/hundo.py b/hundo/hundo.py index d9718e5..676dc2c 100644 --- a/hundo/hundo.py +++ b/hundo/hundo.py @@ -31,7 +31,13 @@ def cli(obj): https://github.com/pnnl/hundo/issues """ - +@click.option( + "--pipeline", + type=str, + default="OTU", + show_default=True, + help="cluster reads into OTUs or denoise reads into ASVs. This also changes other parts of the pipeline. default:OTU", +) @cli.command("lca", short_help="runs LCA across aligned hits") @click.argument("fasta", type=click.Path(exists=True)) @click.argument("aligned_hits", type=click.File("r")) From 4ba7af26fd79d268356d7f6862fe32d26ca24de3 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Sat, 27 Apr 2019 10:46:30 -0700 Subject: [PATCH 02/21] Add basic ASV pipeline --- hundo/Snakefile | 357 ++++++++++++++++++++++++++++++------------------ 1 file changed, 226 insertions(+), 131 deletions(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index fe417aa..b1bdacc 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -186,7 +186,9 @@ def get_run_reference_chimera_filter_inputs(wildcards): db = REFFASTA else: db = config.get("reference_chimera_filter") - if config.get("denovo_chimera_filter", True): + if config.get("pipeline", "ASV"): + fasta = "zOTUs-uchime3.fasta" + elif config.get("denovo_chimera_filter", True): fasta = "chimera-filtered_denovo.fasta" elif float(config.get("read_identity_requirement", 0.97)) * 100 < 99: fasta = "preclustered.fasta" @@ -583,124 +585,197 @@ rule dereplicate_sequences: --threads {threads} -log {log} --uc {output.uc} """ +if config.get("pipeline") == "OTU": + rule precluster_sequences: + input: + "dereplicated-sequences.fasta" + output: + fa = temp("preclustered.fasta"), + uc = temp("preclustered.uclust") + params: + id = min((100 - float(config.get("percent_of_allowable_difference", 3))) / 100, 1.00) + conda: + CONDAENV + log: + "logs/precluster.log" + threads: + config.get("threads", 1) + group: + "combined_group" + shell: + """ + vsearch --threads {threads} --cluster_size {input} \ + --id {params.id} --strand plus --sizein --sizeout \ + --centroids {output.fa} --uc {output.uc} + """ -rule precluster_sequences: - input: - "dereplicated-sequences.fasta" - output: - fa = temp("preclustered.fasta"), - uc = temp("preclustered.uclust") - params: - id = min((100 - float(config.get("percent_of_allowable_difference", 3))) / 100, 1.00) - conda: - CONDAENV - log: - "logs/precluster.log" - threads: - config.get("threads", 1) - group: - "combined_group" - shell: - """ - vsearch --threads {threads} --cluster_size {input} \ - --id {params.id} --strand plus --sizein --sizeout \ - --centroids {output.fa} --uc {output.uc} - """ + rule run_denovo_chimera_filter: + input: + "preclustered.fasta" + output: + temp("chimera-filtered_denovo.fasta") + conda: + CONDAENV + log: + "logs/chimera-filtered_denovo.log" + threads: + config.get("threads", 1) + group: + "combined_group" + shell: + """ + vsearch --uchime_denovo {input} --nonchimeras {output} \ + --strand plus --sizein --sizeout --threads {threads} \ + --log {log} + """ -rule run_denovo_chimera_filter: - input: - "preclustered.fasta" - output: - temp("chimera-filtered_denovo.fasta") - conda: - CONDAENV - log: - "logs/chimera-filtered_denovo.log" - threads: - config.get("threads", 1) - group: - "combined_group" - shell: - """ - vsearch --uchime_denovo {input} --nonchimeras {output} \ - --strand plus --sizein --sizeout --threads {threads} \ - --log {log} - """ + rule run_reference_chimera_filter: + input: + unpack(get_run_reference_chimera_filter_inputs) + output: + temp("chimera-filtered_reference.fasta") + conda: + CONDAENV + log: + "logs/chimera-filtered_reference.log" + threads: + config.get("threads", 1) + group: + "combined_group" + shell: + """ + vsearch --uchime_ref {input.fasta} --nonchimeras {output} \ + --strand plus --sizein --sizeout --db {input.db} \ + --threads {threads} --log {log} + """ -rule run_reference_chimera_filter: - input: - unpack(get_run_reference_chimera_filter_inputs) - output: - temp("chimera-filtered_reference.fasta") - conda: - CONDAENV - log: - "logs/chimera-filtered_reference.log" - threads: - config.get("threads", 1) - group: - "combined_group" - shell: - """ - vsearch --uchime_ref {input.fasta} --nonchimeras {output} \ - --strand plus --sizein --sizeout --db {input.db} \ - --threads {threads} --log {log} - """ + rule extract_filtered_sequences: + # Extract all non-chimeric, non-singleton sequences, dereplicated + input: + filter_fa = "dereplicated-sequences.fasta", + uc = "preclustered.uclust", + keep_fa = "chimera-filtered_reference.fasta" + output: + temp("nonchimeric-nonsingleton.fasta") + group: + "combined_group" + run: + filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0]) -rule extract_filtered_sequences: - # Extract all non-chimeric, non-singleton sequences, dereplicated - input: - filter_fa = "dereplicated-sequences.fasta", - uc = "preclustered.uclust", - keep_fa = "chimera-filtered_reference.fasta" - output: - temp("nonchimeric-nonsingleton.fasta") - group: - "combined_group" - run: - filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0]) + rule pull_seqs_from_samples_with_filtered_sequences: + # Extract all non-chimeric, non-singleton sequences in each sample + input: + filter_fa = "all-sequences.fasta", + uc = "dereplicated.uclust", + keep_fa = "nonchimeric-nonsingleton.fasta" + output: + temp("all-sequences_filtered.fasta") + group: + "combined_group" + run: + filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0]) -rule pull_seqs_from_samples_with_filtered_sequences: - # Extract all non-chimeric, non-singleton sequences in each sample - input: - filter_fa = "all-sequences.fasta", - uc = "dereplicated.uclust", - keep_fa = "nonchimeric-nonsingleton.fasta" - output: - temp("all-sequences_filtered.fasta") - group: - "combined_group" - run: - filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0]) + rule cluster_sequences: + input: + # get_cluster_sequences_input + "all-sequences_filtered.fasta" + output: + fa = "OTU.fasta" + params: + minsize = config.get("minimum_sequence_abundance", 2), + otu_id_pct = (100 - float(config.get("percent_of_allowable_difference", 3))) / 100. + conda: + CONDAENV + log: + "logs/cluster_sequences.log" + threads: + config.get("threads", 1) + group: + "combined_group" + shell: + """ + vsearch --cluster_size {input} --centroids {output.fa} \ + --relabel OTU_ --id {params.otu_id_pct} --log {log} \ + --sizein --strand plus --threads {threads} + """ +# Pipeline for building ASVs +else: + rule denoise_sequences: + input: + "dereplicated-sequences.fasta" + output: + fa = temp("zOTUs-all.fasta"), + conda: + CONDAENV + log: + "logs/cluster-unoise.log" + threads: + config.get("threads", 1) + group: + "combined_group" + shell: + """ + vsearch --threads {threads} --cluster_unoise {input} \ + --strand plus --sizein --sizeout \ + --centroids {output.fa} --log {log} + """ -rule cluster_sequences: - input: - # get_cluster_sequences_input - "all-sequences_filtered.fasta" - output: - fa = "OTU.fasta" - params: - minsize = config.get("minimum_sequence_abundance", 2), - otu_id_pct = (100 - float(config.get("percent_of_allowable_difference", 3))) / 100. - conda: - CONDAENV - log: - "logs/cluster_sequences.log" - threads: - config.get("threads", 1) - group: - "combined_group" - shell: - """ - vsearch --cluster_size {input} --centroids {output.fa} \ - --relabel OTU_ --id {params.otu_id_pct} --log {log} \ - --sizein --strand plus --threads {threads} - """ + + rule run_denovo_chimera_filter: + input: + "zOTUs-all.fasta" + output: + temp("zOTUs-uchime3.fasta") + conda: + CONDAENV + log: + "logs/chimera-filtered_uchime3_denovo.log" + group: + "combined_group" + shell: + """ + vsearch --uchime3_denovo {input} --nonchimeras {output} \ + --strand plus --sizein --sizeout --log {log} + """ + + + rule run_reference_chimera_filter: + input: + unpack(get_run_reference_chimera_filter_inputs) + output: + temp("zOTUs-uchime3-uchime-ref.fasta") + conda: + CONDAENV + log: + "logs/chimera-filtered_reference.log" + threads: + config.get("threads", 1) + group: + "combined_group" + shell: + """ + vsearch --uchime_ref {input.fasta} --nonchimeras {output} \ + --strand plus --sizein --sizeout --db {input.db} \ + --threads {threads} --log {log} + """ + + + rule rename_zOTUs_to_OTUs: + input: + "zOTUs-uchime3-uchime-ref.fasta" + output: + "OTU.fasta" + group: + "combined_group" + shell: + """ + cp {input} {output} + """ if config.get("aligner") == "blast": @@ -779,29 +854,49 @@ rule compute_lca: {output.tsv} """ - -rule compile_counts: - input: - seqs = "all-sequences.fasta", - db = "OTU_tax.fasta" - output: - tsv = "OTU.txt" - params: - id_req = config.get("read_identity_requirement", 0.80), - minqt = config.get("vsearch_minqt", 0.80) - conda: - CONDAENV - threads: - config.get("threads", 1) - group: - "combined_group" - shell: - """ - vsearch --usearch_global {input.seqs} \ - --db {input.db} --id {params.id_req} \ - --otutabout {output.tsv} --sizein --strand plus \ - --threads {threads} - """ +if config.get("pipeline") == "OTU": + rule compile_counts: + input: + seqs = "all-sequences.fasta", + db = "OTU_tax.fasta" + output: + tsv = "OTU.txt" + params: + id_req = config.get("read_identity_requirement", 0.80), + minqt = config.get("vsearch_minqt", 0.80) + conda: + CONDAENV + threads: + config.get("threads", 1) + group: + "combined_group" + shell: + """ + vsearch --usearch_global {input.seqs} \ + --db {input.db} --id {params.id_req} \ + --otutabout {output.tsv} --sizein --strand plus \ + --threads {threads} + """ +else: + rule compile_counts: + input: + seqs = "all-sequences.fasta", + db = "OTU_tax.fasta" + output: + tsv = "OTU.txt" + conda: + CONDAENV + threads: + config.get("threads", 1) + group: + "combined_group" + shell: + """ + vsearch --usearch_global {input.seqs} \ + --db {input.db} --id 1.0 \ + --otutabout {output.tsv} --sizein --strand plus \ + --threads {threads} + """ rule create_biom_from_txt: From 35cd4611f9c0785b189937b798f8d8029a60f907 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Sat, 27 Apr 2019 15:40:55 -0700 Subject: [PATCH 03/21] bump version --- hundo/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/__init__.py b/hundo/__init__.py index b7e1990..64c7d51 100644 --- a/hundo/__init__.py +++ b/hundo/__init__.py @@ -1 +1 @@ -__version__ = "1.2.5" +__version__ = "1.2.6" From f807168460cd16ebc04b5a0ee264ef90ff4b9ea1 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Sat, 27 Apr 2019 16:22:47 -0700 Subject: [PATCH 04/21] bump minor version, not patch version --- hundo/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/__init__.py b/hundo/__init__.py index 64c7d51..67bc602 100644 --- a/hundo/__init__.py +++ b/hundo/__init__.py @@ -1 +1 @@ -__version__ = "1.2.6" +__version__ = "1.3.0" From 3ea993cbed82221d34d5c6ce4ad027949913fcc8 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Sat, 27 Apr 2019 16:23:42 -0700 Subject: [PATCH 05/21] Add pipline more places --- hundo/Snakefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index b1bdacc..9629376 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -186,7 +186,7 @@ def get_run_reference_chimera_filter_inputs(wildcards): db = REFFASTA else: db = config.get("reference_chimera_filter") - if config.get("pipeline", "ASV"): + if (config.get("pipeline") == "ASV"): fasta = "zOTUs-uchime3.fasta" elif config.get("denovo_chimera_filter", True): fasta = "chimera-filtered_denovo.fasta" @@ -993,7 +993,8 @@ rule aggregate_run_parameters: blast_minimum_bitscore = config.get("blast_minimum_bitscore"), blast_top_fraction = config.get("blast_top_fraction"), read_identity_requirement = config.get("read_identity_requirement"), - aligner = config.get("aligner") + aligner = config.get("aligner"), + pipeline = config.get("pipeline") run: with open(output.txt, "w") as fh: print("fastq_dir:", params.fastq_dir, sep=" ", file=fh) @@ -1014,6 +1015,7 @@ rule aggregate_run_parameters: print("percent_of_allowable_difference:", params.percent_of_allowable_difference, sep=" ", file=fh) print("reference_database:", params.reference_database, sep=" ", file=fh) print("aligner:", params.aligner, sep=" ", file=fh) + print("pipeline:", params.pipeline, sep=" ", file=fh) print("blast_minimum_bitscore:", params.blast_minimum_bitscore, sep=" ", file=fh) print("blast_top_fraction:", params.blast_top_fraction, sep=" ", file=fh) print("read_identity_requirement:", params.read_identity_requirement, sep=" ", file=fh) From fabdd9a8283ce4cecb6e68f066140787a25549b3 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Sat, 27 Apr 2019 16:24:37 -0700 Subject: [PATCH 06/21] Correct order of pipeline and add it more places --- hundo/hundo.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/hundo/hundo.py b/hundo/hundo.py index 676dc2c..0a129b9 100644 --- a/hundo/hundo.py +++ b/hundo/hundo.py @@ -31,13 +31,6 @@ def cli(obj): https://github.com/pnnl/hundo/issues """ -@click.option( - "--pipeline", - type=str, - default="OTU", - show_default=True, - help="cluster reads into OTUs or denoise reads into ASVs. This also changes other parts of the pipeline. default:OTU", -) @cli.command("lca", short_help="runs LCA across aligned hits") @click.argument("fasta", type=click.Path(exists=True)) @click.argument("aligned_hits", type=click.File("r")) @@ -351,6 +344,13 @@ def run_download(database_dir, jobs, reference_database, dryrun, snakemake_args) show_default=True, help="local aligner; `blast` is more sensitive while `vsearch` is much faster", ) +@click.option( + "--pipeline", + type=click.Choice(choices=["OTU", "ASV"]), + default="OTU", + show_default=True, + help="cluster to make OTUs or denoise to make ASVs. This also changes other parts of the pipeline", +) @click.option( "-t", "--threads", @@ -526,6 +526,7 @@ def run_annotate( dryrun, author, aligner, + pipeline, threads, database_dir, filter_adapters, @@ -609,6 +610,7 @@ def run_annotate( "read_identity_requirement={read_identity_requirement} " "prefilter_file_size={prefilter_file_size} " "aligner={aligner} " + "pipeline={pipeline} " "no_temp_declared={no_temp_declared} {add_args} " "{args}" ).format( @@ -642,6 +644,7 @@ def run_annotate( read_identity_requirement=read_identity_requirement, prefilter_file_size=prefilter_file_size, aligner=aligner, + pipeline=pipeline, no_temp_declared=no_temp_declared, add_args="" if snakemake_args and snakemake_args[0].startswith("-") else "--", args=" ".join(snakemake_args), From 93eb5ab8f2febf76cb2cdad34a8c4ccd152a4eb0 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Sat, 27 Apr 2019 16:30:19 -0700 Subject: [PATCH 07/21] Bump vsearch version to support --unoise --- hundo/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/environment.yml b/hundo/environment.yml index d1294b2..0925c83 100644 --- a/hundo/environment.yml +++ b/hundo/environment.yml @@ -18,7 +18,7 @@ dependencies: - pandas=0.24.2 - pigz=2.3.4 - pyyaml>=3.12 - - vsearch=2.6.0 + - vsearch=2.13.0 - zip=3.0 - pip - pip: From 4db51299cc79761eddcb2f1bee71346dad8ec420 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Sat, 27 Apr 2019 16:48:39 -0700 Subject: [PATCH 08/21] Bump to more flexible version of vsearch and update Snakefile --- hundo/Snakefile | 6 +++--- hundo/environment.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index 9629376..9badf67 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -647,7 +647,7 @@ if config.get("pipeline") == "OTU": shell: """ vsearch --uchime_ref {input.fasta} --nonchimeras {output} \ - --strand plus --sizein --sizeout --db {input.db} \ + --sizein --sizeout --db {input.db} \ --threads {threads} --log {log} """ @@ -740,7 +740,7 @@ else: shell: """ vsearch --uchime3_denovo {input} --nonchimeras {output} \ - --strand plus --sizein --sizeout --log {log} + --sizein --sizeout --log {log} """ @@ -760,7 +760,7 @@ else: shell: """ vsearch --uchime_ref {input.fasta} --nonchimeras {output} \ - --strand plus --sizein --sizeout --db {input.db} \ + --sizein --sizeout --db {input.db} \ --threads {threads} --log {log} """ diff --git a/hundo/environment.yml b/hundo/environment.yml index 0925c83..c2d5c66 100644 --- a/hundo/environment.yml +++ b/hundo/environment.yml @@ -18,7 +18,7 @@ dependencies: - pandas=0.24.2 - pigz=2.3.4 - pyyaml>=3.12 - - vsearch=2.13.0 + - vsearch=2.13.1 - zip=3.0 - pip - pip: From ac52599e0cf7eeb7b8728b1ef0c4de2f92cba431 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Sat, 27 Apr 2019 17:24:21 -0700 Subject: [PATCH 09/21] Remove extra --strand argument --- hundo/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index 9badf67..1b3f92e 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -626,7 +626,7 @@ if config.get("pipeline") == "OTU": shell: """ vsearch --uchime_denovo {input} --nonchimeras {output} \ - --strand plus --sizein --sizeout --threads {threads} \ + --sizein --sizeout --threads {threads} \ --log {log} """ From 0a81c9f90f5a3b328836cdc1b85efe8d05811588 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Mon, 29 Apr 2019 12:20:32 -0700 Subject: [PATCH 10/21] Use --relabel_md5 when building ASVs --- hundo/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index 1b3f92e..c0de8e0 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -721,7 +721,7 @@ else: shell: """ vsearch --threads {threads} --cluster_unoise {input} \ - --strand plus --sizein --sizeout \ + --strand plus --sizein --sizeout --relabel_md5 \ --centroids {output.fa} --log {log} """ From 82ce59a449e4d3b0723f2b14185d80d43ac1a747 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Tue, 28 May 2019 14:52:07 -0700 Subject: [PATCH 11/21] compile_counts using .99 instead of 1.0 --- hundo/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index c0de8e0..ac45764 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -893,7 +893,7 @@ else: shell: """ vsearch --usearch_global {input.seqs} \ - --db {input.db} --id 1.0 \ + --db {input.db} --id 0.99 \ --otutabout {output.tsv} --sizein --strand plus \ --threads {threads} """ From 9af03f36bfb2acd076592762799ee97ccb7dcfdd Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Tue, 28 May 2019 15:31:02 -0700 Subject: [PATCH 12/21] use ASV or OTU in full Snakefile' --- hundo/Snakefile | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index ac45764..289bd70 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -769,7 +769,7 @@ else: input: "zOTUs-uchime3-uchime-ref.fasta" output: - "OTU.fasta" + "ASV.fasta" group: "combined_group" shell: @@ -781,7 +781,7 @@ else: if config.get("aligner") == "blast": rule run_aligner: input: - fasta = "OTU.fasta", + fasta = "%s.fasta" % config.get("pipeline"), # force download db = BLASTREF, db_fa = REFFASTA @@ -807,7 +807,7 @@ if config.get("aligner") == "blast": else: rule run_aligner: input: - fasta = "OTU.fasta", + fasta = "%s.fasta" % config.get("pipeline"), db = BLASTREF, db_fa = REFFASTA output: @@ -832,13 +832,13 @@ else: rule compute_lca: input: - fasta = "OTU.fasta", + fasta = "%s.fasta" % config.get("pipeline"), hits = "local-hits.txt", map = REFMAP, tre = REFTRE output: - fasta = "OTU_tax.fasta", - tsv = temp("OTU_assignments.tsv") + fasta = "%s_tax.fasta" % config.get("pipeline"), + tsv = temp("tax_assignments.tsv") params: min_score = config.get("blast_minimum_bitscore", 125), min_pid = config.get("min_pid", .85), @@ -881,9 +881,9 @@ else: rule compile_counts: input: seqs = "all-sequences.fasta", - db = "OTU_tax.fasta" + db = "ASV_tax.fasta" output: - tsv = "OTU.txt" + tsv = "ASV.txt" conda: CONDAENV threads: @@ -901,9 +901,9 @@ else: rule create_biom_from_txt: input: - "OTU.txt" + txt = "%s.txt" % config.get("pipeline") output: - "OTU.biom" + biom = "%s.biom" % config.get("pipeline") shadow: "shallow" conda: @@ -912,17 +912,17 @@ rule create_biom_from_txt: "combined_group" shell: ''' - sed 's|\"||g' {input} | sed 's|\,|\;|g' > OTU_converted.txt - biom convert -i OTU_converted.txt -o {output} --to-json \ + sed 's|\"||g' {input.txt} | sed 's|\,|\;|g' > featured_converted.txt + biom convert -i featured_converted.txt -o {output.biom} --to-json \ --process-obs-metadata sc_separated --table-type "OTU table" ''' rule multiple_align: input: - "OTU.fasta" + fasta = "%s.fasta" % config.get("pipeline") output: - "OTU_aligned.fasta" + msa = "%s_aligned.fasta" % config.get("pipeline") threads: # mafft (via conda) is single-threaded on macOS config.get("threads", 1) @@ -933,15 +933,15 @@ rule multiple_align: shell: """ mafft --auto --thread {threads} --quiet --maxiterate 1000 \ - --globalpair {input} > {output} + --globalpair {input.fasta} > {output.msa} """ rule newick_tree: input: - "OTU_aligned.fasta" + msa = "%s_aligned.fasta" % config.get("pipeline") output: - "OTU.tree" + tre = "%s.tree" % config.get("pipeline") threads: config.get("threads", 1) conda: @@ -952,13 +952,13 @@ rule newick_tree: "combined_group" shell: """ - export OMP_NUM_THREADS={threads} && FastTreeMP -nt -gamma -spr 4 -log {log} -quiet {input} > {output} || fasttree -nt -gamma -spr 4 -log {log} -quiet {input} > {output} + export OMP_NUM_THREADS={threads} && FastTreeMP -nt -gamma -spr 4 -log {log} -quiet {input.msa} > {output.tre} || fasttree -nt -gamma -spr 4 -log {log} -quiet {input.msa} > {output.tre} """ rule make_deliverables_archive: input: - "OTU.biom", "OTU.fasta", "OTU.tree", "OTU.txt" + "OTU.biom", "OTU.fasta", "OTU.tree", "OTU.txt", "ASV.biom", "ASV.fasta", "ASV.tree", "ASV.txt" output: "hundo_results.zip" conda: @@ -1028,8 +1028,9 @@ rule build_report: "scripts", "build_report.py" ), - biom = "OTU.biom", - txt = "OTU.txt", + biom = "%s.biom" % config.get("pipeline"), + txt = "%s.txt" % config.get("pipeline"), + pipeline = config.get("pipeline"), archive = "hundo_results.zip", params = "PARAMS.txt", env = CONDAENV, @@ -1067,6 +1068,7 @@ rule build_report: shell: """ python {input.report_script} --biom {input.biom} --txt {input.txt} \ + --pipeline {input.pipeline} \ --zip {input.archive} --env {input.env} --params {input.params} \ --eeplot 'logs/*_merged_filtered_eestats.txt' \ --r1quals 'logs/*_R1_eestats.txt' \ From 733d55d6aa8cc6d2ed0ba17ef5b9c60c9019df2e Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Tue, 28 May 2019 15:37:53 -0700 Subject: [PATCH 13/21] duplicate report for ASVs --- hundo/scripts/build_report_ASV.py | 630 ++++++++++++++++++++++++++++++ 1 file changed, 630 insertions(+) create mode 100644 hundo/scripts/build_report_ASV.py diff --git a/hundo/scripts/build_report_ASV.py b/hundo/scripts/build_report_ASV.py new file mode 100644 index 0000000..f0a1914 --- /dev/null +++ b/hundo/scripts/build_report_ASV.py @@ -0,0 +1,630 @@ +#!/usr/bin/env python +# coding=utf-8 + +import argparse +import base64 +import csv +import datetime +import io +import mimetypes +import os +import textwrap +from collections import defaultdict +from glob import glob + +import numpy as np +import pandas as pd +import plotly.figure_factory as ff +import plotly.graph_objs as go +from biom import parse_table +from docutils.core import publish_file +from plotly import offline + +import relatively + +STYLE = """ + + + + """ +SCRIPT = """ + + + + + + """ +BLUE = "rgba(31,119,180,0.3)" +RED = "rgba(214,39,40,0.3)" + + +def build_uri(file, file_type=None, default_encoding="utf8"): + fname = os.path.basename(file) + type, encoding = mimetypes.guess_type(file) + if file_type is None: + file_type = type + if encoding is not None: + default_encoding = encoding + with open(file, mode="rb") as fh: + fdata = base64.b64encode(fh.read()) + return f'data:{file_type};charset={default_encoding};filename={fname};base64,{fdata.decode("utf-8")}' + + +def parse_biom(biom): + with open(biom) as fh: + bt = parse_table(fh) + df = bt.to_dataframe() + otu_df = [ + ["Samples", "OTUs", "OTU Total Count", "OTU Table Density"], + [ + bt.length("sample"), + bt.length("observation"), + bt.sum(), + "{:f}".format(bt.get_table_density()), + ], + ] + otu_df = pd.DataFrame(otu_df) + # make the first row the column labels + otu_df.rename(columns=otu_df.iloc[0], inplace=True) + # drop the first row + otu_df = otu_df.reindex(otu_df.index.drop(0)) + # build the table + summary_table = otu_df.to_html( + index=False, + bold_rows=False, + classes=["table", "table-bordered"], + table_id="otuTable", + ) + # fix for rst + summary_table = summary_table.replace("\n", "\n" + 10 * " ") + return df, summary_table + + +def make_div(figure_or_data, include_plotlyjs=False, show_link=False, div_id=None): + div = offline.plot( + figure_or_data, + include_plotlyjs=include_plotlyjs, + show_link=show_link, + output_type="div", + ) + if ".then(function ()" in div: + div = f"""{div.partition(".then(function ()")[0]}""" + if div_id: + import re + + try: + existing_id = re.findall(r'id="(.*?)"|$', div)[0] + div = div.replace(existing_id, div_id) + except IndexError: + pass + return div.replace("\n", " ") + + +def build_summary_table(raw, biom_df, div_df, omitted=None): + # sample summary table + header = ["Sample", "Raw", "Filtered", "Merged", "In OTUs"] + count_data = [] + omit = [] + if omitted: + for sample_count in omitted.split(","): + sample, count = sample_count.split(":") + omit.append(sample) + count_data.append([sample, count]) + for count_file in raw: + sample_name = count_file.partition("logs/raw_counts/")[-1].partition( + "_R1.count" + )[0] + sd = [sample_name] + with open(count_file) as fh: + for line in fh: + sd.append(int(float(line.strip()))) + with open(count_file.replace("raw_counts/", "filtered_counts/")) as fh: + for line in fh: + sd.append(int(float(line.strip()))) + with open(count_file.replace("raw_counts/", "merged_counts/")) as fh: + for line in fh: + sd.append(int(float(line.strip()))) + sd.append(int(biom_df[sample_name].sum())) + count_data.append(sd) + df = pd.DataFrame(count_data, columns=header) + df = df.merge(div_df, how="outer", left_on="Sample", right_index=True) + df.sort_values(by="shannon", inplace=True) + # summary table + df_str = df.to_html( + index=False, + bold_rows=False, + classes=["table", "table-bordered"], + table_id="summaryTable", + ) + df_str = df_str.replace("\n", "\n" + 10 * " ") + df = df.reset_index() + # pull out the omitted samples + keep = [idx for idx, sample in df.Sample.items() if sample not in omit] + df = df.iloc[keep] + df.sort_values(by="Raw", inplace=True) + plot_data = [] + visible = [True] * 4 + if len(df.Sample) > 10: + visible = [True] + ["legendonly"] * 3 + for i, metric in enumerate(header[1:]): + plot_data.append( + go.Bar(x=df.Sample, y=df[metric], name=metric, visible=visible[i]) + ) + layout = dict( + title="Counts Per Sample By Stage", + barmode="group", + height=500, + yaxis={"title": "Count"}, + showlegend=True, + hovermode="x", + bargroupgap=0.1, + legend={"orientation": "h", "x": 0, "y": 1.06}, + ) + fig = go.Figure(data=plot_data, layout=layout) + return df_str, make_div(fig, div_id="countSummaryPlot") + + +def build_quality_plot(r1_quals): + # quality plot + raw_qual_stats = defaultdict(lambda: defaultdict(list)) + for r1_ee_file in r1_quals: + sample_name = r1_ee_file.partition("logs/")[-1].partition("_R1")[0] + r2_ee_file = "_R2".join(r1_ee_file.rsplit("_R1", 1)) + with open(r1_ee_file) as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + raw_qual_stats["R1"][sample_name].append(float(row["Mean_Q"])) + with open(r2_ee_file) as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + q = float(row["Mean_Q"]) + raw_qual_stats["R2"][sample_name].append(q) + data = [] + for read_index, sample_data in raw_qual_stats.items(): + color = BLUE if read_index == "R1" else RED + for sample_name, sample_stats in sample_data.items(): + data.append( + go.Scatter( + x=list(range(1, len(sample_stats))), + y=sample_stats, + name=sample_name, + text=sample_name, + hoverinfo="text+x+y", + legendgroup=read_index, + mode="lines", + line=dict(color=color, dash="solid"), + ) + ) + layout = go.Layout( + title="Mean Quality Scores for R1 and R2", + xaxis={"title": "Position"}, + yaxis={"title": "Quality (Phred score)"}, + hovermode="closest", + showlegend=False, + autosize=True, + annotations=[ + dict( + x=0, + y=1.1, + xref="paper", + yref="paper", + text="Forward", + showarrow=False, + font=dict(size=16, color="#ffffff"), + align="left", + borderpad=4, + bgcolor="#1f77b4", + ), + dict( + x=0.15, + y=1.1, + xref="paper", + yref="paper", + text="Reverse", + showarrow=False, + font=dict(size=16, color="#ffffff"), + align="left", + borderpad=4, + bgcolor="#d62728", + ), + ], + ) + fig = go.Figure(data=data, layout=layout) + return make_div(fig, div_id="qualityScorePlot") + + +def build_taxonomy_plot(txt, value_cols, height=900): + df = pd.read_csv(txt, sep="\t") + levels = ["kingdom", "phylum", "class", "order", "family", "genus", "species"] + df[levels] = df["taxonomy"].str.split(",", expand=True) + # remove "k__", etc., from taxonomies + for level in levels: + df[level] = df[level].str.replace(f"{level[0]}__", "") + hierarchy = ["phylum", "class", "order"] + df[hierarchy] = df[hierarchy].fillna("NA") + df[value_cols] = df[value_cols].fillna(0) + df = df[hierarchy + value_cols] + dfs = relatively.get_dfs_across_hierarchy( + df, hierarchy, value_cols, reorder=None, dependent="; " + ) + fig = relatively.get_abundance_figure_from_dfs( + dfs, hierarchy, "Assigned Taxonomy Per Sample", height=height + ) + return make_div(fig, div_id="taxonomyPlot") + + +def build_ee_plot(eeplot): + # merged reads quality plot + ee_stats = defaultdict(list) + for stats_file in eeplot: + sample_name = stats_file.partition("logs/")[-1].partition("_merged")[0] + with open(stats_file) as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + ee_stats[sample_name].append(row["Mean_EE"]) + data = [] + for sample_name, sample_stats in ee_stats.items(): + data.append( + go.Scatter( + x=list(range(1, len(sample_stats))), + y=sample_stats, + name=sample_name, + text=sample_name, + hoverinfo="text+x+y", + mode="lines", + line=dict(color=BLUE, dash="solid"), + ) + ) + layout = go.Layout( + title="Mean Expected Error Rate Across Merged Reads", + yaxis={"title": "Expected Error"}, + xaxis={"title": "Base Position"}, + hovermode="closest", + showlegend=False, + ) + fig = go.Figure(data=data, layout=layout) + return make_div(fig, div_id="eePlot") + + +def format_from_file(txt): + ret = "" + with open(txt) as fh: + for i, line in enumerate(fh): + if i == 0: + ret += line + else: + ret += " " * 4 + line + return ret + + +def get_metadata(author=None, version=None, sep=" // "): + md = [datetime.date.today().isoformat()] + if version: + md.insert(0, version) + if author: + md.insert(0, author) + return sep.join(md) + + +def build_attachment(zip): + data = build_uri(zip) + name = os.path.basename(zip) + return f"""{name}""" + + +def main( + biom, + txt, + zip, + env, + params, + eeplot, + r1quals, + raw, + omitted, + html, + author, + version, + samples, +): + # resolve the file patterns + eeplot = glob(eeplot, recursive=False) + r1quals = glob(r1quals, recursive=False) + raw = glob(raw, recursive=False) + + biom_df, otu_summary_table = parse_biom(biom) + biom_df = biom_df.to_dense() + div_idx = relatively.calculate_diversity( + biom_df, + biom_df.index, + biom_df.columns, + index=["shannon", "simpson", "invsimpson"], + ) + sample_summary_table, count_summary_plot = build_summary_table( + raw, biom_df, div_idx, omitted + ) + sample_order = div_idx.sort_values(by="shannon").index.tolist() + quality_plot = build_quality_plot(r1quals) + taxonomy_plot = build_taxonomy_plot(txt, sample_order, 800) + ee_plot = build_ee_plot(eeplot) + params_str = format_from_file(params) + conda_str = format_from_file(env) + metadata = get_metadata(author, version) + attachment = build_attachment(zip) + samples_attachment = build_attachment(samples) + report_str = f""" + +.. raw:: html + + {STYLE} + {SCRIPT} + +============================================================= +hundo_ - Experiment Summary +============================================================= + +.. contents:: + :backlinks: none + :depth: 2 + +Summary +------- + +Sequence Counts +*************** + +.. raw:: html + +
+ {sample_summary_table} +
+ {count_summary_plot} + +Sequence Quality +**************** + +.. raw:: html + + {quality_plot} + {ee_plot} + +OTUs +**** + +.. raw:: html + +
+ {otu_summary_table} +
+ +Taxonomy +******** + +Samples are ordered from least diverse to most. + +.. raw:: html + + {taxonomy_plot} + +Methods +------- + +``hundo`` wraps a number of functions and its exact steps are +dependent upon a given user's configuration. + +Paired-end sequence reads are quality trimmed and can be trimmed of +adapters and filtered of contaminant sequences using BBDuk2 of the +BBTools_ package. Passing reads are merged using +VSEARCH_ then aggregated into a single FASTA file with headers +describing the origin and count of the sequence. + +Prior to creating clusters, reads are filtered again based on expected +error rates: + +To create clusters, the aggregated, merged reads are dereplicated to +remove singletons by default using VSEARCH. Sequences are preclustered +into centroids using VSEARCH to accelerate chimera filtering. Chimera +filtering is completed in two steps: *de novo* and then reference +based. The reference by default is the entire annotation database. +Following chimera filtering, sequences are placed into clusters using +distance-based, greedy cluster with VSEARCH based on the allowable +percent difference of the configuration. + +After OTU sequences have been determined, BLAST_ or VSEARCH is used to align +sequences to the reference database. Reference databases for 16S were curated +by the CREST_ team and hundo incorporates the CREST LCA method. ITS databases +are maintained by UNITE_. + +Counts are assigned to OTUs using the global alignment method of +VSEARCH, which outputs the final OTU table as a tab-delimited text +file. The Biom_ command line tool is used to convert the tab table +to biom. + +Multiple alignment of sequences is completed using MAFFT_. +A tree based on the aligned sequences is built using FastTree2_. + +This workflow is built using Snakemake_ and makes use of Bioconda_ +to install its dependencies. + +.. _BBTools: https://sourceforge.net/projects/bbmap/ +.. _VSEARCH: https://github.com/torognes/vsearch +.. _BLAST: https://blast.ncbi.nlm.nih.gov/Blast.cgi +.. _CREST: https://github.com/lanzen/CREST +.. _UNITE: https://unite.ut.ee/ +.. _Biom: http://biom-format.org/ +.. _MAFFT: https://mafft.cbrc.jp/alignment/software/ +.. _FastTree2: http://www.microbesonline.org/fasttree/ +.. _Snakemake: https://snakemake.readthedocs.io/en/stable/ +.. _Bioconda: https://bioconda.github.io/ + +Configuration +------------- + +:: + + {params_str} + + +Execution Environment +--------------------- + +:: + + {conda_str} + + +Output Files +------------ + +Not all files written by the workflow are contained within the +Downloads section of this page to minimize the size of the this +document. Other output files are described and are written to the +results directory. + + +Attached +******** + +The zip archive contains the following files: + +OTU.biom +```````` + +Biom table with raw counts per sample and their associated +taxonomic assignment formatted to be compatible with downstream tools +like phyloseq_. + +OTU.fasta +````````` + +Representative DNA sequences of each OTU. + +OTU.tree +```````` + +Newick tree representation of aligned OTU sequences. + +OTU.txt +``````` + +Tab-delimited text table with columns OTU ID, a column for each sample, +and taxonomy assignment in the final column as a comma delimited list. + + +Other Result Files +****************** + +Other files that may be needed, but that are not included in the +attached archive include: + +OTU_aligned.fasta +````````````````` + +OTU sequences after alignment using MAFFT. + +all-sequences.fasta +``````````````````` + +Quality-controlled, dereplicated DNA sequences of all samples. The +header of each record identifies the sample of origin and the count +resulting from dereplication. + +blast-hits.txt +`````````````` + +The BLAST assignments per OTU sequence. + +Downloads +--------- + +.. _hundo: https://github.com/pnnl/hundo +.. _phyloseq: https://joey711.github.io/phyloseq/ + + +.. container:: + :name: attachments + + + .. container:: + :name: archive + + archive: + .. raw:: html + + {attachment} + + .. container:: + :name: samples + + samples: + .. raw:: html + + {samples_attachment} + + +.. container:: + :name: metadata + + {metadata} + +""" + with open(html, "w") as fh: + publish_file( + source=io.StringIO(report_str), + destination=fh, + writer_name="html", + settings_overrides={"stylesheet_path": ""}, + ) + + +if __name__ == "__main__": + p = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + p.add_argument("--biom") + p.add_argument("--txt") + p.add_argument("--zip") + p.add_argument("--env") + p.add_argument("--params") + p.add_argument("--eeplot") + p.add_argument("--r1quals") + p.add_argument("--raw") + p.add_argument("--omitted") + p.add_argument("--html") + p.add_argument("--author") + p.add_argument("--version") + p.add_argument("--samples") + args = p.parse_args() + main( + args.biom, + args.txt, + args.zip, + args.env, + args.params, + args.eeplot, + args.r1quals, + args.raw, + args.omitted, + args.html, + args.author, + args.version, + args.samples, + ) From 895c307aeb67dbeabb905ca3b26809397599671b Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Tue, 28 May 2019 15:54:10 -0700 Subject: [PATCH 14/21] update new report to discribe ASVs --- hundo/scripts/build_report_ASV.py | 51 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/hundo/scripts/build_report_ASV.py b/hundo/scripts/build_report_ASV.py index f0a1914..b1d549a 100644 --- a/hundo/scripts/build_report_ASV.py +++ b/hundo/scripts/build_report_ASV.py @@ -73,7 +73,7 @@ def parse_biom(biom): bt = parse_table(fh) df = bt.to_dataframe() otu_df = [ - ["Samples", "OTUs", "OTU Total Count", "OTU Table Density"], + ["Samples", "ASVs", "ASV Total Count", "ASV Table Density"], [ bt.length("sample"), bt.length("observation"), @@ -120,7 +120,7 @@ def make_div(figure_or_data, include_plotlyjs=False, show_link=False, div_id=Non def build_summary_table(raw, biom_df, div_df, omitted=None): # sample summary table - header = ["Sample", "Raw", "Filtered", "Merged", "In OTUs"] + header = ["Sample", "Raw", "Filtered", "Merged", "In ASVs"] count_data = [] omit = [] if omitted: @@ -406,7 +406,7 @@ def main( {quality_plot} {ee_plot} -OTUs +ASVs **** .. raw:: html @@ -428,7 +428,8 @@ def main( ------- ``hundo`` wraps a number of functions and its exact steps are -dependent upon a given user's configuration. +dependent upon a given user's configuration. In this case, the +ASV pipeline was used. Paired-end sequence reads are quality trimmed and can be trimmed of adapters and filtered of contaminant sequences using BBDuk2 of the @@ -437,26 +438,22 @@ def main( describing the origin and count of the sequence. Prior to creating clusters, reads are filtered again based on expected -error rates: +error rates. -To create clusters, the aggregated, merged reads are dereplicated to -remove singletons by default using VSEARCH. Sequences are preclustered -into centroids using VSEARCH to accelerate chimera filtering. Chimera -filtering is completed in two steps: *de novo* and then reference +In order to create ASVs, the aggregated, merged reads are dereplicated +and denoised using cluster_unoise as implemented in VSEARCH_. Chimera +filtering is completed in two steps: uchime3 *de novo* and then reference based. The reference by default is the entire annotation database. -Following chimera filtering, sequences are placed into clusters using -distance-based, greedy cluster with VSEARCH based on the allowable -percent difference of the configuration. -After OTU sequences have been determined, BLAST_ or VSEARCH is used to align +After ASV sequences have been determined, BLAST_ or VSEARCH is used to align sequences to the reference database. Reference databases for 16S were curated by the CREST_ team and hundo incorporates the CREST LCA method. ITS databases are maintained by UNITE_. -Counts are assigned to OTUs using the global alignment method of -VSEARCH, which outputs the final OTU table as a tab-delimited text -file. The Biom_ command line tool is used to convert the tab table -to biom. +Counts are assigned to ASVs using the global alignment method of +VSEARCH at 99% identity, which outputs the final ASV table as a +tab-delimited text file. The Biom_ command line tool is used to convert the +tab table to biom. Multiple alignment of sequences is completed using MAFFT_. A tree based on the aligned sequences is built using FastTree2_. @@ -505,27 +502,27 @@ def main( The zip archive contains the following files: -OTU.biom +ASV.biom ```````` Biom table with raw counts per sample and their associated taxonomic assignment formatted to be compatible with downstream tools like phyloseq_. -OTU.fasta +ASV.fasta ````````` -Representative DNA sequences of each OTU. +Representative DNA sequences of each ASV. -OTU.tree +ASV.tree ```````` -Newick tree representation of aligned OTU sequences. +Newick tree representation of aligned ASV sequences. -OTU.txt +ASV.txt ``````` -Tab-delimited text table with columns OTU ID, a column for each sample, +Tab-delimited text table with columns ASV ID, a column for each sample, and taxonomy assignment in the final column as a comma delimited list. @@ -535,10 +532,10 @@ def main( Other files that may be needed, but that are not included in the attached archive include: -OTU_aligned.fasta +ASV_aligned.fasta ````````````````` -OTU sequences after alignment using MAFFT. +ASV sequences after alignment using MAFFT. all-sequences.fasta ``````````````````` @@ -550,7 +547,7 @@ def main( blast-hits.txt `````````````` -The BLAST assignments per OTU sequence. +The BLAST assignments per ASV sequence. Downloads --------- From e00131a922d0fd5e015815e51dd6572e3e280340 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Tue, 28 May 2019 15:57:32 -0700 Subject: [PATCH 15/21] switch report based on config.get("pipeline") --- hundo/Snakefile | 2 +- hundo/scripts/{build_report.py => build_report_OTU.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename hundo/scripts/{build_report.py => build_report_OTU.py} (100%) diff --git a/hundo/Snakefile b/hundo/Snakefile index 289bd70..d01f604 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -1026,7 +1026,7 @@ rule build_report: report_script = os.path.join( os.path.dirname(os.path.abspath(workflow.snakefile)), "scripts", - "build_report.py" + "build_report_%s.py" % config.get("pipeline") ), biom = "%s.biom" % config.get("pipeline"), txt = "%s.txt" % config.get("pipeline"), diff --git a/hundo/scripts/build_report.py b/hundo/scripts/build_report_OTU.py similarity index 100% rename from hundo/scripts/build_report.py rename to hundo/scripts/build_report_OTU.py From 0c717f511471a75c8c474ec325077222153e1ef8 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Tue, 28 May 2019 16:07:40 -0700 Subject: [PATCH 16/21] strip size from ASV headers --- hundo/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index d01f604..d22316c 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -760,7 +760,7 @@ else: shell: """ vsearch --uchime_ref {input.fasta} --nonchimeras {output} \ - --sizein --sizeout --db {input.db} \ + --xsize --db {input.db} \ --threads {threads} --log {log} """ From 47423f5c36b31b82b059ff93d84a920bedc39b2c Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Tue, 28 May 2019 16:17:14 -0700 Subject: [PATCH 17/21] bump vsearch to include gzip support --- hundo/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/environment.yml b/hundo/environment.yml index c2d5c66..b357281 100644 --- a/hundo/environment.yml +++ b/hundo/environment.yml @@ -18,7 +18,7 @@ dependencies: - pandas=0.24.2 - pigz=2.3.4 - pyyaml>=3.12 - - vsearch=2.13.1 + - vsearch=2.13.4 - zip=3.0 - pip - pip: From 5d6e2120fcb2a64ffb54d46c9537b36dcf47a3d3 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Tue, 28 May 2019 16:47:34 -0700 Subject: [PATCH 18/21] troubleshoot results.zip and build_report --- hundo/Snakefile | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index d22316c..5772c5a 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -958,7 +958,10 @@ rule newick_tree: rule make_deliverables_archive: input: - "OTU.biom", "OTU.fasta", "OTU.tree", "OTU.txt", "ASV.biom", "ASV.fasta", "ASV.tree", "ASV.txt" + biom = "%s.biom" % config.get("pipeline"), + fasta = "%s.fasta" % config.get("pipeline"), + tree = "%s.tree" % config.get("pipeline"), + txt = "%s.txt" % config.get("pipeline") output: "hundo_results.zip" conda: @@ -966,7 +969,7 @@ rule make_deliverables_archive: group: "combined_group" shell: - "zip {output} {input}" + "zip {output} {input.biom} {input.fasta} {input.tree} {input.txt} " rule aggregate_run_parameters: @@ -1026,11 +1029,13 @@ rule build_report: report_script = os.path.join( os.path.dirname(os.path.abspath(workflow.snakefile)), "scripts", - "build_report_%s.py" % config.get("pipeline") - ), + "build_report_OTU.py" + ) if config.get("pipeline") == "OTU" else os.path.join( + os.path.dirname(os.path.abspath(workflow.snakefile)), + "scripts", + "build_report_ASV.py"), biom = "%s.biom" % config.get("pipeline"), txt = "%s.txt" % config.get("pipeline"), - pipeline = config.get("pipeline"), archive = "hundo_results.zip", params = "PARAMS.txt", env = CONDAENV, @@ -1068,7 +1073,6 @@ rule build_report: shell: """ python {input.report_script} --biom {input.biom} --txt {input.txt} \ - --pipeline {input.pipeline} \ --zip {input.archive} --env {input.env} --params {input.params} \ --eeplot 'logs/*_merged_filtered_eestats.txt' \ --r1quals 'logs/*_R1_eestats.txt' \ From f3506d3d0e95464ee19d530fa98781e671d9c49a Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Wed, 29 May 2019 09:55:41 -0700 Subject: [PATCH 19/21] update manifest to include reports --- MANIFEST.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 837f7d0..6f9d61d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include hundo/Snakefile include hundo/environment.yml -include hundo/scripts/build_report.py +include hundo/scripts/build_report_OTU.py +include hundo/scripts/build_report_ASV.py From 08392b876100fe29b74bb8b6b6486c508f622a75 Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Mon, 18 Mar 2024 14:53:01 -0400 Subject: [PATCH 20/21] unpin bzip2 to updata vsearch --- hundo/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/environment.yml b/hundo/environment.yml index 62af040..1920728 100644 --- a/hundo/environment.yml +++ b/hundo/environment.yml @@ -9,7 +9,7 @@ dependencies: - biom-format=2.1.7 - biopython=1.74 - blast=2.9.0 - - bzip2=1.0.6 + - bzip2>1.0.6 - click=6.7 - docutils=0.14 - mafft=7.313 From 327838244f0903922794a0c97659fa2dbcb7374c Mon Sep 17 00:00:00 2001 From: Colin Brislawn Date: Mon, 18 Mar 2024 14:53:26 -0400 Subject: [PATCH 21/21] Update zenodo.org download URL --- hundo/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hundo/Snakefile b/hundo/Snakefile index 5772c5a..6a0c10a 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -311,7 +311,7 @@ rule download_reference_data: output: "%s/{filename}" % config.get("database_dir", ".") run: - shell("curl 'https://zenodo.org/record/1043977/files/{wildcards.filename}' -s > {output}") + shell("curl 'https://zenodo.org/records/1043977/files/{wildcards.filename}' -s > {output}") if not REFERENCES[os.path.basename(output[0])] == md5(output[0]): raise OSError(2, "Invalid checksum", output[0])