From 1a4fa514e4b4555f6041e3655cbabe7d325f1f06 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Sat, 27 Apr 2019 10:00:26 -0700
Subject: [PATCH 01/21] Add click option --pipeline

---
 hundo/hundo.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hundo/hundo.py b/hundo/hundo.py
index d9718e5..676dc2c 100644
--- a/hundo/hundo.py
+++ b/hundo/hundo.py
@@ -31,7 +31,13 @@ def cli(obj):
         https://github.com/pnnl/hundo/issues
     """
 
-
+@click.option(
+    "--pipeline",
+    type=str,
+    default="OTU",
+    show_default=True,
+    help="cluster reads into OTUs or denoise reads into ASVs. This also changes other parts of the pipeline. default:OTU",
+)
 @cli.command("lca", short_help="runs LCA across aligned hits")
 @click.argument("fasta", type=click.Path(exists=True))
 @click.argument("aligned_hits", type=click.File("r"))

From 4ba7af26fd79d268356d7f6862fe32d26ca24de3 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Sat, 27 Apr 2019 10:46:30 -0700
Subject: [PATCH 02/21] Add basic ASV pipeline

---
 hundo/Snakefile | 357 ++++++++++++++++++++++++++++++------------------
 1 file changed, 226 insertions(+), 131 deletions(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index fe417aa..b1bdacc 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -186,7 +186,9 @@ def get_run_reference_chimera_filter_inputs(wildcards):
         db = REFFASTA
     else:
         db = config.get("reference_chimera_filter")
-    if config.get("denovo_chimera_filter", True):
+    if config.get("pipeline", "ASV"):
+        fasta = "zOTUs-uchime3.fasta"
+    elif config.get("denovo_chimera_filter", True):
         fasta = "chimera-filtered_denovo.fasta"
     elif float(config.get("read_identity_requirement", 0.97)) * 100 < 99:
         fasta = "preclustered.fasta"
@@ -583,124 +585,197 @@ rule dereplicate_sequences:
             --threads {threads} -log {log} --uc {output.uc}
         """
 
+if config.get("pipeline") == "OTU":
+    rule precluster_sequences:
+        input:
+            "dereplicated-sequences.fasta"
+        output:
+            fa = temp("preclustered.fasta"),
+            uc = temp("preclustered.uclust")
+        params:
+            id = min((100 - float(config.get("percent_of_allowable_difference", 3))) / 100, 1.00)
+        conda:
+            CONDAENV
+        log:
+            "logs/precluster.log"
+        threads:
+            config.get("threads", 1)
+        group:
+            "combined_group"
+        shell:
+            """
+            vsearch --threads {threads} --cluster_size {input} \
+                --id {params.id} --strand plus --sizein --sizeout \
+                --centroids {output.fa} --uc {output.uc}
+            """
 
-rule precluster_sequences:
-    input:
-        "dereplicated-sequences.fasta"
-    output:
-        fa = temp("preclustered.fasta"),
-        uc = temp("preclustered.uclust")
-    params:
-        id = min((100 - float(config.get("percent_of_allowable_difference", 3))) / 100, 1.00)
-    conda:
-        CONDAENV
-    log:
-        "logs/precluster.log"
-    threads:
-        config.get("threads", 1)
-    group:
-        "combined_group"
-    shell:
-        """
-        vsearch --threads {threads} --cluster_size {input} \
-            --id {params.id} --strand plus --sizein --sizeout \
-            --centroids {output.fa} --uc {output.uc}
-        """
 
+    rule run_denovo_chimera_filter:
+        input:
+            "preclustered.fasta"
+        output:
+            temp("chimera-filtered_denovo.fasta")
+        conda:
+            CONDAENV
+        log:
+            "logs/chimera-filtered_denovo.log"
+        threads:
+            config.get("threads", 1)
+        group:
+            "combined_group"
+        shell:
+            """
+            vsearch --uchime_denovo {input} --nonchimeras {output} \
+                --strand plus --sizein --sizeout --threads {threads} \
+                --log {log}
+            """
 
-rule run_denovo_chimera_filter:
-    input:
-        "preclustered.fasta"
-    output:
-        temp("chimera-filtered_denovo.fasta")
-    conda:
-        CONDAENV
-    log:
-        "logs/chimera-filtered_denovo.log"
-    threads:
-        config.get("threads", 1)
-    group:
-        "combined_group"
-    shell:
-        """
-        vsearch --uchime_denovo {input} --nonchimeras {output} \
-            --strand plus --sizein --sizeout --threads {threads} \
-            --log {log}
-        """
 
+    rule run_reference_chimera_filter:
+        input:
+            unpack(get_run_reference_chimera_filter_inputs)
+        output:
+            temp("chimera-filtered_reference.fasta")
+        conda:
+            CONDAENV
+        log:
+            "logs/chimera-filtered_reference.log"
+        threads:
+            config.get("threads", 1)
+        group:
+            "combined_group"
+        shell:
+            """
+            vsearch --uchime_ref {input.fasta} --nonchimeras {output} \
+                --strand plus --sizein --sizeout --db {input.db} \
+                --threads {threads} --log {log}
+            """
 
-rule run_reference_chimera_filter:
-    input:
-        unpack(get_run_reference_chimera_filter_inputs)
-    output:
-        temp("chimera-filtered_reference.fasta")
-    conda:
-        CONDAENV
-    log:
-        "logs/chimera-filtered_reference.log"
-    threads:
-        config.get("threads", 1)
-    group:
-        "combined_group"
-    shell:
-        """
-        vsearch --uchime_ref {input.fasta} --nonchimeras {output} \
-            --strand plus --sizein --sizeout --db {input.db} \
-            --threads {threads} --log {log}
-        """
 
+    rule extract_filtered_sequences:
+        # Extract all non-chimeric, non-singleton sequences, dereplicated
+        input:
+            filter_fa = "dereplicated-sequences.fasta",
+            uc = "preclustered.uclust",
+            keep_fa = "chimera-filtered_reference.fasta"
+        output:
+            temp("nonchimeric-nonsingleton.fasta")
+        group:
+            "combined_group"
+        run:
+            filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0])
 
-rule extract_filtered_sequences:
-    # Extract all non-chimeric, non-singleton sequences, dereplicated
-    input:
-        filter_fa = "dereplicated-sequences.fasta",
-        uc = "preclustered.uclust",
-        keep_fa = "chimera-filtered_reference.fasta"
-    output:
-        temp("nonchimeric-nonsingleton.fasta")
-    group:
-        "combined_group"
-    run:
-        filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0])
 
+    rule pull_seqs_from_samples_with_filtered_sequences:
+        # Extract all non-chimeric, non-singleton sequences in each sample
+        input:
+            filter_fa = "all-sequences.fasta",
+            uc = "dereplicated.uclust",
+            keep_fa = "nonchimeric-nonsingleton.fasta"
+        output:
+            temp("all-sequences_filtered.fasta")
+        group:
+            "combined_group"
+        run:
+            filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0])
 
-rule pull_seqs_from_samples_with_filtered_sequences:
-    # Extract all non-chimeric, non-singleton sequences in each sample
-    input:
-        filter_fa = "all-sequences.fasta",
-        uc = "dereplicated.uclust",
-        keep_fa = "nonchimeric-nonsingleton.fasta"
-    output:
-        temp("all-sequences_filtered.fasta")
-    group:
-        "combined_group"
-    run:
-        filter_fastas(input.filter_fa, input.uc, input.keep_fa, output[0])
 
+    rule cluster_sequences:
+        input:
+            # get_cluster_sequences_input
+            "all-sequences_filtered.fasta"
+        output:
+            fa = "OTU.fasta"
+        params:
+            minsize = config.get("minimum_sequence_abundance", 2),
+            otu_id_pct = (100 - float(config.get("percent_of_allowable_difference", 3))) / 100.
+        conda:
+            CONDAENV
+        log:
+            "logs/cluster_sequences.log"
+        threads:
+            config.get("threads", 1)
+        group:
+            "combined_group"
+        shell:
+            """
+            vsearch --cluster_size {input} --centroids {output.fa} \
+                --relabel OTU_ --id {params.otu_id_pct} --log {log} \
+                --sizein --strand plus --threads {threads}
+            """
+# Pipeline for building ASVs
+else:
+    rule denoise_sequences:
+        input:
+            "dereplicated-sequences.fasta"
+        output:
+            fa = temp("zOTUs-all.fasta"),
+        conda:
+            CONDAENV
+        log:
+            "logs/cluster-unoise.log"
+        threads:
+            config.get("threads", 1)
+        group:
+            "combined_group"
+        shell:
+            """
+            vsearch --threads {threads} --cluster_unoise {input} \
+                --strand plus --sizein --sizeout \
+                --centroids {output.fa} --log {log}
+            """
 
-rule cluster_sequences:
-    input:
-        # get_cluster_sequences_input
-        "all-sequences_filtered.fasta"
-    output:
-        fa = "OTU.fasta"
-    params:
-        minsize = config.get("minimum_sequence_abundance", 2),
-        otu_id_pct = (100 - float(config.get("percent_of_allowable_difference", 3))) / 100.
-    conda:
-        CONDAENV
-    log:
-        "logs/cluster_sequences.log"
-    threads:
-        config.get("threads", 1)
-    group:
-        "combined_group"
-    shell:
-        """
-        vsearch --cluster_size {input} --centroids {output.fa} \
-            --relabel OTU_ --id {params.otu_id_pct} --log {log} \
-            --sizein --strand plus --threads {threads}
-        """
+
+    rule run_denovo_chimera_filter:
+        input:
+            "zOTUs-all.fasta"
+        output:
+            temp("zOTUs-uchime3.fasta")
+        conda:
+            CONDAENV
+        log:
+            "logs/chimera-filtered_uchime3_denovo.log"
+        group:
+            "combined_group"
+        shell:
+            """
+            vsearch --uchime3_denovo {input} --nonchimeras {output} \
+                --strand plus --sizein --sizeout --log {log}
+            """
+
+
+    rule run_reference_chimera_filter:
+        input:
+            unpack(get_run_reference_chimera_filter_inputs)
+        output:
+            temp("zOTUs-uchime3-uchime-ref.fasta")
+        conda:
+            CONDAENV
+        log:
+            "logs/chimera-filtered_reference.log"
+        threads:
+            config.get("threads", 1)
+        group:
+            "combined_group"
+        shell:
+            """
+            vsearch --uchime_ref {input.fasta} --nonchimeras {output} \
+                --strand plus --sizein --sizeout --db {input.db} \
+                --threads {threads} --log {log}
+            """
+
+
+    rule rename_zOTUs_to_OTUs:
+        input:
+            "zOTUs-uchime3-uchime-ref.fasta"
+        output:
+            "OTU.fasta"
+        group:
+            "combined_group"
+        shell:
+            """
+            cp {input} {output}
+            """
 
 
 if config.get("aligner") == "blast":
@@ -779,29 +854,49 @@ rule compute_lca:
             {output.tsv}
         """
 
-
-rule compile_counts:
-    input:
-        seqs = "all-sequences.fasta",
-        db = "OTU_tax.fasta"
-    output:
-        tsv = "OTU.txt"
-    params:
-        id_req = config.get("read_identity_requirement", 0.80),
-        minqt = config.get("vsearch_minqt", 0.80)
-    conda:
-        CONDAENV
-    threads:
-        config.get("threads", 1)
-    group:
-        "combined_group"
-    shell:
-        """
-        vsearch --usearch_global {input.seqs} \
-            --db {input.db} --id {params.id_req} \
-            --otutabout {output.tsv} --sizein --strand plus \
-            --threads {threads}
-        """
+if config.get("pipeline") == "OTU":
+    rule compile_counts:
+        input:
+            seqs = "all-sequences.fasta",
+            db = "OTU_tax.fasta"
+        output:
+            tsv = "OTU.txt"
+        params:
+            id_req = config.get("read_identity_requirement", 0.80),
+            minqt = config.get("vsearch_minqt", 0.80)
+        conda:
+            CONDAENV
+        threads:
+            config.get("threads", 1)
+        group:
+            "combined_group"
+        shell:
+            """
+            vsearch --usearch_global {input.seqs} \
+                --db {input.db} --id {params.id_req} \
+                --otutabout {output.tsv} --sizein --strand plus \
+                --threads {threads}
+            """
+else:
+    rule compile_counts:
+        input:
+            seqs = "all-sequences.fasta",
+            db = "OTU_tax.fasta"
+        output:
+            tsv = "OTU.txt"
+        conda:
+            CONDAENV
+        threads:
+            config.get("threads", 1)
+        group:
+            "combined_group"
+        shell:
+            """
+            vsearch --usearch_global {input.seqs} \
+                --db {input.db} --id 1.0 \
+                --otutabout {output.tsv} --sizein --strand plus \
+                --threads {threads}
+            """
 
 
 rule create_biom_from_txt:

From 35cd4611f9c0785b189937b798f8d8029a60f907 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Sat, 27 Apr 2019 15:40:55 -0700
Subject: [PATCH 03/21] bump version

---
 hundo/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/__init__.py b/hundo/__init__.py
index b7e1990..64c7d51 100644
--- a/hundo/__init__.py
+++ b/hundo/__init__.py
@@ -1 +1 @@
-__version__ = "1.2.5"
+__version__ = "1.2.6"

From f807168460cd16ebc04b5a0ee264ef90ff4b9ea1 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Sat, 27 Apr 2019 16:22:47 -0700
Subject: [PATCH 04/21] bump minor version, not patch version

---
 hundo/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/__init__.py b/hundo/__init__.py
index 64c7d51..67bc602 100644
--- a/hundo/__init__.py
+++ b/hundo/__init__.py
@@ -1 +1 @@
-__version__ = "1.2.6"
+__version__ = "1.3.0"

From 3ea993cbed82221d34d5c6ce4ad027949913fcc8 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Sat, 27 Apr 2019 16:23:42 -0700
Subject: [PATCH 05/21] Add pipline more places

---
 hundo/Snakefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index b1bdacc..9629376 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -186,7 +186,7 @@ def get_run_reference_chimera_filter_inputs(wildcards):
         db = REFFASTA
     else:
         db = config.get("reference_chimera_filter")
-    if config.get("pipeline", "ASV"):
+    if (config.get("pipeline") == "ASV"):
         fasta = "zOTUs-uchime3.fasta"
     elif config.get("denovo_chimera_filter", True):
         fasta = "chimera-filtered_denovo.fasta"
@@ -993,7 +993,8 @@ rule aggregate_run_parameters:
         blast_minimum_bitscore = config.get("blast_minimum_bitscore"),
         blast_top_fraction = config.get("blast_top_fraction"),
         read_identity_requirement = config.get("read_identity_requirement"),
-        aligner = config.get("aligner")
+        aligner = config.get("aligner"),
+        pipeline = config.get("pipeline")
     run:
         with open(output.txt, "w") as fh:
             print("fastq_dir:", params.fastq_dir, sep=" ", file=fh)
@@ -1014,6 +1015,7 @@ rule aggregate_run_parameters:
             print("percent_of_allowable_difference:", params.percent_of_allowable_difference, sep=" ", file=fh)
             print("reference_database:", params.reference_database, sep=" ", file=fh)
             print("aligner:", params.aligner, sep=" ", file=fh)
+            print("pipeline:", params.pipeline, sep=" ", file=fh)
             print("blast_minimum_bitscore:", params.blast_minimum_bitscore, sep=" ", file=fh)
             print("blast_top_fraction:", params.blast_top_fraction, sep=" ", file=fh)
             print("read_identity_requirement:", params.read_identity_requirement, sep=" ", file=fh)

From fabdd9a8283ce4cecb6e68f066140787a25549b3 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Sat, 27 Apr 2019 16:24:37 -0700
Subject: [PATCH 06/21] Correct order of pipeline and add it more places

---
 hundo/hundo.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/hundo/hundo.py b/hundo/hundo.py
index 676dc2c..0a129b9 100644
--- a/hundo/hundo.py
+++ b/hundo/hundo.py
@@ -31,13 +31,6 @@ def cli(obj):
         https://github.com/pnnl/hundo/issues
     """
 
-@click.option(
-    "--pipeline",
-    type=str,
-    default="OTU",
-    show_default=True,
-    help="cluster reads into OTUs or denoise reads into ASVs. This also changes other parts of the pipeline. default:OTU",
-)
 @cli.command("lca", short_help="runs LCA across aligned hits")
 @click.argument("fasta", type=click.Path(exists=True))
 @click.argument("aligned_hits", type=click.File("r"))
@@ -351,6 +344,13 @@ def run_download(database_dir, jobs, reference_database, dryrun, snakemake_args)
     show_default=True,
     help="local aligner; `blast` is more sensitive while `vsearch` is much faster",
 )
+@click.option(
+    "--pipeline",
+    type=click.Choice(choices=["OTU", "ASV"]),
+    default="OTU",
+    show_default=True,
+    help="cluster to make OTUs or denoise to make ASVs. This also changes other parts of the pipeline",
+)
 @click.option(
     "-t",
     "--threads",
@@ -526,6 +526,7 @@ def run_annotate(
     dryrun,
     author,
     aligner,
+    pipeline,
     threads,
     database_dir,
     filter_adapters,
@@ -609,6 +610,7 @@ def run_annotate(
         "read_identity_requirement={read_identity_requirement} "
         "prefilter_file_size={prefilter_file_size} "
         "aligner={aligner} "
+        "pipeline={pipeline} "
         "no_temp_declared={no_temp_declared} {add_args} "
         "{args}"
     ).format(
@@ -642,6 +644,7 @@ def run_annotate(
         read_identity_requirement=read_identity_requirement,
         prefilter_file_size=prefilter_file_size,
         aligner=aligner,
+        pipeline=pipeline,
         no_temp_declared=no_temp_declared,
         add_args="" if snakemake_args and snakemake_args[0].startswith("-") else "--",
         args=" ".join(snakemake_args),

From 93eb5ab8f2febf76cb2cdad34a8c4ccd152a4eb0 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Sat, 27 Apr 2019 16:30:19 -0700
Subject: [PATCH 07/21] Bump vsearch version to support --unoise

---
 hundo/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/environment.yml b/hundo/environment.yml
index d1294b2..0925c83 100644
--- a/hundo/environment.yml
+++ b/hundo/environment.yml
@@ -18,7 +18,7 @@ dependencies:
     - pandas=0.24.2
     - pigz=2.3.4
     - pyyaml>=3.12
-    - vsearch=2.6.0
+    - vsearch=2.13.0
     - zip=3.0
     - pip
     - pip:

From 4db51299cc79761eddcb2f1bee71346dad8ec420 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Sat, 27 Apr 2019 16:48:39 -0700
Subject: [PATCH 08/21] Bump to more flexible version of vsearch and update
 Snakefile

---
 hundo/Snakefile       | 6 +++---
 hundo/environment.yml | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index 9629376..9badf67 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -647,7 +647,7 @@ if config.get("pipeline") == "OTU":
         shell:
             """
             vsearch --uchime_ref {input.fasta} --nonchimeras {output} \
-                --strand plus --sizein --sizeout --db {input.db} \
+                --sizein --sizeout --db {input.db} \
                 --threads {threads} --log {log}
             """
 
@@ -740,7 +740,7 @@ else:
         shell:
             """
             vsearch --uchime3_denovo {input} --nonchimeras {output} \
-                --strand plus --sizein --sizeout --log {log}
+                --sizein --sizeout --log {log}
             """
 
 
@@ -760,7 +760,7 @@ else:
         shell:
             """
             vsearch --uchime_ref {input.fasta} --nonchimeras {output} \
-                --strand plus --sizein --sizeout --db {input.db} \
+                --sizein --sizeout --db {input.db} \
                 --threads {threads} --log {log}
             """
 
diff --git a/hundo/environment.yml b/hundo/environment.yml
index 0925c83..c2d5c66 100644
--- a/hundo/environment.yml
+++ b/hundo/environment.yml
@@ -18,7 +18,7 @@ dependencies:
     - pandas=0.24.2
     - pigz=2.3.4
     - pyyaml>=3.12
-    - vsearch=2.13.0
+    - vsearch=2.13.1
     - zip=3.0
     - pip
     - pip:

From ac52599e0cf7eeb7b8728b1ef0c4de2f92cba431 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Sat, 27 Apr 2019 17:24:21 -0700
Subject: [PATCH 09/21] Remove extra --strand argument

---
 hundo/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index 9badf67..1b3f92e 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -626,7 +626,7 @@ if config.get("pipeline") == "OTU":
         shell:
             """
             vsearch --uchime_denovo {input} --nonchimeras {output} \
-                --strand plus --sizein --sizeout --threads {threads} \
+                --sizein --sizeout --threads {threads} \
                 --log {log}
             """
 

From 0a81c9f90f5a3b328836cdc1b85efe8d05811588 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Mon, 29 Apr 2019 12:20:32 -0700
Subject: [PATCH 10/21] Use --relabel_md5 when building ASVs

---
 hundo/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index 1b3f92e..c0de8e0 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -721,7 +721,7 @@ else:
         shell:
             """
             vsearch --threads {threads} --cluster_unoise {input} \
-                --strand plus --sizein --sizeout \
+                --strand plus --sizein --sizeout --relabel_md5 \
                 --centroids {output.fa} --log {log}
             """
 

From 82ce59a449e4d3b0723f2b14185d80d43ac1a747 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Tue, 28 May 2019 14:52:07 -0700
Subject: [PATCH 11/21] compile_counts using .99 instead of 1.0

---
 hundo/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index c0de8e0..ac45764 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -893,7 +893,7 @@ else:
         shell:
             """
             vsearch --usearch_global {input.seqs} \
-                --db {input.db} --id 1.0 \
+                --db {input.db} --id 0.99 \
                 --otutabout {output.tsv} --sizein --strand plus \
                 --threads {threads}
             """

From 9af03f36bfb2acd076592762799ee97ccb7dcfdd Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Tue, 28 May 2019 15:31:02 -0700
Subject: [PATCH 12/21] use ASV or OTU in full Snakefile'

---
 hundo/Snakefile | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index ac45764..289bd70 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -769,7 +769,7 @@ else:
         input:
             "zOTUs-uchime3-uchime-ref.fasta"
         output:
-            "OTU.fasta"
+            "ASV.fasta"
         group:
             "combined_group"
         shell:
@@ -781,7 +781,7 @@ else:
 if config.get("aligner") == "blast":
     rule run_aligner:
         input:
-            fasta = "OTU.fasta",
+            fasta = "%s.fasta" % config.get("pipeline"),
             # force download
             db = BLASTREF,
             db_fa = REFFASTA
@@ -807,7 +807,7 @@ if config.get("aligner") == "blast":
 else:
     rule run_aligner:
         input:
-            fasta = "OTU.fasta",
+            fasta = "%s.fasta" % config.get("pipeline"),
             db = BLASTREF,
             db_fa = REFFASTA
         output:
@@ -832,13 +832,13 @@ else:
 
 rule compute_lca:
     input:
-        fasta = "OTU.fasta",
+        fasta = "%s.fasta" % config.get("pipeline"),
         hits = "local-hits.txt",
         map = REFMAP,
         tre = REFTRE
     output:
-        fasta = "OTU_tax.fasta",
-        tsv = temp("OTU_assignments.tsv")
+        fasta = "%s_tax.fasta" % config.get("pipeline"),
+        tsv = temp("tax_assignments.tsv")
     params:
         min_score = config.get("blast_minimum_bitscore", 125),
         min_pid = config.get("min_pid", .85),
@@ -881,9 +881,9 @@ else:
     rule compile_counts:
         input:
             seqs = "all-sequences.fasta",
-            db = "OTU_tax.fasta"
+            db = "ASV_tax.fasta"
         output:
-            tsv = "OTU.txt"
+            tsv = "ASV.txt"
         conda:
             CONDAENV
         threads:
@@ -901,9 +901,9 @@ else:
 
 rule create_biom_from_txt:
     input:
-        "OTU.txt"
+        txt = "%s.txt" % config.get("pipeline")
     output:
-        "OTU.biom"
+        biom = "%s.biom" % config.get("pipeline")
     shadow:
         "shallow"
     conda:
@@ -912,17 +912,17 @@ rule create_biom_from_txt:
         "combined_group"
     shell:
         '''
-        sed 's|\"||g' {input} | sed 's|\,|\;|g' > OTU_converted.txt
-        biom convert -i OTU_converted.txt -o {output} --to-json \
+        sed 's|\"||g' {input.txt} | sed 's|\,|\;|g' > featured_converted.txt
+        biom convert -i featured_converted.txt -o {output.biom} --to-json \
             --process-obs-metadata sc_separated --table-type "OTU table"
         '''
 
 
 rule multiple_align:
     input:
-        "OTU.fasta"
+        fasta = "%s.fasta" % config.get("pipeline")
     output:
-        "OTU_aligned.fasta"
+        msa = "%s_aligned.fasta" % config.get("pipeline")
     threads:
         # mafft (via conda) is single-threaded on macOS
         config.get("threads", 1)
@@ -933,15 +933,15 @@ rule multiple_align:
     shell:
         """
         mafft --auto --thread {threads} --quiet --maxiterate 1000 \
-            --globalpair {input} > {output}
+            --globalpair {input.fasta} > {output.msa}
         """
 
 
 rule newick_tree:
     input:
-        "OTU_aligned.fasta"
+        msa = "%s_aligned.fasta" % config.get("pipeline")
     output:
-        "OTU.tree"
+        tre = "%s.tree" % config.get("pipeline")
     threads:
         config.get("threads", 1)
     conda:
@@ -952,13 +952,13 @@ rule newick_tree:
         "combined_group"
     shell:
         """
-        export OMP_NUM_THREADS={threads} && FastTreeMP -nt -gamma -spr 4 -log {log} -quiet {input} > {output} || fasttree -nt -gamma -spr 4 -log {log} -quiet {input} > {output}
+        export OMP_NUM_THREADS={threads} && FastTreeMP -nt -gamma -spr 4 -log {log} -quiet {input.msa} > {output.tre} || fasttree -nt -gamma -spr 4 -log {log} -quiet {input.msa} > {output.tre}
         """
 
 
 rule make_deliverables_archive:
     input:
-        "OTU.biom", "OTU.fasta", "OTU.tree", "OTU.txt"
+        "OTU.biom", "OTU.fasta", "OTU.tree", "OTU.txt", "ASV.biom", "ASV.fasta", "ASV.tree", "ASV.txt"
     output:
         "hundo_results.zip"
     conda:
@@ -1028,8 +1028,9 @@ rule build_report:
             "scripts",
             "build_report.py"
         ),
-        biom = "OTU.biom",
-        txt = "OTU.txt",
+        biom = "%s.biom" % config.get("pipeline"),
+        txt = "%s.txt" % config.get("pipeline"),
+        pipeline = config.get("pipeline"),
         archive = "hundo_results.zip",
         params = "PARAMS.txt",
         env = CONDAENV,
@@ -1067,6 +1068,7 @@ rule build_report:
     shell:
         """
         python {input.report_script} --biom {input.biom} --txt {input.txt} \
+            --pipeline {input.pipeline} \
             --zip {input.archive} --env {input.env} --params {input.params} \
             --eeplot 'logs/*_merged_filtered_eestats.txt' \
             --r1quals 'logs/*_R1_eestats.txt' \

From 733d55d6aa8cc6d2ed0ba17ef5b9c60c9019df2e Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Tue, 28 May 2019 15:37:53 -0700
Subject: [PATCH 13/21] duplicate report for ASVs

---
 hundo/scripts/build_report_ASV.py | 630 ++++++++++++++++++++++++++++++
 1 file changed, 630 insertions(+)
 create mode 100644 hundo/scripts/build_report_ASV.py

diff --git a/hundo/scripts/build_report_ASV.py b/hundo/scripts/build_report_ASV.py
new file mode 100644
index 0000000..f0a1914
--- /dev/null
+++ b/hundo/scripts/build_report_ASV.py
@@ -0,0 +1,630 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import argparse
+import base64
+import csv
+import datetime
+import io
+import mimetypes
+import os
+import textwrap
+from collections import defaultdict
+from glob import glob
+
+import numpy as np
+import pandas as pd
+import plotly.figure_factory as ff
+import plotly.graph_objs as go
+from biom import parse_table
+from docutils.core import publish_file
+from plotly import offline
+
+import relatively
+
+STYLE = """
+    <link rel="stylesheet" type="text/css" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css"/>
+    <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.19/css/dataTables.bootstrap.min.css"/>
+    <style type="text/css">
+    body{font-family:Helvetica,arial,sans-serif;font-size:14px;line-height:1.6;padding-bottom:10px;background-color:#fff;color:#333;margin:0}body>div .section::before{content:"";display:block;height:80px;margin:-80px 0 0}#summary::before{margin:0}.topic-title{font-size:18pt}body>div>.section{margin-left:255px;margin-bottom:3em}div.section{margin-right:20px}#contents>p{display:none}button,li p.first{display:inline-block}#contents{margin-top:80px;padding-left:0;width:235px;background-color:#f1f1f1;height:100%;position:fixed;overflow:auto}#contents ul{list-style-type:none}#contents ul>li{font-size:14pt}#contents ul>li a:hover{color:#151d26}button,h1.title{color:#fff;background-color:#151d26}#contents ul>li>ul>li{font-size:12pt}h1.title{margin-top:0;position:fixed;z-index:10;padding:20px;width:100%}code,table tr:nth-child(2n),tt{background-color:#f8f8f8}.one-col{min-width:310px;height:500px;margin:0 auto}.two-col-left{height:300px;width:49%;float:left}.two-col-right{height:300px;width:49%;float:right}button{margin:0 5px 0 0;padding:5px 25px;font-size:18px;line-height:1.8;appearance:none;box-shadow:none;border-radius:3px;border:none}button:focus{outline:0}button:hover{background-color:#4183C4}button:active{background-color:#27496d}.legend-rect{width:20px;height:20px;margin-right:8px;margin-left:20px;float:left;-webkit-border-radius:2px;border-radius:2px}a{color:#4183C4;text-decoration:none}a.absent{color:#c00}a.anchor{padding-left:30px;margin-left:-30px;cursor:pointer;position:absolute;top:0;left:0;bottom:0}dl,dl dt,dl dt:first-child,hr,table,table tr{padding:0}table tr td,table tr th{border:1px solid #ccc;text-align:left;padding:6px 13px}h1,h2,h3,h4,h5,h6{margin:20px 0 10px;padding:0;font-weight:700;-webkit-font-smoothing:antialiased;cursor:text;position:relative}h1:hover a.anchor,h2:hover a.anchor,h3:hover a.anchor,h4:hover a.anchor,h5:hover a.anchor,h6:hover a.anchor{text-decoration:none}h1 code,h1 tt,h2 code,h2 tt,h3 code,h3 tt,h4 code,h4 tt,h5 code,h5 tt,h6 code,h6 tt{font-size:inherit}h1{font-size:28px;color:#151d26;border-bottom:1px solid #ccc}h2{font-size:24px;color:#000}h3{font-size:18px}h4{font-size:16px}dl dt,h5,h6{font-size:14px}h6{color:#777}blockquote,dl,li,ol,p,pre,table,ul{margin:15px 0}hr{background:url(http://tinyurl.com/bq5kskr) repeat-x;border:0;color:#ccc;height:4px}a:first-child h1,a:first-child h2,a:first-child h3,a:first-child h4,a:first-child h5,a:first-child h6{margin-top:0;padding-top:0}h1 p,h2 p,h3 p,h4 p,h5 p,h6 p{margin-top:0}dl dt{font-weight:700;font-style:italic;margin:15px 0 5px}blockquote>:first-child,dl dd>:first-child,dl dt>:first-child,table tr td :first-child,table tr th :first-child{margin-top:0}blockquote>:last-child,dl dd>:last-child,dl dt>:last-child{margin-bottom:0}dl dd{margin:0 0 15px;padding:0 15px}blockquote{border-left:4px solid #ddd;padding:0 15px;color:#777}table{border-spacing:0;border-collapse:collapse}table tr{border-top:1px solid #ccc;background-color:#fff;margin:0}table tr th{font-weight:700;margin:0}table tr td{margin:0}table tr td :last-child,table tr th :last-child{margin-bottom:0}img{max-width:100%}span.frame{display:block;overflow:hidden}span.frame>span{border:1px solid #ddd;display:block;float:left;overflow:hidden;margin:13px 0 0;padding:7px;width:auto}span.frame span img{display:block;float:left}span.frame span span{clear:both;color:#333;display:block;padding:5px 0 0}span.align-center{display:block;overflow:hidden;clear:both}span.align-center>span{display:block;overflow:hidden;margin:13px auto 0;text-align:center}span.align-center span img{margin:0 auto;text-align:center}span.align-right{display:block;overflow:hidden;clear:both}span.align-right>span{display:block;overflow:hidden;margin:13px 0 0;text-align:right}span.align-right span img{margin:0;text-align:right}span.float-left{display:block;margin-right:13px;overflow:hidden;float:left}span.float-left span{margin:13px 0 0}span.float-right{display:block;margin-left:13px;overflow:hidden;float:right}span.float-right>span{display:block;overflow:hidden;margin:13px auto 0;text-align:right}code,tt{margin:0 2px;padding:0 5px;white-space:nowrap;border:1px solid #eaeaea;border-radius:3px}pre code{margin:0;padding:0;white-space:pre;background:0 0}.highlight pre,pre{background-color:#f8f8f8;border:1px solid #ccc;font-size:13px;line-height:19px;overflow:auto;padding:6px 10px;border-radius:3px}pre code,pre tt{background-color:transparent;border:none}div#metadata{text-align:right}h1{line-height:1.6}.simple{padding-left:20px}.docutils.container{width:100%}
+    </style>
+    """
+SCRIPT = """
+    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+    <script src="https://code.jquery.com/jquery-3.3.1.min.js"></script>
+    <script src="https://cdn.datatables.net/1.10.19/js/jquery.dataTables.min.js"></script>
+    <script src="https://cdn.datatables.net/1.10.19/js/dataTables.bootstrap.min.js"></script>
+    <script>
+    $(document).ready( function () {
+        $('#summaryTable').DataTable();
+        $('#otuTable').DataTable( {
+            "searching": false,
+            "paging": false,
+            "ordering": false,
+            "info": false,
+        } );
+    } );
+    window.onresize = function() {
+        Plotly.Plots.resize('countSummaryPlot')
+        Plotly.Plots.resize('qualityScorePlot')
+        Plotly.Plots.resize('taxonomyPlot')
+        Plotly.Plots.resize('eePlot')
+    }
+    </script>
+    """
+BLUE = "rgba(31,119,180,0.3)"
+RED = "rgba(214,39,40,0.3)"
+
+
+def build_uri(file, file_type=None, default_encoding="utf8"):
+    fname = os.path.basename(file)
+    type, encoding = mimetypes.guess_type(file)
+    if file_type is None:
+        file_type = type
+    if encoding is not None:
+        default_encoding = encoding
+    with open(file, mode="rb") as fh:
+        fdata = base64.b64encode(fh.read())
+    return f'data:{file_type};charset={default_encoding};filename={fname};base64,{fdata.decode("utf-8")}'
+
+
+def parse_biom(biom):
+    with open(biom) as fh:
+        bt = parse_table(fh)
+        df = bt.to_dataframe()
+        otu_df = [
+            ["Samples", "OTUs", "OTU Total Count", "OTU Table Density"],
+            [
+                bt.length("sample"),
+                bt.length("observation"),
+                bt.sum(),
+                "{:f}".format(bt.get_table_density()),
+            ],
+        ]
+        otu_df = pd.DataFrame(otu_df)
+        # make the first row the column labels
+        otu_df.rename(columns=otu_df.iloc[0], inplace=True)
+        # drop the first row
+        otu_df = otu_df.reindex(otu_df.index.drop(0))
+        # build the table
+        summary_table = otu_df.to_html(
+            index=False,
+            bold_rows=False,
+            classes=["table", "table-bordered"],
+            table_id="otuTable",
+        )
+        # fix for rst
+        summary_table = summary_table.replace("\n", "\n" + 10 * " ")
+    return df, summary_table
+
+
+def make_div(figure_or_data, include_plotlyjs=False, show_link=False, div_id=None):
+    div = offline.plot(
+        figure_or_data,
+        include_plotlyjs=include_plotlyjs,
+        show_link=show_link,
+        output_type="div",
+    )
+    if ".then(function ()" in div:
+        div = f"""{div.partition(".then(function ()")[0]}</script>"""
+    if div_id:
+        import re
+
+        try:
+            existing_id = re.findall(r'id="(.*?)"|$', div)[0]
+            div = div.replace(existing_id, div_id)
+        except IndexError:
+            pass
+    return div.replace("\n", " ")
+
+
+def build_summary_table(raw, biom_df, div_df, omitted=None):
+    # sample summary table
+    header = ["Sample", "Raw", "Filtered", "Merged", "In OTUs"]
+    count_data = []
+    omit = []
+    if omitted:
+        for sample_count in omitted.split(","):
+            sample, count = sample_count.split(":")
+            omit.append(sample)
+            count_data.append([sample, count])
+    for count_file in raw:
+        sample_name = count_file.partition("logs/raw_counts/")[-1].partition(
+            "_R1.count"
+        )[0]
+        sd = [sample_name]
+        with open(count_file) as fh:
+            for line in fh:
+                sd.append(int(float(line.strip())))
+        with open(count_file.replace("raw_counts/", "filtered_counts/")) as fh:
+            for line in fh:
+                sd.append(int(float(line.strip())))
+        with open(count_file.replace("raw_counts/", "merged_counts/")) as fh:
+            for line in fh:
+                sd.append(int(float(line.strip())))
+        sd.append(int(biom_df[sample_name].sum()))
+        count_data.append(sd)
+    df = pd.DataFrame(count_data, columns=header)
+    df = df.merge(div_df, how="outer", left_on="Sample", right_index=True)
+    df.sort_values(by="shannon", inplace=True)
+    # summary table
+    df_str = df.to_html(
+        index=False,
+        bold_rows=False,
+        classes=["table", "table-bordered"],
+        table_id="summaryTable",
+    )
+    df_str = df_str.replace("\n", "\n" + 10 * " ")
+    df = df.reset_index()
+    # pull out the omitted samples
+    keep = [idx for idx, sample in df.Sample.items() if sample not in omit]
+    df = df.iloc[keep]
+    df.sort_values(by="Raw", inplace=True)
+    plot_data = []
+    visible = [True] * 4
+    if len(df.Sample) > 10:
+        visible = [True] + ["legendonly"] * 3
+    for i, metric in enumerate(header[1:]):
+        plot_data.append(
+            go.Bar(x=df.Sample, y=df[metric], name=metric, visible=visible[i])
+        )
+    layout = dict(
+        title="Counts Per Sample By Stage",
+        barmode="group",
+        height=500,
+        yaxis={"title": "Count"},
+        showlegend=True,
+        hovermode="x",
+        bargroupgap=0.1,
+        legend={"orientation": "h", "x": 0, "y": 1.06},
+    )
+    fig = go.Figure(data=plot_data, layout=layout)
+    return df_str, make_div(fig, div_id="countSummaryPlot")
+
+
+def build_quality_plot(r1_quals):
+    # quality plot
+    raw_qual_stats = defaultdict(lambda: defaultdict(list))
+    for r1_ee_file in r1_quals:
+        sample_name = r1_ee_file.partition("logs/")[-1].partition("_R1")[0]
+        r2_ee_file = "_R2".join(r1_ee_file.rsplit("_R1", 1))
+        with open(r1_ee_file) as fh:
+            reader = csv.DictReader(fh, delimiter="\t")
+            for row in reader:
+                raw_qual_stats["R1"][sample_name].append(float(row["Mean_Q"]))
+        with open(r2_ee_file) as fh:
+            reader = csv.DictReader(fh, delimiter="\t")
+            for row in reader:
+                q = float(row["Mean_Q"])
+                raw_qual_stats["R2"][sample_name].append(q)
+    data = []
+    for read_index, sample_data in raw_qual_stats.items():
+        color = BLUE if read_index == "R1" else RED
+        for sample_name, sample_stats in sample_data.items():
+            data.append(
+                go.Scatter(
+                    x=list(range(1, len(sample_stats))),
+                    y=sample_stats,
+                    name=sample_name,
+                    text=sample_name,
+                    hoverinfo="text+x+y",
+                    legendgroup=read_index,
+                    mode="lines",
+                    line=dict(color=color, dash="solid"),
+                )
+            )
+    layout = go.Layout(
+        title="Mean Quality Scores for R1 and R2",
+        xaxis={"title": "Position"},
+        yaxis={"title": "Quality (Phred score)"},
+        hovermode="closest",
+        showlegend=False,
+        autosize=True,
+        annotations=[
+            dict(
+                x=0,
+                y=1.1,
+                xref="paper",
+                yref="paper",
+                text="Forward",
+                showarrow=False,
+                font=dict(size=16, color="#ffffff"),
+                align="left",
+                borderpad=4,
+                bgcolor="#1f77b4",
+            ),
+            dict(
+                x=0.15,
+                y=1.1,
+                xref="paper",
+                yref="paper",
+                text="Reverse",
+                showarrow=False,
+                font=dict(size=16, color="#ffffff"),
+                align="left",
+                borderpad=4,
+                bgcolor="#d62728",
+            ),
+        ],
+    )
+    fig = go.Figure(data=data, layout=layout)
+    return make_div(fig, div_id="qualityScorePlot")
+
+
+def build_taxonomy_plot(txt, value_cols, height=900):
+    df = pd.read_csv(txt, sep="\t")
+    levels = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
+    df[levels] = df["taxonomy"].str.split(",", expand=True)
+    # remove "k__", etc., from taxonomies
+    for level in levels:
+        df[level] = df[level].str.replace(f"{level[0]}__", "")
+    hierarchy = ["phylum", "class", "order"]
+    df[hierarchy] = df[hierarchy].fillna("NA")
+    df[value_cols] = df[value_cols].fillna(0)
+    df = df[hierarchy + value_cols]
+    dfs = relatively.get_dfs_across_hierarchy(
+        df, hierarchy, value_cols, reorder=None, dependent="; "
+    )
+    fig = relatively.get_abundance_figure_from_dfs(
+        dfs, hierarchy, "Assigned Taxonomy Per Sample", height=height
+    )
+    return make_div(fig, div_id="taxonomyPlot")
+
+
+def build_ee_plot(eeplot):
+    # merged reads quality plot
+    ee_stats = defaultdict(list)
+    for stats_file in eeplot:
+        sample_name = stats_file.partition("logs/")[-1].partition("_merged")[0]
+        with open(stats_file) as fh:
+            reader = csv.DictReader(fh, delimiter="\t")
+            for row in reader:
+                ee_stats[sample_name].append(row["Mean_EE"])
+    data = []
+    for sample_name, sample_stats in ee_stats.items():
+        data.append(
+            go.Scatter(
+                x=list(range(1, len(sample_stats))),
+                y=sample_stats,
+                name=sample_name,
+                text=sample_name,
+                hoverinfo="text+x+y",
+                mode="lines",
+                line=dict(color=BLUE, dash="solid"),
+            )
+        )
+    layout = go.Layout(
+        title="Mean Expected Error Rate Across Merged Reads",
+        yaxis={"title": "Expected Error"},
+        xaxis={"title": "Base Position"},
+        hovermode="closest",
+        showlegend=False,
+    )
+    fig = go.Figure(data=data, layout=layout)
+    return make_div(fig, div_id="eePlot")
+
+
+def format_from_file(txt):
+    ret = ""
+    with open(txt) as fh:
+        for i, line in enumerate(fh):
+            if i == 0:
+                ret += line
+            else:
+                ret += " " * 4 + line
+    return ret
+
+
+def get_metadata(author=None, version=None, sep=" // "):
+    md = [datetime.date.today().isoformat()]
+    if version:
+        md.insert(0, version)
+    if author:
+        md.insert(0, author)
+    return sep.join(md)
+
+
+def build_attachment(zip):
+    data = build_uri(zip)
+    name = os.path.basename(zip)
+    return f"""<a href="{data}" download="{name}" draggable="true">{name}</a>"""
+
+
+def main(
+    biom,
+    txt,
+    zip,
+    env,
+    params,
+    eeplot,
+    r1quals,
+    raw,
+    omitted,
+    html,
+    author,
+    version,
+    samples,
+):
+    # resolve the file patterns
+    eeplot = glob(eeplot, recursive=False)
+    r1quals = glob(r1quals, recursive=False)
+    raw = glob(raw, recursive=False)
+
+    biom_df, otu_summary_table = parse_biom(biom)
+    biom_df = biom_df.to_dense()
+    div_idx = relatively.calculate_diversity(
+        biom_df,
+        biom_df.index,
+        biom_df.columns,
+        index=["shannon", "simpson", "invsimpson"],
+    )
+    sample_summary_table, count_summary_plot = build_summary_table(
+        raw, biom_df, div_idx, omitted
+    )
+    sample_order = div_idx.sort_values(by="shannon").index.tolist()
+    quality_plot = build_quality_plot(r1quals)
+    taxonomy_plot = build_taxonomy_plot(txt, sample_order, 800)
+    ee_plot = build_ee_plot(eeplot)
+    params_str = format_from_file(params)
+    conda_str = format_from_file(env)
+    metadata = get_metadata(author, version)
+    attachment = build_attachment(zip)
+    samples_attachment = build_attachment(samples)
+    report_str = f"""
+
+.. raw:: html
+
+    {STYLE}
+    {SCRIPT}
+
+=============================================================
+hundo_ - Experiment Summary
+=============================================================
+
+.. contents::
+    :backlinks: none
+    :depth: 2
+
+Summary
+-------
+
+Sequence Counts
+***************
+
+.. raw:: html
+
+    <div style="overflow-x:auto;">
+    {sample_summary_table}
+    </div>
+    {count_summary_plot}
+
+Sequence Quality
+****************
+
+.. raw:: html
+
+    {quality_plot}
+    {ee_plot}
+
+OTUs
+****
+
+.. raw:: html
+
+    <div style="overflow-x:auto;">
+    {otu_summary_table}
+    </div>
+
+Taxonomy
+********
+
+Samples are ordered from least diverse to most.
+
+.. raw:: html
+
+    {taxonomy_plot}
+
+Methods
+-------
+
+``hundo`` wraps a number of functions and its exact steps are
+dependent upon a given user's configuration.
+
+Paired-end sequence reads are quality trimmed and can be trimmed of
+adapters and filtered of contaminant sequences using BBDuk2 of the
+BBTools_ package. Passing reads are merged using
+VSEARCH_ then aggregated into a single FASTA file with headers
+describing the origin and count of the sequence.
+
+Prior to creating clusters, reads are filtered again based on expected
+error rates:
+
+To create clusters, the aggregated, merged reads are dereplicated to
+remove singletons by default using VSEARCH. Sequences are preclustered
+into centroids using VSEARCH to accelerate chimera filtering. Chimera
+filtering is completed in two steps: *de novo* and then reference
+based. The reference by default is the entire annotation database.
+Following chimera filtering, sequences are placed into clusters using
+distance-based, greedy cluster with VSEARCH based on the allowable
+percent difference of the configuration.
+
+After OTU sequences have been determined, BLAST_ or VSEARCH is used to align
+sequences to the reference database. Reference databases for 16S were curated
+by the CREST_ team and hundo incorporates the CREST LCA method. ITS databases
+are maintained by UNITE_.
+
+Counts are assigned to OTUs using the global alignment method of
+VSEARCH, which outputs the final OTU table as a tab-delimited text
+file. The Biom_ command line tool is used to convert the tab table
+to biom.
+
+Multiple alignment of sequences is completed using MAFFT_.
+A tree based on the aligned sequences is built using FastTree2_.
+
+This workflow is built using Snakemake_ and makes use of Bioconda_
+to install its dependencies.
+
+.. _BBTools: https://sourceforge.net/projects/bbmap/
+.. _VSEARCH: https://github.com/torognes/vsearch
+.. _BLAST: https://blast.ncbi.nlm.nih.gov/Blast.cgi
+.. _CREST: https://github.com/lanzen/CREST
+.. _UNITE: https://unite.ut.ee/
+.. _Biom: http://biom-format.org/
+.. _MAFFT: https://mafft.cbrc.jp/alignment/software/
+.. _FastTree2: http://www.microbesonline.org/fasttree/
+.. _Snakemake: https://snakemake.readthedocs.io/en/stable/
+.. _Bioconda: https://bioconda.github.io/
+
+Configuration
+-------------
+
+::
+
+    {params_str}
+
+
+Execution Environment
+---------------------
+
+::
+
+    {conda_str}
+
+
+Output Files
+------------
+
+Not all files written by the workflow are contained within the
+Downloads section of this page to minimize the size of the this
+document. Other output files are described and are written to the
+results directory.
+
+
+Attached
+********
+
+The zip archive contains the following files:
+
+OTU.biom
+````````
+
+Biom table with raw counts per sample and their associated
+taxonomic assignment formatted to be compatible with downstream tools
+like phyloseq_.
+
+OTU.fasta
+`````````
+
+Representative DNA sequences of each OTU.
+
+OTU.tree
+````````
+
+Newick tree representation of aligned OTU sequences.
+
+OTU.txt
+```````
+
+Tab-delimited text table with columns OTU ID, a column for each sample,
+and taxonomy assignment in the final column as a comma delimited list.
+
+
+Other Result Files
+******************
+
+Other files that may be needed, but that are not included in the
+attached archive include:
+
+OTU_aligned.fasta
+`````````````````
+
+OTU sequences after alignment using MAFFT.
+
+all-sequences.fasta
+```````````````````
+
+Quality-controlled, dereplicated DNA sequences of all samples. The
+header of each record identifies the sample of origin and the count
+resulting from dereplication.
+
+blast-hits.txt
+``````````````
+
+The BLAST assignments per OTU sequence.
+
+Downloads
+---------
+
+.. _hundo: https://github.com/pnnl/hundo
+.. _phyloseq: https://joey711.github.io/phyloseq/
+
+
+.. container::
+   :name: attachments
+
+
+        .. container::
+            :name: archive
+
+            archive:
+                .. raw:: html
+
+                    {attachment}
+
+        .. container::
+            :name: samples
+
+            samples:
+                .. raw:: html
+
+                    {samples_attachment}
+
+
+.. container::
+   :name: metadata
+
+   {metadata}
+
+"""
+    with open(html, "w") as fh:
+        publish_file(
+            source=io.StringIO(report_str),
+            destination=fh,
+            writer_name="html",
+            settings_overrides={"stylesheet_path": ""},
+        )
+
+
+if __name__ == "__main__":
+    p = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    p.add_argument("--biom")
+    p.add_argument("--txt")
+    p.add_argument("--zip")
+    p.add_argument("--env")
+    p.add_argument("--params")
+    p.add_argument("--eeplot")
+    p.add_argument("--r1quals")
+    p.add_argument("--raw")
+    p.add_argument("--omitted")
+    p.add_argument("--html")
+    p.add_argument("--author")
+    p.add_argument("--version")
+    p.add_argument("--samples")
+    args = p.parse_args()
+    main(
+        args.biom,
+        args.txt,
+        args.zip,
+        args.env,
+        args.params,
+        args.eeplot,
+        args.r1quals,
+        args.raw,
+        args.omitted,
+        args.html,
+        args.author,
+        args.version,
+        args.samples,
+    )

From 895c307aeb67dbeabb905ca3b26809397599671b Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Tue, 28 May 2019 15:54:10 -0700
Subject: [PATCH 14/21] update new report to discribe ASVs

---
 hundo/scripts/build_report_ASV.py | 51 +++++++++++++++----------------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/hundo/scripts/build_report_ASV.py b/hundo/scripts/build_report_ASV.py
index f0a1914..b1d549a 100644
--- a/hundo/scripts/build_report_ASV.py
+++ b/hundo/scripts/build_report_ASV.py
@@ -73,7 +73,7 @@ def parse_biom(biom):
         bt = parse_table(fh)
         df = bt.to_dataframe()
         otu_df = [
-            ["Samples", "OTUs", "OTU Total Count", "OTU Table Density"],
+            ["Samples", "ASVs", "ASV Total Count", "ASV Table Density"],
             [
                 bt.length("sample"),
                 bt.length("observation"),
@@ -120,7 +120,7 @@ def make_div(figure_or_data, include_plotlyjs=False, show_link=False, div_id=Non
 
 def build_summary_table(raw, biom_df, div_df, omitted=None):
     # sample summary table
-    header = ["Sample", "Raw", "Filtered", "Merged", "In OTUs"]
+    header = ["Sample", "Raw", "Filtered", "Merged", "In ASVs"]
     count_data = []
     omit = []
     if omitted:
@@ -406,7 +406,7 @@ def main(
     {quality_plot}
     {ee_plot}
 
-OTUs
+ASVs
 ****
 
 .. raw:: html
@@ -428,7 +428,8 @@ def main(
 -------
 
 ``hundo`` wraps a number of functions and its exact steps are
-dependent upon a given user's configuration.
+dependent upon a given user's configuration. In this case, the
+ASV pipeline was used.
 
 Paired-end sequence reads are quality trimmed and can be trimmed of
 adapters and filtered of contaminant sequences using BBDuk2 of the
@@ -437,26 +438,22 @@ def main(
 describing the origin and count of the sequence.
 
 Prior to creating clusters, reads are filtered again based on expected
-error rates:
+error rates.
 
-To create clusters, the aggregated, merged reads are dereplicated to
-remove singletons by default using VSEARCH. Sequences are preclustered
-into centroids using VSEARCH to accelerate chimera filtering. Chimera
-filtering is completed in two steps: *de novo* and then reference
+In order to create ASVs, the aggregated, merged reads are dereplicated
+and denoised using cluster_unoise as implemented in VSEARCH_. Chimera
+filtering is completed in two steps: uchime3 *de novo* and then reference
 based. The reference by default is the entire annotation database.
-Following chimera filtering, sequences are placed into clusters using
-distance-based, greedy cluster with VSEARCH based on the allowable
-percent difference of the configuration.
 
-After OTU sequences have been determined, BLAST_ or VSEARCH is used to align
+After ASV sequences have been determined, BLAST_ or VSEARCH is used to align
 sequences to the reference database. Reference databases for 16S were curated
 by the CREST_ team and hundo incorporates the CREST LCA method. ITS databases
 are maintained by UNITE_.
 
-Counts are assigned to OTUs using the global alignment method of
-VSEARCH, which outputs the final OTU table as a tab-delimited text
-file. The Biom_ command line tool is used to convert the tab table
-to biom.
+Counts are assigned to ASVs using the global alignment method of
+VSEARCH at 99% identity, which outputs the final ASV table as a
+tab-delimited text file. The Biom_ command line tool is used to convert the
+tab table to biom.
 
 Multiple alignment of sequences is completed using MAFFT_.
 A tree based on the aligned sequences is built using FastTree2_.
@@ -505,27 +502,27 @@ def main(
 
 The zip archive contains the following files:
 
-OTU.biom
+ASV.biom
 ````````
 
 Biom table with raw counts per sample and their associated
 taxonomic assignment formatted to be compatible with downstream tools
 like phyloseq_.
 
-OTU.fasta
+ASV.fasta
 `````````
 
-Representative DNA sequences of each OTU.
+Representative DNA sequences of each ASV.
 
-OTU.tree
+ASV.tree
 ````````
 
-Newick tree representation of aligned OTU sequences.
+Newick tree representation of aligned ASV sequences.
 
-OTU.txt
+ASV.txt
 ```````
 
-Tab-delimited text table with columns OTU ID, a column for each sample,
+Tab-delimited text table with columns ASV ID, a column for each sample,
 and taxonomy assignment in the final column as a comma delimited list.
 
 
@@ -535,10 +532,10 @@ def main(
 Other files that may be needed, but that are not included in the
 attached archive include:
 
-OTU_aligned.fasta
+ASV_aligned.fasta
 `````````````````
 
-OTU sequences after alignment using MAFFT.
+ASV sequences after alignment using MAFFT.
 
 all-sequences.fasta
 ```````````````````
@@ -550,7 +547,7 @@ def main(
 blast-hits.txt
 ``````````````
 
-The BLAST assignments per OTU sequence.
+The BLAST assignments per ASV sequence.
 
 Downloads
 ---------

From e00131a922d0fd5e015815e51dd6572e3e280340 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Tue, 28 May 2019 15:57:32 -0700
Subject: [PATCH 15/21] switch report based on config.get("pipeline")

---
 hundo/Snakefile                                        | 2 +-
 hundo/scripts/{build_report.py => build_report_OTU.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename hundo/scripts/{build_report.py => build_report_OTU.py} (100%)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index 289bd70..d01f604 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -1026,7 +1026,7 @@ rule build_report:
         report_script = os.path.join(
             os.path.dirname(os.path.abspath(workflow.snakefile)),
             "scripts",
-            "build_report.py"
+            "build_report_%s.py" % config.get("pipeline")
         ),
         biom = "%s.biom" % config.get("pipeline"),
         txt = "%s.txt" % config.get("pipeline"),
diff --git a/hundo/scripts/build_report.py b/hundo/scripts/build_report_OTU.py
similarity index 100%
rename from hundo/scripts/build_report.py
rename to hundo/scripts/build_report_OTU.py

From 0c717f511471a75c8c474ec325077222153e1ef8 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Tue, 28 May 2019 16:07:40 -0700
Subject: [PATCH 16/21] strip size from ASV headers

---
 hundo/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index d01f604..d22316c 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -760,7 +760,7 @@ else:
         shell:
             """
             vsearch --uchime_ref {input.fasta} --nonchimeras {output} \
-                --sizein --sizeout --db {input.db} \
+                --xsize --db {input.db} \
                 --threads {threads} --log {log}
             """
 

From 47423f5c36b31b82b059ff93d84a920bedc39b2c Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Tue, 28 May 2019 16:17:14 -0700
Subject: [PATCH 17/21] bump vsearch to include gzip support

---
 hundo/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/environment.yml b/hundo/environment.yml
index c2d5c66..b357281 100644
--- a/hundo/environment.yml
+++ b/hundo/environment.yml
@@ -18,7 +18,7 @@ dependencies:
     - pandas=0.24.2
     - pigz=2.3.4
     - pyyaml>=3.12
-    - vsearch=2.13.1
+    - vsearch=2.13.4
     - zip=3.0
     - pip
     - pip:

From 5d6e2120fcb2a64ffb54d46c9537b36dcf47a3d3 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Tue, 28 May 2019 16:47:34 -0700
Subject: [PATCH 18/21] troubleshoot results.zip and build_report

---
 hundo/Snakefile | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index d22316c..5772c5a 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -958,7 +958,10 @@ rule newick_tree:
 
 rule make_deliverables_archive:
     input:
-        "OTU.biom", "OTU.fasta", "OTU.tree", "OTU.txt", "ASV.biom", "ASV.fasta", "ASV.tree", "ASV.txt"
+        biom = "%s.biom" % config.get("pipeline"),
+        fasta = "%s.fasta" % config.get("pipeline"),
+        tree = "%s.tree" % config.get("pipeline"),
+        txt = "%s.txt" % config.get("pipeline")
     output:
         "hundo_results.zip"
     conda:
@@ -966,7 +969,7 @@ rule make_deliverables_archive:
     group:
         "combined_group"
     shell:
-        "zip {output} {input}"
+        "zip {output} {input.biom} {input.fasta} {input.tree} {input.txt} "
 
 
 rule aggregate_run_parameters:
@@ -1026,11 +1029,13 @@ rule build_report:
         report_script = os.path.join(
             os.path.dirname(os.path.abspath(workflow.snakefile)),
             "scripts",
-            "build_report_%s.py" % config.get("pipeline")
-        ),
+            "build_report_OTU.py"
+        ) if config.get("pipeline") == "OTU" else os.path.join(
+            os.path.dirname(os.path.abspath(workflow.snakefile)),
+            "scripts",
+            "build_report_ASV.py"),
         biom = "%s.biom" % config.get("pipeline"),
         txt = "%s.txt" % config.get("pipeline"),
-        pipeline = config.get("pipeline"),
         archive = "hundo_results.zip",
         params = "PARAMS.txt",
         env = CONDAENV,
@@ -1068,7 +1073,6 @@ rule build_report:
     shell:
         """
         python {input.report_script} --biom {input.biom} --txt {input.txt} \
-            --pipeline {input.pipeline} \
             --zip {input.archive} --env {input.env} --params {input.params} \
             --eeplot 'logs/*_merged_filtered_eestats.txt' \
             --r1quals 'logs/*_R1_eestats.txt' \

From f3506d3d0e95464ee19d530fa98781e671d9c49a Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Wed, 29 May 2019 09:55:41 -0700
Subject: [PATCH 19/21] update manifest to include reports

---
 MANIFEST.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 837f7d0..6f9d61d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
 include hundo/Snakefile
 include hundo/environment.yml
-include hundo/scripts/build_report.py
+include hundo/scripts/build_report_OTU.py
+include hundo/scripts/build_report_ASV.py

From 08392b876100fe29b74bb8b6b6486c508f622a75 Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Mon, 18 Mar 2024 14:53:01 -0400
Subject: [PATCH 20/21] unpin bzip2 to updata vsearch

---
 hundo/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/environment.yml b/hundo/environment.yml
index 62af040..1920728 100644
--- a/hundo/environment.yml
+++ b/hundo/environment.yml
@@ -9,7 +9,7 @@ dependencies:
     - biom-format=2.1.7
     - biopython=1.74
     - blast=2.9.0
-    - bzip2=1.0.6
+    - bzip2>1.0.6
     - click=6.7
     - docutils=0.14
     - mafft=7.313

From 327838244f0903922794a0c97659fa2dbcb7374c Mon Sep 17 00:00:00 2001
From: Colin Brislawn <cbrisl@gmail.com>
Date: Mon, 18 Mar 2024 14:53:26 -0400
Subject: [PATCH 21/21] Update zenodo.org download URL

---
 hundo/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hundo/Snakefile b/hundo/Snakefile
index 5772c5a..6a0c10a 100644
--- a/hundo/Snakefile
+++ b/hundo/Snakefile
@@ -311,7 +311,7 @@ rule download_reference_data:
     output:
         "%s/{filename}" % config.get("database_dir", ".")
     run:
-        shell("curl 'https://zenodo.org/record/1043977/files/{wildcards.filename}' -s > {output}")
+        shell("curl 'https://zenodo.org/records/1043977/files/{wildcards.filename}' -s > {output}")
         if not REFERENCES[os.path.basename(output[0])] == md5(output[0]):
             raise OSError(2, "Invalid checksum", output[0])