From a93a6c306033dbf226525a7f4d656a747aeed325 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <fullerbl@biowulf.nih.gov>
Date: Thu, 10 Oct 2024 15:06:29 -0400
Subject: [PATCH 001/196] Fixed render_r1_r2 function(s) in Snakefiles

- Removed the unused r1-only=False parameter in the render_r1_r2() function in both the rnaseq and chipseq Snakefiles\n- Changed the name of 'r1_only' function to 'render_r1_only' in both Snakefiles to make the name more intuitive and updated the rest of the files accordingly
---
 workflows/chipseq/Snakefile |  8 ++++----
 workflows/rnaseq/Snakefile  | 11 +++++------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index f278b896..90c84d28 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -70,10 +70,10 @@ if config.get('merged_bigwigs', None):
     final_targets.extend(utils.flatten(c.targets['merged_bigwig']))
 
 
-def render_r1_r2(pattern, r1_only=False):
+def render_r1_r2(pattern):
     return expand(pattern, sample='{sample}', n=c.n)
 
-def r1_only(pattern):
+def render_r1_only(pattern):
     return expand(pattern, sample='{sample}', n=1)
 
 rule targets:
@@ -133,7 +133,7 @@ if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('S
         output:
             fastq=render_r1_r2(c.patterns['fastq'])
         log:
-            r1_only(c.patterns['fastq'])[0] + '.log'
+            render_r1_only(c.patterns['fastq'])[0] + '.log'
         params:
             is_paired=c.is_paired,
             sampletable=_st,
@@ -337,7 +337,7 @@ rule fastq_screen:
     """
     input:
         **fastq_screen_references(),
-        fastq=r1_only(rules.cutadapt.output.fastq),
+        fastq=render_r1_only(rules.cutadapt.output.fastq),
     output:
         txt=c.patterns['fastq_screen']
     log:
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index f9a11c4d..d9e7f692 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -64,10 +64,9 @@ if config.get('merged_bigwigs', None):
     final_targets.extend(utils.flatten(c.targets['merged_bigwig']))
 
 
-def render_r1_r2(pattern, r1_only=False):
+def render_r1_r2(pattern):
     return expand(pattern, sample='{sample}', n=c.n)
-
-def r1_only(pattern):
+def render_r1_only(pattern):
     return expand(pattern, sample='{sample}', n=1)
 
 rule targets:
@@ -126,7 +125,7 @@ if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('S
         output:
             fastq=render_r1_r2(c.patterns['fastq'])
         log:
-            r1_only(c.patterns['fastq'])[0] + '.log'
+            render_r1_only(c.patterns['fastq'])[0] + '.log'
         params:
             is_paired=c.is_paired,
             sampletable=_st,
@@ -472,7 +471,7 @@ rule rRNA:
     Map reads with bowtie2 to the rRNA reference
     """
     input:
-        fastq=r1_only(c.patterns['cutadapt']),
+        fastq=render_r1_only(c.patterns['cutadapt']),
         index=[c.refdict[c.organism][config['rrna']['tag']]['bowtie2']]
     output:
         bam=temporary(c.patterns['rrna']['bam'])
@@ -569,7 +568,7 @@ rule fastq_screen:
     """
     input:
         **fastq_screen_references(),
-        fastq=r1_only(rules.cutadapt.output.fastq),
+        fastq=render_r1_only(rules.cutadapt.output.fastq),
     output:
         txt=c.patterns['fastq_screen']
     log:

From d0a0300ede9d14f8dda6bc114fc88ea6df5275cd Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 13 Oct 2024 10:23:19 -0400
Subject: [PATCH 002/196] add newline back in

---
 workflows/rnaseq/Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index d9e7f692..1cde537a 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -66,6 +66,7 @@ if config.get('merged_bigwigs', None):
 
 def render_r1_r2(pattern):
     return expand(pattern, sample='{sample}', n=c.n)
+
 def render_r1_only(pattern):
     return expand(pattern, sample='{sample}', n=1)
 

From 7d92555d67f8a4808670d866be0b239819ca79d6 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Wed, 16 Oct 2024 10:59:43 -0400
Subject: [PATCH 003/196] Make strand_arg a param

Move `strand_arg` assignment from the `run` block to the `params` block so that `--rerun-trigger` will detect changes to strandedness configuration and re-run those rules
---
 workflows/rnaseq/Snakefile | 66 ++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 1cde537a..f4245e30 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -598,12 +598,7 @@ rule featurecounts:
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2)
-    run:
-        # NOTE: By default, we use -p for paired-end
-        p_arg = ''
-        if c.is_paired:
-            p_arg = '-p --countReadPairs '
-
+    params:
         strand_arg = helpers.strand_arg_lookup(
             c, {
                 'unstranded': '-s0 ',
@@ -611,10 +606,14 @@ rule featurecounts:
                 'fr-secondstrand': '-s1 ',
             }
         )
-
+    run:
+        # NOTE: By default, we use -p for paired-end
+        p_arg = ''
+        if c.is_paired:
+            p_arg = '-p --countReadPairs '
         shell(
             'featureCounts '
-            '{strand_arg} '
+            '{params.strand_arg} '
             '{p_arg} '
             '-T {threads} '
             '-a {input.annotation} '
@@ -769,15 +768,8 @@ rule collectrnaseqmetrics:
         # NOTE: Be careful with the memory here; make sure you have enough
         # and/or it matches the resources you're requesting in the cluster
         # config.
-        java_args='-Xmx20g'
-        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
-    log:
-        c.patterns['collectrnaseqmetrics']['metrics'] + '.log'
-    threads: 1
-    resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2)
-    run:
+        java_args='-Xmx20g',
+        # java_args='-Xmx2g',  # [TEST SETTINGS -1]
         strand_arg = helpers.strand_arg_lookup(
             c, {
                 'unstranded': 'STRAND=NONE ',
@@ -785,11 +777,18 @@ rule collectrnaseqmetrics:
                 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ',
             }
         )
+    log:
+        c.patterns['collectrnaseqmetrics']['metrics'] + '.log'
+    threads: 1
+    resources:
+        mem_mb=gb(32),
+        runtime=autobump(hours=2)
+    run:
         shell(
             'picard '
             '{params.java_args} '
             'CollectRnaSeqMetrics '
-            '{strand_arg} '
+            '{params.strand_arg} '
             'VALIDATION_STRINGENCY=LENIENT '
             'REF_FLAT={input.refflat} '
             'INPUT={input.bam} '
@@ -870,7 +869,14 @@ rule kallisto:
         c.patterns['kallisto']
     params:
         index_dir=os.path.dirname(c.refdict[c.organism][config['kallisto']['tag']]['kallisto']),
-        outdir=os.path.dirname(c.patterns['kallisto'])
+        outdir=os.path.dirname(c.patterns['kallisto']),
+        strand_arg = helpers.strand_arg_lookup(
+            c, {
+                'unstranded': '',
+                'fr-firststrand': '--rf-stranded',
+                'fr-secondstrand': '--fr-stranded',
+            }
+        )
     log:
         c.patterns['kallisto'] + '.log'
     threads:
@@ -887,15 +893,6 @@ rule kallisto:
             # and standard deviation here
             se_args = '--single --fragment-length 300 --sd 20 '
             assert len(input.fastq) == 1
-
-        strand_arg = helpers.strand_arg_lookup(
-            c, {
-                'unstranded': '',
-                'fr-firststrand': '--rf-stranded',
-                'fr-secondstrand': '--fr-stranded',
-            }
-        )
-
         shell(
             'kallisto quant '
             '--index {input.index} '
@@ -905,7 +902,7 @@ rule kallisto:
             '--bias '
             '--threads {threads} '
             '{se_args} '
-            '{strand_arg} '
+            '{params.strand_arg} '
             '{input.fastq} '
             '&> {log}'
         )
@@ -987,7 +984,7 @@ rule bigwig_neg:
         runtime=autobump(hours=2)
     log:
         c.patterns['bigwig']['neg'] + '.log'
-    run:
+    params:
         strand_arg = helpers.strand_arg_lookup(
             c, {
                 'unstranded': '',
@@ -995,13 +992,14 @@ rule bigwig_neg:
                 'fr-secondstrand': '--filterRNAstrand forward ',
             }
         )
+    run:
         shell(
             'bamCoverage '
             '--bam {input.bam} '
             '-o {output} '
             '-p {threads} '
             '{BAMCOVERAGE_ARGS} '
-            '{strand_arg} '
+            '{params.strand_arg} '
             '&> {log}'
         )
 
@@ -1020,8 +1018,7 @@ rule bigwig_pos:
         runtime=autobump(hours=2)
     log:
         c.patterns['bigwig']['pos'] + '.log'
-
-    run:
+    params:
         strand_arg = helpers.strand_arg_lookup(
             c, {
                 'unstranded': '',
@@ -1029,13 +1026,14 @@ rule bigwig_pos:
                 'fr-secondstrand': '--filterRNAstrand reverse ',
             }
         )
+    run:
         shell(
             'bamCoverage '
             '--bam {input.bam} '
             '-o {output} '
             '-p {threads} '
             '{BAMCOVERAGE_ARGS} '
-            '{strand_arg} '
+            '{params.strand_arg} '
             '&> {log}'
         )
 

From b808d10c046b3f8722f0f6d29695ea73e8b60d24 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Thu, 24 Oct 2024 13:49:19 -0400
Subject: [PATCH 004/196] add Plodia interpunctella reference config (#417)

Add lib.postprocess.utils.extract_from_zip function, used for extracting -- and then immediately gzipping -- a file from
within a downloaded zip.

Include reference config for Plodia interpunctella
---
 .../Plodia_interpunctella.yaml                | 41 ++++++++++++++++++
 lib/postprocess/utils.py                      | 43 +++++++++++++++++--
 2 files changed, 81 insertions(+), 3 deletions(-)
 create mode 100644 include/reference_configs/Plodia_interpunctella.yaml

diff --git a/include/reference_configs/Plodia_interpunctella.yaml b/include/reference_configs/Plodia_interpunctella.yaml
new file mode 100644
index 00000000..214e907f
--- /dev/null
+++ b/include/reference_configs/Plodia_interpunctella.yaml
@@ -0,0 +1,41 @@
+references:
+  plodia:
+    ilPloInte3.2:
+      genome:
+        url: 'https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_027563975.2/download?include_annotation_type=GENOME_FASTA'
+        postprocess:
+          function: 'lib.postprocess.utils.extract_from_zip'
+          kwargs:
+            path_in_zip: 'ncbi_dataset/data/GCF_027563975.2/GCF_027563975.2_ilPloInte3.2_genomic.fna'
+        indexes:
+          - 'hisat2'
+          - 'bowtie2'
+          - 'star'
+
+      annotation:
+        url: "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_027563975.2/download?include_annotation_type=GENOME_GTF"
+        postprocess:
+          function: 'lib.postprocess.utils.extract_from_zip'
+          kwargs:
+            path_in_zip: "ncbi_dataset/data/GCF_027563975.2/genomic.gtf"
+        conversions:
+          - 'refflat'
+          - 'bed12'
+
+      transcriptome:
+        indexes:
+          - 'salmon'
+          - 'kallisto'
+
+    rRNA:
+      genome:
+        url:
+          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
+          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
+        indexes:
+          - 'hisat2'
+          - 'bowtie2'
+          - 'star'
+        postprocess:
+          function: 'lib.common.filter_fastas'
+          args: 'Plodia interpunctella'
diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py
index abb87288..16010e14 100644
--- a/lib/postprocess/utils.py
+++ b/lib/postprocess/utils.py
@@ -1,12 +1,49 @@
 import sys
 import os
-import pandas as pd
-import gzip
 import re
+import gzip
+import zipfile
+import shutil
+import tempfile
+import pandas as pd
+
 here = os.path.dirname(os.path.abspath(__file__))
-sys.path.insert(0, os.path.join(here, '../../lib'))
+sys.path.insert(0, os.path.join(here, "../../lib"))
 from common import openfile
 
+
+
+def extract_from_zip(tmpfiles, outfile, path_in_zip):
+    """
+    Parameters
+    ----------
+
+    tmpfiles : list
+        One-item list containing zip file
+
+    outfile : str
+        gzipped output file to create
+
+    path_in_zip : str
+        Path within zipfile to extract. You can identify the path using unzip
+        -l x.zip from bash.
+    """
+    assert len(tmpfiles) == 1, f"expected single zip file, got {tmpfiles}"
+
+    extraction_dir = tempfile.mkdtemp()
+
+    with zipfile.ZipFile(tmpfiles[0], "r") as z:
+        z.extract(path_in_zip, path=extraction_dir)
+
+    full_path_to_extracted = os.path.join(extraction_dir, path_in_zip)
+
+    with open(full_path_to_extracted, "rb") as fin:
+        with gzip.open(outfile, "wb") as fout:
+            shutil.copyfileobj(fin, fout)
+
+    shutil.rmtree(extraction_dir)
+
+
 def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand = "None"):
     """
     Matches string to the 9th field of GTF and an optional strand that defaults to None;

From 47b379ad126ee63e5259f74c4dc7c06454e4dc8c Mon Sep 17 00:00:00 2001
From: Nicholas Johnson <nicholas.johnson2@nih.gov>
Date: Wed, 4 Dec 2024 18:09:26 -0500
Subject: [PATCH 005/196] Update plotting.R (#423)

Just a small mistake

Co-authored-by: Ryan Dale <ryan.dale@nih.gov>
---
 lib/lcdbwf/R/plotting.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/lcdbwf/R/plotting.R b/lib/lcdbwf/R/plotting.R
index 9e7bc8e5..f4aa9c41 100644
--- a/lib/lcdbwf/R/plotting.R
+++ b/lib/lcdbwf/R/plotting.R
@@ -268,7 +268,7 @@ vargenes_heatmap <- function(rld, cols_for_grouping, n=50){
   mat <- mat - rowMeans(mat)
   df <- as.data.frame(colData(rld)[, cols_for_grouping])
   rownames(df) <- colnames(rld)
-  colnames(df) <- cols.for.grouping
+  colnames(df) <- cols_for_grouping
   pheatmap(mat, annotation_col=df, cluster_cols=TRUE)
 }
 

From aeca4c2dfb5ee76221e1f7280986e94cba6488b3 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Fri, 13 Dec 2024 22:05:16 -0500
Subject: [PATCH 006/196] Change SRA fastq directory (#418)

* Change SRA fastq directory

Change the directory where SRA fastq files are downloaded and add the
'orig_filename' column to the config object for each sample so that the rest of the workflow works correctly

* Make code more elegant

Change a nested for-loop implementation in patters_targets.py to a more elegant one-line solution and clean up some code in Snakefile

* improve helper.fill_patterns

add check when combining by `zip` to ensure values are all same length

add more doctests

---------

Co-authored-by: Ryan Dale <ryan.dale@nih.gov>
---
 lib/helpers.py                               | 34 ++++++++++----
 lib/patterns_targets.py                      | 14 +++++-
 workflows/rnaseq/Snakefile                   | 49 ++++++++++----------
 workflows/rnaseq/config/rnaseq_patterns.yaml |  1 +
 4 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/lib/helpers.py b/lib/helpers.py
index 053bca2b..4723286c 100644
--- a/lib/helpers.py
+++ b/lib/helpers.py
@@ -34,22 +34,31 @@ def detect_layout(sampletable):
 
 def fill_patterns(patterns, fill, combination=product):
     """
-    Fills in a dictionary of patterns with the dictionary or DataFrame `fill`.
+    Fills in a dictionary of patterns with the dictionary `fill`.
 
     >>> patterns = dict(a='{sample}_R{N}.fastq')
-    >>> fill = dict(sample=['one', 'two'], N=[1, 2])
+    >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2])
     >>> sorted(fill_patterns(patterns, fill)['a'])
-    ['one_R1.fastq', 'one_R2.fastq', 'two_R1.fastq', 'two_R2.fastq']
+    ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq']
+
+    If using `zip` as a combination, checks to ensure all values in `fill` are
+    the same length to avoid truncated output.
+
+    This fails:
 
     >>> patterns = dict(a='{sample}_R{N}.fastq')
-    >>> fill = dict(sample=['one', 'two'], N=[1, 2])
-    >>> sorted(fill_patterns(patterns, fill, zip)['a'])
-    ['one_R1.fastq', 'two_R2.fastq']
+    >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2])
+    >>> sorted(fill_patterns(patterns, fill, zip)['a']) # doctest: +IGNORE_EXCEPTION_DETAIL
+    Traceback (most recent call last):
+    ...
+    ValueError: {'sample': ['one', 'two', 'three'], 'N': [1, 2]} does not have the same number of entries for each key
+
+    But this works:
 
     >>> patterns = dict(a='{sample}_R{N}.fastq')
-    >>> fill = pd.DataFrame({'sample': ['one', 'two'], 'N': [1, 2]})
-    >>> sorted(fill_patterns(patterns, fill)['a'])
-    ['one_R1.fastq', 'two_R2.fastq']
+    >>> fill = dict(sample=['one', 'one', 'two', 'two', 'three', 'three'], N=[1, 2, 1, 2, 1, 2])
+    >>> sorted(fill_patterns(patterns, fill, zip)['a'])
+    ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq']
 
     """
     # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns
@@ -64,12 +73,17 @@ def fill_patterns(patterns, fill, combination=product):
     #
     #   expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x']
 
+    if combination == zip:
+        lengths = set([len(v) for v in fill.values()])
+        if len(lengths) != 1:
+            raise ValueError(f"{fill} does not have the same number of entries for each key")
+
     def update(d, u, c):
         for k, v in u.items():
             if isinstance(v, collections.abc.Mapping):
                 r = update(d.get(k, {}), v, c)
                 d[k] = r
-            else:
+            else:  # not a dictionary, so we're at a leaf
                 if isinstance(fill, pd.DataFrame):
                     d[k] = list(set(expand(u[k], zip, **fill.to_dict("list"))))
                 else:
diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py
index 542d4116..ec62d513 100644
--- a/lib/patterns_targets.py
+++ b/lib/patterns_targets.py
@@ -9,6 +9,7 @@
 from . import common
 from . import chipseq
 from . import helpers
+from snakemake.io import expand
 
 HERE = os.path.abspath(os.path.dirname(__file__))
 
@@ -80,6 +81,10 @@ def __init__(self, config, patterns, workdir=None):
             self.n = [1, 2]
         else:
             self.n = [1]
+        if 'Run' in self.sampletable.columns and sum(self.sampletable['Run'].str.startswith('SRR')) > 0:
+            self.is_sra = True
+        else:
+            self.is_sra = False
 
         helpers.preflight(self.config)
 
@@ -107,7 +112,14 @@ def __init__(self, config, patterns, workdir=None):
 
         self.fill = dict(sample=self.samples, n=self.n)
         self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None)
-        self.targets = helpers.fill_patterns(self.patterns, self.fill, zip)
+        self.targets = helpers.fill_patterns(self.patterns, self.fill)
+
+        # If the sampletable is from an sra metadata table, then we need to set the value of
+        # 'orig_filename' for each of the samples to where the fastq was downloaded
+        if self.is_sra:
+            self.sampletable['orig_filename'] = expand(self.patterns["sra_fastq"], sample=self.samples, n=1)
+            if self.is_paired:
+                self.sampletable['orig_filename_R2'] = expand(self.patterns["sra_fastq"], sample=self.samples, n=2)
 
         # Then the aggregation
         if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config:
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index f4245e30..e979cfdc 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -76,6 +76,30 @@ rule targets:
     """
     input: final_targets
 
+if c.is_sra:
+
+    # Convert the sampletable to be indexed by the first column, for
+    # convenience in generating the input/output filenames.
+    _st = c.sampletable.set_index(c.sampletable.columns[0])
+
+    rule fastq_dump:
+        output:
+            fastq=render_r1_r2(c.patterns['sra_fastq'])
+        log:
+            render_r1_only(c.patterns['sra_fastq'])[0] + '.log'
+        params:
+            is_paired=c.is_paired,
+            sampletable=_st,
+            # limit = 100000, # [TEST SETTINGS]
+        resources:
+            mem_mb=gb(1),
+            disk_mb=autobump(gb=1),
+            runtime=autobump(hours=2)
+        conda:
+            '../../wrappers/wrappers/fastq-dump/environment.yaml'
+        script:
+            wrapper_for('fastq-dump/wrapper.py')
+
 if 'orig_filename' in c.sampletable.columns:
 
     localrules: symlinks, symlink_targets
@@ -115,31 +139,6 @@ if 'orig_filename' in c.sampletable.columns:
     rule symlink_targets:
         input: c.targets['fastq']
 
-
-if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('SRR')) > 0:
-
-    # Convert the sampletable to be indexed by the first column, for
-    # convenience in generating the input/output filenames.
-    _st = c.sampletable.set_index(c.sampletable.columns[0])
-
-    rule fastq_dump:
-        output:
-            fastq=render_r1_r2(c.patterns['fastq'])
-        log:
-            render_r1_only(c.patterns['fastq'])[0] + '.log'
-        params:
-            is_paired=c.is_paired,
-            sampletable=_st,
-            # limit = 100000, # [TEST SETTINGS]
-        resources:
-            mem_mb=gb(1),
-            disk_mb=autobump(gb=1),
-            runtime=autobump(hours=2)
-        conda:
-            '../../wrappers/wrappers/fastq-dump/environment.yaml'
-        script:
-            wrapper_for('fastq-dump/wrapper.py')
-
 # This can be set at the command line with --config strand_check_reads=1000
 config.setdefault('strand_check_reads', 1e5)
 
diff --git a/workflows/rnaseq/config/rnaseq_patterns.yaml b/workflows/rnaseq/config/rnaseq_patterns.yaml
index 5379d0dc..92b2a534 100644
--- a/workflows/rnaseq/config/rnaseq_patterns.yaml
+++ b/workflows/rnaseq/config/rnaseq_patterns.yaml
@@ -3,6 +3,7 @@ strand_check:
   bam: 'strand_check/{sample}/{sample}.strandedness.bam'
   tsv: 'strand_check/{sample}/{sample}.strandedness'
 fastq: 'data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz'
+sra_fastq: 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz'
 cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz'
 bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam'
 fastqc:

From a2e5448d017063308052fb865d239755bf36a410 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Tue, 31 Dec 2024 21:19:11 -0500
Subject: [PATCH 007/196] mambaforge -> miniforge

---
 .circleci/config.yml | 58 ++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 35 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index da38e059..4219875a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -28,7 +28,7 @@ variables:
     save_cache:
       key: v5-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }}
       paths:
-        - /opt/mambaforge
+        - /opt/miniforge
 
         # this file is created by sra-tools upon installation by conda, and so
         # needs to be included in the cache otherwise fastq-dump thinks it's
@@ -73,7 +73,7 @@ variables:
 
           # Note that if we don't escape \$PATH, we'll be stuck with the exact
           # PATH defined here, which will break anything needing conda envs.
-          echo "export PATH=\$PATH:/opt/mambaforge/bin" >> $BASH_ENV
+          echo "export PATH=\$PATH:/opt/miniforge/bin" >> $BASH_ENV
           source $BASH_ENV
 
 
@@ -85,28 +85,16 @@ variables:
       command: |
         source $BASH_ENV
         echo $PATH
-        # /opt/mambaforge will only exist if there was a cache restore; otherwise we'll make it here.
+        # /opt/miniforge will only exist if there was a cache restore; otherwise we'll make it here.
         #
-        # Use mambaforge which comes with mamba.
-        if [ ! -e /opt/mambaforge ]; then
-            curl -L https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh > mambaforge.sh
-            bash mambaforge.sh -b -p /opt/mambaforge
-            source "/opt/mambaforge/etc/profile.d/conda.sh"
-            source "/opt/mambaforge/etc/profile.d/mamba.sh"
+        if [ ! -e /opt/miniforge ]; then
+            curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
+            bash Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/miniforge
+            source "/opt/miniforge/etc/profile.d/conda.sh"
             conda activate
 
             which conda
-            which mamba
-            mamba --version
-
-            # Note that mambaforge doesn't come with the defaults channel, but
-            # we're adding it here at the beginning to simulate what most users
-            # probably have locally (and following the bioconda docs). Using
-            # strict channel priority means we should [theoretically] never
-            # pull packages from defaults because they all exist on
-            # conda-forge.
-            conda config --system --add channels defaults
-
+            conda --version
             conda config --system --add channels bioconda
             conda config --system --add channels conda-forge
             conda config --system --set channel_priority strict
@@ -115,10 +103,10 @@ variables:
             # https://docs.conda.io/projects/conda-build/en/latest/resources/link-scripts.html,
             # post-link scripts should not depend on any installed or
             # to-be-installed conda packages...but they do.
-            mamba install -n base r-base yq
+            conda install -n base r-base yq
 
-            time mamba env create -n $LCDBWF_ENV --file env.yml
-            time mamba env create -n $LCDBWF_ENV_R --file env-r.yml
+            time conda env create -n $LCDBWF_ENV --file env.yml
+            time conda env create -n $LCDBWF_ENV_R --file env-r.yml
         fi
 
   # --------------------------------------------------------------------------
@@ -127,7 +115,7 @@ variables:
     run:
       name: Download example data
       command: |
-        source /opt/mambaforge/etc/profile.d/conda.sh
+        source /opt/miniforge/etc/profile.d/conda.sh
         conda activate $LCDBWF_ENV
         conda info --envs
         conda config --show
@@ -172,7 +160,7 @@ variables:
     run:
       name: Run pytest suite and testthat suite
       command: |
-        source /opt/mambaforge/etc/profile.d/conda.sh
+        source /opt/miniforge/etc/profile.d/conda.sh
         conda activate $LCDBWF_ENV
         # run unit tests and doctests for the modules in lib
         test/lcdb-wf-test unit_tests --pytest
@@ -194,7 +182,7 @@ variables:
         name: chipseq workflow
         command: |
           cd $DEPLOY/workflows/chipseq
-          source /opt/mambaforge/etc/profile.d/conda.sh
+          source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
           $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p -r
           $DEPLOY/test/lcdb-wf-test chipseq --trackhub
@@ -208,7 +196,7 @@ variables:
         name: chipseq misc
         command: |
           cd $DEPLOY/workflows/chipseq
-          source /opt/mambaforge/etc/profile.d/conda.sh
+          source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
 
           ./run_test.sh --use-conda -j2 -k -p -r \
@@ -237,7 +225,7 @@ variables:
       run:
         name: references workflow
         command: |
-          source /opt/mambaforge/etc/profile.d/conda.sh
+          source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
           $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -r -k --orig $ORIG
 
@@ -248,7 +236,7 @@ variables:
         name: rnaseq workflow
         command: |
           cd $DEPLOY
-          source /opt/mambaforge/etc/profile.d/conda.sh
+          source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p -r --orig $ORIG
@@ -276,7 +264,7 @@ variables:
         command: |
           ORIG=$(pwd)
           cd $DEPLOY
-          source /opt/mambaforge/etc/profile.d/conda.sh
+          source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
 
           # Check the help for test/lcdb-wf-test to see what args these
@@ -299,7 +287,7 @@ variables:
         name: colocalization workflow
         command: |
           cd $DEPLOY/workflows/colocalization
-          source /opt/mambaforge/etc/profile.d/conda.sh
+          source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
           $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -r -p -j2 --use-conda --orig $ORIG
 
@@ -438,9 +426,9 @@ jobs:
       - run:
           name: Install sphinx
           command: |
-            source /opt/mambaforge/etc/profile.d/conda.sh
+            source /opt/miniforge/etc/profile.d/conda.sh
             conda activate lcdb-wf-test
-            mamba install -y sphinx make yaml
+            conda install -y sphinx make yaml
       - run:
           name: OK for unknown github host
           command: mkdir -p ~/.ssh/ && echo -e "Host github.com\n\tStrictHostKeyChecking no\n" > ~/.ssh/config
@@ -450,7 +438,7 @@ jobs:
       - run:
           name: Build and upload docs
           command: |
-            source /opt/mambaforge/etc/profile.d/conda.sh
+            source /opt/miniforge/etc/profile.d/conda.sh
             conda activate lcdb-wf-test
             ci/build-docs.sh
       - store_artifacts:
@@ -466,7 +454,7 @@ jobs:
       - run:
           name: Report environment
           command: |
-            source /opt/mambaforge/etc/profile.d/conda.sh
+            source /opt/miniforge/etc/profile.d/conda.sh
             conda env export -n lcdb-wf-test > /tmp/env.yaml
             conda env export -n lcdb-wf-test-r > /tmp/env-r.yaml
       - store_artifacts:

From 4487d9067924f6aec484898be9717d343ad6f28b Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Tue, 31 Dec 2024 22:00:54 -0500
Subject: [PATCH 008/196] latest ubuntu for testing

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4219875a..582a9842 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,7 +5,7 @@ variables:
   # default settings for all steps
   defaults: &defaults
     docker:
-      - image: ubuntu:20.04
+      - image: ubuntu:latest
 
   # --------------------------------------------------------------------------
   # The caching dramatically speeds up testing time, because we can do the

From 836fff09239408a7496b177cfc8eef25f11777f6 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Tue, 31 Dec 2024 22:02:03 -0500
Subject: [PATCH 009/196] https for downloading chainfile

---
 workflows/external/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/external/Snakefile b/workflows/external/Snakefile
index 9f8308c9..c6dd34b0 100644
--- a/workflows/external/Snakefile
+++ b/workflows/external/Snakefile
@@ -24,7 +24,7 @@ rule download_chainfile:
     output: 'data/dm3ToDm6.over.chain.gz'
     shell:
         'wget -O- '
-        'http://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz '
+        'https://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz '
         '> {output}'
 
 

From 09dedd783973a07fd850552baf02c8878c2a440f Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Tue, 31 Dec 2024 22:16:53 -0500
Subject: [PATCH 010/196] noninteractive apt install

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 582a9842..df941a5a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -48,7 +48,7 @@ variables:
         name: Set path
         command: |
           # x11-utils required to avoid R::png() segfaulting
-          apt update && apt install -y \
+          DEBIAN_FRONTEND=noninteractive apt update && apt install -y \
             curl \
             git \
             locales \

From b6c663a2daea5e336e655590c1d192ab791ac39a Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Tue, 31 Dec 2024 23:02:28 -0500
Subject: [PATCH 011/196] noninteractive apt install

---
 .circleci/config.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index df941a5a..d351ba7e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -48,7 +48,8 @@ variables:
         name: Set path
         command: |
           # x11-utils required to avoid R::png() segfaulting
-          DEBIAN_FRONTEND=noninteractive apt update && apt install -y \
+          export DEBIAN_FRONTEND=noninteractive
+          apt update && apt install -y \
             curl \
             git \
             locales \

From 2fc5d71455a2da0da7212449ff1981c72ea4ba8a Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Wed, 1 Jan 2025 10:51:25 -0500
Subject: [PATCH 012/196] debug url

---
 workflows/external/Snakefile | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/workflows/external/Snakefile b/workflows/external/Snakefile
index c6dd34b0..48ddf73b 100644
--- a/workflows/external/Snakefile
+++ b/workflows/external/Snakefile
@@ -23,10 +23,7 @@ rule download_chainfile:
     """
     output: 'data/dm3ToDm6.over.chain.gz'
     shell:
-        'wget -O- '
-        'https://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz '
-        '> {output}'
-
+        'curl -L -v  https://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz -o  {output}'
 
 rule beds:
     """

From a4703987ecd08e384d98033f77f72c6dcc5ad5cb Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Wed, 1 Jan 2025 11:25:12 -0500
Subject: [PATCH 013/196] for test "external" data, do not do liftover

ucsc might be blocking circle-ci given the licenseing requirements of
chainfiles
---
 workflows/external/Snakefile | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/workflows/external/Snakefile b/workflows/external/Snakefile
index 48ddf73b..79c3d1e2 100644
--- a/workflows/external/Snakefile
+++ b/workflows/external/Snakefile
@@ -16,35 +16,14 @@ rule targets:
     input:
         list(modencode.keys()),
 
-
-rule download_chainfile:
-    """
-    Download the chainfile we need for liftover
-    """
-    output: 'data/dm3ToDm6.over.chain.gz'
-    shell:
-        'curl -L -v  https://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz -o  {output}'
-
 rule beds:
     """
-    Download URLs, get rid of "track" lines, and then prepare them for liftover
+    Download URLs, get rid of "track" lines.
     """
-    output: temporary('data/{factor}_{celltype}.bed.dm3')
+    output: 'data/{factor}_{celltype}.bed'
     run:
-        key = str(output[0]).replace('.dm3', '')
+        key = str(output[0])
         url = modencode[key]
-        shell(
-            'wget -O - "{url}" | grep -v "track" > {output}')
-
-rule liftover:
-    """
-    Perform the liftover
-    """
-    input:
-        bed='{prefix}.dm3',
-        chainfile=rules.download_chainfile.output
-    output: '{prefix}'
-    shell:
-        'liftOver {input.bed} {input.chainfile} {output} {output}.unmapped'
+        shell('wget -O - "{url}" | grep -v "track" > {output}')
 
 # vim: ft=python

From ed9161d20ae05fa8e47c2d0d7e8daee3db9ef819 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Wed, 1 Jan 2025 11:26:07 -0500
Subject: [PATCH 014/196] remove support for GAT

this tool was last updated in 2017, and has incompatibilites with recent
numpy.
---
 workflows/colocalization/Snakefile | 36 +-----------------------------
 1 file changed, 1 insertion(+), 35 deletions(-)

diff --git a/workflows/colocalization/Snakefile b/workflows/colocalization/Snakefile
index ac0b0413..cb5a7991 100644
--- a/workflows/colocalization/Snakefile
+++ b/workflows/colocalization/Snakefile
@@ -64,29 +64,22 @@ if ADD_CHIPSEQ_PEAKS:
         config['beds'][key] = fn
 
 
-# Number of shufflings for GAT
-# N = 100 [TEST_SETTINGS +1]
-N = 10000
-
 targets = expand(
     '{outdir}/{algorithm}/{domain}/{query}/{query}_vs_{reference}.txt',
     outdir=config['output'],
     domain=config['domains'].keys(),
     query=config['beds'].keys(),
     reference=config['beds'].keys(),
-    algorithm=['IntervalStats', 'GAT', 'jaccard', 'fisher'],
+    algorithm=['IntervalStats', 'jaccard', 'fisher'],
 )
 
 # Currently-supported options {algorithm: (possible values)}
 # IntervalStats: (f_05, f_01, f_001)
-# GAT: (l2fold, fractions)
 # jaccard: (jaccard)
 # fisher: (pval)
 pattern = '{outdir}/{algorithm}/{domain}/{value}_heatmap.pdf'
 targets += expand(pattern, outdir=config['output'], domain=config['domains'],
                   algorithm='IntervalStats', value=['f_01'])
-targets += expand(pattern, outdir=config['output'], domain=config['domains'],
-                  algorithm='GAT', value=['l2fold'])
 targets += expand(pattern, outdir=config['output'], domain=config['domains'],
                   algorithm='jaccard', value=['jaccard'])
 targets += expand(pattern, outdir=config['output'], domain=config['domains'],
@@ -216,33 +209,6 @@ rule intervalstats:
         df.to_csv(str(output[0]), sep='\t', index=False)
 
 
-rule gat:
-    input:
-        domain=lambda wc: config['domains'][getattr(wc, 'domain')],
-        query=lambda wc: config['beds'][getattr(wc, 'query')],
-        reference=lambda wc: config['beds'][getattr(wc, 'reference')],
-    output: '{outdir}/GAT/{domain}/{query}/{query}_vs_{reference}.txt'
-    run:
-        shell('cut -f1,2,3 {input.query} > {output}.query.tmp')
-        shell('cut -f1,2,3 {input.reference} > {output}.reference.tmp')
-        if os.stat(output[0] + '.query.tmp').st_size == 0:
-            shell('touch {output}')
-        else:
-            shell(
-                'gat-run.py '
-                '--ignore-segment-tracks '
-                '--annotations {output}.reference.tmp '
-                '--segments {output}.query.tmp '
-                '--workspace {input.domain} '
-                '--counter nucleotide-overlap '
-                '--num-samples {N} '
-                '--output-counts-pattern {output}.%s.counts '
-                '--log {output}.log '
-                '--stdout {output} '
-            )
-        shell('rm {output}.query.tmp {output}.reference.tmp')
-
-
 rule heatmap:
     input:
         expand(

From becdf2168daf748a43f04a649ced0d9a7cb8e415 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Thu, 2 Jan 2025 21:23:37 +0000
Subject: [PATCH 015/196] GAT no longer used, remove from requirements

---
 include/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/requirements.txt b/include/requirements.txt
index 6001f6d5..dc7d4d23 100644
--- a/include/requirements.txt
+++ b/include/requirements.txt
@@ -7,7 +7,6 @@ deeptools
 fastq-screen
 fastqc
 font-ttf-dejavu-sans-mono
-gat
 gffread
 gffutils
 hisat2

From dfaec3e9defaec1dce6cba38981080e072dfc20e Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Thu, 2 Jan 2025 21:23:50 +0000
Subject: [PATCH 016/196] don't pin python

---
 include/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/requirements.txt b/include/requirements.txt
index dc7d4d23..4e2b155e 100644
--- a/include/requirements.txt
+++ b/include/requirements.txt
@@ -26,7 +26,7 @@ pyfaidx
 pysam
 pytest
 pytest-xdist
-python>=3.10
+python
 rseqc
 
 # earlier versions of salmon can segfault on Slurm

From b65f4cd61ebd411db108ef2deae67f9c8e92c3fa Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Thu, 2 Jan 2025 21:23:58 +0000
Subject: [PATCH 017/196] pin snakemake >8

---
 include/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/requirements.txt b/include/requirements.txt
index 4e2b155e..fd8df8be 100644
--- a/include/requirements.txt
+++ b/include/requirements.txt
@@ -34,7 +34,7 @@ salmon>=1.10.1
 
 samtools
 seaborn
-snakemake-minimal
+snakemake>8
 sra-tools
 star
 subread

From 0f81f076a05fff5eb0169a574016cf415f5ca775 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Thu, 2 Jan 2025 21:24:50 +0000
Subject: [PATCH 018/196] update env.yml

---
 env.yml | 612 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 296 insertions(+), 316 deletions(-)

diff --git a/env.yml b/env.yml
index 5b656720..02f0f695 100644
--- a/env.yml
+++ b/env.yml
@@ -5,380 +5,360 @@ dependencies:
   - _libgcc_mutex=0.1
   - _openmp_mutex=4.5
   - _r-mutex=1.0.1
-  - alsa-lib=1.2.3.2
-  - amply=0.1.5
+  - alabaster=1.0.0
+  - alsa-lib=1.2.13
+  - amply=0.1.6
+  - annotated-types=0.7.0
   - appdirs=1.4.4
-  - argcomplete=3.0.8
-  - argh=0.27.2
-  - asttokens=2.2.1
-  - attr=2.5.1
-  - attrs=23.1.0
-  - backcall=0.2.0
-  - backports=1.0
-  - backports.functools_lru_cache=1.6.4
-  - bedtools=2.31.0
-  - binutils_impl_linux-64=2.39
-  - binutils_linux-64=2.39
-  - biopython=1.81
-  - boost-cpp=1.74.0
+  - argcomplete=3.5.2
+  - argh=0.31.3
+  - argparse-dataclass=2.0.0
+  - asttokens=3.0.0
+  - attrs=24.3.0
+  - babel=2.16.0
+  - beautifulsoup4=4.12.3
+  - bedtools=2.31.1
+  - binutils_impl_linux-64=2.43
+  - biopython=1.84
+  - boost-cpp=1.85.0
   - bowtie=1.3.1
-  - bowtie2=2.5.1
-  - brotli=1.0.9
-  - brotli-bin=1.0.9
-  - brotlipy=0.7.0
-  - bwidget=1.9.14
-  - bx-python=0.9.0
+  - bowtie2=2.5.4
+  - brotli=1.1.0
+  - brotli-bin=1.1.0
+  - brotli-python=1.1.0
+  - bwidget=1.10.1
+  - bx-python=0.13.0
   - bzip2=1.0.8
-  - c-ares=1.18.1
-  - ca-certificates=2023.5.7
-  - cairo=1.16.0
-  - certifi=2023.5.7
-  - cffi=1.15.1
-  - charset-normalizer=3.1.0
-  - click=8.1.3
-  - coin-or-cbc=2.10.10
-  - coin-or-cgl=0.60.7
-  - coin-or-clp=1.17.8
-  - coin-or-osi=0.108.8
-  - coin-or-utils=2.11.9
-  - coincbc=2.10.10
+  - c-ares=1.34.4
+  - ca-certificates=2024.12.14
+  - cairo=1.18.2
+  - certifi=2024.12.14
+  - cffi=1.17.1
+  - charset-normalizer=3.4.0
+  - click=8.1.8
+  - coin-or-cbc=2.10.12
+  - coin-or-cgl=0.60.9
+  - coin-or-clp=1.17.10
+  - coin-or-osi=0.108.11
+  - coin-or-utils=2.11.12
+  - coincbc=2.10.12
   - colorama=0.4.6
   - coloredlogs=15.0.1
   - colormath=3.0.0
-  - configargparse=1.5.3
+  - conda-inject=1.3.2
+  - configargparse=1.7
   - connection_pool=0.0.3
-  - contourpy=1.0.7
-  - cryptography=39.0.0
-  - curl=7.86.0
-  - cutadapt=4.4
-  - cycler=0.11.0
+  - contourpy=1.3.1
+  - curl=8.11.1
+  - cutadapt=5.0
+  - cycler=0.12.1
   - datrie=0.8.2
-  - dbus=1.13.6
   - decorator=5.1.1
-  - deeptools=3.5.2
+  - deeptools=3.5.5
   - deeptoolsintervals=0.1.9
-  - dnaio=0.10.0
-  - docutils=0.20.1
-  - dpath=2.1.5
-  - epic2=0.0.52
-  - exceptiongroup=1.1.1
-  - execnet=1.9.0
-  - executing=1.2.0
-  - expat=2.5.0
-  - fastq-screen=0.15.3
+  - dnaio=1.2.2
+  - docutils=0.21.2
+  - dpath=2.2.0
+  - eido=0.2.4
+  - et_xmlfile=2.0.0
+  - exceptiongroup=1.2.2
+  - execnet=2.1.1
+  - executing=2.1.0
+  - expat=2.6.4
+  - fastq-screen=0.16.0
   - fastqc=0.12.1
-  - fftw=3.3.10
-  - filelock=3.12.0
   - font-ttf-dejavu-sans-mono=2.37
   - font-ttf-inconsolata=3.000
   - font-ttf-source-code-pro=2.038
   - font-ttf-ubuntu=0.83
-  - fontconfig=2.14.2
+  - fontconfig=2.15.0
   - fonts-conda-ecosystem=1
   - fonts-conda-forge=1
-  - fonttools=4.39.4
+  - fonttools=4.55.3
   - freetype=2.12.1
   - fribidi=1.0.10
-  - future=0.18.3
-  - gat=1.3.6
-  - gcc_impl_linux-64=10.4.0
-  - gcc_linux-64=10.4.0
-  - gettext=0.21.1
+  - gcc_impl_linux-64=14.2.0
   - gffread=0.12.7
-  - gffutils=0.11.1
-  - gfortran_impl_linux-64=10.4.0
-  - gfortran_linux-64=10.4.0
-  - giflib=5.2.1
-  - gitdb=4.0.10
-  - gitpython=3.1.31
-  - glib=2.74.1
-  - glib-tools=2.74.1
-  - gmp=6.2.1
+  - gffutils=0.13
+  - gfortran_impl_linux-64=14.2.0
+  - giflib=5.2.2
+  - gitdb=4.0.11
+  - gitpython=3.1.43
   - graphite2=1.3.13
-  - gsl=2.7
-  - gst-plugins-base=1.18.5
-  - gstreamer=1.20.3
-  - gxx_impl_linux-64=10.4.0
-  - gxx_linux-64=10.4.0
-  - harfbuzz=4.2.0
-  - hdf5=1.12.1
+  - gsl=1.16
+  - gxx_impl_linux-64=14.2.0
+  - h2=4.1.0
+  - harfbuzz=10.1.0
+  - hdf5=1.14.3
   - hisat2=2.2.1
-  - htslib=1.16
+  - hpack=4.0.0
+  - html5lib=1.1
+  - htslib=1.21
   - humanfriendly=10.0
-  - icu=69.1
-  - idna=3.4
-  - importlib-metadata=6.6.0
-  - importlib_resources=5.12.0
+  - humanize=4.11.0
+  - hyperframe=6.0.1
+  - icu=75.1
+  - idna=3.10
+  - imagesize=1.4.1
+  - immutables=0.21
+  - importlib-metadata=8.5.0
+  - importlib_resources=6.4.5
   - iniconfig=2.0.0
   - intervalstats=1.01
-  - ipython=8.13.2
-  - isa-l=2.30.0
-  - jack=1.9.18
-  - jedi=0.18.2
-  - jinja2=3.1.2
-  - jpeg=9e
-  - jsonschema=4.17.3
-  - jupyter_core=5.3.0
-  - kallisto=0.48.0
-  - kernel-headers_linux-64=2.6.32
+  - ipython=8.31.0
+  - isa-l=2.31.0
+  - jedi=0.19.2
+  - jinja2=3.1.5
+  - jsonschema=4.23.0
+  - jsonschema-specifications=2024.10.1
+  - jupyter_core=5.7.2
+  - kaleido-core=0.2.1
+  - kallisto=0.51.1
+  - kernel-headers_linux-64=3.10.0
   - keyutils=1.6.1
-  - kiwisolver=1.4.4
-  - krb5=1.19.3
-  - lcms2=2.14
-  - ld_impl_linux-64=2.39
+  - kiwisolver=1.4.7
+  - krb5=1.21.3
+  - lcms2=2.16
+  - ld_impl_linux-64=2.43
   - lerc=4.0.0
+  - libaec=1.1.3
   - libblas=3.9.0
-  - libbrotlicommon=1.0.9
-  - libbrotlidec=1.0.9
-  - libbrotlienc=1.0.9
-  - libcap=2.64
+  - libboost=1.85.0
+  - libboost-devel=1.85.0
+  - libboost-headers=1.85.0
+  - libbrotlicommon=1.1.0
+  - libbrotlidec=1.1.0
+  - libbrotlienc=1.1.0
   - libcblas=3.9.0
-  - libclang=13.0.1
   - libcups=2.3.3
-  - libcurl=7.86.0
-  - libdb=6.2.32
-  - libdeflate=1.13
+  - libcurl=8.11.1
+  - libdeflate=1.23
   - libedit=3.1.20191231
   - libev=4.33
-  - libevent=2.1.10
-  - libexpat=2.5.0
+  - libexpat=2.6.4
   - libffi=3.4.2
-  - libflac=1.3.4
-  - libgcc-devel_linux-64=10.4.0
-  - libgcc-ng=12.2.0
+  - libgcc=14.2.0
+  - libgcc-devel_linux-64=14.2.0
+  - libgcc-ng=14.2.0
   - libgd=2.3.3
-  - libgfortran-ng=12.2.0
-  - libgfortran5=12.2.0
-  - libglib=2.74.1
-  - libgomp=12.2.0
-  - libhwloc=2.8.0
+  - libgfortran=14.2.0
+  - libgfortran-ng=14.2.0
+  - libgfortran5=14.2.0
+  - libglib=2.82.2
+  - libgomp=14.2.0
+  - libhwloc=2.11.2
   - libiconv=1.17
   - libjemalloc=5.3.0
+  - libjpeg-turbo=3.0.0
   - liblapack=3.9.0
   - liblapacke=3.9.0
-  - libllvm13=13.0.1
-  - libnghttp2=1.51.0
-  - libnsl=2.0.0
-  - libogg=1.3.4
-  - libopenblas=0.3.21
-  - libopus=1.3.1
-  - libpng=1.6.39
-  - libpq=14.5
-  - libsanitizer=10.4.0
-  - libsndfile=1.0.31
-  - libsqlite=3.41.2
-  - libssh2=1.10.0
-  - libstdcxx-devel_linux-64=10.4.0
-  - libstdcxx-ng=12.2.0
-  - libtiff=4.4.0
-  - libtool=2.4.7
-  - libudev1=253
+  - liblzma=5.6.3
+  - liblzma-devel=5.6.3
+  - libnghttp2=1.64.0
+  - libnsl=2.0.1
+  - libopenblas=0.3.28
+  - libopenssl-static=3.4.0
+  - libpng=1.6.44
+  - libsanitizer=14.2.0
+  - libsqlite=3.47.2
+  - libssh2=1.11.1
+  - libstdcxx=14.2.0
+  - libstdcxx-devel_linux-64=14.2.0
+  - libstdcxx-ng=14.2.0
+  - libtiff=4.7.0
   - libuuid=2.38.1
-  - libvorbis=1.3.7
-  - libwebp=1.2.4
-  - libwebp-base=1.2.4
-  - libxcb=1.13
-  - libxkbcommon=1.0.3
-  - libxml2=2.9.14
-  - libzlib=1.2.13
-  - lzo=2.10
-  - lzstring=1.0.4
-  - make=4.3
-  - markdown=3.4.3
-  - markdown-it-py=2.2.0
-  - markupsafe=2.1.2
-  - matplotlib=3.7.1
-  - matplotlib-base=3.7.1
-  - matplotlib-inline=0.1.6
-  - mdurl=0.1.0
-  - multiqc=1.14
+  - libwebp-base=1.5.0
+  - libxcb=1.17.0
+  - libxcrypt=4.4.36
+  - libxml2=2.13.5
+  - libzlib=1.3.1
+  - logmuse=0.2.8
+  - logomaker=0.8
+  - make=4.4.1
+  - markdown=3.6
+  - markdown-it-py=3.0.0
+  - markupsafe=3.0.2
+  - mathjax=2.7.7
+  - matplotlib-base=3.10.0
+  - matplotlib-inline=0.1.7
+  - mdurl=0.1.2
+  - multiqc=1.26
   - munkres=1.1.4
-  - mysql-common=8.0.32
   - mysql-connector-c=6.1.11
-  - mysql-libs=8.0.32
   - natsort=8.4.0
-  - nbformat=5.8.0
-  - ncbi-vdb=3.0.2
-  - ncurses=6.3
-  - networkx=3.1
-  - nspr=4.35
-  - nss=3.89
-  - numpy=1.23.5
-  - openjdk=11.0.1
-  - openjpeg=2.5.0
-  - openssl=1.1.1t
-  - ossuuid=1.6.2
-  - packaging=23.1
-  - pandas=2.0.1
-  - pandoc=3.1.2
-  - pango=1.50.7
-  - parso=0.8.3
-  - patsy=0.5.3
+  - nbformat=5.10.4
+  - ncurses=6.5
+  - networkx=3.4.2
+  - nspr=4.36
+  - nss=3.107
+  - numpy=2.2.1
+  - numpydoc=1.8.0
+  - openjdk=23.0.1
+  - openjpeg=2.5.3
+  - openpyxl=3.1.5
+  - openssl=3.4.0
+  - packaging=24.2
+  - pandas=2.2.3
+  - pandoc=3.6.1
+  - pango=1.54.0
+  - parso=0.8.4
+  - patsy=1.0.1
   - pbzip2=1.1.13
-  - pcre2=10.37
+  - pcre2=10.44
+  - pephubclient=0.4.4
+  - peppy=0.40.7
   - perl=5.32.1
-  - perl-alien-build=2.48
-  - perl-alien-libxml2=0.17
-  - perl-business-isbn=3.007
-  - perl-business-isbn-data=20210112.006
-  - perl-capture-tiny=0.48
-  - perl-carp=1.50
-  - perl-constant=1.33
-  - perl-data-dumper=2.183
-  - perl-encode=3.19
-  - perl-exporter=5.74
-  - perl-extutils-makemaker=7.70
-  - perl-ffi-checklib=0.28
-  - perl-file-chdir=0.1011
-  - perl-file-path=2.18
-  - perl-file-temp=0.2304
-  - perl-file-which=1.24
-  - perl-gd=2.76
+  - perl-gd=2.56
   - perl-gdgraph=1.54
   - perl-gdtextutil=0.86
-  - perl-importer=0.026
-  - perl-mime-base64=3.16
-  - perl-parent=0.241
-  - perl-path-tiny=0.124
-  - perl-pathtools=3.75
-  - perl-scope-guard=0.21
-  - perl-storable=3.15
-  - perl-sub-info=0.002
-  - perl-term-table=0.016
-  - perl-test-fatal=0.016
-  - perl-test-warnings=0.031
-  - perl-test2-suite=0.000145
-  - perl-try-tiny=0.31
-  - perl-uri=5.17
-  - perl-xml-libxml=2.0207
-  - perl-xml-namespacesupport=1.12
-  - perl-xml-sax=1.02
-  - perl-xml-sax-base=1.09
-  - pexpect=4.8.0
+  - pexpect=4.9.0
   - picard=2.27.5
   - pickleshare=0.7.5
-  - pigz=2.6
-  - pillow=9.2.0
-  - pip=23.1.2
-  - pixman=0.40.0
+  - pigz=2.8
+  - pillow=11.0.0
+  - pip=24.3.1
+  - pixman=0.44.2
   - pkgutil-resolve-name=1.3.10
-  - plac=1.3.5
-  - platformdirs=3.5.1
-  - plotly=5.14.1
-  - pluggy=1.0.0
-  - pooch=1.7.0
-  - preseq=3.2.0
-  - prompt-toolkit=3.0.38
-  - prompt_toolkit=3.0.38
-  - psutil=5.9.5
+  - plac=1.4.3
+  - platformdirs=4.3.6
+  - plotly=5.24.1
+  - pluggy=1.5.0
+  - preseq=2.0.2
+  - prompt-toolkit=3.0.48
+  - psutil=6.1.1
   - pthread-stubs=0.4
   - ptyprocess=0.7.0
-  - pulp=2.7.0
-  - pulseaudio=14.0
-  - pure_eval=0.2.2
+  - pulp=2.8.0
+  - pure_eval=0.2.3
   - py2bit=0.3.0
-  - pybedtools=0.9.0
-  - pybigwig=0.3.18
-  - pycparser=2.21
-  - pyfaidx=0.7.2.1
-  - pygments=2.15.1
-  - pyopenssl=23.1.1
-  - pyparsing=3.0.9
-  - pyqt=5.15.4
-  - pyqt5-sip=12.9.0
-  - pyrsistent=0.19.3
-  - pysam=0.20.0
+  - pyaml-env=1.2.1
+  - pybedtools=0.11.0
+  - pybigwig=0.3.23
+  - pycparser=2.22
+  - pydantic=2.10.4
+  - pydantic-core=2.27.2
+  - pyfaidx=0.8.1.3
+  - pygments=2.18.0
+  - pyparsing=3.2.1
+  - pysam=0.22.1
   - pysocks=1.7.1
-  - pytest=7.3.1
-  - pytest-xdist=3.2.1
-  - python=3.10.8
-  - python-dateutil=2.8.2
-  - python-fastjsonschema=2.16.3
-  - python-isal=1.1.0
-  - python-lzo=1.14
-  - python-tzdata=2023.3
-  - python_abi=3.10
-  - pytz=2023.3
+  - pytest=8.3.4
+  - pytest-xdist=3.6.1
+  - python=3.12.8
+  - python-dateutil=2.9.0.post0
+  - python-fastjsonschema=2.21.1
+  - python-isal=1.7.1
+  - python-kaleido=0.2.1
+  - python-tzdata=2024.2
+  - python-zlib-ng=0.5.1
+  - python_abi=3.12
+  - pytz=2024.1
   - pyvcf3=1.0.3
-  - pyyaml=6.0
-  - qt-main=5.15.2
-  - r-base=4.1.3
+  - pyyaml=6.0.2
+  - qhull=2020.2
+  - r-base=4.2.3
   - readline=8.2
-  - requests=2.29.0
+  - referencing=0.35.1
+  - requests=2.32.3
   - reretry=0.11.8
-  - rich=13.3.5
-  - rich-click=1.6.1
-  - rseqc=5.0.1
-  - salmon=1.10.1
-  - samtools=1.16.1
-  - scipy=1.10.1
-  - seaborn=0.12.2
-  - seaborn-base=0.12.2
+  - rich=13.9.4
+  - rich-click=1.8.5
+  - rpds-py=0.22.3
+  - rseqc=5.0.4
+  - salmon=1.10.3
+  - samtools=1.21
+  - scipy=1.14.1
+  - seaborn=0.13.2
+  - seaborn-base=0.13.2
   - sed=4.8
-  - setuptools=67.7.2
-  - simplejson=3.19.1
-  - sip=6.5.1
-  - six=1.16.0
-  - smart_open=6.3.0
-  - smmap=3.0.5
-  - snakemake-minimal=7.25.3
+  - setuptools=75.6.0
+  - shellingham=1.5.4
+  - simplejson=3.19.3
+  - six=1.17.0
+  - slack-sdk=3.34.0
+  - slack_sdk=3.34.0
+  - smart_open=7.1.0
+  - smmap=5.0.0
+  - snakemake=8.26.0
+  - snakemake-interface-common=1.17.4
+  - snakemake-interface-executor-plugins=9.3.3
+  - snakemake-interface-report-plugins=1.1.0
+  - snakemake-interface-storage-plugins=3.3.0
+  - snakemake-minimal=8.26.0
+  - snowballstemmer=2.2.0
+  - soupsieve=2.5
   - spectra=0.0.11
-  - sqlite=3.41.2
-  - sra-tools=3.0.3
-  - stack_data=0.6.2
-  - star=2.7.10b
-  - statsmodels=0.14.0
-  - stopit=1.1.2
-  - subread=2.0.3
-  - sysroot_linux-64=2.12
+  - sphinx=8.1.3
+  - sphinxcontrib-applehelp=2.0.0
+  - sphinxcontrib-devhelp=2.0.0
+  - sphinxcontrib-htmlhelp=2.1.0
+  - sphinxcontrib-jsmath=1.0.1
+  - sphinxcontrib-qthelp=2.0.0
+  - sphinxcontrib-serializinghtml=1.1.10
+  - sqlite=3.47.2
+  - sra-tools=2.9.6
+  - stack_data=0.6.3
+  - star=2.7.11b
+  - statsmodels=0.14.4
+  - subread=2.0.8
+  - sysroot_linux-64=2.17
   - tabulate=0.9.0
-  - tbb=2021.7.0
-  - tenacity=8.2.2
-  - throttler=1.2.1
-  - tk=8.6.12
+  - tbb=2022.0.0
+  - tenacity=9.0.0
+  - throttler=1.2.2
+  - tk=8.6.13
   - tktable=2.10
-  - toml=0.10.2
-  - tomli=2.0.1
-  - toposort=1.10
-  - tornado=6.3.2
-  - trackhub=0.2.4
-  - traitlets=5.9.0
-  - typing-extensions=4.5.0
-  - typing_extensions=4.5.0
-  - tzdata=2023c
-  - ucsc-bedgraphtobigwig=377
-  - ucsc-bedsort=377
-  - ucsc-bedtobigbed=377
-  - ucsc-bigwigmerge=377
-  - ucsc-fetchchromsizes=377
-  - ucsc-genepredtobed=377
-  - ucsc-gtftogenepred=377
-  - ucsc-liftover=377
-  - ucsc-oligomatch=377
-  - ucsc-twobittofa=377
-  - ucsc-wigtobigwig=377
-  - unicodedata2=15.0.0
-  - urllib3=1.26.15
-  - wcwidth=0.2.6
-  - wheel=0.40.0
-  - wrapt=1.15.0
-  - xopen=1.7.0
-  - xorg-kbproto=1.0.7
-  - xorg-libice=1.0.10
-  - xorg-libsm=1.2.3
-  - xorg-libx11=1.8.4
-  - xorg-libxau=1.0.9
-  - xorg-libxdmcp=1.1.3
-  - xorg-libxext=1.3.4
-  - xorg-libxrender=0.9.10
-  - xorg-libxt=1.2.1
-  - xorg-renderproto=0.11.1
-  - xorg-xextproto=7.3.0
-  - xorg-xproto=7.0.31
-  - xz=5.2.6
+  - tomli=2.2.1
+  - tqdm=4.67.1
+  - trackhub=1.0
+  - traitlets=5.14.3
+  - typeguard=4.4.1
+  - typer=0.15.1
+  - typer-slim=0.15.1
+  - typer-slim-standard=0.15.1
+  - typing-extensions=4.12.2
+  - typing_extensions=4.12.2
+  - tzdata=2024b
+  - ubiquerg=0.8.0
+  - ucsc-bedgraphtobigwig=472
+  - ucsc-bedsort=469
+  - ucsc-bedtobigbed=473
+  - ucsc-bigwigmerge=469
+  - ucsc-fetchchromsizes=469
+  - ucsc-genepredtobed=469
+  - ucsc-gtftogenepred=469
+  - ucsc-liftover=469
+  - ucsc-oligomatch=469
+  - ucsc-stringify=472
+  - ucsc-twobittofa=472
+  - ucsc-wigtobigwig=472
+  - unicodedata2=15.1.0
+  - urllib3=2.3.0
+  - veracitools=0.1.3
+  - wcwidth=0.2.13
+  - webencodings=0.5.1
+  - wheel=0.45.1
+  - wrapt=1.17.0
+  - xopen=2.0.2
+  - xorg-libice=1.1.2
+  - xorg-libsm=1.2.5
+  - xorg-libx11=1.8.10
+  - xorg-libxau=1.0.12
+  - xorg-libxdmcp=1.1.5
+  - xorg-libxext=1.3.6
+  - xorg-libxfixes=6.0.1
+  - xorg-libxi=1.8.2
+  - xorg-libxrandr=1.5.4
+  - xorg-libxrender=0.9.12
+  - xorg-libxt=1.3.1
+  - xorg-libxtst=1.2.5
+  - xz=5.6.3
+  - xz-gpl-tools=5.6.3
+  - xz-tools=5.6.3
   - yaml=0.2.5
-  - yte=1.5.1
-  - zipp=3.15.0
-  - zlib=1.2.13
-  - zstandard=0.19.0
-  - zstd=1.5.2
+  - yte=1.5.5
+  - zipp=3.21.0
+  - zlib=1.3.1
+  - zlib-ng=2.2.3
+  - zstandard=0.23.0
+  - zstd=1.5.6

From f039b64128c26035540d55cc2dc898999ee34152 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Thu, 2 Jan 2025 21:25:20 +0000
Subject: [PATCH 019/196] update snakefiles and lib to reflect changes in
 snakemake 8

---
 lib/helpers.py                 | 4 ++--
 workflows/chipseq/Snakefile    | 4 +++-
 workflows/references/Snakefile | 4 +++-
 workflows/rnaseq/Snakefile     | 8 +++++---
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/lib/helpers.py b/lib/helpers.py
index 4723286c..9e0d6323 100644
--- a/lib/helpers.py
+++ b/lib/helpers.py
@@ -3,7 +3,7 @@
 from itertools import product
 import pandas as pd
 from snakemake.shell import shell
-from snakemake.io import expand, regex
+from snakemake.io import expand, regex_from_filepattern
 from lib import common
 
 
@@ -118,7 +118,7 @@ def extract_wildcards(pattern, target):
     >>> assert extract_wildcards(pattern, target) == expected
     >>> assert extract_wildcards(pattern, 'asdf') is None
     """
-    m = re.compile(regex(pattern)).match(target)
+    m = re.compile(regex_from_filepattern(pattern)).match(target)
     if m:
         return m.groupdict()
 
diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 90c84d28..2b5fc485 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -1,5 +1,4 @@
 import sys
-sys.path.insert(0, srcdir('../..'))
 import os
 from textwrap import dedent
 import yaml
@@ -7,6 +6,9 @@ import tempfile
 import pandas as pd
 import numpy as np
 import pybedtools
+
+HERE = str(Path(workflow.snakefile).parent)
+sys.path.insert(0, HERE + "/../..")
 from lib import common, utils, helpers, aligners, chipseq
 from lib.patterns_targets import ChIPSeqConfig
 from lib.utils import autobump, gb, hours
diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index d6bc9d0f..815d00c6 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -1,12 +1,14 @@
 import os
 import sys
-sys.path.insert(0, srcdir('../..'))
 import gzip
 import yaml
 import importlib
 import tempfile
 import pandas
 from snakemake.utils import makedirs
+
+HERE = str(Path(workflow.snakefile).parent)
+sys.path.insert(0, HERE + "/../..")
 from lib.imports import resolve_name
 from lib import utils
 from lib.utils import autobump, gb, hours
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index e979cfdc..0ac150b8 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -1,11 +1,13 @@
-import sys
-
-sys.path.insert(0, srcdir('../..'))
 import os
+import sys
+from pathlib import Path
 from textwrap import dedent
 import yaml
 import tempfile
 import pandas as pd
+
+HERE = str(Path(workflow.snakefile).parent)
+sys.path.insert(0, HERE + "/../..")
 from lib import common, utils, helpers, aligners
 from lib.utils import autobump, gb, hours
 from lib.patterns_targets import RNASeqConfig

From bec163d1b3855cb396362614bd3b00dd3c7434d4 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Thu, 2 Jan 2025 21:25:40 +0000
Subject: [PATCH 020/196] rm --bias for kallisto, which was causing segfaults

---
 workflows/rnaseq/Snakefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 0ac150b8..47c2a324 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -900,7 +900,6 @@ rule kallisto:
             '--output-dir {params.outdir} '
             '--threads {threads} '
             '--bootstrap-samples 100 '
-            '--bias '
             '--threads {threads} '
             '{se_args} '
             '{params.strand_arg} '

From cc310fb83e4d681ac550ea5f5d8543ca8da50911 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Thu, 2 Jan 2025 21:49:19 +0000
Subject: [PATCH 021/196] update test args -r --> --reason for snakemake 8

---
 .circleci/config.yml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index d351ba7e..af992587 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -185,7 +185,7 @@ variables:
           cd $DEPLOY/workflows/chipseq
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p -r
+          $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p --reason
           $DEPLOY/test/lcdb-wf-test chipseq --trackhub
 
   # --------------------------------------------------------------------------
@@ -200,7 +200,7 @@ variables:
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
 
-          ./run_test.sh --use-conda -j2 -k -p -r \
+          ./run_test.sh --use-conda -j2 -k -p --reason \
             --configfile $ORIG/test/test_configs/test_chipseq_regression.yaml \
             --config sampletable=$ORIG/test/test_configs/chipseq_one_run.tsv \
             merged_bigwigs="{}" \
@@ -228,7 +228,7 @@ variables:
         command: |
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -r -k --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p --reason -k --orig $ORIG
 
   # --------------------------------------------------------------------------
   # Standard RNA-seq workflow
@@ -240,7 +240,7 @@ variables:
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p -r --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p --reason --orig $ORIG
 
           $DEPLOY/test/lcdb-wf-test rnaseq --trackhub --orig $ORIG
 
@@ -272,12 +272,12 @@ variables:
           # provide; some of them use the --until argument to restrict the
           # rules that are run. Note the use of --orig $ORIG to use the test
           # configs from the original clone rather than the deployed directory.
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe          -k -r -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se          -k -r -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k -r -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass      -k -r -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass      -k -r -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe              -k -r -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe          -k --reason -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se          -k --reason -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k --reason -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass      -k --reason -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass      -k --reason -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe              -k --reason -p -j2 --use-conda --orig $ORIG
 
 
 
@@ -290,7 +290,7 @@ variables:
           cd $DEPLOY/workflows/colocalization
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -r -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k --reason -p -j2 --use-conda --orig $ORIG
 
 # --------------------------------------------------------------------------
 # Syntax note: All of the steps above, with their "&step-name" labels, can be

From 54514e9642448c5b907582acb20f604c4acf3974 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Thu, 2 Jan 2025 22:31:39 +0000
Subject: [PATCH 022/196] rm --reason for snakemake 8

---
 .circleci/config.yml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index af992587..50b1051a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -185,7 +185,7 @@ variables:
           cd $DEPLOY/workflows/chipseq
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p --reason
+          $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p
           $DEPLOY/test/lcdb-wf-test chipseq --trackhub
 
   # --------------------------------------------------------------------------
@@ -200,7 +200,7 @@ variables:
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
 
-          ./run_test.sh --use-conda -j2 -k -p --reason \
+          ./run_test.sh --use-conda -j2 -k -p \
             --configfile $ORIG/test/test_configs/test_chipseq_regression.yaml \
             --config sampletable=$ORIG/test/test_configs/chipseq_one_run.tsv \
             merged_bigwigs="{}" \
@@ -228,7 +228,7 @@ variables:
         command: |
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p --reason -k --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -k --orig $ORIG
 
   # --------------------------------------------------------------------------
   # Standard RNA-seq workflow
@@ -240,7 +240,7 @@ variables:
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p --reason --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p --orig $ORIG
 
           $DEPLOY/test/lcdb-wf-test rnaseq --trackhub --orig $ORIG
 
@@ -272,12 +272,12 @@ variables:
           # provide; some of them use the --until argument to restrict the
           # rules that are run. Note the use of --orig $ORIG to use the test
           # configs from the original clone rather than the deployed directory.
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe          -k --reason -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se          -k --reason -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k --reason -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass      -k --reason -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass      -k --reason -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe              -k --reason -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe          -k -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se          -k -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass      -k -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass      -k -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe              -k -p -j2 --use-conda --orig $ORIG
 
 
 
@@ -290,7 +290,7 @@ variables:
           cd $DEPLOY/workflows/colocalization
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k --reason -p -j2 --use-conda --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -p -j2 --use-conda --orig $ORIG
 
 # --------------------------------------------------------------------------
 # Syntax note: All of the steps above, with their "&step-name" labels, can be

From 06c147b97e2df018114a2ff852b4e1850f1d5b73 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 02:38:38 +0000
Subject: [PATCH 023/196] disable colocalization workflow

---
 .circleci/config.yml | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 50b1051a..02b27915 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -140,7 +140,7 @@ variables:
         cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh
         cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh
         cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh
-        cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh
+        # cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh
 
         mkdir $DEPLOY/ci
         mkdir $DEPLOY/test
@@ -399,14 +399,14 @@ jobs:
       - *get-data
       - *rnaseq-misc-step
 
-  colocalization:
-    <<: *defaults
-    steps:
-      - checkout
-      - *restore_cache
-      - *set-path
-      - *get-data
-      - *colocalization-step
+  # colocalization:
+  #   <<: *defaults
+  #   steps:
+  #     - checkout
+  #     - *restore_cache
+  #     - *set-path
+  #     - *get-data
+  #     - *colocalization-step
 
   references:
     <<: *defaults
@@ -493,10 +493,10 @@ workflows:
           requires:
             - initial-setup
             - pytest
-      - colocalization:
-          requires:
-            - initial-setup
-            - pytest
+      # - colocalization:
+      #     requires:
+      #       - initial-setup
+      #       - pytest
       - build-docs:
           requires:
             - initial-setup
@@ -507,4 +507,4 @@ workflows:
             - chipseq
             - chipseq-misc
             - references
-            - colocalization
+            # - colocalization

From bea0910394da3e82bfc3cb8a0b76a57943cd4c62 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 19:22:22 +0000
Subject: [PATCH 024/196] delete lots of stuff

---
 lib/aligners.py                         |  85 ---
 lib/common.py                           | 914 ------------------------
 lib/helpers.py                          | 219 ------
 lib/imports.py                          |  22 -
 lib/postprocess/adapters.py             |   6 -
 lib/postprocess/dicty.py                |  18 -
 lib/postprocess/hg19.py                 |   3 -
 lib/postprocess/hg38.py                 |  14 -
 workflows/references/Snakefile          | 369 ----------
 workflows/references/config/config.yaml |   6 -
 workflows/references/run_test.sh        |   3 -
 11 files changed, 1659 deletions(-)
 delete mode 100644 lib/aligners.py
 delete mode 100644 lib/common.py
 delete mode 100644 lib/helpers.py
 delete mode 100644 lib/imports.py
 delete mode 100644 lib/postprocess/adapters.py
 delete mode 100644 lib/postprocess/dicty.py
 delete mode 100644 lib/postprocess/hg19.py
 delete mode 100644 lib/postprocess/hg38.py
 delete mode 100644 workflows/references/Snakefile
 delete mode 100644 workflows/references/config/config.yaml
 delete mode 100755 workflows/references/run_test.sh

diff --git a/lib/aligners.py b/lib/aligners.py
deleted file mode 100644
index 62fe58a5..00000000
--- a/lib/aligners.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-Helper functions for working with aligners within Snakefiles
-"""
-
-
-def hisat2_index_from_prefix(prefix):
-    """
-    Given a prefix, return a list of the corresponding hisat2 index files.
-    """
-    return ['{prefix}.{n}.ht2'.format(prefix=prefix, n=n) for n in range(1, 9)]
-
-
-def prefix_from_hisat2_index(index_files):
-    """
-    Given a list of index files for hisat2, return the corresponding prefix.
-    """
-    if isinstance(index_files, str):
-        return '.'.join(index_files.split('.')[:-2])
-    else:
-        prefixes = list(
-            set(
-                map(
-                    lambda x: '.'.join(x.split('.')[:-2]), index_files)
-            )
-        )
-        if len(prefixes) != 1:
-            raise ValueError(
-                "More than one prefix detected from '{0}'".format(prefixes)
-            )
-        return prefixes[0]
-
-
-def bowtie2_index_from_prefix(prefix):
-    """
-    Given a prefix, return a list of the corresponding bowtie2 index files.
-    """
-    return (
-        [
-            '{prefix}.{n}.bt2'.format(prefix=prefix, n=n)
-            for n in range(1, 5)
-        ] + [
-            '{prefix}.rev.{n}.bt2'.format(prefix=prefix, n=n)
-            for n in range(1, 3)
-        ]
-    )
-
-
-def prefix_from_bowtie2_index(index_files):
-    """
-    Given a list of index files for bowtie2, return the corresponding prefix.
-    """
-    if isinstance(index_files, str):
-        return '.'.join(index_files.replace('.rev', '').split('.')[:-2])
-    else:
-        prefixes = list(
-            set(
-                map(
-                    lambda x: '.'.join(x.replace('.rev', '').split('.')[:-2]),
-                    index_files)
-            )
-        )
-        if len(prefixes) != 1:
-            raise ValueError(
-                "More than one prefix detected from '{0}'".format(prefixes)
-            )
-        return prefixes[0]
-
-def fastq_arg_from_input(fastqs):
-    """
-    Prepares the correct input FASTQ arguments for bowtie2 and HISAT2 based on
-    whether or not the sample is paired-end.
-
-    Parameters
-    ----------
-    fastqs : list-like
-        List or snakemake.input object containing fastq filenames.
-    """
-
-    if isinstance(fastqs, str) or len(fastqs) == 1:
-        fastqs = '-U {0} '.format(fastqs)
-    else:
-        assert len(fastqs) == 2
-        fastqs = '-1 {0} -2 {1} '.format(*fastqs)
-    return fastqs
-
diff --git a/lib/common.py b/lib/common.py
deleted file mode 100644
index 829cc129..00000000
--- a/lib/common.py
+++ /dev/null
@@ -1,914 +0,0 @@
-import glob
-import subprocess
-import time
-import os
-import warnings
-import urllib.request as request
-import contextlib
-import yaml
-import pandas
-from Bio import SeqIO
-import gzip
-import binascii
-from lib.imports import resolve_name
-from lib import aligners
-from lib import utils
-from snakemake.shell import shell
-from snakemake.io import expand
-
-# List of possible keys in config that are to be interpreted as paths
-PATH_KEYS = [
-    'references_dir',
-    'sampletable',
-    'sample_dir',
-    'aggregation_dir',
-    'merged_dir',
-    'peaks_dir',
-    'hub_config',
-]
-
-
-def _is_gzipped(fn):
-    """
-    Filename-independent method of checking if a file is gzipped or not. Uses
-    the magic number.
-
-    xref https://stackoverflow.com/a/47080739
-    """
-    with open(fn, 'rb') as f:
-        return binascii.hexlify(f.read(2)) == b'1f8b'
-
-
-def openfile(tmp, mode):
-    """
-    Returns an open file handle; auto-detects gzipped files.
-    """
-    if _is_gzipped(tmp):
-        return gzip.open(tmp, mode)
-    else:
-        return open(tmp, mode)
-
-
-def resolve_config(config, workdir=None):
-    """
-    Finds the config file.
-
-    Parameters
-    ----------
-    config : str, dict
-        If str, assume it's a YAML file and parse it; otherwise pass through
-
-    workdir : str
-        Optional location to specify relative location of all paths in `config`
-    """
-    if isinstance(config, str):
-        config = yaml.load(open(config), Loader=yaml.FullLoader)
-
-    def rel(pth):
-        if workdir is None or os.path.isabs(pth):
-            return pth
-        return os.path.join(workdir, pth)
-    for key in PATH_KEYS:
-        if key in config:
-            config[key] = rel(config[key])
-    return config
-
-
-def gzipped(tmpfiles, outfile):
-    """
-    Cat-and-gzip a list of uncompressed files into a compressed output file.
-    """
-    with gzip.open(outfile, 'wt') as fout:
-        for f in tmpfiles:
-            with open(f) as infile:
-                for line in infile:
-                    fout.write(line)
-
-
-def cat(tmpfiles, outfile):
-    """
-    Simple concatenation of files.
-
-    Note that gzipped files can be concatenated as-is without un- and re-
-    compressing.
-    """
-    shell('cat {tmpfiles} > {outfile}')
-
-
-def filter_fastas(tmpfiles, outfile, pattern):
-    """
-    Extract records from fasta file(s) given a search pattern.
-
-    Given input gzipped FASTAs, create a new gzipped fasta containing only
-    records whose description matches `pattern`.
-
-    Parameters
-    ----------
-    tmpfiles : list
-        gzipped fasta files to look through
-
-    outfile : str
-        gzipped output fastq file
-
-    pattern : str
-        Look for this string in each record's description
-
-    """
-    def gen():
-        for tmp in tmpfiles:
-            handle = gzip.open(tmp, 'rt')
-            parser = SeqIO.parse(handle, 'fasta')
-            for rec in parser:
-                if pattern not in rec.description:
-                    continue
-                rec.seq = rec.seq.back_transcribe()
-                rec.description = rec.name
-                yield rec
-
-    with gzip.open(outfile, 'wt') as fout:
-        SeqIO.write(gen(), fout, 'fasta')
-
-
-def twobit_to_fasta(tmpfiles, outfile):
-    """
-    Converts .2bit files to fasta.
-
-    Parameters
-    ----------
-    tmpfiles : list
-        2bit files to convert
-
-    outfile : str
-        gzipped output fastq file
-    """
-    # Note that twoBitToFa doesn't support multiple input files, but we want to
-    # support them with this function
-    lookup = {i: i + '.fa' for i in tmpfiles}
-    for i in tmpfiles:
-        fn = lookup[i]
-        shell('twoBitToFa {i} {fn}')
-
-    # Make sure we retain the order of the originally-provided files from the
-    # config when concatenating.
-    fastas = [lookup[i] for i in tmpfiles]
-    shell('cat {fastas} | gzip -c > {outfile}')
-    shell('rm {fastas}')
-
-
-def download_and_postprocess(outfile, config, organism, tag, type_):
-    """
-    Given an output file, figure out what to do based on the config.
-
-    See notes below for details.
-
-    Parameters
-    ----------
-    outfile : str
-
-    config : dict
-
-    organism : str
-        Which organism to use. Must be a key in the "references" section of the
-        config.
-
-    tag : str
-        Which tag for the organism to use. Must be a tag for the organism in
-        the config
-
-    type_ : str
-        A supported references type (gtf, fasta) to use.
-
-    Notes
-    -----
-
-    This function:
-
-        - uses `organism`, `tag`, `type_` as a key into the config dict to
-          figure out:
-
-            - what postprocessing function (if any) was specified along with
-              its optional args
-            - the URL[s] to download
-
-        - resolves the name of the postprocessing function (if provided) and
-          imports it
-        - downloads the URL[s] to tempfile[s]
-        - calls the imported postprocessing function using the tempfile[s] and
-          outfile plus any additional specified arguments.
-
-
-    The postprocessing function must have one of the following signatures,
-    where `infiles` contains the list of temporary files downloaded from the
-    URL or URLs specified, and `outfile` is a gzipped file expected to be
-    created by the function::
-
-        def func(infiles, outfile):
-            pass
-
-    or::
-
-        def func(infiles, outfile, *args):
-            pass
-
-    or::
-
-        def func(infiles, outfile, *args, **kwargs):
-            pass
-
-
-    The function is specified as a string that resolves to an importable
-    function, e.g., `postprocess: lib.postprocess.dm6.fix` will call a function
-    called `fix` in the file `lib/postprocess/dm6.py`.
-
-    If the contents of `postprocess:` is a dict, it must have at least the key
-    `function`, and optionally `args` and/or `kwargs` keys. The `function` key
-    indicates the importable path to the function.  `args` can be a string
-    or list of arguments that will be provided as additional args to a function
-    with the second kind of signature above.  If `kwargs` is provided, it is
-    a dict that is passed to the function with the third kind of signature
-    above. For example::
-
-        postprocess:
-            function: lib.postprocess.dm6.fix
-            args:
-                - True
-                - 3
-
-    or::
-
-        postprocess:
-            function: lib.postprocess.dm6.fix
-            args:
-                - True
-                - 3
-            kwargs:
-                skip: exon
-
-    """
-
-    def default_postprocess(origfn, newfn):
-        """
-        If no other postprocess function is defined, then simply move the
-        original to the new.
-        """
-        shell("mv {origfn} {newfn}")
-
-    block = config['references'][organism][tag][type_]
-
-    # postprocess can be missing, in which case we use the default above
-    post_process = block.get('postprocess', None)
-
-    if not isinstance(post_process, list):
-        post_process = [post_process]
-
-    funcs = []
-    func_tmpfiles = []
-    for i, post_process_block in enumerate(post_process):
-        if post_process_block is None:
-            func = default_postprocess
-            args = ()
-            kwargs = {}
-            name = None
-
-        # postprocess can have a single string value (indicating the function) or
-        # it can be a dict with keys "function" and optionally "args". The value of
-        # "args" can be a string or a list.
-        else:
-            if isinstance(post_process_block, dict):
-                name = post_process_block.get('function', post_process)
-                args = post_process_block.get('args', ())
-                kwargs = post_process_block.get('kwargs', {})
-                if isinstance(args, str):
-                    args = (args,)
-            elif isinstance(post_process_block, str):
-                name = post_process_block
-                args = ()
-                kwargs = {}
-
-            # In the special case where there is kwarg beginning and ending
-            # with "__", this can be a dotted function name so it will be
-            # resolved here as well and passed along to the postprocessing
-            # function.
-            #
-            # This makes it possible to do things like add ERCC annotations on
-            # the end of other annotations that themselves need to be
-            # post-processed.
-            for kw in kwargs:
-                if kw.startswith('__') and kw.endswith('__'):
-                    kwargs[kw] = resolve_name(kwargs[kw])
-
-            # import the function
-            func = resolve_name(name)
-
-        tmp_outfile = f'{outfile}.{i}.{name}.tmp'
-        func_tmpfiles.append(tmp_outfile)
-        funcs.append([func, args, kwargs, tmp_outfile])
-
-    # The last func's outfile should be the final outfile
-    funcs[-1][-1] = outfile
-
-    # as described in the docstring above, functions are to assume a list of
-    # urls
-    urls = block['url']
-    if isinstance(urls, str):
-        urls = [urls]
-
-    # Download tempfiles into reasonably-named filenames
-    tmpfiles = ['{0}.{1}.tmp'.format(outfile, i) for i in range(len(urls))]
-    tmpinputfiles = tmpfiles
-    try:
-        for url, tmpfile in zip(urls, tmpfiles):
-            if url.startswith('file:'):
-                url = url.replace('file://', '')
-                shell('cp {url} {tmpfile} 2> {outfile}.log')
-            else:
-                shell("wget {url} -O- > {tmpfile} 2> {outfile}.log")
-
-        for func, args, kwargs, outfile in funcs:
-            func(tmpinputfiles, outfile, *args, **kwargs)
-            tmpinputfiles = [outfile]
-
-    except Exception as e:
-        raise e
-    finally:
-        for i in tmpfiles + func_tmpfiles:
-            if os.path.exists(i):
-                shell('rm {i}')
-
-
-def references_dict(config):
-    """
-    Transforms the references section of the config file.
-
-    The references section of the config file is designed to be human-editable,
-    and to only need the URL(s). User-specified indexes, conversions, and
-    post-processing functions can also be added.
-
-    For example, the config might say::
-
-        human:
-          gencode:
-            fasta: <url to fasta>
-                indexes:
-                  - hisat2
-
-    In this function, we need to convert that "indexes: [hisat2]" into the full
-    path of the hisat2 index that can be used as input for a Snakemake rule. In
-    this example, in the dictionary returned below we can then get that path
-    with `d['human']['gencode']['hisat2']`, or more generally,
-    `d[organism][tag][type]`.
-
-    Parameters
-    ----------
-    config : dict
-
-    Notes
-    -----
-
-    The config file is designed to be easy to edit and use from the user's
-    standpoint. But it's not so great for practical usage. Here we convert the
-    config file which has the format::
-
-    ... references_dir: "/data"
-    ... references:
-    ...   dm6:
-    ...     r6-11:
-    ...       metadata:
-    ...         reference_genome_build: 'dm6'
-    ...         reference_effective_genome_count: 1.2e7
-    ...         reference_effective_genome_proportion: 0.97
-    ...       genome:
-    ...         url: ""
-    ...         indexes:
-    ...           - bowtie2
-    ...           - hisat2
-    ...       annotation:
-    ...         url: ""
-    ...         conversions:
-    ...           - refflat
-    ...       transcriptome:
-    ...           indexes:
-    ...             - salmon
-
-    To this format::
-
-    ... 'dm6': {
-    ...    'r6-11': {
-    ...        'annotation':    '/data/dm6/r6-11/annotation/dm6_r6-11.gtf',
-    ...        'bowtie2':       '/data/dm6/r6-11/genome/bowtie2/dm6_r6-11.1.bt2',
-    ...        'bowtie2_fasta': '/data/dm6/r6-11/genome/bowtie2/dm6_r6-11.fasta',
-    ...        'chromsizes':    '/data/dm6/r6-11/genome/dm6_r6-11.chromsizes',
-    ...        'genome':        '/data/dm6/r6-11/genome/dm6_r6-11.fasta',
-    ...        'hisat2':        '/data/dm6/r6-11/genome/hisat2/dm6_r6-11.1.ht2',
-    ...        'hisat2_fasta':  '/data/dm6/r6-11/genome/hisat2/dm6_r6-11.fasta',
-    ...        'refflat':       '/data/dm6/r6-11/annotation/dm6_r6-11.refflat',
-    ...        'salmon':        '/data/dm6/r6-11/transcriptome/salmon/dm6_r6-11/versionInfo.json',
-    ...        'salmon_fasta':  '/data/dm6/r6-11/transcriptome/salmon/dm6_r6-11.fasta',
-    ...        'transcriptome': '/data/dm6/r6-11/transcriptome/dm6_r6-11.fasta',
-    ...        },
-    ... }
-
-    """
-    if isinstance(config, str):
-        config = yaml.load(open(config), Loader=yaml.FullLoader)
-
-    references_dir = get_references_dir(config)
-
-    # Map "indexes" value to a pattern specific to each index.
-    index_extensions = {
-        'bowtie2': aligners.bowtie2_index_from_prefix('')[0],
-        'hisat2': aligners.hisat2_index_from_prefix('')[0],
-        'star': '/Genome',
-
-        # Notes on salmon indexing:
-        #   - pre-1.0 versions had hash.bin
-        #   - post-1.0 versions do not have hash.bin but do have several other
-        #     different .bin files
-        #   - both appear to have versionInfo.json
-        #
-        # In order to support both, we use a filename found in common between
-        # the version.
-        'salmon': '/versionInfo.json',
-        'kallisto': '/transcripts.idx',
-    }
-
-    conversion_extensions = {
-
-        'intergenic': '.intergenic.gtf',
-        'refflat': '.refflat',
-        'gffutils': '.gtf.db',
-        'bed12': '.bed12',
-        'genelist': '.genelist',
-        'annotation_hub': '.{keytype}.csv',
-        'mappings': '.mapping.tsv.gz',
-    }
-
-    d = {}
-    conversion_kwargs = {}
-
-    merged_references = config['references']
-
-    type_extensions = {
-        'genome': 'fasta',
-        'annotation': 'gtf',
-        'transcriptome': 'fasta'
-    }
-
-    for organism in merged_references.keys():
-        d[organism] = {}
-        for tag in merged_references[organism].keys():
-            e = {}
-            for type_, block in merged_references[organism][tag].items():
-                if type_ == 'metadata':
-                    continue
-                try:
-                    type_extension = type_extensions[type_]
-
-                except KeyError:
-                    raise ValueError(
-
-                        "KeyError: " + type_ + "\n"
-                        "\nConfig file format has changed:\n"
-                        "  - 'fasta:' -> 'genome:'\n"
-                        "  - 'gtf:' -> 'annotation:'\n"
-                        "  - new 'transcriptome:' section\n"
-                        "\nSee docs for details\n\n"
-
-                    )
-                e[type_] = (
-                    '{references_dir}/'
-                    '{organism}/'
-                    '{tag}/'
-                    '{type_}/'
-                    '{organism}_{tag}.{type_extension}'.format(**locals())
-                )
-
-                # Add conversions if specified.
-                if type_ == 'annotation':
-                    conversions = block.get('conversions', [])
-                    for conversion in conversions:
-                        kwargs = {}
-                        if isinstance(conversion, dict):
-                            # if conversion is specified as dict, we assume
-                            # that there is only one key, and that key is the
-                            # actual name of the conversion; the corresponding
-                            # value will be kwargs. This is used e.g. for
-                            # gffutils conversion which often need some
-                            # tweaking of args depending on the gtf format.
-                            assert len(list(conversion.keys())) == 1
-                            kwargs = list(conversion.values())[0]
-                            conversion = list(conversion.keys())[0]
-
-                        # While the full set of columns for annotation hub are
-                        # not known in advance, we can assume at least the
-                        # keytype provided will be an output file. Fill that in
-                        # here.
-                        if conversion == 'annotation_hub':
-                            keytype = kwargs['keytype']
-                            ext = conversion_extensions[conversion].format(keytype=keytype)
-                        else:
-                            ext = conversion_extensions[conversion]
-                        output = (
-                            '{references_dir}/'
-                            '{organism}/'
-                            '{tag}/'
-                            '{type_}/'
-                            '{organism}_{tag}{ext}'.format(**locals())
-                        )
-                        e[conversion] = output
-
-                        conversion_kwargs[output] = kwargs
-
-                if type_ in ['genome', 'transcriptome']:
-                    # Add indexes if specified
-                    indexes = block.get('indexes', [])
-                    for index in indexes:
-                        ext = index_extensions[index]
-
-                        e[index] = (
-                            '{references_dir}/{organism}/{tag}/{type_}/{index}/{organism}_{tag}{ext}'
-                            .format(**locals())
-                        )
-
-                        # Each index will get the original fasta symlinked over
-                        # to its directory
-                        e[index + '_fasta'] = (
-                            '{references_dir}/{organism}/{tag}/{type_}/{index}/{organism}_{tag}.fasta'
-                            .format(**locals())
-                        )
-
-                    # Only makes sense to have chromsizes for genome fasta, not transcriptome.
-                    if type_ == 'genome':
-                        e['chromsizes'] = (
-                            '{references_dir}/'
-                            '{organism}/'
-                            '{tag}/'
-                            '{type_}/'
-                            '{organism}_{tag}.chromsizes'.format(**locals())
-                        )
-                d[organism][tag] = e
-    return d, conversion_kwargs
-
-
-def get_references_dir(config):
-    """
-    Identify the references directory based on config and env vars.
-
-    Returns the references dir, preferring the value of an existing environment
-    variable `REFERENCES_DIR` over the config entry "references_dir". Raise an
-    error if either can't be found.
-
-    Parameters
-    ----------
-    config : dict
-    """
-    config = resolve_config(config)
-    references_dir = os.environ.get(
-        'REFERENCES_DIR', config.get('references_dir', None))
-    if references_dir is None:
-        raise ValueError('No references dir specified')
-    return references_dir
-
-
-def get_sampletable(config):
-    """
-    Return samples and pandas.DataFrame of parsed sampletable.
-
-    Returns the sample IDs and the parsed sampletable from the file specified
-    in the config.
-
-    The sample IDs are assumed to be the first column of the sampletable.
-
-    Parameters
-    ----------
-    config : dict
-    """
-    config = resolve_config(config)
-    sampletable = pandas.read_csv(config['sampletable'], comment="#", sep='\t')
-    samples = sampletable.iloc[:, 0]
-    return samples, sampletable
-
-
-def get_techreps(sampletable, label):
-    """
-    Return all sample IDs for which the "label" column is `label`.
-    """
-    # since we're not requiring a name but we want to use `loc`
-    first_col = sampletable.columns[0]
-    result = list(sampletable.loc[sampletable['label'] == label, first_col])
-
-    # If we're using a ChIP-seq-like sampletable we can provide a more
-    # informative error message.
-
-    is_chipseq = 'antibody' in sampletable.columns
-    if is_chipseq:
-        err = ("""
-        No technical replicates found for label '{}'. Check the ChIP-seq config
-        file to ensure the peak-calling section only specifies values from the
-        sampletable's "label" column.""".format(label)
-        )
-    else:
-        err = "No technical replicates found for label '{}'.".format(label)
-
-    if len(result) == 0:
-        raise ValueError(err)
-
-    return result
-
-
-def load_config(config, missing_references_ok=False):
-    """
-    Loads the config.
-
-    Resolves any included references directories/files and runs the deprecation
-    handler.
-    """
-    if isinstance(config, str):
-        config = yaml.load(open(config), Loader=yaml.FullLoader)
-
-    # Here we populate a list of reference sections. Items later on the list
-    # will have higher priority
-    includes = config.get('include_references', [])
-    for i in includes:
-        if not os.path.exists(i):
-            raise ValueError("include_references: '{}' does not exist".format(i))
-    reference_sections = []
-
-    # First the directories. Directories that come earlier lose to those that
-    # come later.
-    for dirname in filter(os.path.isdir, includes):
-        # Note we're looking recursively for .yaml and .yml, so very large
-        # reference directories are possible
-        for fn in glob.glob(os.path.join(dirname, '**/*.y?ml'),
-                            recursive=True):
-            refs = yaml.load(open(fn), Loader=yaml.FullLoader).get('references', None)
-            if refs is None:
-                if not missing_references_ok:
-                    raise ValueError("No 'references:' section in {0}".format(fn))
-            else:
-                reference_sections.append(refs)
-
-    # Now the files
-    for fn in filter(os.path.isfile, includes):
-        refs = yaml.load(open(fn), Loader=yaml.FullLoader).get('references', None)
-        if refs is None:
-            if not missing_references_ok:
-                raise ValueError("No 'references:' section in {0}".format(fn))
-        else:
-            reference_sections.append(refs)
-
-    # The last thing we include is the references section as written in the
-    # config, which wins over all.
-    reference_sections.append(config.get('references', {}))
-
-    merged_references = {}
-    for ref in reference_sections:
-        for organism in ref.keys():
-            org_dict = merged_references.get(organism, {})
-            for tag in ref[organism].keys():
-                org_dict[tag] = ref[organism][tag]
-            merged_references[organism] = org_dict
-    config['references'] = merged_references
-
-    # Run the deprecation handler on the final config
-    config = deprecation_handler(config)
-
-    return config
-
-
-def deprecation_handler(config):
-    """
-    Checks the config to see if anything has been deprecated.
-
-    Also makes any fixes that can be done automatically.
-    """
-    if 'assembly' in config:
-        config['organism'] = config['assembly']
-        warnings.warn(
-            "'assembly' should be replaced with 'organism' in config files. "
-            "As a temporary measure, a new 'organism' key has been added with "
-            "the value of 'assembly'",
-            DeprecationWarning)
-
-    for org, block1 in config.get('references', {}).items():
-        for tag, block2 in block1.items():
-            gtf_conversions = block2.get('gtf', {}).get('conversions', [])
-            for c in gtf_conversions:
-                if isinstance(c, dict) and 'annotation_hub' in c:
-                    warnings.warn(
-                        "You may want to try the 'mappings' conversion rather "
-                        "than 'annotation_hub' since it works directly off "
-                        "the GTF file rather than assuming concordance between "
-                        "GTF and AnnoationHub instances",
-                        DeprecationWarning)
-
-    return config
-
-
-def is_paired_end(sampletable, sample):
-    """
-    Inspects the sampletable to see if the sample is paired-end or not
-
-    Parameters
-    ----------
-    sampletable : pandas.DataFrame
-        Contains a "layout" or "LibraryLayout" column (but not both). If the
-        lowercase value is "pe" or "paired", consider the sample paired-end.
-        Otherwise consider single-end.
-
-    sample : str
-        Assumed to be found in the first column of `sampletable`
-    """
-    # We can't fall back to detecting PE based on two fastq files provided for
-    # each sample when it's an SRA sampletable (which only has SRR accessions).
-    #
-    # So detect first detect if SRA sampletable based on presence of "Run"
-    # column and all values of that column starting with "SRR", and then raise
-    # an error if the Layout column does not exist.
-
-    if "Run" in sampletable.columns:
-        if all(sampletable["Run"].str.startswith("SRR")):
-            if "Layout" not in sampletable.columns and "layout" not in sampletable.columns:
-                raise ValueError(
-                    "Sampletable appears to be SRA, but no 'Layout' column "
-                    "found. This is required to specify single- or paired-end "
-                    "libraries.")
-
-    row = sampletable.set_index(sampletable.columns[0]).loc[sample]
-    if 'orig_filename_R2' in row:
-        return True
-    if 'layout' in row and 'LibraryLayout' in row:
-        raise ValueError("Expecting column 'layout' or 'LibraryLayout', "
-                         "not both")
-    try:
-        return row['layout'].lower() in ['pe', 'paired']
-    except KeyError:
-        pass
-    try:
-        return row['LibraryLayout'].lower() in ['pe', 'paired']
-    except KeyError:
-        pass
-    return False
-
-
-def fill_r1_r2(sampletable, pattern, r1_only=False):
-    """
-    Returns a function intended to be used as a rule's input function.
-
-    The returned function, when provided with wildcards, will return one or two
-    rendered versions of a pattern depending on SE or PE respectively.
-    Specifically, given a pattern (which is expected to contain a placeholder
-    for "{sample}" and "{n}"), look up in the sampletable whether or not it is
-    paired-end.
-
-    Parameters
-    ----------
-
-    sampletable : pandas.DataFrame
-        Contains a "layout" column with either "SE" or "PE", or "LibraryLayout"
-        column with "SINGLE" or "PAIRED". If column does not exist, assume SE.
-
-    pattern : str
-        Must contain at least a "{sample}" placeholder.
-
-    r1_only : bool
-        If True, then only return the file for R1 even if PE is configured.
-    """
-    def func(wc):
-        try:
-            wc.sample
-        except AttributeError:
-            raise ValueError(
-                'Need "{{sample}}" in pattern '
-                '"{pattern}"'.format(pattern=pattern))
-        n = [1]
-        if is_paired_end(sampletable, wc.sample) and not r1_only:
-            n = [1, 2]
-        res = expand(pattern, sample=wc.sample, n=n)
-        return res
-    return func
-
-
-def pluck(obj, kv):
-    """
-    For a given dict or list that somewhere contains keys `kv`, return the
-    values of those keys.
-
-    Named after the dplyr::pluck, and implemented based on
-    https://stackoverflow.com/a/1987195
-    """
-    if isinstance(obj, list):
-        for i in obj:
-            for x in pluck(i, kv):
-                yield x
-    elif isinstance(obj, dict):
-        if kv in obj:
-            yield obj[kv]
-        for j in obj.values():
-            for x in pluck(j, kv):
-                yield x
-
-
-def check_url(url, verbose=False):
-    """
-    Try to open -- and then immediately close -- a URL.
-
-    Any exceptions can be handled upstream.
-
-    """
-
-    # Some notes here:
-    #
-    #  - A pure python implementation isn't great because urlopen seems to
-    #    cache or hold sessions open or something. EBI servers reject responses
-    #    because too many clients are connected. This doesn't happen using curl.
-    #
-    #  - Using the requests module doesn't help, because urls can be ftp:// and
-    #    requests doesn't support that.
-    #
-    #  - Similarly, using asyncio and aiohttp works great for https, but not
-    #    ftp (I couldn't get aioftp to work properly).
-    #
-    #  - Not all servers support --head. An example of this is
-    #    https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA.
-    #
-    #  - Piping curl to head using the -c arg to use bytes seems to work.
-    #    However, we need to set pipefail (otherwise because head exits 0 the
-    #    whole thing exits 0). And in that case, we expect curl to exit every
-    #    time with exit code 23, which is "failed to write output", because of
-    #    the broken pipe. This is handled below.
-    #
-    if verbose:
-        print(f'Checking {url}')
-
-    # Notes on curl args:
-    #
-    #  --max-time to allow the server some seconds to respond
-    #  --retry to allow multiple tries if transient errors (4xx for FTP, 5xx for HTTP) are found
-    #  --silent to not print anything
-    #  --fail to return non-zero exit codes for 404 (default is exit 0 on hitting 404)
-    #
-    # Need to run through bash explicitly to get the pipefail option, which in
-    # turn means running with shell=True
-    proc = subprocess.run(f'/bin/bash -o pipefail -c "curl --retry 3 --max-time 10 --silent --fail {url} | head -c 10 > /dev/null"', shell=True)
-    return proc
-
-
-def check_urls(config, verbose=False):
-    """
-    Given a config filename or existing object, extract the URLs and check
-    them.
-
-    Parameters
-    ----------
-
-    config : str or dict
-        Config object to inspect
-
-    verbose : bool
-        Print which URL is being checked
-
-    wait : int
-        Number of seconds to wait in between checking URLs, to avoid
-        too-many-connection issues
-    """
-    config = load_config(config, missing_references_ok=True)
-    failures = []
-    urls = list(set(utils.flatten(pluck(config, 'url'))))
-    for url in urls:
-        if url.startswith('file://'):
-            continue
-
-        res = check_url(url, verbose=verbose)
-
-        # we expect exit code 23 because we're triggering SIGPIPE with the
-        # "|head -c" above.
-        if res.returncode and res.returncode != 23:
-            failures.append(f'FAIL with exit code {res.returncode}. Command was: {res.args}')
-    if failures:
-        output = '\n   '.join(failures)
-        raise ValueError(f'Found problematic URLs. See https://ec.haxx.se/usingcurl/usingcurl-returns for explanation of exit codes.\n   {output}')
-
-
-def check_all_urls_found(verbose=True):
-    """
-    Recursively loads all references that can be included and checks them.
-    Reports out if there are any failures.
-    """
-    check_urls({'include_references': [
-        'include/reference_configs',
-        'test/test_configs',
-        'workflows/rnaseq/config',
-        'workflows/chipseq/config',
-        'workflows/references/config',
-    ]}, verbose=verbose)
-
-
-def gff2gtf(gff, gtf):
-    """
-    Converts a gff file to a gtf format using the gffread function from Cufflinks
-    """
-    if _is_gzipped(gff[0]):
-        shell('gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}')
-    else:
-        shell('gffread {gff} -T -o- | gzip -c > {gtf}')
diff --git a/lib/helpers.py b/lib/helpers.py
deleted file mode 100644
index 9e0d6323..00000000
--- a/lib/helpers.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import collections
-import re
-from itertools import product
-import pandas as pd
-from snakemake.shell import shell
-from snakemake.io import expand, regex_from_filepattern
-from lib import common
-
-
-class ConfigurationError(Exception):
-    pass
-
-
-def detect_layout(sampletable):
-    """
-    Identifies whether a sampletable represents single-end or paired-end reads.
-
-    Raises NotImplementedError if there's a mixture.
-    """
-    is_pe = [common.is_paired_end(sampletable, s) for s in sampletable.iloc[:, 0]]
-    if all(is_pe):
-        return "PE"
-    elif not any(is_pe):
-        return "SE"
-    else:
-        p = sampletable.iloc[is_pe, 0].to_list()
-        s = sampletable.iloc[[not i for i in is_pe], 0].to_list()
-        if len(p) > len(s):
-            report = f"SE samples: {s}"
-        else:
-            report = f"PE samples: {p}"
-        raise ValueError(f"Only a single layout (SE or PE) is supported. {report}")
-
-
-def fill_patterns(patterns, fill, combination=product):
-    """
-    Fills in a dictionary of patterns with the dictionary `fill`.
-
-    >>> patterns = dict(a='{sample}_R{N}.fastq')
-    >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2])
-    >>> sorted(fill_patterns(patterns, fill)['a'])
-    ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq']
-
-    If using `zip` as a combination, checks to ensure all values in `fill` are
-    the same length to avoid truncated output.
-
-    This fails:
-
-    >>> patterns = dict(a='{sample}_R{N}.fastq')
-    >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2])
-    >>> sorted(fill_patterns(patterns, fill, zip)['a']) # doctest: +IGNORE_EXCEPTION_DETAIL
-    Traceback (most recent call last):
-    ...
-    ValueError: {'sample': ['one', 'two', 'three'], 'N': [1, 2]} does not have the same number of entries for each key
-
-    But this works:
-
-    >>> patterns = dict(a='{sample}_R{N}.fastq')
-    >>> fill = dict(sample=['one', 'one', 'two', 'two', 'three', 'three'], N=[1, 2, 1, 2, 1, 2])
-    >>> sorted(fill_patterns(patterns, fill, zip)['a'])
-    ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq']
-
-    """
-    # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns
-    # with no wildcards in them are removed from expand when `zip` is used as
-    # the combination function.
-    #
-    # For example, in 5.4.5:
-    #
-    #   expand('x', zip, d=[1,2,3]) == []
-    #
-    # But in 4.4.0:
-    #
-    #   expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x']
-
-    if combination == zip:
-        lengths = set([len(v) for v in fill.values()])
-        if len(lengths) != 1:
-            raise ValueError(f"{fill} does not have the same number of entries for each key")
-
-    def update(d, u, c):
-        for k, v in u.items():
-            if isinstance(v, collections.abc.Mapping):
-                r = update(d.get(k, {}), v, c)
-                d[k] = r
-            else:  # not a dictionary, so we're at a leaf
-                if isinstance(fill, pd.DataFrame):
-                    d[k] = list(set(expand(u[k], zip, **fill.to_dict("list"))))
-                else:
-                    d[k] = list(set(expand(u[k], c, **fill)))
-            if not d[k]:
-                d[k] = [u[k]]
-        return d
-
-    d = {}
-    return update(d, patterns, combination)
-
-
-def extract_wildcards(pattern, target):
-    """
-    Return a dictionary of wildcards and values identified from `target`.
-
-    Returns None if the regex match failed.
-
-    Parameters
-    ----------
-    pattern : str
-        Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``.
-
-    target : str
-        Filename from which to extract wildcards, e.g., ``data/a.bam``.
-
-    Examples
-    --------
-    >>> pattern = '{output}/{sample}.bam'
-    >>> target = 'data/a.bam'
-    >>> expected = {'output': 'data', 'sample': 'a'}
-    >>> assert extract_wildcards(pattern, target) == expected
-    >>> assert extract_wildcards(pattern, 'asdf') is None
-    """
-    m = re.compile(regex_from_filepattern(pattern)).match(target)
-    if m:
-        return m.groupdict()
-
-
-def rscript(string, scriptname, log=None):
-    """
-    Saves the string as `scriptname` and then runs it
-
-    Parameters
-    ----------
-    string : str
-        Filled-in template to be written as R script
-
-    scriptname : str
-        File to save script to
-
-    log : str
-        File to redirect stdout and stderr to. If None, no redirection occurs.
-    """
-    with open(scriptname, "w") as fout:
-        fout.write(string)
-    if log:
-        _log = "> {0} 2>&1".format(log)
-    else:
-        _log = ""
-    shell("Rscript {scriptname} {_log}")
-
-
-def check_unique_fn(df):
-    """
-    Raises an error if the fastq filenames are not unique
-    """
-    fns = df["orig_filename"]
-    if "orig_filename_R2" in df.columns:
-        fns = pd.concat([fns, df["orig_filename_R2"]])
-    if len(fns.unique()) < len(fns):
-        raise ValueError("Fastq filenames non unique, check the sampletable\n")
-
-
-def check_unique_samplename(df):
-    """
-    Raises an error if the samplenames are not unique
-    """
-    ns = df.index
-    if len(ns.unique()) < len(ns):
-        raise ConfigurationError("Samplenames non unique, check the sampletable\n")
-
-
-def preflight(config):
-    """
-    Performs verifications on config and sampletable files
-
-    Parameters
-    ----------
-    config: yaml config object
-    """
-    sampletable = pd.read_table(config["sampletable"], index_col=0, comment="#")
-    check_unique_samplename(sampletable)
-    if "orig_filename" in sampletable.columns:
-        check_unique_fn(sampletable)
-
-
-def rnaseq_preflight(c):
-    if "kallisto" not in c.config:
-        raise ConfigurationError(
-            """
-            Starting in v1.8, an additional 'kallisto' argument is expected
-            in the config file. Note that in the future this may be
-            automatically included, but for now please add the following to the
-            config, where 'tagname' is the tag for the reference of interest:
-
-            kallisto:
-              tag: "tagname"
-            """
-        )
-
-
-def chipseq_preflight(c):
-    pass
-
-
-def strand_arg_lookup(config, lookup):
-    """
-    Given a config object and lookup dictionary, confirm that the config has
-    correctly specified strandedness and then return the value for that key.
-    """
-    if not config.stranded:
-        raise ConfigurationError(
-            "Starting in v1.8, 'stranded' is required in the config file. "
-            "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), "
-            "or 'fr-secondstrand' (R1 aligns sense to original transcript). If you are not sure, "
-            "run the workflow with only the 'strand_check' rule, like "
-            "'snakemake -j 5 strand_check'."
-        )
-    if config.stranded not in lookup:
-        keys = list(lookup.keys())
-        raise KeyError(f"'{config.stranded}' not one of {keys}")
-    return lookup[config.stranded]
diff --git a/lib/imports.py b/lib/imports.py
deleted file mode 100644
index f790ef6f..00000000
--- a/lib/imports.py
+++ /dev/null
@@ -1,22 +0,0 @@
-def resolve_name(name):
-    """
-    Imports a specific object from a dotted path and returns just that object.
-
-    From nose.utils.resolve_name (with the logging parts taken out) which in
-    turn is from unittest.TestLoader.loadTestByName
-    """
-    parts = name.split('.')
-    parts_copy = parts[:]
-    while parts_copy:
-        try:
-            module = __import__('.'.join(parts_copy))
-            break
-        except ImportError:
-            del parts_copy[-1]
-            if not parts_copy:
-                raise
-    parts = parts[1:]
-    obj = module
-    for part in parts:
-        obj = getattr(obj, part)
-    return obj
diff --git a/lib/postprocess/adapters.py b/lib/postprocess/adapters.py
deleted file mode 100644
index 1d8ab7ab..00000000
--- a/lib/postprocess/adapters.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from snakemake.shell import shell
-
-def fasta_postprocess(origfn, newfn):
-    shell(
-          "gzip -c {origfn} > {newfn} "
-          "&& rm {origfn}")
diff --git a/lib/postprocess/dicty.py b/lib/postprocess/dicty.py
deleted file mode 100644
index 237cbbdd..00000000
--- a/lib/postprocess/dicty.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from Bio import SeqIO
-import gzip
-from snakemake.shell import shell
-
-def rrna_postprocess(tmpfiles, outfile):
-    def gen():
-        for tmp in tmpfiles:
-            handle = gzip.open(tmp, 'rt')
-            parser = SeqIO.parse(handle, 'fasta')
-            for rec in parser:
-                if 'Dictyostelium discoideum' not in rec.description:
-                    continue
-                rec.seq = rec.seq.back_transcribe()
-                rec.description = rec.name
-                yield rec
-
-    with gzip.open(outfile, 'wt') as fout:
-        SeqIO.write(gen(), fout, 'fasta')
diff --git a/lib/postprocess/hg19.py b/lib/postprocess/hg19.py
deleted file mode 100644
index 8d042432..00000000
--- a/lib/postprocess/hg19.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from snakemake.shell import shell
-def plus_lncrna_fasta_postprocess(tmpfiles, outfile):
-    shell('cat {tmpfiles} > {outfile}')
diff --git a/lib/postprocess/hg38.py b/lib/postprocess/hg38.py
deleted file mode 100644
index d21f54ad..00000000
--- a/lib/postprocess/hg38.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import pybedtools
-import gzip
-from snakemake.shell import shell
-import os
-
-
-def strip_ensembl_version(infiles, outfile):
-    def transform(f):
-        f.attrs['gene_id'] = f.attrs['gene_id'].split('.')[0]
-        return f
-    with gzip.open(outfile, 'wt') as fout:
-        for infile in infiles:
-            for feature in pybedtools.BedTool(infile):
-                fout.write(str(transform(feature)))
diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
deleted file mode 100644
index 815d00c6..00000000
--- a/workflows/references/Snakefile
+++ /dev/null
@@ -1,369 +0,0 @@
-import os
-import sys
-import gzip
-import yaml
-import importlib
-import tempfile
-import pandas
-from snakemake.utils import makedirs
-
-HERE = str(Path(workflow.snakefile).parent)
-sys.path.insert(0, HERE + "/../..")
-from lib.imports import resolve_name
-from lib import utils
-from lib.utils import autobump, gb, hours
-from lib import aligners, helpers
-from lib import common
-
-# Note: when running this workflow on its own (say, to generate all references
-# ahead of time) you wil need to provide a config file from the command line.
-#
-# Otherwise, this file is expected to be `include:`ed into other workflows,
-# which will have their own config files.
-
-config = common.load_config(config)
-
-references_dir = common.get_references_dir(config)
-refdict, conversion_kwargs = common.references_dict(config)
-
-makedirs([references_dir, os.path.join(references_dir, 'logs')])
-
-localrules: symlink_fasta_to_index_dir
-
-wildcard_constraints:
-    _type="genome|transcriptome|annotation",
-    _ext="fasta|gtf"
-
-
-rule all_references:
-    input: utils.flatten(refdict)
-
-
-rule download_and_process:
-    """Downloads the configured URL, applies any configured post-processing, and
-    saves the resulting gzipped file to *.fasta.gz or *.gtf.gz.
-    """
-    output:
-        temporary('{references_dir}/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}.gz')
-    run:
-        common.download_and_postprocess(output[0], config, wildcards.organism, wildcards.tag, wildcards._type)
-
-
-rule unzip:
-    """Generic rule to unzip files as needed, for example when building
-    indexes.
-    """
-    input:
-        rules.download_and_process.output
-    output:
-        protected('{references_dir}/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}')
-    wildcard_constraints:
-        _type="genome|annotation"
-    log:
-        '{references_dir}/logs/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}.log'
-    shell: 'gunzip -c {input} > {output}'
-
-
-rule bowtie2_index:
-    """
-    Build bowtie2 index
-    """
-    input:
-        '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta'
-    output:
-        protected(aligners.bowtie2_index_from_prefix('{references_dir}/{organism}/{tag}/genome/bowtie2/{organism}_{tag}'))
-    log:
-        '{references_dir}/logs/{organism}/{tag}/genome/bowtie2/{organism}_{tag}.log'
-    resources:
-        runtime=autobump(hours=8),
-        mem_mb=autobump(gb=32),
-        disk_mb=autobump(gb=50)
-    run:
-        prefix = aligners.prefix_from_bowtie2_index(output)
-        shell(
-            'bowtie2-build '
-            '{input} '
-            '{prefix} '
-            '&> {log}')
-
-
-rule star_index:
-    input:
-        fasta='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta',
-        gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf',
-    output:
-        protected('{references_dir}/{organism}/{tag}/genome/star/{organism}_{tag}/Genome')
-    log:
-        '{references_dir}/{organism}/{tag}/genome/star/{organism}_{tag}/Genome.log'
-    threads:
-        8
-    resources:
-        runtime=autobump(hours=8),
-        mem_mb=gb(64)
-    run:
-        genomedir = os.path.dirname(output[0])
-        shell('rm -r {genomedir}')
-        shell('mkdir -p {genomedir}')
-        shell(
-            'STAR '
-            '--runMode genomeGenerate '
-            '--runThreadN {threads} '
-            '--genomeDir {genomedir} '
-            '--genomeFastaFiles {input.fasta} '
-
-            # NOTE: GTF is optional
-            '--sjdbGTFfile {input.gtf} '
-
-            # NOTE: STAR docs say that 100 should work well.
-            '--sjdbOverhang 100 '
-
-            # NOTE: for small genomes, may need to scale this down to
-            # min(14, log2(GenomeLength) / 2 - 1)
-            # --genomeSAindexNbases 14
-            '&> {log}'
-        )
-        # STAR writes a hard-coded Log.out file to the current working
-        # directory. So put that on the end of the log file for the rule and
-        # then clean up.
-        shell('cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out')
-
-
-rule hisat2_index:
-    """
-    Build HISAT2 index
-    """
-    input:
-        '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta'
-    output:
-        protected(aligners.hisat2_index_from_prefix('{references_dir}/{organism}/{tag}/genome/hisat2/{organism}_{tag}'))
-    log:
-        '{references_dir}/logs/{organism}/{tag}/genome/hisat2/{organism}_{tag}.log'
-    resources:
-        runtime=autobump(hours=8),
-        mem_mb=gb(32),
-        disk_mb=gb(50)
-    run:
-        prefix = aligners.prefix_from_hisat2_index(output)
-        shell(
-            'hisat2-build '
-            '{input} '
-            '{prefix} '
-            '&> {log}')
-
-
-rule symlink_fasta_to_index_dir:
-    """Aligners often want the reference fasta in the same dir as the index, so
-    this makes the appropriate symlink
-    """
-    input:
-        fasta='{references_dir}/{organism}/{tag}/{_type}/{organism}_{tag}.fasta'
-    output:
-        '{references_dir}/{organism}/{tag}/{_type}/{index}/{organism}_{tag}.fasta'
-    resources:
-        runtime=hours(1)
-    log:
-        '{references_dir}/logs/{organism}/{tag}/{_type}/{index}/{organism}_{tag}.fasta.log'
-    run:
-        utils.make_relative_symlink(input[0], output[0])
-
-
-rule transcriptome_fasta:
-    input:
-        fasta='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta',
-        gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf'
-    output:
-        protected('{references_dir}/{organism}/{tag}/transcriptome/{organism}_{tag}.fasta')
-    resources:
-        runtime=hours(1)
-    shell:
-        'gffread {input.gtf} -w {output} -g {input.fasta}'
-
-
-rule salmon_index:
-    "Build salmon index"
-    output:
-        protected('{references_dir}/{organism}/{tag}/transcriptome/salmon/{organism}_{tag}/versionInfo.json')
-    input:
-        fasta='{references_dir}/{organism}/{tag}/transcriptome/{organism}_{tag}.fasta'
-    log:
-        '{references_dir}/logs/{organism}/{tag}/transcriptome/salmon/{organism}_{tag}.log'
-    params:
-        outdir='{references_dir}/{organism}/{tag}/transcriptome/salmon/{organism}_{tag}'
-    resources:
-        mem_mb=gb(32),
-        runtime=hours(2)
-    shell:
-        'salmon index '
-        '--transcripts {input.fasta} '
-        '--index {params.outdir} '
-        '&> {log}'
-
-
-rule kallisto_index:
-    "Build kallisto index"
-    output:
-        index=protected('{references_dir}/{organism}/{tag}/transcriptome/kallisto/{organism}_{tag}/transcripts.idx')
-    input:
-        fasta='{references_dir}/{organism}/{tag}/transcriptome/{organism}_{tag}.fasta'
-    log:
-        '{references_dir}/logs/{organism}/{tag}/transcriptome/kallisto/{organism}_{tag}.log'
-    resources:
-        runtime=hours(2),
-        mem_mb=gb(32),
-    shell:
-        'kallisto index '
-        '--index {output.index} '
-        '{input.fasta} '
-        '&> {log}'
-
-
-rule conversion_refflat:
-    """Converts a GTF into refFlat format
-    """
-    input:
-        '{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf'
-    output:
-        protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.refflat')
-    log:
-        '{references_dir}/logs/{organism}/{tag}/annotation/{organism}_{tag}.refflat.log'
-    resources:
-        runtime=hours(2),
-        mem_mb=gb(2)
-    shell:
-        'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp '
-        '''&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} '''
-        '&& rm {output}.tmp '
-
-
-rule conversion_bed12:
-    input:
-        '{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf'
-    output:
-        protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.bed12')
-    resources:
-        runtime=hours(2),
-        mem_mb=gb(2)
-    shell:
-        'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp '
-        '&& genePredToBed {output}.tmp {output} '
-        '&& rm {output}.tmp'
-
-rule conversion_gffutils:
-    """Converts a GTF into a gffutils sqlite3 database
-    """
-    input:
-        gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf'
-    output:
-        db=protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf.db')
-    log:
-        '{references_dir}/logs/{organism}/{tag}/annotation/{organism}_{tag}.gtf.db.log'
-    resources:
-        runtime=hours(2),
-        mem_mb=gb(4)
-    run:
-        import gffutils
-        kwargs = conversion_kwargs[output[0]]
-        fd, tmpdb = tempfile.mkstemp(suffix='.db', prefix='gffutils_')
-        db = gffutils.create_db(data=input.gtf, dbfn=tmpdb, **kwargs)
-        shell('mv {tmpdb} {output.db}')
-
-
-rule chromsizes:
-    """Creates a chromsizes table from fasta
-    """
-    input:
-        '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta'
-    output:
-        protected('{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.chromsizes')
-    log:
-        '{references_dir}/logs/{organism}/{tag}/genome/{organism}_{tag}.fasta.log'
-    params:
-        # NOTE: Be careful with the memory here; make sure you have enough
-        # and/or it matches the resources you're requesting in the cluster
-        # config.
-        java_args='-Xmx20g'
-        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
-    resources:
-        mem_mb=gb(24),
-        runtime=hours(2)
-    shell:
-        'export LC_COLLATE=C; '
-        'rm -f {output}.tmp '
-        '&& picard '
-        '{params.java_args} '
-        'CreateSequenceDictionary R={input} O={output}.tmp &> {log} '
-        '&& grep "^@SQ" {output}.tmp '
-        '''| awk '{{print $2, $3}}' '''
-        '| sed "s/SN://g;s/ LN:/\\t/g" '
-        '| sort -k1,1 > {output} '
-        '&& rm -f {output}.tmp '
-
-
-rule genelist:
-    """Creates a list of unique gene names in the GTF
-    """
-    input:
-        gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf'
-    output:
-        protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.genelist')
-    resources:
-        runtime=hours(1),
-        mem_mb=gb(2)
-    run:
-        attribute = conversion_kwargs[output[0]]['gene_id']
-        import gffutils
-        genes = set()
-        for feature in gffutils.DataIterator(input.gtf):
-            genes.update(feature.attributes[attribute])
-        with open(output[0], 'w') as fout:
-            for feature in sorted(list(set(genes))):
-                fout.write(feature + '\n')
-
-
-rule mappings:
-    """
-    Creates gzipped TSV mapping between attributes in the GTF.
-    """
-    input:
-        gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf'
-    output:
-        protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.mapping.tsv.gz')
-    params:
-        include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', [])
-    resources:
-        runtime=hours(2),
-        mem_mb=gb(2)
-    run:
-        import gffutils
-
-        # Will want to change the setting back to what it was originally when
-        # we're done
-        orig_setting = gffutils.constants.always_return_list
-        gffutils.constants.always_return_list = False
-
-        include_featuretypes = params.include_featuretypes
-
-        res = []
-        for f in gffutils.DataIterator(input[0]):
-
-            ft = f.featuretype
-
-            if include_featuretypes and (ft not in include_featuretypes):
-                continue
-
-            d = dict(f.attributes)
-            d['__featuretype__'] = ft
-            res.append(d)
-
-        df = pandas.DataFrame(res)
-
-        # Depending on how many attributes there were and the
-        # include_featuretypes settings, this may take a while.
-        df = df.drop_duplicates()
-
-        df.to_csv(output[0], sep='\t', index=False, compression='gzip')
-
-        # Restore original setting
-        gffutils.constants.always_return_list = orig_setting
-
-# vim: ft=python
diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml
deleted file mode 100644
index 49618dcd..00000000
--- a/workflows/references/config/config.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-references_dir: 'references_dir'
-
-# See the reference config files in the top level of the repo,
-# include/reference_configs, for inspiration for more species.
-include_references:
-  - '../../include/reference_configs/test.yaml'
diff --git a/workflows/references/run_test.sh b/workflows/references/run_test.sh
deleted file mode 100755
index 7aacb413..00000000
--- a/workflows/references/run_test.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-set -e
-python -m doctest ../../ci/preprocessor.py
-python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@"

From 060c2f8c057c19c1644c406d0032b8971a17b9a0 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 19:22:37 +0000
Subject: [PATCH 025/196] add new references.smk

---
 lib/postprocess/merge.py |  32 ----
 rules/references.smk     | 322 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 322 insertions(+), 32 deletions(-)
 delete mode 100644 lib/postprocess/merge.py
 create mode 100644 rules/references.smk

diff --git a/lib/postprocess/merge.py b/lib/postprocess/merge.py
deleted file mode 100644
index c3d1686e..00000000
--- a/lib/postprocess/merge.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os
-from snakemake.shell import shell
-from ..imports import resolve_name
-
-def file_merge(origfns, newfn, *args):
-    tmpfiles = ['{0}.{1}.sub.tmp'.format(newfn, i) for i in range(len(origfns))]
-    try:
-        for origfn, tmpfile, ppfunc in zip(origfns, tmpfiles, args):
-            print(ppfunc)
-            func = resolve_name(ppfunc)
-            func(origfn, tmpfile)
-
-        if os.path.exists(newfn):
-            shell('rm {newfn}')
-
-        if newfn.endswith('.gz'):
-            fn = newfn.replace('.gz', '')
-            for tmpfile in tmpfiles:
-                shell("gunzip -c {tmpfile} >> {fn}")
-            shell("gzip {fn}")
-        else:
-            for tmpfile in tmpfiles:
-                shell("cat {tmpfile} >> {newfn}")
-
-    except Exception as e:
-        raise e
-
-    finally:
-        for i in tmpfiles:
-            if os.path.exists(i):
-                shell('rm {i}')
-
diff --git a/rules/references.smk b/rules/references.smk
new file mode 100644
index 00000000..f4b104ff
--- /dev/null
+++ b/rules/references.smk
@@ -0,0 +1,322 @@
+import os
+import sys
+import pandas
+
+HERE = str(Path(workflow.snakefile).parent)
+sys.path.insert(0, HERE + "/../..")
+from lib.utils import autobump, gb, hours
+from lib import utils
+
+def default_postprocess(origfn, newfn):
+    shell("mv {origfn} {newfn}")
+
+rule fasta:
+    output:
+        temporary('references/genome.fa.gz')
+    run:
+        utils.download_and_postprocess(
+            urls=config['fasta']['url'],
+            postprocess=config['fasta'].get('postprocess', None),
+            outfile=output[0],
+            log=log
+        )
+
+
+rule gtf:
+    output:
+        temporary('references/annotation.gtf.gz')
+    run:
+        utils.download_and_postprocess(
+            urls=config['gtf']['url'],
+            postprocess=config['gtf'].get('postprocess', None),
+            outfile=output[0],
+            log=log
+        )
+
+
+rule rrna:
+    output:
+        temporary('references/rrna.fa.gz')
+    run:
+        utils.download_and_postprocess(
+            urls=config['rrna']['url'],
+            postprocess=config['rrna'].get('postprocess', None),
+            outfile=output[0],
+            log=log
+        )
+
+
+rule unzip:
+    input:
+        "references/{prefix}.gz"
+    output:
+        "references/{prefix}"
+    shell: 'gunzip -c {input} > {output}'
+
+
+rule bowtie2_index:
+    input:
+        "references/{label}.fa",
+    output:
+        multiext(
+            "references/bowtie2/{label}",
+            ".1.bt2",
+            ".2.bt2",
+            ".3.bt2",
+            ".4.bt2",
+            ".rev.1.bt2",
+            ".rev.2.bt2",
+            ".fa",
+        ),
+    log:
+        "references/logs/bowtie2_{label}.log"
+    resources:
+        runtime=autobump(hours=8),
+        mem_mb=autobump(gb=32),
+        disk_mb=autobump(gb=50)
+    threads:
+        8
+    run:
+        index = os.path.commonprefix(output).rstrip(".")
+        shell(
+            "bowtie2-build"
+            " --threads {threads}"
+            " {input}"
+            " {index}"
+            " &> {log}"
+        )
+        utils.make_relative_symlink(input[0], output[-1])
+
+
+rule star_index:
+    input:
+        fasta='references/genome.fa',
+        gtf='references/annotation.gtf',
+    output:
+        protected('references/star/Genome')
+    log:
+        'references/logs/star.log'
+    threads:
+        8
+    resources:
+        runtime=autobump(hours=8),
+        mem_mb=gb(64)
+    run:
+        genomedir = os.path.dirname(output[0])
+        shell('rm -r {genomedir}')
+        shell('mkdir -p {genomedir}')
+        shell(
+            'STAR '
+            '--runMode genomeGenerate '
+            '--runThreadN {threads} '
+            '--genomeDir {genomedir} '
+            '--genomeFastaFiles {input.fasta} '
+
+            # NOTE: GTF is optional
+            '--sjdbGTFfile {input.gtf} '
+
+            # NOTE: STAR docs say that 100 should work well.
+            '--sjdbOverhang 100 '
+
+            # NOTE: for small genomes, may need to scale this down to
+            # min(14, log2(GenomeLength) / 2 - 1)
+            # --genomeSAindexNbases 14
+            '&> {log}'
+        )
+        # STAR writes a hard-coded Log.out file to the current working
+        # directory. So put that on the end of the log file for the rule and
+        # then clean up.
+        shell('cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out')
+        shell("ln -s {input.fasta} {genomedir}")
+
+rule hisat2_index:
+    input:
+        "references/genome.fa",
+    output:
+        multiext(
+            "references/hisat2/genome",
+            ".1.ht2",
+            ".2.ht2",
+            ".3.ht2",
+            ".4.ht2",
+            ".5.ht2",
+            ".6.ht2",
+            ".7.ht2",
+            ".8.ht2",
+            ".fa",
+        )
+    log:
+        "references/logs/hisat2.log"
+    resources:
+        runtime=autobump(hours=8),
+        mem_mb=autobump(gb=32),
+        disk_mb=autobump(gb=50)
+    threads:
+        8
+    run:
+        index = os.path.commonprefix(output).rstrip(".")
+        shell(
+            "hisat2-build"
+            " --threads {threads}"
+            " {input}"
+            " {index}"
+            " &> {log}"
+        )
+        shell("ln -s {input} {output[-1]}")
+
+
+
+rule transcriptome_fasta:
+    input:
+        fasta='references/genome.fa',
+        gtf='references/annotation.gtf',
+    output:
+        'references/transcriptome.fa' 
+    resources:
+        runtime=hours(1)
+    shell:
+        'gffread {input.gtf} -w {output} -g {input.fasta}'
+
+
+rule salmon_index:
+    input:
+        'references/transcriptome.fa'
+    output:
+        'references/salmon/versionInfo.json'
+    log:
+        'references/logs/salmon.log'
+    params:
+        outdir='references/salmon'
+    resources:
+        mem_mb=gb(32),
+        runtime=hours(2)
+    run:
+        outdir = os.path.dirname(output[0])
+        shell(
+            'salmon index '
+            '--transcripts {input} '
+            '--index {outdir} '
+            '&> {log}'
+        )
+
+
+rule kallisto_index:
+    output:
+        'references/kallisto/transcripts.idx',
+    input:
+        'references/genome.fa'
+    log:
+        'references/logs/kallisto.log'
+    resources:
+        runtime=hours(2),
+        mem_mb=gb(32),
+    shell:
+        'kallisto index '
+        '--index {output} '
+        '{input} '
+        '&> {log}'
+
+
+rule conversion_refflat:
+    input:
+        'references/annotation.gtf'
+    output:
+        protected('references/annotation.refflat')
+    log:
+        'references/logs/annotation.refflat.log'
+    resources:
+        runtime=hours(2),
+        mem_mb=gb(2)
+    shell:
+        'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp '
+        '''&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} '''
+        '&& rm {output}.tmp '
+
+
+rule conversion_bed12:
+    input:
+        'references/annotation.gtf'
+    output:
+        protected('references/annotation.bed12')
+    resources:
+        runtime=hours(2),
+        mem_mb=gb(2)
+    shell:
+        'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp '
+        '&& genePredToBed {output}.tmp {output} '
+        '&& rm {output}.tmp'
+
+
+rule chromsizes:
+    input:
+        'references/genome.fa'
+    output:
+        protected('references/genome.chromsizes')
+    log:
+        'references/logs/genome.chromsizes.log'
+    params:
+        # NOTE: Be careful with the memory here; make sure you have enough
+        # and/or it matches the resources you're requesting
+        java_args='-Xmx20g'
+        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
+    resources:
+        mem_mb=gb(24),
+        runtime=hours(2)
+    shell:
+        'export LC_COLLATE=C; '
+        'rm -f {output}.tmp '
+        '&& picard '
+        '{params.java_args} '
+        'CreateSequenceDictionary R={input} O={output}.tmp &> {log} '
+        '&& grep "^@SQ" {output}.tmp '
+        '''| awk '{{print $2, $3}}' '''
+        '| sed "s/SN://g;s/ LN:/\\t/g" '
+        '| sort -k1,1 > {output} '
+        '&& rm -f {output}.tmp '
+
+
+rule mappings:
+    """
+    Creates gzipped TSV mapping between attributes in the GTF.
+    """
+    input:
+        gtf='references/annotation.gtf'
+    output:
+        protected('references/annotation.mapping.tsv.gz')
+    params:
+        include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', [])
+    resources:
+        runtime=hours(2),
+        mem_mb=gb(2)
+    run:
+        import gffutils
+
+        # Will want to change the setting back to what it was originally when
+        # we're done
+        orig_setting = gffutils.constants.always_return_list
+        gffutils.constants.always_return_list = False
+
+        include_featuretypes = params.include_featuretypes
+
+        res = []
+        for f in gffutils.DataIterator(input[0]):
+
+            ft = f.featuretype
+
+            if include_featuretypes and (ft not in include_featuretypes):
+                continue
+
+            d = dict(f.attributes)
+            d['__featuretype__'] = ft
+            res.append(d)
+
+        df = pandas.DataFrame(res)
+
+        # Depending on how many attributes there were and the
+        # include_featuretypes settings, this may take a while.
+        df = df.drop_duplicates()
+
+        df.to_csv(output[0], sep='\t', index=False, compression='gzip')
+
+        # Restore original setting
+        gffutils.constants.always_return_list = orig_setting

From 8bb7398f5a4f3d6c5e1b06e933745011ac250521 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 19:24:00 +0000
Subject: [PATCH 026/196] simplify config

---
 workflows/rnaseq/config/config.yaml | 68 ++++++++---------------------
 1 file changed, 18 insertions(+), 50 deletions(-)

diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml
index 7b0db18d..2cbd3d66 100644
--- a/workflows/rnaseq/config/config.yaml
+++ b/workflows/rnaseq/config/config.yaml
@@ -1,59 +1,27 @@
+fasta:
+  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
+  postprocess: 'lib.utils.gzipped'
+
+gtf:
+  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf"
+  postprocess: 'lib.utils.gzipped'
+
+rrna:
+  url:
+    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
+    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
+  postprocess:
+    function: 'lib.utils.filter_fastas'
+    args: 'Drosophila melanogaster'
+
+
 sampletable: 'config/sampletable.tsv'
 
 patterns: 'config/rnaseq_patterns.yaml'
 
-# Which key in the `references` dict below to use
-organism: 'dmel'
-
-# If not specified here, use the environment variable REFERENCES_DIR.
-references_dir: 'references_data'
- 
 # See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info.
 stranded: 'fr-firststrand'     # for dUTP libraries
 #         'fr-secondstrand'    # for ligation libraries
 #         'unstranded'         # for libraries without strand specificity
 
-aligner:
-  index: 'star'
-  tag: 'test'
-
-rrna:
-  index: 'bowtie2'
-  tag: 'rRNA'
-
-gtf:
-  tag: "test"
-
-salmon:
-  tag: "test"
-
-kallisto:
-  tag: "test"
-
-fastq_screen:
-  - label: rRNA
-    organism: dmel
-    tag: test
-  - label: Fly
-    organism: dmel
-    tag: test
-
-merged_bigwigs:
-  control_pos:
-    pos:
-      - sample1
-      - sample2
-  treatment_all:
-    pos:
-      - sample3
-      - sample4
-    neg:
-      - sample3
-      - sample4
-
-# See the reference config files in the top level of the repo,
-# include/reference_configs, for inspiration for more species.
-
-include_references:
-  - '../../include/reference_configs/test.yaml'
-  - '../../include/reference_configs/Drosophila_melanogaster.yaml'
+aligner: 'star'

From 79081fdb041ebb462e4b535abdc9c9da0bed3050 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 19:24:11 +0000
Subject: [PATCH 027/196] utils, common, and helpers are all now in utils

---
 lib/utils.py | 897 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 878 insertions(+), 19 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index 3c280890..fd8c4dba 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1,9 +1,47 @@
-import os
-import contextlib
+import binascii
 import collections
+import contextlib
+import gzip
+import os
+import re
+import subprocess
+import warnings
 from collections.abc import Iterable
+from itertools import product
+
+import pandas
+import pandas as pd
+import yaml
+from Bio import SeqIO
+from snakemake.io import expand, regex_from_filepattern
 from snakemake.shell import shell
 
+# Small helper functions
+
+
+def resolve_name(name):
+    """
+    Imports a specific object from a dotted path and returns just that object.
+
+    From nose.utils.resolve_name (with the logging parts taken out) which in
+    turn is from unittest.TestLoader.loadTestByName
+    """
+    parts = name.split(".")
+    parts_copy = parts[:]
+    while parts_copy:
+        try:
+            module = __import__(".".join(parts_copy))
+            break
+        except ImportError:
+            del parts_copy[-1]
+            if not parts_copy:
+                raise
+    parts = parts[1:]
+    obj = module
+    for part in parts:
+        obj = getattr(obj, part)
+    return obj
+
 
 @contextlib.contextmanager
 def temp_env(env):
@@ -52,22 +90,19 @@ def gen():
 
 
 def test_flatten():
-    assert (
-        sorted(
-            flatten(
-                {
-                    "a": {
-                        "b": {
-                            "c": ["a", "b", "c"],
-                        },
+    assert sorted(
+        flatten(
+            {
+                "a": {
+                    "b": {
+                        "c": ["a", "b", "c"],
                     },
-                    "x": ["e", "f", "g"],
-                    "y": {"z": "d"},
-                }
-            )
+                },
+                "x": ["e", "f", "g"],
+                "y": {"z": "d"},
+            }
         )
-        == ["a", "b", "c", "d", "e", "f", "g"]
-    )
+    ) == ["a", "b", "c", "d", "e", "f", "g"]
 
     assert flatten("a", True) == "a"
     assert flatten(["a"], True) == "a"
@@ -171,7 +206,7 @@ def boolean_labels(names, idx, mapping={True: "AND", False: "NOT"}, strip="AND_"
     a_AND_b_AND_c_NOT_d_AND_e
     """
     s = []
-    for i, (n, x) in enumerate(zip(names, idx)):
+    for n, x in zip(names, idx):
         s.append(mapping[x] + "_" + n)
     s = "_".join(s)
     if s.startswith(strip):
@@ -191,7 +226,188 @@ def make_relative_symlink(target, linkname):
     linkbase = os.path.basename(linkname)
     if not os.path.exists(linkdir):
         shell("mkdir -p {linkdir}")
-    shell("cd {linkdir}; ln -sf {relative_target} {linkbase}")
+    shell(f"cd {linkdir}; ln -sf {relative_target} {linkbase}")
+
+
+def extract_wildcards(pattern, target):
+    """
+    Return a dictionary of wildcards and values identified from `target`.
+
+    Returns None if the regex match failed.
+
+    Parameters
+    ----------
+    pattern : str
+        Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``.
+
+    target : str
+        Filename from which to extract wildcards, e.g., ``data/a.bam``.
+
+    Examples
+    --------
+    >>> pattern = '{output}/{sample}.bam'
+    >>> target = 'data/a.bam'
+    >>> expected = {'output': 'data', 'sample': 'a'}
+    >>> assert extract_wildcards(pattern, target) == expected
+    >>> assert extract_wildcards(pattern, 'asdf') is None
+    """
+    m = re.compile(regex_from_filepattern(pattern)).match(target)
+    if m:
+        return m.groupdict()
+
+
+def _is_gzipped(fn):
+    """
+    Filename-independent method of checking if a file is gzipped or not. Uses
+    the magic number.
+
+    xref https://stackoverflow.com/a/47080739
+    """
+    with open(fn, "rb") as f:
+        return binascii.hexlify(f.read(2)) == b"1f8b"
+
+
+def openfile(tmp, mode):
+    """
+    Returns an open file handle; auto-detects gzipped files.
+    """
+    if _is_gzipped(tmp):
+        return gzip.open(tmp, mode)
+    else:
+        return open(tmp, mode)
+
+
+def gzipped(tmpfiles, outfile):
+    """
+    Cat-and-gzip a list of uncompressed files into a compressed output file.
+    """
+    with gzip.open(outfile, "wt") as fout:
+        for f in tmpfiles:
+            with open(f) as infile:
+                for line in infile:
+                    fout.write(line)
+
+
+def cat(tmpfiles, outfile):
+    """
+    Simple concatenation of files.
+
+    Note that gzipped files can be concatenated as-is without un- and re-
+    compressing.
+    """
+    shell(f"cat {tmpfiles} > {outfile}")
+
+
+def is_paired_end(sampletable, sample):
+    """
+    Inspects the sampletable to see if the sample is paired-end or not
+
+    Parameters
+    ----------
+    sampletable : pandas.DataFrame
+        Contains a "layout" or "LibraryLayout" column (but not both). If the
+        lowercase value is "pe" or "paired", consider the sample paired-end.
+        Otherwise consider single-end.
+
+    sample : str
+        Assumed to be found in the first column of `sampletable`
+    """
+    # We can't fall back to detecting PE based on two fastq files provided for
+    # each sample when it's an SRA sampletable (which only has SRR accessions).
+    #
+    # So detect first detect if SRA sampletable based on presence of "Run"
+    # column and all values of that column starting with "SRR", and then raise
+    # an error if the Layout column does not exist.
+
+    if "Run" in sampletable.columns:
+        if all(sampletable["Run"].str.startswith("SRR")):
+            if (
+                "Layout" not in sampletable.columns
+                and "layout" not in sampletable.columns
+            ):
+                raise ValueError(
+                    "Sampletable appears to be SRA, but no 'Layout' column "
+                    "found. This is required to specify single- or paired-end "
+                    "libraries."
+                )
+
+    row = sampletable.set_index(sampletable.columns[0]).loc[sample]
+    if "orig_filename_R2" in row:
+        return True
+    if "layout" in row and "LibraryLayout" in row:
+        raise ValueError("Expecting column 'layout' or 'LibraryLayout', " "not both")
+    try:
+        return row["layout"].lower() in ["pe", "paired"]
+    except KeyError:
+        pass
+    try:
+        return row["LibraryLayout"].lower() in ["pe", "paired"]
+    except KeyError:
+        pass
+    return False
+
+
+def fill_r1_r2(sampletable, pattern, r1_only=False):
+    """
+    Returns a function intended to be used as a rule's input function.
+
+    The returned function, when provided with wildcards, will return one or two
+    rendered versions of a pattern depending on SE or PE respectively.
+    Specifically, given a pattern (which is expected to contain a placeholder
+    for "{sample}" and "{n}"), look up in the sampletable whether or not it is
+    paired-end.
+
+    Parameters
+    ----------
+
+    sampletable : pandas.DataFrame
+        Contains a "layout" column with either "SE" or "PE", or "LibraryLayout"
+        column with "SINGLE" or "PAIRED". If column does not exist, assume SE.
+
+    pattern : str
+        Must contain at least a "{sample}" placeholder.
+
+    r1_only : bool
+        If True, then only return the file for R1 even if PE is configured.
+    """
+
+    def func(wc):
+        try:
+            wc.sample
+        except AttributeError:
+            raise ValueError(
+                'Need "{{sample}}" in pattern ' '"{pattern}"'.format(pattern=pattern)
+            )
+        n = [1]
+        if is_paired_end(sampletable, wc.sample) and not r1_only:
+            n = [1, 2]
+        res = expand(pattern, sample=wc.sample, n=n)
+        return res
+
+    return func
+
+
+def pluck(obj, kv):
+    """
+    For a given dict or list that somewhere contains keys `kv`, return the
+    values of those keys.
+
+    Named after the dplyr::pluck, and implemented based on
+    https://stackoverflow.com/a/1987195
+    """
+    if isinstance(obj, list):
+        for i in obj:
+            for x in pluck(i, kv):
+                yield x
+    elif isinstance(obj, dict):
+        if kv in obj:
+            yield obj[kv]
+        for j in obj.values():
+            for x in pluck(j, kv):
+                yield x
+
+
+# Functions for conveniently working with resources
 
 
 def autobump(*args, **kwargs):
@@ -308,7 +524,7 @@ def autobump(*args, **kwargs):
         raise ValueError(f"Unhandled args and kwargs: {args}, {kwargs}")
 
     def f(wildcards, attempt):
-        return  baseline_converted + (attempt - 1) * increment_converted
+        return baseline_converted + (attempt - 1) * increment_converted
 
     return f
 
@@ -319,3 +535,646 @@ def gb(size_in_gb):
 
 def hours(time_in_hours):
     return time_in_hours * 60
+
+
+# Config parsing and handling
+
+
+class ConfigurationError(Exception):
+    pass
+
+
+def detect_layout(sampletable):
+    """
+    Identifies whether a sampletable represents single-end or paired-end reads.
+
+    Raises NotImplementedError if there's a mixture.
+    """
+    is_pe = [is_paired_end(sampletable, s) for s in sampletable.iloc[:, 0]]
+    if all(is_pe):
+        return "PE"
+    elif not any(is_pe):
+        return "SE"
+    else:
+        p = sampletable.iloc[is_pe, 0].to_list()
+        s = sampletable.iloc[[not i for i in is_pe], 0].to_list()
+        if len(p) > len(s):
+            report = f"SE samples: {s}"
+        else:
+            report = f"PE samples: {p}"
+        raise ValueError(f"Only a single layout (SE or PE) is supported. {report}")
+
+
+def fill_patterns(patterns, fill, combination=product):
+    """
+    Fills in a dictionary of patterns with the dictionary `fill`.
+
+    >>> patterns = dict(a='{sample}_R{N}.fastq')
+    >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2])
+    >>> sorted(fill_patterns(patterns, fill)['a'])
+    ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq']
+
+    If using `zip` as a combination, checks to ensure all values in `fill` are
+    the same length to avoid truncated output.
+
+    This fails:
+
+    >>> patterns = dict(a='{sample}_R{N}.fastq')
+    >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2])
+    >>> sorted(fill_patterns(patterns, fill, zip)['a']) # doctest: +IGNORE_EXCEPTION_DETAIL
+    Traceback (most recent call last):
+    ...
+    ValueError: {'sample': ['one', 'two', 'three'], 'N': [1, 2]} does not have the same number of entries for each key
+
+    But this works:
+
+    >>> patterns = dict(a='{sample}_R{N}.fastq')
+    >>> fill = dict(sample=['one', 'one', 'two', 'two', 'three', 'three'], N=[1, 2, 1, 2, 1, 2])
+    >>> sorted(fill_patterns(patterns, fill, zip)['a'])
+    ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq']
+
+    """
+    # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns
+    # with no wildcards in them are removed from expand when `zip` is used as
+    # the combination function.
+    #
+    # For example, in 5.4.5:
+    #
+    #   expand('x', zip, d=[1,2,3]) == []
+    #
+    # But in 4.4.0:
+    #
+    #   expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x']
+
+    if combination == zip:
+        lengths = set([len(v) for v in fill.values()])
+        if len(lengths) != 1:
+            raise ValueError(
+                f"{fill} does not have the same number of entries for each key"
+            )
+
+    def update(d, u, c):
+        for k, v in u.items():
+            if isinstance(v, collections.abc.Mapping):
+                r = update(d.get(k, {}), v, c)
+                d[k] = r
+            else:  # not a dictionary, so we're at a leaf
+                if isinstance(fill, pd.DataFrame):
+                    d[k] = list(set(expand(u[k], zip, **fill.to_dict("list"))))
+                else:
+                    d[k] = list(set(expand(u[k], c, **fill)))
+            if not d[k]:
+                d[k] = [u[k]]
+        return d
+
+    d = {}
+    return update(d, patterns, combination)
+
+
+def rscript(string, scriptname, log=None):
+    """
+    Saves the string as `scriptname` and then runs it
+
+    Parameters
+    ----------
+    string : str
+        Filled-in template to be written as R script
+
+    scriptname : str
+        File to save script to
+
+    log : str
+        File to redirect stdout and stderr to. If None, no redirection occurs.
+    """
+    with open(scriptname, "w") as fout:
+        fout.write(string)
+    if log:
+        _log = "> {0} 2>&1".format(log)
+    else:
+        _log = ""
+    shell("Rscript {scriptname} {_log}")
+
+
+def check_unique_fn(df):
+    """
+    Raises an error if the fastq filenames are not unique
+    """
+    fns = df["orig_filename"]
+    if "orig_filename_R2" in df.columns:
+        fns = pd.concat([fns, df["orig_filename_R2"]])
+    if len(fns.unique()) < len(fns):
+        raise ValueError("Fastq filenames non unique, check the sampletable\n")
+
+
+def check_unique_samplename(df):
+    """
+    Raises an error if the samplenames are not unique
+    """
+    ns = df.index
+    if len(ns.unique()) < len(ns):
+        raise ConfigurationError("Samplenames non unique, check the sampletable\n")
+
+
+def preflight(config):
+    """
+    Performs verifications on config and sampletable files
+
+    Parameters
+    ----------
+    config: yaml config object
+    """
+    sampletable = pd.read_table(config["sampletable"], index_col=0, comment="#")
+    check_unique_samplename(sampletable)
+    if "orig_filename" in sampletable.columns:
+        check_unique_fn(sampletable)
+
+
+def rnaseq_preflight(c):
+    pass
+
+
+def chipseq_preflight(c):
+    pass
+
+
+def strand_arg_lookup(config, lookup):
+    """
+    Given a config object and lookup dictionary, confirm that the config has
+    correctly specified strandedness and then return the value for that key.
+    """
+    if not config.stranded:
+        raise ConfigurationError(
+            "Starting in v1.8, 'stranded' is required in the config file. "
+            "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), "
+            "or 'fr-secondstrand' (R1 aligns sense to original transcript). If you are not sure, "
+            "run the workflow with only the 'strand_check' rule, like "
+            "'snakemake -j 5 strand_check'."
+        )
+    if config.stranded not in lookup:
+        keys = list(lookup.keys())
+        raise KeyError(f"'{config.stranded}' not one of {keys}")
+    return lookup[config.stranded]
+
+
+def filter_fastas(tmpfiles, outfile, pattern):
+    """
+    Extract records from fasta file(s) given a search pattern.
+
+    Given input gzipped FASTAs, create a new gzipped fasta containing only
+    records whose description matches `pattern`.
+
+    Parameters
+    ----------
+    tmpfiles : list
+        gzipped fasta files to look through
+
+    outfile : str
+        gzipped output fastq file
+
+    pattern : str
+        Look for this string in each record's description
+
+    """
+
+    def gen():
+        for tmp in tmpfiles:
+            handle = gzip.open(tmp, "rt")
+            parser = SeqIO.parse(handle, "fasta")
+            for rec in parser:
+                if pattern not in rec.description:
+                    continue
+                rec.seq = rec.seq.back_transcribe()
+                rec.description = rec.name
+                yield rec
+
+    with gzip.open(outfile, "wt") as fout:
+        SeqIO.write(gen(), fout, "fasta")
+
+
+def twobit_to_fasta(tmpfiles, outfile):
+    """
+    Converts .2bit files to fasta.
+
+    Parameters
+    ----------
+    tmpfiles : list
+        2bit files to convert
+
+    outfile : str
+        gzipped output fastq file
+    """
+    # Note that twoBitToFa doesn't support multiple input files, but we want to
+    # support them with this function
+    lookup = {i: i + ".fa" for i in tmpfiles}
+    for i in tmpfiles:
+        fn = lookup[i]
+        shell("twoBitToFa {i} {fn}")
+
+    # Make sure we retain the order of the originally-provided files from the
+    # config when concatenating.
+    fastas = [lookup[i] for i in tmpfiles]
+    shell("cat {fastas} | gzip -c > {outfile}")
+    shell("rm {fastas}")
+
+
+def download_and_postprocess(urls, postprocess, outfile, log):
+    """
+    Many reference files cannot be used as-is and need to be modified.
+
+    This function supports providing one or more URLs, and any postprocess
+    functions to get the reference files usable.
+
+    Parameters
+    ----------
+    urls : str or list
+        URL(s) to download. Can be a list, in which case they will be concatenated.
+
+    postprocess : str | dict | list | None
+        Postprocessing config. See below for details.
+
+    outfile : str
+        Output filename to save final output. Expected to be gzipped.
+
+    log : str
+        Log filename that will accumulate all logs
+
+    Notes
+    -----
+
+    This function:
+
+        - downloads the URL[s] to tempfile[s]
+        - resolves the name of the postprocessing function(s) if provided and
+          imports it
+        - calls the imported postprocessing function using the tempfile[s] and
+          outfile plus any additional specified arguments.
+
+    The postprocessing function must have one of the following signatures,
+    where `infiles` contains the list of temporary files downloaded from the
+    URL or URLs specified, and `outfile` is a gzipped file expected to be
+    created by the function::
+
+        def func(infiles, outfile):
+            pass
+
+    or::
+
+        def func(infiles, outfile, *args):
+            pass
+
+    or::
+
+        def func(infiles, outfile, *args, **kwargs):
+            pass
+
+
+    The function is specified as a string that resolves to an importable
+    function, e.g., `postprocess: lib.postprocess.dm6.fix` will call a function
+    called `fix` in the file `lib/postprocess/dm6.py`.
+
+    If the contents of `postprocess:` is a dict, it must have at least the key
+    `function`, and optionally `args` and/or `kwargs` keys. The `function` key
+    indicates the importable path to the function.  `args` can be a string
+    or list of arguments that will be provided as additional args to a function
+    with the second kind of signature above.  If `kwargs` is provided, it is
+    a dict that is passed to the function with the third kind of signature
+    above. For example::
+
+        postprocess:
+            function: lib.postprocess.dm6.fix
+            args:
+                - True
+                - 3
+
+    or::
+
+        postprocess:
+            function: lib.postprocess.dm6.fix
+            args:
+                - True
+                - 3
+            kwargs:
+                skip: exon
+
+    """
+
+    def default_postprocess(origfn, newfn):
+        shell("mv {origfn} {newfn}")
+
+    if not isinstance(postprocess, list):
+        postprocess = [postprocess]
+
+    # Will contain tuples of (func, args, kwargs, tmp_outfile)
+    funcs = []
+
+    # It is possible to chain multiple postprocessing functions together by
+    # providing them as a list.
+    #
+    #   postprocess = [
+    #
+    #     "lib.func1",
+    #
+    #     {
+    #       "function": "lib.func2",
+    #       "args": (True, True),
+    #     },
+    #
+    #     {
+    #       "function": "lib.func3",
+    #       "args": (1, 2),
+    #       "kwargs": {"gzipped": True),
+    #     },
+    #
+    #   ]
+    #
+    for i, postprocess_i in enumerate(postprocess):
+
+        if postprocess_i is None:
+            func = default_postprocess
+            args = ()
+            kwargs = {}
+            name = None
+
+        # postprocess can have a single string value indicating the function or
+        # it can be a dict with keys "function" and optionally "args". The value of
+        # "args" can be a string or a list.
+        else:
+            if isinstance(postprocess_i, dict):
+                name = postprocess_i.get("function", postprocess)
+                args = postprocess_i.get("args", ())
+                kwargs = postprocess_i.get("kwargs", {})
+                if isinstance(args, str):
+                    args = (args,)
+            elif isinstance(postprocess_i, str):
+                name = postprocess_i
+                args = ()
+                kwargs = {}
+
+            else:
+                raise ValueError(
+                    f"Unhandled type of postprocessing configuration: {postprocess_i}"
+                )
+
+            # In the special case where there is kwarg beginning and ending
+            # with "__", this can be a dotted function name so it will be
+            # resolved here as well and passed along to the postprocessing
+            # function.
+            #
+            # This makes it possible to do things like add ERCC annotations on
+            # the end of other annotations that themselves need to be
+            # post-processed.
+            for kw in kwargs:
+                if kw.startswith("__") and kw.endswith("__"):
+                    kwargs[kw] = resolve_name(kwargs[kw])
+
+            # import the function
+            func = resolve_name(name)
+
+        tmp_outfile = f"{outfile}.{i}.{name}.tmp"
+        funcs.append([func, args, kwargs, tmp_outfile])
+
+    # The last func's outfile should be the final outfile
+    funcs[-1][-1] = outfile
+
+    # as described in the docstring above, functions are to assume a list of
+    # urls
+    if isinstance(urls, str):
+        urls = [urls]
+
+    # Download into reasonably-named temp filenames
+    downloaded_tmpfiles = [f"{outfile}.{i}.tmp" for i in range(len(urls))]
+
+    # For the first postprocess, its input will be all the downloaded files.
+    postprocess_input = downloaded_tmpfiles
+    try:
+        # Copy (if local URI) or download into the specified temp files
+        for url, tmpfile in zip(urls, downloaded_tmpfiles):
+            if url.startswith("file:"):
+                url = url.replace("file://", "")
+                shell("cp {url} {tmpfile} 2> {log}")
+            else:
+                shell("wget {url} -O- > {tmpfile} 2> {log}")
+
+        for func, args, kwargs, tmp_outfile in funcs:
+            func(
+                # all downloaded files (if the first postprocess), or the
+                # output of the last postprocess
+                postprocess_input,
+                # the temp output for just this postprocess
+                tmp_outfile,
+                *args,
+                **kwargs,
+            )
+
+            # We want the next postprocess to use the output of what we just
+            # ran; as documented above the input files are expected to be in
+            # a list.
+            postprocess_input = [tmp_outfile]
+
+    except Exception as e:
+        raise e
+    finally:
+        to_delete = downloaded_tmpfiles
+
+        # all but the last postprocess func output (the last one is the final
+        # output that we want to keep!)
+        to_delete += [i[-1] for i in funcs[:-1]]
+
+        for i in to_delete:
+            if os.path.exists(i):
+                shell("rm {i}")
+    if not _is_gzipped(outfile):
+        raise ValueError(f"{outfile} does not appear to be gzipped.")
+
+
+def get_sampletable(config):
+    """
+    Return samples and pandas.DataFrame of parsed sampletable.
+
+    Returns the sample IDs and the parsed sampletable from the file specified
+    in the config.
+
+    The sample IDs are assumed to be the first column of the sampletable.
+
+    Parameters
+    ----------
+    config : dict
+    """
+    sampletable = pandas.read_csv(config["sampletable"], comment="#", sep="\t")
+    samples = sampletable.iloc[:, 0]
+    return samples, sampletable
+
+
+def get_techreps(sampletable, label):
+    """
+    Return all sample IDs for which the "label" column is `label`.
+    """
+    # since we're not requiring a name but we want to use `loc`
+    first_col = sampletable.columns[0]
+    result = list(sampletable.loc[sampletable["label"] == label, first_col])
+
+    # If we're using a ChIP-seq-like sampletable we can provide a more
+    # informative error message.
+
+    is_chipseq = "antibody" in sampletable.columns
+    if is_chipseq:
+        err = """
+        No technical replicates found for label '{}'. Check the ChIP-seq config
+        file to ensure the peak-calling section only specifies values from the
+        sampletable's "label" column.""".format(
+            label
+        )
+    else:
+        err = "No technical replicates found for label '{}'.".format(label)
+
+    if len(result) == 0:
+        raise ValueError(err)
+
+    return result
+
+
+def deprecation_handler(config):
+    """
+    Checks the config to see if anything has been deprecated.
+
+    Also makes any fixes that can be done automatically.
+    """
+    if "assembly" in config:
+        config["organism"] = config["assembly"]
+        warnings.warn(
+            "'assembly' should be replaced with 'organism' in config files. "
+            "As a temporary measure, a new 'organism' key has been added with "
+            "the value of 'assembly'",
+            DeprecationWarning,
+        )
+
+    for org, block1 in config.get("references", {}).items():
+        for tag, block2 in block1.items():
+            gtf_conversions = block2.get("gtf", {}).get("conversions", [])
+            for c in gtf_conversions:
+                if isinstance(c, dict) and "annotation_hub" in c:
+                    warnings.warn(
+                        "You may want to try the 'mappings' conversion rather "
+                        "than 'annotation_hub' since it works directly off "
+                        "the GTF file rather than assuming concordance between "
+                        "GTF and AnnoationHub instances",
+                        DeprecationWarning,
+                    )
+
+    return config
+
+
+def check_url(url, verbose=False):
+    """
+    Try to open -- and then immediately close -- a URL.
+
+    Any exceptions can be handled upstream.
+
+    """
+
+    # Some notes here:
+    #
+    #  - A pure python implementation isn't great because urlopen seems to
+    #    cache or hold sessions open or something. EBI servers reject responses
+    #    because too many clients are connected. This doesn't happen using curl.
+    #
+    #  - Using the requests module doesn't help, because urls can be ftp:// and
+    #    requests doesn't support that.
+    #
+    #  - Similarly, using asyncio and aiohttp works great for https, but not
+    #    ftp (I couldn't get aioftp to work properly).
+    #
+    #  - Not all servers support --head. An example of this is
+    #    https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA.
+    #
+    #  - Piping curl to head using the -c arg to use bytes seems to work.
+    #    However, we need to set pipefail (otherwise because head exits 0 the
+    #    whole thing exits 0). And in that case, we expect curl to exit every
+    #    time with exit code 23, which is "failed to write output", because of
+    #    the broken pipe. This is handled below.
+    #
+    if verbose:
+        print(f"Checking {url}")
+
+    # Notes on curl args:
+    #
+    #  --max-time to allow the server some seconds to respond
+    #  --retry to allow multiple tries if transient errors (4xx for FTP, 5xx for HTTP) are found
+    #  --silent to not print anything
+    #  --fail to return non-zero exit codes for 404 (default is exit 0 on hitting 404)
+    #
+    # Need to run through bash explicitly to get the pipefail option, which in
+    # turn means running with shell=True
+    proc = subprocess.run(
+        f'/bin/bash -o pipefail -c "curl --retry 3 --max-time 10 --silent --fail {url} | head -c 10 > /dev/null"',
+        shell=True,
+    )
+    return proc
+
+
+def check_urls(config, verbose=False):
+    """
+    Given a config filename or existing object, extract the URLs and check
+    them.
+
+    Parameters
+    ----------
+
+    config : str or dict
+        Config object to inspect
+
+    verbose : bool
+        Print which URL is being checked
+
+    wait : int
+        Number of seconds to wait in between checking URLs, to avoid
+        too-many-connection issues
+    """
+    failures = []
+    urls = list(set(utils.flatten(pluck(config, "url"))))
+    for url in urls:
+        if url.startswith("file://"):
+            continue
+
+        res = check_url(url, verbose=verbose)
+
+        # we expect exit code 23 because we're triggering SIGPIPE with the
+        # "|head -c" above.
+        if res.returncode and res.returncode != 23:
+            failures.append(
+                f"FAIL with exit code {res.returncode}. Command was: {res.args}"
+            )
+    if failures:
+        output = "\n   ".join(failures)
+        raise ValueError(
+            f"Found problematic URLs. See https://ec.haxx.se/usingcurl/usingcurl-returns for explanation of exit codes.\n   {output}"
+        )
+
+
+def check_all_urls_found(verbose=True):
+    """
+    Recursively loads all references that can be included and checks them.
+    Reports out if there are any failures.
+    """
+    check_urls(
+        {
+            "include_references": [
+                "include/reference_configs",
+                "test/test_configs",
+                "workflows/rnaseq/config",
+                "workflows/chipseq/config",
+                "workflows/references/config",
+            ]
+        },
+        verbose=verbose,
+    )
+
+
+def gff2gtf(gff, gtf):
+    """
+    Converts a gff file to a gtf format using the gffread function from Cufflinks
+    """
+    if _is_gzipped(gff[0]):
+        shell("gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}")
+    else:
+        shell("gffread {gff} -T -o- | gzip -c > {gtf}")

From 8337b98654604d38bbbeae64b5e381cc267aa6a8 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 19:24:34 +0000
Subject: [PATCH 028/196] cleanup patterns_targets

---
 lib/patterns_targets.py | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py
index ec62d513..08fedb26 100644
--- a/lib/patterns_targets.py
+++ b/lib/patterns_targets.py
@@ -6,9 +6,8 @@
 import os
 import collections
 import yaml
-from . import common
+from . import utils
 from . import chipseq
-from . import helpers
 from snakemake.io import expand
 
 HERE = os.path.abspath(os.path.dirname(__file__))
@@ -53,11 +52,7 @@ def __init__(self, config, patterns, workdir=None):
             patterns = os.path.join(workdir, patterns)
             self.workdir = workdir
 
-        if isinstance(config, str):
-            self.path = config
-
-        self.config = common.load_config(
-            common.resolve_config(config, workdir))
+        self.config = config
 
         stranded = self.config.get('stranded', None)
         self.stranded = None
@@ -71,12 +66,9 @@ def __init__(self, config, patterns, workdir=None):
 
         # Read the config file and extract all sort of useful bits. This mostly
         # uses the `common` module to handle the details.
-        self.config['references_dir'] = common.get_references_dir(self.config)
-        self.samples, self.sampletable = common.get_sampletable(self.config)
-        self.refdict, self.conversion_kwargs = common.references_dict(self.config)
-        self.organism = self.config['organism']
+        self.samples, self.sampletable = utils.get_sampletable(self.config)
         self.patterns = yaml.load(open(patterns), Loader=yaml.FullLoader)
-        self.is_paired = helpers.detect_layout(self.sampletable) == 'PE'
+        self.is_paired = utils.detect_layout(self.sampletable) == 'PE'
         if self.is_paired:
             self.n = [1, 2]
         else:
@@ -86,7 +78,7 @@ def __init__(self, config, patterns, workdir=None):
         else:
             self.is_sra = False
 
-        helpers.preflight(self.config)
+        ##########################utils.preflight(self.config)
 
 class RNASeqConfig(SeqConfig):
     def __init__(self, config, patterns, workdir=None):
@@ -112,7 +104,7 @@ def __init__(self, config, patterns, workdir=None):
 
         self.fill = dict(sample=self.samples, n=self.n)
         self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None)
-        self.targets = helpers.fill_patterns(self.patterns, self.fill)
+        self.targets = utils.fill_patterns(self.patterns, self.fill)
 
         # If the sampletable is from an sra metadata table, then we need to set the value of
         # 'orig_filename' for each of the samples to where the fastq was downloaded
@@ -126,14 +118,14 @@ def __init__(self, config, patterns, workdir=None):
             self.fill_by_aggregation = dict(
                 merged_bigwig_label=self.config['merged_bigwigs'].keys(),
             )
-            self.targets_by_aggregation = helpers.fill_patterns(
+            self.targets_by_aggregation = utils.fill_patterns(
                 self.patterns_by_aggregation,
                 self.fill_by_aggregation
             )
             self.targets.update(self.targets_by_aggregation)
             self.patterns.update(self.patterns_by_aggregation)
 
-        helpers.rnaseq_preflight(self)
+        #########################utils.rnaseq_preflight(self)
 
 
 class ChIPSeqConfig(SeqConfig):
@@ -179,7 +171,7 @@ def __init__(self, config, patterns, workdir=None):
             ip_label=self.sampletable.label[
                 self.sampletable.antibody != 'input'].values
         )
-        self.targets_by_sample = helpers.fill_patterns(
+        self.targets_by_sample = utils.fill_patterns(
             self.patterns_by_sample, self.fill_by_sample)
 
         self.targets.update(self.targets_by_sample)
@@ -191,7 +183,7 @@ def __init__(self, config, patterns, workdir=None):
             self.fill_by_aggregation = dict(
                 merged_bigwig_label=self.config['merged_bigwigs'].keys(),
             )
-            self.targets_by_aggregation = helpers.fill_patterns(
+            self.targets_by_aggregation = utils.fill_patterns(
                 self.patterns_by_aggregation,
                 self.fill_by_aggregation
             )
@@ -254,11 +246,11 @@ def __init__(self, config, patterns, workdir=None):
             # targets as they're built.
             update_recursive(
                 self.targets_for_peaks,
-                helpers.fill_patterns(_peak_patterns, _fill)
+                utils.fill_patterns(_peak_patterns, _fill)
             )
 
 
         self.targets.update(self.targets_for_peaks)
         self.patterns.update(self.patterns_by_peaks)
 
-        helpers.chipseq_preflight(self)
+        utils.chipseq_preflight(self)

From e8d16df366a3d2549e35122dcdcc6b303aa12aea Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 19:24:57 +0000
Subject: [PATCH 029/196] rnaseq workflow

---
 workflows/rnaseq/Snakefile | 163 +++++++++++++++----------------------
 1 file changed, 66 insertions(+), 97 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 47c2a324..634f1d91 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -8,26 +8,17 @@ import pandas as pd
 
 HERE = str(Path(workflow.snakefile).parent)
 sys.path.insert(0, HERE + "/../..")
-from lib import common, utils, helpers, aligners
+from lib import utils
 from lib.utils import autobump, gb, hours
 from lib.patterns_targets import RNASeqConfig
 
-# ----------------------------------------------------------------------------
-#
-# Search for the string "NOTE:" to look for points of configuration that might
-# be helpful for your experiment.
-#
-# ----------------------------------------------------------------------------
-
-if not workflow.overwrite_configfiles:
-    configfile: 'config/config.yaml'
+configfile: 'config/config.yaml'
 
-config = common.load_config(config)
+include: '../../rules/references.smk'
 
-include: '../references/Snakefile'
 
 # Verify configuration of config and sampletable files
-helpers.preflight(config)
+################utils.preflight(config)
 
 c = RNASeqConfig(config, config.get('patterns', 'config/rnaseq_patterns.yaml'))
 
@@ -47,19 +38,18 @@ def wrapper_for(path):
 
 # See "patterns and targets" in the documentation for what's going on here.
 final_targets = utils.flatten((
-    utils.flatten(c.targets['fastqc']),
-    [c.targets['fastq_screen']],
-    [c.targets['rrna_percentages_table']],
-    [c.targets['multiqc']],
-    utils.flatten(c.targets['featurecounts']),
-    utils.flatten(c.targets['markduplicates']),
-    utils.flatten(c.targets['salmon']),
-    utils.flatten(c.targets['kallisto']),
-    utils.flatten(c.targets['preseq']),
-    utils.flatten(c.targets['rseqc']),
-    utils.flatten(c.targets['collectrnaseqmetrics']),
-    utils.flatten(c.targets['bigwig']),
-    utils.flatten(c.targets['samtools']),
+    c.targets['fastqc'],
+    c.targets['rrna_percentages_table'],
+    c.targets['multiqc'],
+    c.targets['featurecounts'],
+    c.targets['markduplicates'],
+    c.targets['salmon'],
+    c.targets['kallisto'],
+    c.targets['preseq'],
+    c.targets['rseqc'],
+    c.targets['collectrnaseqmetrics'],
+    c.targets['bigwig'],
+    c.targets['samtools'],
 ))
 
 if config.get('merged_bigwigs', None):
@@ -146,9 +136,9 @@ config.setdefault('strand_check_reads', 1e5)
 
 rule sample_strand_check:
     input:
-        fastq=common.fill_r1_r2(c.sampletable, c.patterns['fastq']),
-        index=[c.refdict[c.organism][config['aligner']['tag']]['bowtie2']],
-        bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12']
+        fastq=utils.fill_r1_r2(c.sampletable, c.patterns['fastq']),
+        index=rules.bowtie2_index.output,
+        bed12=rules.conversion_bed12.output,
     output:
         strandedness=c.patterns['strand_check']['tsv'],
         bam=temporary(c.patterns['strand_check']['bam']),
@@ -276,14 +266,14 @@ rule fastqc:
         wrapper_for('fastqc/wrapper.py')
 
 
-if config['aligner']['index'] == 'hisat2':
+if config['aligner'] == 'hisat2':
     rule hisat2:
         """
         Map reads with HISAT2
         """
         input:
-            fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
-            index=[c.refdict[c.organism][config['aligner']['tag']]['hisat2']]
+            fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            index=rules.hisat2_index.output,
         output:
             bam=temporary(c.patterns['bam'])
         log:
@@ -293,9 +283,11 @@ if config['aligner']['index'] == 'hisat2':
             mem_mb=gb(32),
             runtime=autobump(hours=8)
         run:
-            prefix = aligners.prefix_from_bowtie2_index(input.index)
+
+            prefix = os.path.commonprefix(input.index).rstrip(".")
             sam = output.bam.replace('.bam', '.sam')
 
+
             if c.is_paired:
                 assert len(input.fastq) == 2
                 fastqs = '-1 {0} -2 {1} '.format(*input.fastq)
@@ -319,7 +311,7 @@ if config['aligner']['index'] == 'hisat2':
                 "&& rm {sam}"
             )
 
-if config['aligner']['index'].startswith('star'):
+if config['aligner'].startswith('star'):
 
     # STAR can be run in 1-pass or 2-pass modes. Since we may be running it
     # more than once in almost the same way, we pull out the shell command here
@@ -348,16 +340,16 @@ if config['aligner']['index'].startswith('star'):
     )
     logfile_extensions = ['Log.progress.out', 'Log.out', 'Log.final.out', 'Log.std.out']
 
-if config['aligner']['index'] == 'star':
+if config['aligner'] == 'star':
 
     rule star:
         """
         Align with STAR (1-pass mode)
         """
         input:
-            fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
-            index=[c.refdict[c.organism][config['aligner']['tag']]['star']],
-            annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'],
+            fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            index=rules.star_index.output,
+            annotation="references/annotation.gtf"
         output:
             bam=temporary(c.patterns['bam']),
             sjout=temporary(c.patterns['bam'].replace('.bam', '.star.SJ.out.tab')),
@@ -384,16 +376,16 @@ if config['aligner']['index'] == 'star':
             shell('mkdir -p {outdir}/star_logs '
                   '&& mv {logfiles} {outdir}/star_logs')
 
-if config['aligner']['index'] == 'star-twopass':
+if config['aligner'] == 'star-twopass':
 
     rule star_pass1:
         """
         First pass of alignment with STAR to get the junctions
         """
         input:
-            fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
-            index=[c.refdict[c.organism][config['aligner']['tag']]['star']],
-            annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'],
+            fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            index=rules.star_index.output,
+            annotation="references/annotation.gtf"
         output:
             sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')),
         log:
@@ -430,9 +422,9 @@ if config['aligner']['index'] == 'star-twopass':
         """
         input:
             sjout=expand(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES),
-            fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
-            index=[c.refdict[c.organism][config['aligner']['tag']]['star']],
-            annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'],
+            fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            index=rules.star_index.output,
+            annotation="references/annotation.gtf"
         output:
             bam=temporary(c.patterns['bam']),
             sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')),
@@ -474,7 +466,16 @@ rule rRNA:
     """
     input:
         fastq=render_r1_only(c.patterns['cutadapt']),
-        index=[c.refdict[c.organism][config['rrna']['tag']]['bowtie2']]
+        index=multiext(
+            "references/bowtie2/rrna",
+            ".1.bt2",
+            ".2.bt2",
+            ".3.bt2",
+            ".4.bt2",
+            ".rev.1.bt2",
+            ".rev.2.bt2",
+            ".fa",
+        ),
     output:
         bam=temporary(c.patterns['rrna']['bam'])
     log:
@@ -484,7 +485,7 @@ rule rRNA:
         mem_mb=gb(2),
         runtime=autobump(hours=2)
     run:
-        prefix = aligners.prefix_from_bowtie2_index(input.index)
+        prefix = os.path.commonprefix(input.index).rstrip(".")
         sam = output.bam.replace('.bam', '.sam')
 
         shell(
@@ -553,43 +554,12 @@ rule bam_index:
         'samtools index {input} {output}'
 
 
-def fastq_screen_references():
-    """
-    Returns the Bowtie2 indexes for the configured references from the
-    `fastq_screen:` section of the config
-    """
-    refs = {}
-    for i in config['fastq_screen']:
-        refs[i['label']] = c.refdict[i['organism']][i['tag']]['bowtie2']
-    return refs
-
-
-rule fastq_screen:
-    """
-    Run fastq_screen to look for contamination from other genomes
-    """
-    input:
-        **fastq_screen_references(),
-        fastq=render_r1_only(rules.cutadapt.output.fastq),
-    output:
-        txt=c.patterns['fastq_screen']
-    log:
-        c.patterns['fastq_screen'] + '.log'
-    threads: 6
-    resources:
-        mem_mb=gb(4),
-        runtime=autobump(hours=2)
-    params: subset=100000
-    script:
-        wrapper_for('fastq_screen/wrapper.py')
-
-
 rule featurecounts:
     """
     Count reads in annotations with featureCounts from the subread package
     """
     input:
-        annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'],
+        annotation=rules.gtf.output,
         bam=c.targets['markduplicates']['bam']
     output:
         counts='{sample_dir}/rnaseq_aggregation/featurecounts.txt'
@@ -600,7 +570,7 @@ rule featurecounts:
         mem_mb=gb(16),
         runtime=autobump(hours=2)
     params:
-        strand_arg = helpers.strand_arg_lookup(
+        strand_arg = utils.strand_arg_lookup(
             c, {
                 'unstranded': '-s0 ',
                 'fr-firststrand': '-s2 ',
@@ -640,10 +610,10 @@ rule rrna_libsizes_table:
         runtime=autobump(hours=2)
     run:
         def rrna_sample(f):
-            return helpers.extract_wildcards(c.patterns['rrna']['libsize'], f)['sample']
+            return utils.extract_wildcards(c.patterns['rrna']['libsize'], f)['sample']
 
         def sample(f):
-            return helpers.extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample']
+            return utils.extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample']
 
         def million(f):
             return float(open(f).read()) / 1e6
@@ -694,7 +664,6 @@ rule multiqc:
             utils.flatten(c.targets['markduplicates']) +
             utils.flatten(c.targets['salmon']) +
             utils.flatten(c.targets['rseqc']) +
-            utils.flatten(c.targets['fastq_screen']) +
             utils.flatten(c.targets['preseq']) +
             utils.flatten(c.targets['collectrnaseqmetrics']) +
             utils.flatten(c.targets['samtools'])
@@ -762,7 +731,7 @@ rule collectrnaseqmetrics:
     """
     input:
         bam=c.patterns['markduplicates']['bam'],
-        refflat=c.refdict[c.organism][config['gtf']['tag']]['refflat']
+        refflat=rules.conversion_refflat.output,
     output:
         metrics=c.patterns['collectrnaseqmetrics']['metrics'],
     params:
@@ -771,7 +740,7 @@ rule collectrnaseqmetrics:
         # config.
         java_args='-Xmx20g',
         # java_args='-Xmx2g',  # [TEST SETTINGS -1]
-        strand_arg = helpers.strand_arg_lookup(
+        strand_arg = utils.strand_arg_lookup(
             c, {
                 'unstranded': 'STRAND=NONE ',
                 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ',
@@ -822,12 +791,12 @@ rule salmon:
     Quantify reads coming from transcripts with Salmon
     """
     input:
-        fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
-        index=c.refdict[c.organism][config['salmon']['tag']]['salmon'],
+        fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+        index='references/salmon/versionInfo.json'
     output:
         c.patterns['salmon']
     params:
-        index_dir=os.path.dirname(c.refdict[c.organism][config['salmon']['tag']]['salmon']),
+        index_dir=os.path.dirname('references/salmon/versionInfo.json'),
         outdir=os.path.dirname(c.patterns['salmon'])
     log:
         c.patterns['salmon'] + '.log'
@@ -864,14 +833,14 @@ rule kallisto:
     Quantify reads coming from transcripts with Kallisto
     """
     input:
-        fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
-        index=c.refdict[c.organism][config['kallisto']['tag']]['kallisto'],
+        fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+        index='references/kallisto/transcripts.idx',
     output:
         c.patterns['kallisto']
     params:
-        index_dir=os.path.dirname(c.refdict[c.organism][config['kallisto']['tag']]['kallisto']),
+        index_dir=os.path.dirname('references/kallisto/transcripts.idx'),
         outdir=os.path.dirname(c.patterns['kallisto']),
-        strand_arg = helpers.strand_arg_lookup(
+        strand_arg = utils.strand_arg_lookup(
             c, {
                 'unstranded': '',
                 'fr-firststrand': '--rf-stranded',
@@ -913,7 +882,7 @@ rule rseqc_infer_experiment:
     """
     input:
         bam=c.patterns['markduplicates']['bam'],
-        bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12']
+        bed12=rules.conversion_bed12.output,
     output:
         txt=c.patterns['rseqc']['infer_experiment']
     log:
@@ -931,7 +900,7 @@ rule rseqc_read_distribution:
     """
     input:
         bam=c.patterns['markduplicates']['bam'],
-        bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12'],
+        bed12=rules.conversion_bed12.output,
     output:
         txt=c.patterns['rseqc']['read_distribution']
     log:
@@ -985,7 +954,7 @@ rule bigwig_neg:
     log:
         c.patterns['bigwig']['neg'] + '.log'
     params:
-        strand_arg = helpers.strand_arg_lookup(
+        strand_arg = utils.strand_arg_lookup(
             c, {
                 'unstranded': '',
                 'fr-firststrand': '--filterRNAstrand reverse ',
@@ -1019,7 +988,7 @@ rule bigwig_pos:
     log:
         c.patterns['bigwig']['pos'] + '.log'
     params:
-        strand_arg = helpers.strand_arg_lookup(
+        strand_arg = utils.strand_arg_lookup(
             c, {
                 'unstranded': '',
                 'fr-firststrand': '--filterRNAstrand forward ',
@@ -1059,7 +1028,7 @@ if 'merged_bigwigs' in config:
         """
         input:
             bigwigs=bigwigs_to_merge,
-            chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'],
+            chromsizes='references/genome.chromsizes'
         output:
             c.patterns['merged_bigwig']
         log:

From 36fd2e0f167c8c8d4ddec9449ffd9a427e8b0ae2 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 21:38:54 +0000
Subject: [PATCH 030/196] specify references dir from config

---
 rules/references.smk       | 78 +++++++++++++++++++++-----------------
 workflows/rnaseq/Snakefile | 19 +++++-----
 2 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/rules/references.smk b/rules/references.smk
index f4b104ff..157eeed9 100644
--- a/rules/references.smk
+++ b/rules/references.smk
@@ -7,12 +7,16 @@ sys.path.insert(0, HERE + "/../..")
 from lib.utils import autobump, gb, hours
 from lib import utils
 
+REFERENCES = config.get('reference_dir', '../../references')
+
 def default_postprocess(origfn, newfn):
     shell("mv {origfn} {newfn}")
 
 rule fasta:
     output:
-        temporary('references/genome.fa.gz')
+        temporary(REFERENCES + '/genome.fa.gz')
+    log:
+        REFERENCES + "/logs/genome.fa.gz.log"
     run:
         utils.download_and_postprocess(
             urls=config['fasta']['url'],
@@ -24,7 +28,9 @@ rule fasta:
 
 rule gtf:
     output:
-        temporary('references/annotation.gtf.gz')
+        temporary(REFERENCES + '/annotation.gtf.gz')
+    log:
+        REFERENCES + "/logs/annotation.gtf.gz.log"
     run:
         utils.download_and_postprocess(
             urls=config['gtf']['url'],
@@ -36,7 +42,9 @@ rule gtf:
 
 rule rrna:
     output:
-        temporary('references/rrna.fa.gz')
+        temporary(REFERENCES + '/rrna.fa.gz')
+    log:
+        REFERENCES + "/logs/rrna.fa.gz.log"
     run:
         utils.download_and_postprocess(
             urls=config['rrna']['url'],
@@ -48,18 +56,18 @@ rule rrna:
 
 rule unzip:
     input:
-        "references/{prefix}.gz"
+        REFERENCES + '/{prefix}.gz'
     output:
-        "references/{prefix}"
+        REFERENCES + '/{prefix}'
     shell: 'gunzip -c {input} > {output}'
 
 
 rule bowtie2_index:
     input:
-        "references/{label}.fa",
+        REFERENCES + '/{label}.fa',
     output:
         multiext(
-            "references/bowtie2/{label}",
+            REFERENCES + '/bowtie2/{label}',
             ".1.bt2",
             ".2.bt2",
             ".3.bt2",
@@ -69,7 +77,7 @@ rule bowtie2_index:
             ".fa",
         ),
     log:
-        "references/logs/bowtie2_{label}.log"
+        REFERENCES + '/logs/bowtie2_{label}.log'
     resources:
         runtime=autobump(hours=8),
         mem_mb=autobump(gb=32),
@@ -90,12 +98,12 @@ rule bowtie2_index:
 
 rule star_index:
     input:
-        fasta='references/genome.fa',
-        gtf='references/annotation.gtf',
+        fasta=REFERENCES + '/genome.fa',
+        gtf=REFERENCES + '/annotation.gtf',
     output:
-        protected('references/star/Genome')
+        REFERENCES + '/star/Genome'
     log:
-        'references/logs/star.log'
+        REFERENCES + '/logs/star.log'
     threads:
         8
     resources:
@@ -131,10 +139,10 @@ rule star_index:
 
 rule hisat2_index:
     input:
-        "references/genome.fa",
+        REFERENCES + '/genome.fa',
     output:
         multiext(
-            "references/hisat2/genome",
+            REFERENCES + '/hisat2/genome',
             ".1.ht2",
             ".2.ht2",
             ".3.ht2",
@@ -146,7 +154,7 @@ rule hisat2_index:
             ".fa",
         )
     log:
-        "references/logs/hisat2.log"
+        REFERENCES + '/logs/hisat2.log'
     resources:
         runtime=autobump(hours=8),
         mem_mb=autobump(gb=32),
@@ -168,10 +176,10 @@ rule hisat2_index:
 
 rule transcriptome_fasta:
     input:
-        fasta='references/genome.fa',
-        gtf='references/annotation.gtf',
+        fasta=REFERENCES + '/genome.fa',
+        gtf=REFERENCES + '/annotation.gtf',
     output:
-        'references/transcriptome.fa' 
+        REFERENCES + '/transcriptome.fa' 
     resources:
         runtime=hours(1)
     shell:
@@ -180,13 +188,13 @@ rule transcriptome_fasta:
 
 rule salmon_index:
     input:
-        'references/transcriptome.fa'
+        REFERENCES + '/transcriptome.fa'
     output:
-        'references/salmon/versionInfo.json'
+        REFERENCES + '/salmon/versionInfo.json'
     log:
-        'references/logs/salmon.log'
+        REFERENCES + '/logs/salmon.log'
     params:
-        outdir='references/salmon'
+        outdir=REFERENCES + '/salmon'
     resources:
         mem_mb=gb(32),
         runtime=hours(2)
@@ -202,11 +210,11 @@ rule salmon_index:
 
 rule kallisto_index:
     output:
-        'references/kallisto/transcripts.idx',
+        REFERENCES + '/kallisto/transcripts.idx',
     input:
-        'references/genome.fa'
+        REFERENCES + '/genome.fa'
     log:
-        'references/logs/kallisto.log'
+        REFERENCES + '/logs/kallisto.log'
     resources:
         runtime=hours(2),
         mem_mb=gb(32),
@@ -219,11 +227,11 @@ rule kallisto_index:
 
 rule conversion_refflat:
     input:
-        'references/annotation.gtf'
+        REFERENCES + '/annotation.gtf'
     output:
-        protected('references/annotation.refflat')
+        REFERENCES + '/annotation.refflat'
     log:
-        'references/logs/annotation.refflat.log'
+        REFERENCES + '/logs/annotation.refflat.log'
     resources:
         runtime=hours(2),
         mem_mb=gb(2)
@@ -235,9 +243,9 @@ rule conversion_refflat:
 
 rule conversion_bed12:
     input:
-        'references/annotation.gtf'
+        REFERENCES + '/annotation.gtf'
     output:
-        protected('references/annotation.bed12')
+        REFERENCES + '/annotation.bed12'
     resources:
         runtime=hours(2),
         mem_mb=gb(2)
@@ -249,11 +257,11 @@ rule conversion_bed12:
 
 rule chromsizes:
     input:
-        'references/genome.fa'
+        REFERENCES + '/genome.fa'
     output:
-        protected('references/genome.chromsizes')
+        REFERENCES + '/genome.chromsizes'
     log:
-        'references/logs/genome.chromsizes.log'
+        REFERENCES + '/logs/genome.chromsizes.log'
     params:
         # NOTE: Be careful with the memory here; make sure you have enough
         # and/or it matches the resources you're requesting
@@ -280,9 +288,9 @@ rule mappings:
     Creates gzipped TSV mapping between attributes in the GTF.
     """
     input:
-        gtf='references/annotation.gtf'
+        gtf=REFERENCES + '/annotation.gtf'
     output:
-        protected('references/annotation.mapping.tsv.gz')
+        REFERENCES + '/annotation.mapping.tsv.gz'
     params:
         include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', [])
     resources:
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 634f1d91..374a1437 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -16,6 +16,7 @@ configfile: 'config/config.yaml'
 
 include: '../../rules/references.smk'
 
+REFERENCES = config.get('reference_dir', '../../references')
 
 # Verify configuration of config and sampletable files
 ################utils.preflight(config)
@@ -349,7 +350,7 @@ if config['aligner'] == 'star':
         input:
             fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
             index=rules.star_index.output,
-            annotation="references/annotation.gtf"
+            annotation=REFERENCES + "/annotation.gtf"
         output:
             bam=temporary(c.patterns['bam']),
             sjout=temporary(c.patterns['bam'].replace('.bam', '.star.SJ.out.tab')),
@@ -385,7 +386,7 @@ if config['aligner'] == 'star-twopass':
         input:
             fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
             index=rules.star_index.output,
-            annotation="references/annotation.gtf"
+            annotation=REFERENCES + "/annotation.gtf"
         output:
             sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')),
         log:
@@ -424,7 +425,7 @@ if config['aligner'] == 'star-twopass':
             sjout=expand(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES),
             fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
             index=rules.star_index.output,
-            annotation="references/annotation.gtf"
+            annotation=REFERENCES + "/annotation.gtf"
         output:
             bam=temporary(c.patterns['bam']),
             sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')),
@@ -467,7 +468,7 @@ rule rRNA:
     input:
         fastq=render_r1_only(c.patterns['cutadapt']),
         index=multiext(
-            "references/bowtie2/rrna",
+            REFERENCES + "/bowtie2/rrna",
             ".1.bt2",
             ".2.bt2",
             ".3.bt2",
@@ -792,11 +793,11 @@ rule salmon:
     """
     input:
         fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
-        index='references/salmon/versionInfo.json'
+        index=REFERENCES + "/salmon/versionInfo.json"
     output:
         c.patterns['salmon']
     params:
-        index_dir=os.path.dirname('references/salmon/versionInfo.json'),
+        index_dir=os.path.dirname(REFERENCES + "/salmon/versionInfo.json"),
         outdir=os.path.dirname(c.patterns['salmon'])
     log:
         c.patterns['salmon'] + '.log'
@@ -834,11 +835,11 @@ rule kallisto:
     """
     input:
         fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
-        index='references/kallisto/transcripts.idx',
+        index=REFERENCES + "/kallisto/transcripts.idx",
     output:
         c.patterns['kallisto']
     params:
-        index_dir=os.path.dirname('references/kallisto/transcripts.idx'),
+        index_dir=os.path.dirname(REFERENCES + "/kallisto/transcripts.idx"),
         outdir=os.path.dirname(c.patterns['kallisto']),
         strand_arg = utils.strand_arg_lookup(
             c, {
@@ -1028,7 +1029,7 @@ if 'merged_bigwigs' in config:
         """
         input:
             bigwigs=bigwigs_to_merge,
-            chromsizes='references/genome.chromsizes'
+            chromsizes=REFERENCES + "/genome.chromsizes"
         output:
             c.patterns['merged_bigwig']
         log:

From c321ed3015d6fcba91d88e452849a1adc04ca1ba Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 21:39:40 +0000
Subject: [PATCH 031/196] round of cleanup

---
 workflows/rnaseq/Snakefile | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 374a1437..3c251eb3 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -1,7 +1,6 @@
 import os
 import sys
 from pathlib import Path
-from textwrap import dedent
 import yaml
 import tempfile
 import pandas as pd
@@ -462,9 +461,6 @@ if config['aligner'] == 'star-twopass':
 
 
 rule rRNA:
-    """
-    Map reads with bowtie2 to the rRNA reference
-    """
     input:
         fastq=render_r1_only(c.patterns['cutadapt']),
         index=multiext(
@@ -508,9 +504,6 @@ rule rRNA:
 
 
 rule fastq_count:
-    """
-    Count reads in a FASTQ file
-    """
     input:
         fastq='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz'
     output:
@@ -524,9 +517,6 @@ rule fastq_count:
 
 
 rule bam_count:
-    """
-    Count reads in a BAM file
-    """
     input:
         bam='{sample_dir}/{sample}/{suffix}.bam'
     output:
@@ -540,9 +530,6 @@ rule bam_count:
 
 
 rule bam_index:
-    """
-    Index a BAM
-    """
     input:
         bam='{prefix}.bam'
     output:

From 32f43bc036d5f27e556b4c2cc1d0ce82b6f78f38 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 3 Jan 2025 21:39:55 +0000
Subject: [PATCH 032/196] better use of params

---
 workflows/rnaseq/Snakefile | 53 ++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 3c251eb3..cc66821f 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -216,20 +216,21 @@ rule cutadapt:
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2)
+    params:
+        extra=(
+            "--nextseq-trim 20 "
+            "--overlap 6 "
+            "--minimum-length 25 "
+            "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+        ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if c.is_paired else ""
     run:
-
-        # NOTE: Change cutadapt params here
         if c.is_paired:
             shell(
                 "cutadapt "
                 "-o {output[0]} "
                 "-p {output[1]} "
-                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
-                "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
-                "--nextseq-trim 20 "
-                "--overlap 6 "
-                '-j {threads} '
-                '--minimum-length 25 '
+                "-j {threads} "
+                "{params.extra} "
                 "{input.fastq[0]} "
                 "{input.fastq[1]} "
                 "&> {log}"
@@ -238,11 +239,8 @@ rule cutadapt:
             shell(
                 "cutadapt "
                 "-o {output[0]} "
-                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
-                "--nextseq-trim 20 "
-                "--overlap 6 "
-                '-j {threads} '
-                '--minimum-length 25 '
+                "-j {threads} "
+                "{params.extra} "
                 "{input.fastq[0]} "
                 "&> {log}"
             )
@@ -282,6 +280,8 @@ if config['aligner'] == 'hisat2':
         resources:
             mem_mb=gb(32),
             runtime=autobump(hours=8)
+        params:
+            extra=""
         run:
 
             prefix = os.path.commonprefix(input.index).rstrip(".")
@@ -323,7 +323,9 @@ if config['aligner'].startswith('star'):
         '--readFilesIn {input.fastq} '
         '--readFilesCommand zcat '
         '--outFileNamePrefix {prefix} '
-
+        '{params.extra} '
+    )
+    STAR_PARAMS = (
         # NOTE: The STAR docs indicate that the following parameters are
         # standard options for ENCODE long-RNA-seq pipeline.  Comments are from
         # the STAR docs.
@@ -359,6 +361,9 @@ if config['aligner'] == 'star':
         resources:
             mem_mb=gb(64),
             runtime=autobump(hours=8)
+        params:
+            extra=STAR_PARAMS
+
         run:
             genomedir = os.path.dirname(input.index[0])
             outdir = os.path.dirname(output[0])
@@ -394,6 +399,8 @@ if config['aligner'] == 'star-twopass':
         resources:
             mem_mb=gb(64),
             runtime=autobump(hours=8)
+        params:
+            extra=STAR_PARAMS
         run:
             genomedir = os.path.dirname(input.index[0])
             outdir = os.path.dirname(output[0])
@@ -434,6 +441,8 @@ if config['aligner'] == 'star-twopass':
         resources:
             mem_mb=gb(64),
             runtime=autobump(hours=8)
+        params:
+            extra=STAR_PARAMS
         run:
             genomedir = os.path.dirname(input.index[0])
             outdir = os.path.dirname(output[0])
@@ -481,6 +490,12 @@ rule rRNA:
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2)
+    params:
+        extra=(
+            '-k 1 '       # NOTE: we only care if >=1 mapped
+            '--no-unal '  # NOTE: suppress unaligned reads
+        )
+
     run:
         prefix = os.path.commonprefix(input.index).rstrip(".")
         sam = output.bam.replace('.bam', '.sam')
@@ -489,10 +504,9 @@ rule rRNA:
             "bowtie2 "
             "-x {prefix} "
             "-U {input.fastq} "
-            '-k 1 '       # NOTE: we only care if >=1 mapped
-            '--no-unal '  # NOTE: suppress unaligned reads
             "--threads {threads} "
             "-S {sam} "
+            "{params.extra} "
             "> {log} 2>&1"
         )
 
@@ -558,13 +572,12 @@ rule featurecounts:
         mem_mb=gb(16),
         runtime=autobump(hours=2)
     params:
-        strand_arg = utils.strand_arg_lookup(
-            c, {
+        strand_arg={
                 'unstranded': '-s0 ',
                 'fr-firststrand': '-s2 ',
                 'fr-secondstrand': '-s1 ',
-            }
-        )
+            }[config["stranded"]],
+        extra=""
     run:
         # NOTE: By default, we use -p for paired-end
         p_arg = ''

From a9216b91897ff5f4d8aeffd6b868984fedbf1301 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sat, 4 Jan 2025 03:57:11 +0000
Subject: [PATCH 033/196] try moving utils.py to common.smk

---
 lib/utils.py => rules/common.smk | 9 +++++++++
 1 file changed, 9 insertions(+)
 rename lib/utils.py => rules/common.smk (99%)

diff --git a/lib/utils.py b/rules/common.smk
similarity index 99%
rename from lib/utils.py
rename to rules/common.smk
index fd8c4dba..d0401625 100644
--- a/lib/utils.py
+++ b/rules/common.smk
@@ -1178,3 +1178,12 @@ def gff2gtf(gff, gtf):
         shell("gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}")
     else:
         shell("gffread {gff} -T -o- | gzip -c > {gtf}")
+
+
+def wrapper_for(path):
+    return 'file:' + os.path.join('../..','wrappers', 'wrappers', path)
+
+def detect_sra(sampletable):
+    return 'Run' in self.sampletable.columns and any(self.sampletable['Run'].str.startswith('SRR'))
+
+# vim: ft=python

From 300e73d9d868492c92d286b4de8706109dc2549b Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sat, 4 Jan 2025 03:57:28 +0000
Subject: [PATCH 034/196] add strand_check and sra rules

---
 rules/sra.smk          | 34 +++++++++++++++++++++
 rules/strand_check.smk | 69 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 rules/sra.smk
 create mode 100644 rules/strand_check.smk

diff --git a/rules/sra.smk b/rules/sra.smk
new file mode 100644
index 00000000..861b5098
--- /dev/null
+++ b/rules/sra.smk
@@ -0,0 +1,34 @@
+
+sampletable['orig_filename'] = expand(
+    'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1)
+
+if is_paired:
+    sampletable['orig_filename_R2'] = expand(
+        'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2)
+
+rule fastq_dump:
+    output:
+        fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n)
+    log:
+        'original_data/sra_samples/{sample}/{sample}.fastq.gz.log'
+    params:
+        is_paired=is_paired,
+        sampletable=_st,
+        # extra="-X 100000",  # [TEST SETTINGS]
+    resources:
+        mem_mb=gb(1),
+        disk_mb=autobump(gb=1),
+        runtime=autobump(hours=2)
+    run:
+        _st = sampletable.set_index(sampletable.columns[0])
+        srr = _st.loc[wildcards.sample, "Run"]
+        extra = params.get("extra", "")
+        if is_paired:
+            shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
+            shell("mv {srr}_1.fastq.gz {output[0]}")
+            shell("mv {srr}_2.fastq.gz {output[1]}")
+        else:
+            shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp")
+            shell("mv {output[0]}.tmp {output[0]}")
+
+# vim: ft=snakemake
diff --git a/rules/strand_check.smk b/rules/strand_check.smk
new file mode 100644
index 00000000..625ba3e2
--- /dev/null
+++ b/rules/strand_check.smk
@@ -0,0 +1,69 @@
+
+rule sample_strand_check:
+    input:
+        fastq=fill_r1_r2(c.sampletable, c.patterns['fastq']),
+        index=rules.bowtie2_index.output,
+        bed12=rules.conversion_bed12.output,
+    output:
+        strandedness='strand_check/{sample}/{sample}.strandedness',
+        bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'),
+        bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'),
+        fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', sample=SAMPLES, n=n)),
+    log:
+        'strand_check/{sample}/{sample}.strandedness.log'
+    threads: 6
+    resources:
+        mem_mb=gb(8),
+        runtime=autobump(hours=2)
+    run:
+        prefix = aligners.prefix_from_bowtie2_index(input.index)
+        nreads = int(config['strand_check_reads']) * 4
+        if c.is_paired:
+            assert len(input.fastq) == 2
+            assert len(output.fastqs) == 2
+            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
+            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}')
+            fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} '
+        else:
+            assert len(input.fastq) == 1
+            assert len(output.fastqs) == 1
+            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
+            fastqs = f'-U {output.fastqs[0]} '
+        shell(
+            "bowtie2 "
+            "-x {prefix} "
+            "{fastqs} "
+            '--no-unal '
+            "--threads {threads} 2> {log} "
+            "| samtools view -Sb - "
+            "| samtools sort - -o {output.bam} "
+        )
+        shell("samtools index {output.bam}")
+        shell(
+            'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}'
+        )
+
+rule strand_check:
+    input:
+        expand('strand_check/{sample}/{sample}.strandedness', sample=SAMPLES)
+    output:
+        html='strand_check/strandedness.html',
+        filelist=temporary('strand_check/filelist')
+    log:
+        'strand_check/strandedness.log'
+    resources:
+        mem_mb=gb(1),
+        runtime=autobump(hours=2)
+    run:
+        with open(output.filelist, 'w') as fout:
+            for i in  input:
+                fout.write(i + '\n')
+        shell(
+            'multiqc '
+            '--force '
+            '--module rseqc '
+            '--file-list {output.filelist} '
+            '--filename {output.html} &> {log}'
+        )
+
+# vim: ft=snakemake

From 407332e443a0c3c44dc3ce4df16f2798119b8c43 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sat, 4 Jan 2025 03:57:47 +0000
Subject: [PATCH 035/196] mega refactor, still only partway done....

---
 workflows/rnaseq/Snakefile | 358 ++++++++++---------------------------
 1 file changed, 91 insertions(+), 267 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index cc66821f..84b9bdd4 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -1,217 +1,78 @@
 import os
-import sys
-from pathlib import Path
 import yaml
 import tempfile
 import pandas as pd
 
-HERE = str(Path(workflow.snakefile).parent)
-sys.path.insert(0, HERE + "/../..")
-from lib import utils
-from lib.utils import autobump, gb, hours
-from lib.patterns_targets import RNASeqConfig
-
 configfile: 'config/config.yaml'
 
 include: '../../rules/references.smk'
+include: '../../rules/common.smk'
 
 REFERENCES = config.get('reference_dir', '../../references')
 
-# Verify configuration of config and sampletable files
-################utils.preflight(config)
+sampletable = pd.read_table(config["sampletable"], sep="\t")
+_st = c.sampletable.set_index(c.sampletable.columns[0])
+is_paired = detect_layout(sampletable) == "PE"
+is_sra = detect_sra(sampletable)
+n = ["1", "2"] if is_paired else ["1"]
+SAMPLES = sampletable.iloc[:, 0].values
 
-c = RNASeqConfig(config, config.get('patterns', 'config/rnaseq_patterns.yaml'))
+# TODO: moved utils.py over to common.smk; not sure if this means that postprocessing will fail or not...
 
-SAMPLES = c.sampletable.iloc[:, 0].values
 wildcard_constraints:
     n = '[1,2]',
     sample = '|'.join(SAMPLES)
 
+localrules: symlinks, symlink_targets
 
-def wrapper_for(path):
-    return 'file:' + os.path.join('../..','wrappers', 'wrappers', path)
-
-
-# ----------------------------------------------------------------------------
-# RULES
-# ----------------------------------------------------------------------------
-
-# See "patterns and targets" in the documentation for what's going on here.
-final_targets = utils.flatten((
-    c.targets['fastqc'],
-    c.targets['rrna_percentages_table'],
-    c.targets['multiqc'],
-    c.targets['featurecounts'],
-    c.targets['markduplicates'],
-    c.targets['salmon'],
-    c.targets['kallisto'],
-    c.targets['preseq'],
-    c.targets['rseqc'],
-    c.targets['collectrnaseqmetrics'],
-    c.targets['bigwig'],
-    c.targets['samtools'],
-))
-
-if config.get('merged_bigwigs', None):
-    final_targets.extend(utils.flatten(c.targets['merged_bigwig']))
-
+rule all:
+    input:
+        'data/rnaseq_aggregation/multiqc.html',
 
-def render_r1_r2(pattern):
-    return expand(pattern, sample='{sample}', n=c.n)
+if is_sra:
+    include: '../../rules/sra.smk'
 
-def render_r1_only(pattern):
-    return expand(pattern, sample='{sample}', n=1)
 
-rule targets:
+def orig_for_sample(wc):
     """
-    Final targets to create
+    Given a sample, returns either one or two original fastq files
+    depending on whether the library was single- or paired-end.
     """
-    input: final_targets
-
-if c.is_sra:
-
-    # Convert the sampletable to be indexed by the first column, for
-    # convenience in generating the input/output filenames.
-    _st = c.sampletable.set_index(c.sampletable.columns[0])
-
-    rule fastq_dump:
-        output:
-            fastq=render_r1_r2(c.patterns['sra_fastq'])
-        log:
-            render_r1_only(c.patterns['sra_fastq'])[0] + '.log'
-        params:
-            is_paired=c.is_paired,
-            sampletable=_st,
-            # limit = 100000, # [TEST SETTINGS]
-        resources:
-            mem_mb=gb(1),
-            disk_mb=autobump(gb=1),
-            runtime=autobump(hours=2)
-        conda:
-            '../../wrappers/wrappers/fastq-dump/environment.yaml'
-        script:
-            wrapper_for('fastq-dump/wrapper.py')
-
-if 'orig_filename' in c.sampletable.columns:
-
-    localrules: symlinks, symlink_targets
-
-    # Convert the sampletable to be indexed by the first column, for
-    # convenience in generating the input/output filenames.
-    _st = c.sampletable.set_index(c.sampletable.columns[0])
-
-    def orig_for_sample(wc):
-        """
-        Given a sample, returns either one or two original fastq files
-        depending on whether the library was single- or paired-end.
-        """
-        if c.is_paired:
-            return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']]
-        return _st.loc[wc.sample, ['orig_filename']]
-
-
-    rule symlinks:
-        """
-        Symlinks files over from original filename
-        """
-        input:
-            orig_for_sample
-        output:
-            render_r1_r2(c.patterns['fastq'])
-        threads: 1
-        resources:
-            mem_mb=100,
-            runtime=10,
-        run:
-            assert len(output) == len(input), (input, output)
-            for src, linkname in zip(input, output):
-                utils.make_relative_symlink(src, linkname)
-
+    if is_paired:
+        return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']]
+    return _st.loc[wc.sample, ['orig_filename']]
 
-    rule symlink_targets:
-        input: c.targets['fastq']
 
-# This can be set at the command line with --config strand_check_reads=1000
-config.setdefault('strand_check_reads', 1e5)
-
-rule sample_strand_check:
+rule symlinks:
     input:
-        fastq=utils.fill_r1_r2(c.sampletable, c.patterns['fastq']),
-        index=rules.bowtie2_index.output,
-        bed12=rules.conversion_bed12.output,
+        lambda wc: _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] if is_paired
+        else _st.loc[wc.sample, ['orig_filename']]
     output:
-        strandedness=c.patterns['strand_check']['tsv'],
-        bam=temporary(c.patterns['strand_check']['bam']),
-        idx=temporary(c.patterns['strand_check']['bam'] + '.bai'),
-        fastqs=temporary(render_r1_r2(c.patterns['strand_check']['fastq'])),
-    log:
-        c.patterns['strand_check']['tsv'] + '.log'
-    threads: 6
+        expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n)
+    threads: 1
     resources:
-        mem_mb=gb(8),
-        runtime=autobump(hours=2)
+        mem_mb=100,
+        runtime=10,
     run:
-        prefix = aligners.prefix_from_bowtie2_index(input.index)
-        nreads = int(config['strand_check_reads']) * 4
-        if c.is_paired:
-            assert len(input.fastq) == 2
-            assert len(output.fastqs) == 2
-            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
-            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}')
-            fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} '
-        else:
-            assert len(input.fastq) == 1
-            assert len(output.fastqs) == 1
-            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
-            fastqs = f'-U {output.fastqs[0]} '
-        shell(
-            "bowtie2 "
-            "-x {prefix} "
-            "{fastqs} "
-            '--no-unal '
-            "--threads {threads} 2> {log} "
-            "| samtools view -Sb - "
-            "| samtools sort - -o {output.bam} "
-        )
-        shell("samtools index {output.bam}")
-        shell(
-            'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}'
-        )
+        assert len(output) == len(input), (input, output)
+        for src, linkname in zip(input, output):
+            make_relative_symlink(src, linkname)
 
-rule strand_check:
-    input:
-        expand(c.patterns['strand_check']['tsv'], sample=SAMPLES)
-    output:
-        html='strand_check/strandedness.html',
-        filelist=temporary('strand_check/filelist')
-    log:
-        'strand_check/strandedness.log'
-    resources:
-        mem_mb=gb(1),
-        runtime=autobump(hours=2)
-    run:
-        with open(output.filelist, 'w') as fout:
-            for i in  input:
-                fout.write(i + '\n')
-        shell(
-            'multiqc '
-            '--force '
-            '--module rseqc '
-            '--file-list {output.filelist} '
-            '--filename {output.html} &> {log}'
-        )
 
+rule symlink_targets:
+    input: c.targets['fastq']
+
+# This can be set at the command line with --config strand_check_reads=1000
+config.setdefault('strand_check_reads', 1e5)
+include: '../../rules/strand_check.smk'
 
 rule cutadapt:
-    """
-    Run cutadapt
-    """
     input:
-        fastq=render_r1_r2(c.patterns['fastq'])
+        fastq=expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n)
     output:
-        fastq=render_r1_r2(c.patterns['cutadapt'])
+        fastq=expand('data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz', sample=SAMPLES, n=n)
     log:
-        render_r1_r2(c.patterns['cutadapt'])[0] + '.log'
+        'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz'
     threads: 6
     resources:
         mem_mb=gb(2),
@@ -222,7 +83,7 @@ rule cutadapt:
             "--overlap 6 "
             "--minimum-length 25 "
             "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
-        ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if c.is_paired else ""
+        ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if is_paired else ""
     run:
         if c.is_paired:
             shell(
@@ -245,11 +106,8 @@ rule cutadapt:
                 "&> {log}"
             )
 
-
+# TODO: rm wrapper
 rule fastqc:
-    """
-    Run FastQC
-    """
     input:
         '{sample_dir}/{sample}/{sample}{suffix}'
     threads:
@@ -266,11 +124,14 @@ rule fastqc:
 
 if config['aligner'] == 'hisat2':
     rule hisat2:
-        """
-        Map reads with HISAT2
-        """
         input:
-            fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            # TODO: make sure this works
+            fastq=(
+                 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz',
+                 'data/rnaseq_samples/{sample}/{sample}_R2.fastq.gz',
+            ) if is_paired else (
+                 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz',
+            ),
             index=rules.hisat2_index.output,
         output:
             bam=temporary(c.patterns['bam'])
@@ -349,7 +210,7 @@ if config['aligner'] == 'star':
         Align with STAR (1-pass mode)
         """
         input:
-            fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
             index=rules.star_index.output,
             annotation=REFERENCES + "/annotation.gtf"
         output:
@@ -388,7 +249,7 @@ if config['aligner'] == 'star-twopass':
         First pass of alignment with STAR to get the junctions
         """
         input:
-            fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
             index=rules.star_index.output,
             annotation=REFERENCES + "/annotation.gtf"
         output:
@@ -429,7 +290,7 @@ if config['aligner'] == 'star-twopass':
         """
         input:
             sjout=expand(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES),
-            fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
             index=rules.star_index.output,
             annotation=REFERENCES + "/annotation.gtf"
         output:
@@ -611,10 +472,10 @@ rule rrna_libsizes_table:
         runtime=autobump(hours=2)
     run:
         def rrna_sample(f):
-            return utils.extract_wildcards(c.patterns['rrna']['libsize'], f)['sample']
+            return extract_wildcards(c.patterns['rrna']['libsize'], f)['sample']
 
         def sample(f):
-            return utils.extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample']
+            return extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample']
 
         def million(f):
             return float(open(f).read()) / 1e6
@@ -651,27 +512,38 @@ rule rrna_libsizes_table:
 
 
 rule multiqc:
-    """
-    Aggregate various QC stats and logs into a single HTML report with MultiQC
-    """
-    # NOTE: if you add more rules and want MultiQC to pick up the output, then
-    # add outputs from those rules to the inputs here.
     input:
         files=(
-            utils.flatten(c.targets['fastqc']) +
-            utils.flatten(c.targets['rrna_percentages_yaml']) +
-            utils.flatten(c.targets['cutadapt']) +
-            utils.flatten(c.targets['featurecounts']) +
-            utils.flatten(c.targets['markduplicates']) +
-            utils.flatten(c.targets['salmon']) +
-            utils.flatten(c.targets['rseqc']) +
-            utils.flatten(c.targets['preseq']) +
-            utils.flatten(c.targets['collectrnaseqmetrics']) +
-            utils.flatten(c.targets['samtools'])
+            expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n),
+            expand(
+            'data/rnaseq_samples/{sample}/fastqc/{sample}_R1{kind}.fastq.gz_fastqc.zip',
+            sample=SAMPLES, kind=["", ".cutadapt", ".cutadapt.bam"]
+            ),
+            expand(
+                'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups{ext}',
+                sample=SAMPLES, ext=['.bam', '.bam.bai']
+            ),
+            expand(
+                'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf',
+                sample=SAMPLES
+            ),
+            expand('data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5', sample=SAMPLES),
+            expand('data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt', sample=SAMPLES),
+            expand('data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt', sample=SAMPLES),
+            expand('data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt', sample=SAMPLES),
+            expand('data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics', sample=SAMPLES),
+            expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig', sample=SAMPLES, dir=["pos", "neg"]),
+            expand('data/rnaseq_samples/{sample}/idxstat_{sample}.txt', sample=SAMPLES),
+            expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat', sample=SAMPLES),
+            expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats', sample=SAMPLES),
+            'data/rnaseq_aggregation/rrna_percentages_table.tsv',
+            'data/rnaseq_aggregation/featurecounts.txt',
         ),
         config='config/multiqc_config.yaml'
-    output: c.targets['multiqc']
-    log: c.targets['multiqc'][0] + '.log'
+    output:
+        'data/rnaseq_aggregation/multiqc.html'
+    log:
+        'data/rnaseq_aggregation/multiqc.log'
     threads: 1
     resources:
         mem_mb=gb(2),
@@ -741,13 +613,11 @@ rule collectrnaseqmetrics:
         # config.
         java_args='-Xmx20g',
         # java_args='-Xmx2g',  # [TEST SETTINGS -1]
-        strand_arg = utils.strand_arg_lookup(
-            c, {
+        strand_arg={
                 'unstranded': 'STRAND=NONE ',
                 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ',
                 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ',
-            }
-        )
+            }[config["stranded"]
     log:
         c.patterns['collectrnaseqmetrics']['metrics'] + '.log'
     threads: 1
@@ -792,7 +662,7 @@ rule salmon:
     Quantify reads coming from transcripts with Salmon
     """
     input:
-        fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+        fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
         index=REFERENCES + "/salmon/versionInfo.json"
     output:
         c.patterns['salmon']
@@ -834,20 +704,18 @@ rule kallisto:
     Quantify reads coming from transcripts with Kallisto
     """
     input:
-        fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+        fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
         index=REFERENCES + "/kallisto/transcripts.idx",
     output:
         c.patterns['kallisto']
     params:
         index_dir=os.path.dirname(REFERENCES + "/kallisto/transcripts.idx"),
         outdir=os.path.dirname(c.patterns['kallisto']),
-        strand_arg = utils.strand_arg_lookup(
-            c, {
+        strand_arg={
                 'unstranded': '',
                 'fr-firststrand': '--rf-stranded',
                 'fr-secondstrand': '--fr-stranded',
-            }
-        )
+            }[config["stranded"]
     log:
         c.patterns['kallisto'] + '.log'
     threads:
@@ -955,13 +823,11 @@ rule bigwig_neg:
     log:
         c.patterns['bigwig']['neg'] + '.log'
     params:
-        strand_arg = utils.strand_arg_lookup(
-            c, {
+        strand_arg = {
                 'unstranded': '',
                 'fr-firststrand': '--filterRNAstrand reverse ',
                 'fr-secondstrand': '--filterRNAstrand forward ',
-            }
-        )
+            }[config["stranded"]
     run:
         shell(
             'bamCoverage '
@@ -989,13 +855,11 @@ rule bigwig_pos:
     log:
         c.patterns['bigwig']['pos'] + '.log'
     params:
-        strand_arg = utils.strand_arg_lookup(
-            c, {
+        strand_arg={
                 'unstranded': '',
                 'fr-firststrand': '--filterRNAstrand forward ',
                 'fr-secondstrand': '--filterRNAstrand reverse ',
-            }
-        )
+            }[config["stranded"]
     run:
         shell(
             'bamCoverage '
@@ -1021,46 +885,6 @@ def bigwigs_to_merge(wc):
         sample=neg_labels)
     return pos_bigwigs + neg_bigwigs
 
-if 'merged_bigwigs' in config:
-    rule merge_bigwigs:
-        """
-        Merge together bigWigs as specified in the config ("merged_bigwigs"
-        section).
-        """
-        input:
-            bigwigs=bigwigs_to_merge,
-            chromsizes=REFERENCES + "/genome.chromsizes"
-        output:
-            c.patterns['merged_bigwig']
-        log:
-            c.patterns['merged_bigwig'] + '.log'
-        resources:
-            mem_mb=gb(16),
-            runtime=autobump(hours=2)
-        script:
-            wrapper_for('average-bigwigs/wrapper.py')
-
-
-rule rnaseq_rmarkdown:
-    """
-    Run and render the RMarkdown file that performs differential expression
-    """
-    input:
-        featurecounts=utils.flatten(c.targets['featurecounts']),
-        salmon=utils.flatten(c.targets['salmon']),
-
-        # NOTE: the Rmd will likely need heavy editing depending on the project.
-        rmd='downstream/rnaseq.Rmd',
-        sampletable=config['sampletable']
-    output:
-        'downstream/rnaseq.html'
-    log:
-        'downstream/rnaseq.log'
-    shell:
-        'Rscript -e '
-        '''"rmarkdown::render('{input.rmd}')" '''
-        '> {log} 2>&1'
-        # [TEST_SETTINGS -1]
 
 rule flagstat:
     input:

From e3308bf08e897df00120854e01691824b412627d Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 6 Jan 2025 19:28:27 -0500
Subject: [PATCH 036/196] back to utils.py

---
 rules/common.smk => lib/utils.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)
 rename rules/common.smk => lib/utils.py (98%)

diff --git a/rules/common.smk b/lib/utils.py
similarity index 98%
rename from rules/common.smk
rename to lib/utils.py
index d0401625..f1a97c79 100644
--- a/rules/common.smk
+++ b/lib/utils.py
@@ -18,6 +18,12 @@
 
 # Small helper functions
 
+def render_r1_r2(pattern):
+    return expand(pattern, sample='{sample}', n=c.n)
+
+def render_r1_only(pattern):
+    return expand(pattern, sample='{sample}', n=1)
+
 
 def resolve_name(name):
     """
@@ -30,14 +36,14 @@ def resolve_name(name):
     parts_copy = parts[:]
     while parts_copy:
         try:
-            module = __import__(".".join(parts_copy))
+            module_ = __import__(".".join(parts_copy))
             break
         except ImportError:
             del parts_copy[-1]
             if not parts_copy:
                 raise
     parts = parts[1:]
-    obj = module
+    obj = module_
     for part in parts:
         obj = getattr(obj, part)
     return obj
@@ -559,10 +565,10 @@ def detect_layout(sampletable):
         p = sampletable.iloc[is_pe, 0].to_list()
         s = sampletable.iloc[[not i for i in is_pe], 0].to_list()
         if len(p) > len(s):
-            report = f"SE samples: {s}"
+            report_ = f"SE samples: {s}"
         else:
-            report = f"PE samples: {p}"
-        raise ValueError(f"Only a single layout (SE or PE) is supported. {report}")
+            report_ = f"PE samples: {p}"
+        raise ValueError(f"Only a single layout (SE or PE) is supported. {report_}")
 
 
 def fill_patterns(patterns, fill, combination=product):
@@ -1184,6 +1190,6 @@ def wrapper_for(path):
     return 'file:' + os.path.join('../..','wrappers', 'wrappers', path)
 
 def detect_sra(sampletable):
-    return 'Run' in self.sampletable.columns and any(self.sampletable['Run'].str.startswith('SRR'))
+    return 'Run' in sampletable.columns and any(sampletable['Run'].str.startswith('SRR'))
 
 # vim: ft=python

From 1b62efc9ab904e2106eeaca4990c57befb597bed Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 6 Jan 2025 19:28:44 -0500
Subject: [PATCH 037/196] rm libsizes table from multiqc

---
 workflows/rnaseq/config/multiqc_config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflows/rnaseq/config/multiqc_config.yaml b/workflows/rnaseq/config/multiqc_config.yaml
index 3e291495..0fe650a7 100644
--- a/workflows/rnaseq/config/multiqc_config.yaml
+++ b/workflows/rnaseq/config/multiqc_config.yaml
@@ -53,7 +53,6 @@ module_order:
             - '*.cutadapt.fastq.gz_fastqc.zip'
         path_filters:
             - '*.fastq.gz_fastqc.zip'
-    - libsizes_table
     - rrna_percentages_table
     - cutadapt
     - fastqc:

From 16d8489d66e052c11930bacbfb741639a9858565 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 6 Jan 2025 19:30:07 -0500
Subject: [PATCH 038/196] use patterns

---
 workflows/rnaseq/Snakefile | 354 ++++++++++++++++---------------------
 1 file changed, 156 insertions(+), 198 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 84b9bdd4..1c041d22 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -1,23 +1,27 @@
+import sys
 import os
 import yaml
 import tempfile
 import pandas as pd
 
+sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
+from lib import utils
+from lib.utils import autobump, gb, hours
+
+
 configfile: 'config/config.yaml'
 
 include: '../../rules/references.smk'
-include: '../../rules/common.smk'
 
 REFERENCES = config.get('reference_dir', '../../references')
-
 sampletable = pd.read_table(config["sampletable"], sep="\t")
-_st = c.sampletable.set_index(c.sampletable.columns[0])
-is_paired = detect_layout(sampletable) == "PE"
-is_sra = detect_sra(sampletable)
+sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
+is_paired = utils.detect_layout(sampletable) == "PE"
+is_sra = utils.detect_sra(sampletable)
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.iloc[:, 0].values
+patterns = yaml.safe_load(open('config/rnaseq_patterns.yaml'))
 
-# TODO: moved utils.py over to common.smk; not sure if this means that postprocessing will fail or not...
 
 wildcard_constraints:
     n = '[1,2]',
@@ -27,28 +31,18 @@ localrules: symlinks, symlink_targets
 
 rule all:
     input:
-        'data/rnaseq_aggregation/multiqc.html',
+        patterns["multiqc"]
 
 if is_sra:
     include: '../../rules/sra.smk'
 
 
-def orig_for_sample(wc):
-    """
-    Given a sample, returns either one or two original fastq files
-    depending on whether the library was single- or paired-end.
-    """
-    if is_paired:
-        return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']]
-    return _st.loc[wc.sample, ['orig_filename']]
-
-
 rule symlinks:
     input:
-        lambda wc: _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] if is_paired
-        else _st.loc[wc.sample, ['orig_filename']]
+        lambda wc: sampletable.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] if is_paired
+        else sampletable.loc[wc.sample, ['orig_filename']]
     output:
-        expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n)
+        expand(patterns["fastq"], n=n, allow_missing=True)
     threads: 1
     resources:
         mem_mb=100,
@@ -56,23 +50,26 @@ rule symlinks:
     run:
         assert len(output) == len(input), (input, output)
         for src, linkname in zip(input, output):
-            make_relative_symlink(src, linkname)
+            utils.make_relative_symlink(src, linkname)
 
 
 rule symlink_targets:
-    input: c.targets['fastq']
+    input: 
+        expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n)
 
 # This can be set at the command line with --config strand_check_reads=1000
 config.setdefault('strand_check_reads', 1e5)
-include: '../../rules/strand_check.smk'
+
+# TODO: re-enable
+# include: '../../rules/strand_check.smk'
 
 rule cutadapt:
     input:
-        fastq=expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n)
+        fastq=expand(patterns["fastq"], n=n, allow_missing=True)
     output:
-        fastq=expand('data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz', sample=SAMPLES, n=n)
+        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True)
     log:
-        'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz'
+        'data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log'
     threads: 6
     resources:
         mem_mb=gb(2),
@@ -85,7 +82,7 @@ rule cutadapt:
             "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
         ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if is_paired else ""
     run:
-        if c.is_paired:
+        if is_paired:
             shell(
                 "cutadapt "
                 "-o {output[0]} "
@@ -119,24 +116,18 @@ rule fastqc:
         mem_mb=gb(8),
         runtime=autobump(hours=2)
     script:
-        wrapper_for('fastqc/wrapper.py')
+        utils.wrapper_for('fastqc/wrapper.py')
 
 
 if config['aligner'] == 'hisat2':
     rule hisat2:
         input:
-            # TODO: make sure this works
-            fastq=(
-                 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz',
-                 'data/rnaseq_samples/{sample}/{sample}_R2.fastq.gz',
-            ) if is_paired else (
-                 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz',
-            ),
+            fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.hisat2_index.output,
         output:
-            bam=temporary(c.patterns['bam'])
+            bam=temporary(patterns['bam'])
         log:
-            c.patterns['bam'] + '.log'
+            patterns['bam'] + '.log'
         threads: 6
         resources:
             mem_mb=gb(32),
@@ -144,12 +135,10 @@ if config['aligner'] == 'hisat2':
         params:
             extra=""
         run:
-
             prefix = os.path.commonprefix(input.index).rstrip(".")
             sam = output.bam.replace('.bam', '.sam')
 
-
-            if c.is_paired:
+            if is_paired:
                 assert len(input.fastq) == 2
                 fastqs = '-1 {0} -2 {1} '.format(*input.fastq)
             else:
@@ -172,6 +161,8 @@ if config['aligner'] == 'hisat2':
                 "&& rm {sam}"
             )
 
+# TODO: star has lots of rules. Better to be in rules/aligner.smk?
+
 if config['aligner'].startswith('star'):
 
     # STAR can be run in 1-pass or 2-pass modes. Since we may be running it
@@ -210,14 +201,14 @@ if config['aligner'] == 'star':
         Align with STAR (1-pass mode)
         """
         input:
-            fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.star_index.output,
-            annotation=REFERENCES + "/annotation.gtf"
+            annotation=f"{REFERENCES}/annotation.gtf"
         output:
-            bam=temporary(c.patterns['bam']),
-            sjout=temporary(c.patterns['bam'].replace('.bam', '.star.SJ.out.tab')),
+            bam=temporary(patterns['bam']),
+            sjout=temporary(patterns['bam'].replace('.bam', '.star.SJ.out.tab')),
         log:
-            c.patterns['bam'].replace('.bam', '.star.bam.log')
+            patterns['bam'].replace('.bam', '.star.bam.log')
         threads: 16
         resources:
             mem_mb=gb(64),
@@ -249,13 +240,13 @@ if config['aligner'] == 'star-twopass':
         First pass of alignment with STAR to get the junctions
         """
         input:
-            fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.star_index.output,
-            annotation=REFERENCES + "/annotation.gtf"
+            annotation=f"{REFERENCES}/annotation.gtf"
         output:
-            sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')),
+            sjout=temporary(patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')),
         log:
-            c.patterns['bam'].replace('.bam', '.star-pass1.bam.log')
+            patterns['bam'].replace('.bam', '.star-pass1.bam.log')
         threads: 16
         resources:
             mem_mb=gb(64),
@@ -289,15 +280,15 @@ if config['aligner'] == 'star-twopass':
         samples to get the final BAM
         """
         input:
-            sjout=expand(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES),
-            fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+            fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.star_index.output,
-            annotation=REFERENCES + "/annotation.gtf"
+            annotation=f"{REFERENCES}/annotation.gtf",
+            sjout=expand(patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES),
         output:
-            bam=temporary(c.patterns['bam']),
-            sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')),
+            bam=temporary(patterns['bam']),
+            sjout=temporary(patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')),
         log:
-            c.patterns['bam'].replace('.bam', '.star-pass2.bam.log')
+            patterns['bam'].replace('.bam', '.star-pass2.bam.log')
         threads: 16
         resources:
             mem_mb=gb(64),
@@ -332,9 +323,9 @@ if config['aligner'] == 'star-twopass':
 
 rule rRNA:
     input:
-        fastq=render_r1_only(c.patterns['cutadapt']),
+        fastq=expand(patterns["cutadapt"], n=1, allow_missing=True),  # currently only R1
         index=multiext(
-            REFERENCES + "/bowtie2/rrna",
+            f"{REFERENCES}/bowtie2/rrna",
             ".1.bt2",
             ".2.bt2",
             ".3.bt2",
@@ -344,9 +335,9 @@ rule rRNA:
             ".fa",
         ),
     output:
-        bam=temporary(c.patterns['rrna']['bam'])
+        bam=temporary(patterns['rrna']['bam'])
     log:
-        c.patterns['rrna']['bam'] + '.log'
+        patterns['rrna']['bam'] + '.log'
     threads: 6
     resources:
         mem_mb=gb(2),
@@ -356,7 +347,6 @@ rule rRNA:
             '-k 1 '       # NOTE: we only care if >=1 mapped
             '--no-unal '  # NOTE: suppress unaligned reads
         )
-
     run:
         prefix = os.path.commonprefix(input.index).rstrip(".")
         sam = output.bam.replace('.bam', '.sam')
@@ -366,8 +356,8 @@ rule rRNA:
             "-x {prefix} "
             "-U {input.fastq} "
             "--threads {threads} "
-            "-S {sam} "
             "{params.extra} "
+            "-S {sam} "
             "> {log} 2>&1"
         )
 
@@ -417,13 +407,14 @@ rule bam_index:
         'samtools index {input} {output}'
 
 
+# TODO: split into multiple featurecounts runs, since PE needs to be sorted each time.
 rule featurecounts:
     """
     Count reads in annotations with featureCounts from the subread package
     """
     input:
         annotation=rules.gtf.output,
-        bam=c.targets['markduplicates']['bam']
+        bam=expand(patterns['markduplicates']['bam'], sample=SAMPLES),
     output:
         counts='{sample_dir}/rnaseq_aggregation/featurecounts.txt'
     log:
@@ -442,7 +433,7 @@ rule featurecounts:
     run:
         # NOTE: By default, we use -p for paired-end
         p_arg = ''
-        if c.is_paired:
+        if is_paired:
             p_arg = '-p --countReadPairs '
         shell(
             'featureCounts '
@@ -461,21 +452,21 @@ rule rrna_libsizes_table:
     Aggregate rRNA counts into a table
     """
     input:
-        rrna=c.targets['rrna']['libsize'],
-        fastq=c.targets['libsizes']['cutadapt']
+        rrna=expand(patterns['rrna']['libsize'], sample=SAMPLES),
+        fastq=expand(patterns['libsizes']['cutadapt'], sample=SAMPLES),
     output:
-        json=c.patterns['rrna_percentages_yaml'],
-        tsv=c.patterns['rrna_percentages_table']
+        json=patterns['rrna_percentages_yaml'],
+        tsv=patterns['rrna_percentages_table']
     threads: 1
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2)
     run:
         def rrna_sample(f):
-            return extract_wildcards(c.patterns['rrna']['libsize'], f)['sample']
+            return utils.extract_wildcards(patterns['rrna']['libsize'], f)['sample']
 
         def sample(f):
-            return extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample']
+            return utils.extract_wildcards(patterns['libsizes']['cutadapt'], f)['sample']
 
         def million(f):
             return float(open(f).read()) / 1e6
@@ -514,30 +505,21 @@ rule rrna_libsizes_table:
 rule multiqc:
     input:
         files=(
-            expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n),
-            expand(
-            'data/rnaseq_samples/{sample}/fastqc/{sample}_R1{kind}.fastq.gz_fastqc.zip',
-            sample=SAMPLES, kind=["", ".cutadapt", ".cutadapt.bam"]
-            ),
-            expand(
-                'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups{ext}',
-                sample=SAMPLES, ext=['.bam', '.bam.bai']
-            ),
-            expand(
-                'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf',
-                sample=SAMPLES
-            ),
-            expand('data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5', sample=SAMPLES),
-            expand('data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt', sample=SAMPLES),
-            expand('data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt', sample=SAMPLES),
-            expand('data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt', sample=SAMPLES),
-            expand('data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics', sample=SAMPLES),
-            expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig', sample=SAMPLES, dir=["pos", "neg"]),
-            expand('data/rnaseq_samples/{sample}/idxstat_{sample}.txt', sample=SAMPLES),
-            expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat', sample=SAMPLES),
-            expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats', sample=SAMPLES),
-            'data/rnaseq_aggregation/rrna_percentages_table.tsv',
-            'data/rnaseq_aggregation/featurecounts.txt',
+            expand(patterns["fastqc"]["raw"], sample=SAMPLES),
+            expand(patterns["fastqc"]["cutadapt"], sample=SAMPLES),
+            expand(patterns["fastqc"]["bam"], sample=SAMPLES),
+            expand(patterns["markduplicates"]["bam"], sample=SAMPLES),
+            expand(patterns["salmon"], sample=SAMPLES),
+            expand(patterns["kallisto"], sample=SAMPLES),
+            expand(patterns["preseq"], sample=SAMPLES),
+            expand(patterns["rseqc"]["infer_experiment"], sample=SAMPLES),
+            expand(patterns["rseqc"]["read_distribution"], sample=SAMPLES),
+            expand(patterns["collectrnaseqmetrics"]["metrics"], sample=SAMPLES),
+            expand(patterns["samtools"]["idxstats"], sample=SAMPLES),
+            expand(patterns["samtools"]["flagstat"], sample=SAMPLES),
+            expand(patterns["samtools"]["stats"], sample=SAMPLES),
+            patterns["rrna_percentages_table"],
+            patterns["featurecounts"],
         ),
         config='config/multiqc_config.yaml'
     output:
@@ -550,8 +532,8 @@ rule multiqc:
         runtime=autobump(hours=2)
     run:
         analysis_directory = set([os.path.dirname(i) for i in input])
-        outdir = os.path.dirname(c.targets['multiqc'][0])
-        basename = os.path.basename(c.targets['multiqc'][0])
+        outdir = os.path.dirname(output[0])
+        basename = os.path.basename(output[0])
         shell(
             'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 '
             'multiqc '
@@ -570,16 +552,13 @@ rule markduplicates:
     Mark or remove PCR duplicates with Picard MarkDuplicates
     """
     input:
-        bam=c.patterns['bam']
+        bam=patterns['bam']
     output:
-        bam=c.patterns['markduplicates']['bam'],
-        metrics=c.patterns['markduplicates']['metrics']
+        bam=patterns['markduplicates']['bam'],
+        metrics=patterns['markduplicates']['metrics'],
     log:
-        c.patterns['markduplicates']['bam'] + '.log'
+        patterns['markduplicates']['bam'] + '.log'
     params:
-        # NOTE: Be careful with the memory here; make sure you have enough
-        # and/or it matches the resources you're requesting in the cluster
-        # config.
         java_args='-Xmx20g'
         # java_args='-Xmx2g'  # [TEST SETTINGS -1]
     threads: 1
@@ -603,23 +582,20 @@ rule collectrnaseqmetrics:
     Calculate various RNA-seq QC metrics with Picarc CollectRnaSeqMetrics
     """
     input:
-        bam=c.patterns['markduplicates']['bam'],
+        bam=patterns['markduplicates']['bam'],
         refflat=rules.conversion_refflat.output,
     output:
-        metrics=c.patterns['collectrnaseqmetrics']['metrics'],
+        metrics=patterns['collectrnaseqmetrics']['metrics'],
     params:
-        # NOTE: Be careful with the memory here; make sure you have enough
-        # and/or it matches the resources you're requesting in the cluster
-        # config.
         java_args='-Xmx20g',
         # java_args='-Xmx2g',  # [TEST SETTINGS -1]
         strand_arg={
                 'unstranded': 'STRAND=NONE ',
                 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ',
                 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ',
-            }[config["stranded"]
+            }[config["stranded"]]
     log:
-        c.patterns['collectrnaseqmetrics']['metrics'] + '.log'
+        patterns['collectrnaseqmetrics']['metrics'] + '.log'
     threads: 1
     resources:
         mem_mb=gb(32),
@@ -643,9 +619,9 @@ rule preseq:
     Compute a library complexity curve with preseq
     """
     input:
-        bam=c.patterns['bam']
+        bam=patterns['bam']
     output:
-        c.patterns['preseq']
+        patterns['preseq']
     threads: 1
     resources:
         mem_mb=gb(1),
@@ -662,38 +638,36 @@ rule salmon:
     Quantify reads coming from transcripts with Salmon
     """
     input:
-        fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
         index=REFERENCES + "/salmon/versionInfo.json"
     output:
-        c.patterns['salmon']
-    params:
-        index_dir=os.path.dirname(REFERENCES + "/salmon/versionInfo.json"),
-        outdir=os.path.dirname(c.patterns['salmon'])
+        patterns['salmon']
     log:
-        c.patterns['salmon'] + '.log'
+        patterns['salmon'] + '.log'
+    params:
+        extra=(
+            "--libType=A "
+            "--gcBias "
+            "--seqBias "
+            "--validateMappings "
+        )
     threads: 6
     resources:
         mem_mb=gb(32),
         runtime=autobump(hours=2)
     run:
-        if c.is_paired:
+        outdir = os.path.dirname(output[0])
+        index_dir = os.path.dirname(input.index)
+        if is_paired:
             fastq_arg = f'-1 {input.fastq[0]} -2 {input.fastq[1]} '
         else:
             fastq_arg = f'-r {input.fastq} '
         shell(
             'salmon quant '
-            '--index {params.index_dir} '
-            '--output {params.outdir} '
+            '--index {index_dir} '
+            '--output {outdir} '
             '--threads {threads} '
-
-            # NOTE: --libType=A auto-detects library type. Change if needed.
-            '--libType=A '
-
-            # NOTE: Docs suggest using --gcBias, --validateMappings, and
-            # --seqBias is a good idea
-            '--gcBias '
-            '--seqBias '
-            '--validateMappings '
+            '{params.extra} '
             '{fastq_arg} '
             '&> {log}'
         )
@@ -704,43 +678,38 @@ rule kallisto:
     Quantify reads coming from transcripts with Kallisto
     """
     input:
-        fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
+        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
         index=REFERENCES + "/kallisto/transcripts.idx",
     output:
-        c.patterns['kallisto']
+        patterns['kallisto']
     params:
-        index_dir=os.path.dirname(REFERENCES + "/kallisto/transcripts.idx"),
-        outdir=os.path.dirname(c.patterns['kallisto']),
         strand_arg={
                 'unstranded': '',
                 'fr-firststrand': '--rf-stranded',
                 'fr-secondstrand': '--fr-stranded',
-            }[config["stranded"]
+            }[config["stranded"]],
+        extra=(
+            "--bootstrap-samples 100" if is_paired else
+            "--single --fragment-length 300 --sd 20 --bootstrap-samples 100"
+        ),
     log:
-        c.patterns['kallisto'] + '.log'
+        patterns['kallisto'] + '.log'
     threads:
         8
     resources:
         mem_mb=gb(32),
         runtime=autobump(hours=2),
     run:
-        if c.is_paired:
-            se_args = ''
-            assert len(input.fastq) == 2
-        else:
-            # For single-end, add the experimentally-determined fragment length
-            # and standard deviation here
-            se_args = '--single --fragment-length 300 --sd 20 '
-            assert len(input.fastq) == 1
+        outdir = os.path.dirname(output[0])
         shell(
             'kallisto quant '
             '--index {input.index} '
-            '--output-dir {params.outdir} '
+            '--output-dir {outdir} '
             '--threads {threads} '
             '--bootstrap-samples 100 '
             '--threads {threads} '
-            '{se_args} '
             '{params.strand_arg} '
+            '{params.extra} '
             '{input.fastq} '
             '&> {log}'
         )
@@ -750,30 +719,30 @@ rule rseqc_infer_experiment:
     Infer strandedness of experiment
     """
     input:
-        bam=c.patterns['markduplicates']['bam'],
+        bam=patterns['markduplicates']['bam'],
         bed12=rules.conversion_bed12.output,
     output:
-        txt=c.patterns['rseqc']['infer_experiment']
+        txt=patterns['rseqc']['infer_experiment']
     log:
-        c.patterns['rseqc']['infer_experiment'] + '.log'
+        patterns['rseqc']['infer_experiment'] + '.log'
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2)
-
     shell:
         'infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}'
 
+
 rule rseqc_read_distribution:
     """
     read distribution plots
     """
     input:
-        bam=c.patterns['markduplicates']['bam'],
+        bam=patterns['markduplicates']['bam'],
         bed12=rules.conversion_bed12.output,
     output:
-        txt=c.patterns['rseqc']['read_distribution']
+        txt=patterns['rseqc']['read_distribution']
     log:
-        c.patterns['rseqc']['read_distribution'] + '.log'
+        patterns['rseqc']['read_distribution'] + '.log'
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2)
@@ -786,12 +755,12 @@ rule idxstats:
     Run samtools idxstats on sample bams
     """
     input:
-        bam=c.patterns['markduplicates']['bam'],
-        bai=c.patterns['markduplicates']['bam'] + '.bai'
+        bam=patterns['markduplicates']['bam'],
+        bai=patterns['markduplicates']['bam'] + '.bai'
     output:
-        txt=c.patterns['samtools']['idxstats']
+        txt=patterns['samtools']['idxstats']
     log: 
-        c.patterns['samtools']['idxstats'] + '.log'
+        patterns['samtools']['idxstats'] + '.log'
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2)
@@ -801,40 +770,39 @@ rule idxstats:
         )
 
 
-# Common arguments used for bamCoverage rules below
-BAMCOVERAGE_ARGS = (
-    '--minMappingQuality 20 '  # excludes multimappers
-    '--smoothLength 10 '       # smooth signal with specified window
-    # '--normalizeUsing BPM '    # equivalent to TPM # [TEST SETTINGS]
-)
-
 rule bigwig_neg:
     """
     Create a bigwig for negative-strand reads
     """
     input:
-        bam=c.patterns['markduplicates']['bam'],
-        bai=c.patterns['markduplicates']['bam'] + '.bai',
-    output: c.patterns['bigwig']['neg']
+        bam=patterns['markduplicates']['bam'],
+        bai=patterns['markduplicates']['bam'] + '.bai',
+    output:
+        patterns['bigwig']['neg']
     threads: 8
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2)
     log:
-        c.patterns['bigwig']['neg'] + '.log'
+        patterns['bigwig']['neg'] + '.log'
     params:
         strand_arg = {
                 'unstranded': '',
                 'fr-firststrand': '--filterRNAstrand reverse ',
                 'fr-secondstrand': '--filterRNAstrand forward ',
-            }[config["stranded"]
+            }[config["stranded"]],
+        extra=(
+            '--minMappingQuality 20 '
+            '--smoothLength 10 '
+            '--normalizeUsing BPM '    # equivalent to TPM # [TEST SETTINGS]
+        ),
     run:
         shell(
             'bamCoverage '
             '--bam {input.bam} '
             '-o {output} '
             '-p {threads} '
-            '{BAMCOVERAGE_ARGS} '
+            '{params.extra} '
             '{params.strand_arg} '
             '&> {log}'
         )
@@ -845,21 +813,27 @@ rule bigwig_pos:
     Create a bigwig for postive-strand reads.
     """
     input:
-        bam=c.patterns['markduplicates']['bam'],
-        bai=c.patterns['markduplicates']['bam'] + '.bai',
-    output: c.patterns['bigwig']['pos']
+        bam=patterns['markduplicates']['bam'],
+        bai=patterns['markduplicates']['bam'] + '.bai',
+    output:
+        patterns['bigwig']['pos']
     threads: 8
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2)
     log:
-        c.patterns['bigwig']['pos'] + '.log'
+        patterns['bigwig']['pos'] + '.log'
     params:
         strand_arg={
                 'unstranded': '',
                 'fr-firststrand': '--filterRNAstrand forward ',
                 'fr-secondstrand': '--filterRNAstrand reverse ',
-            }[config["stranded"]
+            }[config["stranded"]],
+        extra=(
+            '--minMappingQuality 20 '
+            '--smoothLength 10 '
+            '--normalizeUsing BPM '    # equivalent to TPM # [TEST SETTINGS]
+        ),
     run:
         shell(
             'bamCoverage '
@@ -872,43 +846,27 @@ rule bigwig_pos:
         )
 
 
-def bigwigs_to_merge(wc):
-    chunk = config['merged_bigwigs'][wc.merged_bigwig_label]
-    neg_labels = chunk.get('neg', [])
-    pos_labels = chunk.get('pos', [])
-    pos_bigwigs = expand(
-        c.patterns['bigwig']['pos'],
-        sample=pos_labels
-    )
-    neg_bigwigs = expand(
-        c.patterns['bigwig']['neg'],
-        sample=neg_labels)
-    return pos_bigwigs + neg_bigwigs
-
-
 rule flagstat:
     input:
-        bam=c.patterns['markduplicates']['bam'],
-        bai=c.patterns['markduplicates']['bam'] + '.bai'
+        bam=patterns['markduplicates']['bam'],
+        bai=patterns['markduplicates']['bam'] + '.bai'
     output:
-        c.patterns['samtools']['flagstat']
+        patterns['samtools']['flagstat']
     log:
-        c.patterns['samtools']['flagstat'] + '.log'
+        patterns['samtools']['flagstat'] + '.log'
     shell:
         'samtools flagstat {input.bam} > {output}'
 
 
 rule samtools_stats:
     input:
-        bam=c.patterns['markduplicates']['bam'],
-        bai=c.patterns['markduplicates']['bam'] + '.bai'
+        bam=patterns['markduplicates']['bam'],
+        bai=patterns['markduplicates']['bam'] + '.bai'
     output:
-        c.patterns['samtools']['stats']
+        patterns['samtools']['stats']
     log:
-        c.patterns['samtools']['stats'] + '.log'
+        patterns['samtools']['stats'] + '.log'
     shell:
         'samtools stats {input.bam} > {output}'
 
-
-
 # vim: ft=python

From d6b512aa4c70d262567ca9729a0b0c76f6ca1a96 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 6 Jan 2025 22:06:57 -0500
Subject: [PATCH 039/196] mv back to workflows/references/Snakefile

---
 rules/references.smk => workflows/references/Snakefile | 0
 workflows/rnaseq/Snakefile                             | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename rules/references.smk => workflows/references/Snakefile (100%)

diff --git a/rules/references.smk b/workflows/references/Snakefile
similarity index 100%
rename from rules/references.smk
rename to workflows/references/Snakefile
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 1c041d22..f5408301 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -11,7 +11,7 @@ from lib.utils import autobump, gb, hours
 
 configfile: 'config/config.yaml'
 
-include: '../../rules/references.smk'
+include: '../references/Snakefile'
 
 REFERENCES = config.get('reference_dir', '../../references')
 sampletable = pd.read_table(config["sampletable"], sep="\t")

From 99f3f6b63dead9a8e4eae73968ca8843dc4a5295 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 6 Jan 2025 22:07:33 -0500
Subject: [PATCH 040/196] fix params for bigwig

---
 workflows/rnaseq/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index f5408301..62d2788b 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -840,7 +840,7 @@ rule bigwig_pos:
             '--bam {input.bam} '
             '-o {output} '
             '-p {threads} '
-            '{BAMCOVERAGE_ARGS} '
+            '{params.extra} '
             '{params.strand_arg} '
             '&> {log}'
         )

From bf276c3a7af8ef1510fb5f2f9efdc8f190fbd9ad Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 6 Jan 2025 22:07:53 -0500
Subject: [PATCH 041/196] use slightly cleaner syntax

---
 workflows/references/Snakefile | 79 +++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index 157eeed9..bf4cf212 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -2,8 +2,7 @@ import os
 import sys
 import pandas
 
-HERE = str(Path(workflow.snakefile).parent)
-sys.path.insert(0, HERE + "/../..")
+sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
 from lib.utils import autobump, gb, hours
 from lib import utils
 
@@ -14,9 +13,9 @@ def default_postprocess(origfn, newfn):
 
 rule fasta:
     output:
-        temporary(REFERENCES + '/genome.fa.gz')
+        temporary(f'{REFERENCES}/genome.fa.gz')
     log:
-        REFERENCES + "/logs/genome.fa.gz.log"
+        f"{REFERENCES}/logs/genome.fa.gz.log"
     run:
         utils.download_and_postprocess(
             urls=config['fasta']['url'],
@@ -28,9 +27,9 @@ rule fasta:
 
 rule gtf:
     output:
-        temporary(REFERENCES + '/annotation.gtf.gz')
+        temporary(f"{REFERENCES}/annotation.gtf.gz")
     log:
-        REFERENCES + "/logs/annotation.gtf.gz.log"
+        f"{REFERENCES}/logs/annotation.gtf.gz.log"
     run:
         utils.download_and_postprocess(
             urls=config['gtf']['url'],
@@ -42,9 +41,9 @@ rule gtf:
 
 rule rrna:
     output:
-        temporary(REFERENCES + '/rrna.fa.gz')
+        temporary(f"{REFERENCES}/rrna.fa.gz")
     log:
-        REFERENCES + "/logs/rrna.fa.gz.log"
+        f"{REFERENCES}/logs/rrna.fa.gz.log"
     run:
         utils.download_and_postprocess(
             urls=config['rrna']['url'],
@@ -56,18 +55,18 @@ rule rrna:
 
 rule unzip:
     input:
-        REFERENCES + '/{prefix}.gz'
+        f"{REFERENCES}/{prefix}.gz"
     output:
-        REFERENCES + '/{prefix}'
+        f"{REFERENCES}/{prefix}"
     shell: 'gunzip -c {input} > {output}'
 
 
 rule bowtie2_index:
     input:
-        REFERENCES + '/{label}.fa',
+        f"{REFERENCES}/{label}.fa",
     output:
         multiext(
-            REFERENCES + '/bowtie2/{label}',
+            f"{REFERENCES}/bowtie2/{label}",
             ".1.bt2",
             ".2.bt2",
             ".3.bt2",
@@ -77,7 +76,7 @@ rule bowtie2_index:
             ".fa",
         ),
     log:
-        REFERENCES + '/logs/bowtie2_{label}.log'
+        f"{REFERENCES}/logs/bowtie2_{label}.log'
     resources:
         runtime=autobump(hours=8),
         mem_mb=autobump(gb=32),
@@ -98,12 +97,12 @@ rule bowtie2_index:
 
 rule star_index:
     input:
-        fasta=REFERENCES + '/genome.fa',
-        gtf=REFERENCES + '/annotation.gtf',
+        fasta=f"{REFERENCES}/genome.fa",
+        gtf=f"{REFERENCES}/annotation.gtf",
     output:
-        REFERENCES + '/star/Genome'
+        f"{REFERENCES}/star/Genome"
     log:
-        REFERENCES + '/logs/star.log'
+        f"{REFERENCES}/logs/star.log"
     threads:
         8
     resources:
@@ -139,10 +138,10 @@ rule star_index:
 
 rule hisat2_index:
     input:
-        REFERENCES + '/genome.fa',
+        f"{REFERENCES}/genome.fa",
     output:
         multiext(
-            REFERENCES + '/hisat2/genome',
+            f"{REFERENCES}/hisat2/genome",
             ".1.ht2",
             ".2.ht2",
             ".3.ht2",
@@ -154,7 +153,7 @@ rule hisat2_index:
             ".fa",
         )
     log:
-        REFERENCES + '/logs/hisat2.log'
+        f"{REFERENCES}/logs/hisat2.log"
     resources:
         runtime=autobump(hours=8),
         mem_mb=autobump(gb=32),
@@ -176,10 +175,10 @@ rule hisat2_index:
 
 rule transcriptome_fasta:
     input:
-        fasta=REFERENCES + '/genome.fa',
-        gtf=REFERENCES + '/annotation.gtf',
+        fasta=f"{REFERENCES}/genome.fa",
+        gtf=f"{REFERENCES}/annotation.gtf",
     output:
-        REFERENCES + '/transcriptome.fa' 
+        f"{REFERENCES}/transcriptome.fa" 
     resources:
         runtime=hours(1)
     shell:
@@ -188,13 +187,13 @@ rule transcriptome_fasta:
 
 rule salmon_index:
     input:
-        REFERENCES + '/transcriptome.fa'
+        f"{REFERENCES}/transcriptome.fa"
     output:
-        REFERENCES + '/salmon/versionInfo.json'
+        f"{REFERENCES}/salmon/versionInfo.json"
     log:
-        REFERENCES + '/logs/salmon.log'
+        f"{REFERENCES}/logs/salmon.log"
     params:
-        outdir=REFERENCES + '/salmon'
+        outdir=f"{REFERENCES}/salmon"
     resources:
         mem_mb=gb(32),
         runtime=hours(2)
@@ -210,11 +209,11 @@ rule salmon_index:
 
 rule kallisto_index:
     output:
-        REFERENCES + '/kallisto/transcripts.idx',
+        f"{REFERENCES}/kallisto/transcripts.idx",
     input:
-        REFERENCES + '/genome.fa'
+        f"{REFERENCES}/genome.fa"
     log:
-        REFERENCES + '/logs/kallisto.log'
+        f"{REFERENCES}/logs/kallisto.log"
     resources:
         runtime=hours(2),
         mem_mb=gb(32),
@@ -227,11 +226,11 @@ rule kallisto_index:
 
 rule conversion_refflat:
     input:
-        REFERENCES + '/annotation.gtf'
+        f"{REFERENCES}/annotation.gtf"
     output:
-        REFERENCES + '/annotation.refflat'
+        f"{REFERENCES}/annotation.refflat"
     log:
-        REFERENCES + '/logs/annotation.refflat.log'
+        f"{REFERENCES}/logs/annotation.refflat.log"
     resources:
         runtime=hours(2),
         mem_mb=gb(2)
@@ -243,9 +242,9 @@ rule conversion_refflat:
 
 rule conversion_bed12:
     input:
-        REFERENCES + '/annotation.gtf'
+        f"{REFERENCES}/annotation.gtf"
     output:
-        REFERENCES + '/annotation.bed12'
+        f"{REFERENCES}/annotation.bed12"
     resources:
         runtime=hours(2),
         mem_mb=gb(2)
@@ -257,11 +256,11 @@ rule conversion_bed12:
 
 rule chromsizes:
     input:
-        REFERENCES + '/genome.fa'
+        f"{REFERENCES}/genome.fa"
     output:
-        REFERENCES + '/genome.chromsizes'
+        f"{REFERENCES}/genome.chromsizes"
     log:
-        REFERENCES + '/logs/genome.chromsizes.log'
+        f"{REFERENCES}/logs/genome.chromsizes.log"
     params:
         # NOTE: Be careful with the memory here; make sure you have enough
         # and/or it matches the resources you're requesting
@@ -288,9 +287,9 @@ rule mappings:
     Creates gzipped TSV mapping between attributes in the GTF.
     """
     input:
-        gtf=REFERENCES + '/annotation.gtf'
+        gtf=f"{REFERENCES}/annotation.gtf"
     output:
-        REFERENCES + '/annotation.mapping.tsv.gz'
+        f"{REFERENCES}/annotation.mapping.tsv.gz"
     params:
         include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', [])
     resources:

From 4328264119571baefb5ce17518670629fe5c6b8a Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 6 Jan 2025 22:12:33 -0500
Subject: [PATCH 042/196] always put params directly before run/shell

---
 workflows/rnaseq/Snakefile | 44 ++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 62d2788b..5e37daf1 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -446,7 +446,7 @@ rule featurecounts:
             '&> {log}'
         )
 
-
+# TODO: port some of this over to utils, or maybe script.
 rule rrna_libsizes_table:
     """
     Aggregate rRNA counts into a table
@@ -558,14 +558,14 @@ rule markduplicates:
         metrics=patterns['markduplicates']['metrics'],
     log:
         patterns['markduplicates']['bam'] + '.log'
-    params:
-        java_args='-Xmx20g'
-        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
     threads: 1
     resources:
         mem_mb=gb(32),
         runtime=autobump(hours=2),
         disk_mb=autobump(gb=100),
+    params:
+        java_args='-Xmx20g'
+        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
     shell:
         'picard '
         '{params.java_args} '
@@ -586,6 +586,12 @@ rule collectrnaseqmetrics:
         refflat=rules.conversion_refflat.output,
     output:
         metrics=patterns['collectrnaseqmetrics']['metrics'],
+    log:
+        patterns['collectrnaseqmetrics']['metrics'] + '.log'
+    threads: 1
+    resources:
+        mem_mb=gb(32),
+        runtime=autobump(hours=2)
     params:
         java_args='-Xmx20g',
         # java_args='-Xmx2g',  # [TEST SETTINGS -1]
@@ -594,12 +600,6 @@ rule collectrnaseqmetrics:
                 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ',
                 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ',
             }[config["stranded"]]
-    log:
-        patterns['collectrnaseqmetrics']['metrics'] + '.log'
-    threads: 1
-    resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2)
     run:
         shell(
             'picard '
@@ -644,6 +644,10 @@ rule salmon:
         patterns['salmon']
     log:
         patterns['salmon'] + '.log'
+    threads: 6
+    resources:
+        mem_mb=gb(32),
+        runtime=autobump(hours=2)
     params:
         extra=(
             "--libType=A "
@@ -651,10 +655,6 @@ rule salmon:
             "--seqBias "
             "--validateMappings "
         )
-    threads: 6
-    resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2)
     run:
         outdir = os.path.dirname(output[0])
         index_dir = os.path.dirname(input.index)
@@ -682,6 +682,13 @@ rule kallisto:
         index=REFERENCES + "/kallisto/transcripts.idx",
     output:
         patterns['kallisto']
+    log:
+        patterns['kallisto'] + '.log'
+    threads:
+        8
+    resources:
+        mem_mb=gb(32),
+        runtime=autobump(hours=2),
     params:
         strand_arg={
                 'unstranded': '',
@@ -692,13 +699,6 @@ rule kallisto:
             "--bootstrap-samples 100" if is_paired else
             "--single --fragment-length 300 --sd 20 --bootstrap-samples 100"
         ),
-    log:
-        patterns['kallisto'] + '.log'
-    threads:
-        8
-    resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
     run:
         outdir = os.path.dirname(output[0])
         shell(
@@ -868,5 +868,3 @@ rule samtools_stats:
         patterns['samtools']['stats'] + '.log'
     shell:
         'samtools stats {input.bam} > {output}'
-
-# vim: ft=python

From 5134c9e415660a4c77b550a60b378c12f09217c3 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 6 Jan 2025 22:24:27 -0500
Subject: [PATCH 043/196] run snakefmt on references

---
 workflows/references/Snakefile | 214 +++++++++++++++------------------
 1 file changed, 98 insertions(+), 116 deletions(-)

diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index bf4cf212..d6dcf759 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -6,67 +6,70 @@ sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
 from lib.utils import autobump, gb, hours
 from lib import utils
 
-REFERENCES = config.get('reference_dir', '../../references')
+REFERENCES = config.get("reference_dir", "../../references")
+
 
 def default_postprocess(origfn, newfn):
     shell("mv {origfn} {newfn}")
 
+
 rule fasta:
     output:
-        temporary(f'{REFERENCES}/genome.fa.gz')
+        temporary(f"{REFERENCES}/genome.fa.gz"),
     log:
-        f"{REFERENCES}/logs/genome.fa.gz.log"
+        f"{REFERENCES}/logs/genome.fa.gz.log",
     run:
         utils.download_and_postprocess(
-            urls=config['fasta']['url'],
-            postprocess=config['fasta'].get('postprocess', None),
+            urls=config["fasta"]["url"],
+            postprocess=config["fasta"].get("postprocess", None),
             outfile=output[0],
-            log=log
+            log=log,
         )
 
 
 rule gtf:
     output:
-        temporary(f"{REFERENCES}/annotation.gtf.gz")
+        temporary(f"{REFERENCES}/annotation.gtf.gz"),
     log:
-        f"{REFERENCES}/logs/annotation.gtf.gz.log"
+        f"{REFERENCES}/logs/annotation.gtf.gz.log",
     run:
         utils.download_and_postprocess(
-            urls=config['gtf']['url'],
-            postprocess=config['gtf'].get('postprocess', None),
+            urls=config["gtf"]["url"],
+            postprocess=config["gtf"].get("postprocess", None),
             outfile=output[0],
-            log=log
+            log=log,
         )
 
 
 rule rrna:
     output:
-        temporary(f"{REFERENCES}/rrna.fa.gz")
+        temporary(f"{REFERENCES}/rrna.fa.gz"),
     log:
-        f"{REFERENCES}/logs/rrna.fa.gz.log"
+        f"{REFERENCES}/logs/rrna.fa.gz.log",
     run:
         utils.download_and_postprocess(
-            urls=config['rrna']['url'],
-            postprocess=config['rrna'].get('postprocess', None),
+            urls=config["rrna"]["url"],
+            postprocess=config["rrna"].get("postprocess", None),
             outfile=output[0],
-            log=log
+            log=log,
         )
 
 
 rule unzip:
     input:
-        f"{REFERENCES}/{prefix}.gz"
+        f"{REFERENCES}/{{prefix}}.gz",
     output:
-        f"{REFERENCES}/{prefix}"
-    shell: 'gunzip -c {input} > {output}'
+        f"{REFERENCES}/{{prefix}}",
+    shell:
+        "gunzip -c {input} > {output}"
 
 
 rule bowtie2_index:
     input:
-        f"{REFERENCES}/{label}.fa",
+        f"{REFERENCES}/{{label}}.fa",
     output:
         multiext(
-            f"{REFERENCES}/bowtie2/{label}",
+            f"{REFERENCES}/bowtie2/{{label}}",
             ".1.bt2",
             ".2.bt2",
             ".3.bt2",
@@ -76,22 +79,15 @@ rule bowtie2_index:
             ".fa",
         ),
     log:
-        f"{REFERENCES}/logs/bowtie2_{label}.log'
+        f"{REFERENCES}/logs/bowtie2_{{label}}.log",
     resources:
         runtime=autobump(hours=8),
         mem_mb=autobump(gb=32),
-        disk_mb=autobump(gb=50)
-    threads:
-        8
+        disk_mb=autobump(gb=50),
+    threads: 8
     run:
         index = os.path.commonprefix(output).rstrip(".")
-        shell(
-            "bowtie2-build"
-            " --threads {threads}"
-            " {input}"
-            " {index}"
-            " &> {log}"
-        )
+        shell("bowtie2-build" " --threads {threads}" " {input}" " {index}" " &> {log}")
         utils.make_relative_symlink(input[0], output[-1])
 
 
@@ -100,42 +96,39 @@ rule star_index:
         fasta=f"{REFERENCES}/genome.fa",
         gtf=f"{REFERENCES}/annotation.gtf",
     output:
-        f"{REFERENCES}/star/Genome"
+        f"{REFERENCES}/star/Genome",
     log:
-        f"{REFERENCES}/logs/star.log"
-    threads:
-        8
+        f"{REFERENCES}/logs/star.log",
+    threads: 8
     resources:
         runtime=autobump(hours=8),
-        mem_mb=gb(64)
+        mem_mb=gb(64),
     run:
         genomedir = os.path.dirname(output[0])
-        shell('rm -r {genomedir}')
-        shell('mkdir -p {genomedir}')
+        shell("rm -r {genomedir}")
+        shell("mkdir -p {genomedir}")
         shell(
-            'STAR '
-            '--runMode genomeGenerate '
-            '--runThreadN {threads} '
-            '--genomeDir {genomedir} '
-            '--genomeFastaFiles {input.fasta} '
-
+            "STAR "
+            "--runMode genomeGenerate "
+            "--runThreadN {threads} "
+            "--genomeDir {genomedir} "
+            "--genomeFastaFiles {input.fasta} "
             # NOTE: GTF is optional
-            '--sjdbGTFfile {input.gtf} '
-
+            "--sjdbGTFfile {input.gtf} "
             # NOTE: STAR docs say that 100 should work well.
-            '--sjdbOverhang 100 '
-
+            "--sjdbOverhang 100 "
             # NOTE: for small genomes, may need to scale this down to
             # min(14, log2(GenomeLength) / 2 - 1)
             # --genomeSAindexNbases 14
-            '&> {log}'
+            "&> {log}"
         )
         # STAR writes a hard-coded Log.out file to the current working
         # directory. So put that on the end of the log file for the rule and
         # then clean up.
-        shell('cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out')
+        shell("cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out")
         shell("ln -s {input.fasta} {genomedir}")
 
+
 rule hisat2_index:
     input:
         f"{REFERENCES}/genome.fa",
@@ -151,135 +144,122 @@ rule hisat2_index:
             ".7.ht2",
             ".8.ht2",
             ".fa",
-        )
+        ),
     log:
-        f"{REFERENCES}/logs/hisat2.log"
+        f"{REFERENCES}/logs/hisat2.log",
     resources:
         runtime=autobump(hours=8),
         mem_mb=autobump(gb=32),
-        disk_mb=autobump(gb=50)
-    threads:
-        8
+        disk_mb=autobump(gb=50),
+    threads: 8
     run:
         index = os.path.commonprefix(output).rstrip(".")
-        shell(
-            "hisat2-build"
-            " --threads {threads}"
-            " {input}"
-            " {index}"
-            " &> {log}"
-        )
+        shell("hisat2-build" " --threads {threads}" " {input}" " {index}" " &> {log}")
         shell("ln -s {input} {output[-1]}")
 
 
-
 rule transcriptome_fasta:
     input:
         fasta=f"{REFERENCES}/genome.fa",
         gtf=f"{REFERENCES}/annotation.gtf",
     output:
-        f"{REFERENCES}/transcriptome.fa" 
+        f"{REFERENCES}/transcriptome.fa",
     resources:
-        runtime=hours(1)
+        runtime=hours(1),
     shell:
-        'gffread {input.gtf} -w {output} -g {input.fasta}'
+        "gffread {input.gtf} -w {output} -g {input.fasta}"
 
 
 rule salmon_index:
     input:
-        f"{REFERENCES}/transcriptome.fa"
+        f"{REFERENCES}/transcriptome.fa",
     output:
-        f"{REFERENCES}/salmon/versionInfo.json"
+        f"{REFERENCES}/salmon/versionInfo.json",
     log:
-        f"{REFERENCES}/logs/salmon.log"
+        f"{REFERENCES}/logs/salmon.log",
     params:
-        outdir=f"{REFERENCES}/salmon"
+        outdir=f"{REFERENCES}/salmon",
     resources:
         mem_mb=gb(32),
-        runtime=hours(2)
+        runtime=hours(2),
     run:
         outdir = os.path.dirname(output[0])
-        shell(
-            'salmon index '
-            '--transcripts {input} '
-            '--index {outdir} '
-            '&> {log}'
-        )
+        shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}")
 
 
 rule kallisto_index:
     output:
         f"{REFERENCES}/kallisto/transcripts.idx",
     input:
-        f"{REFERENCES}/genome.fa"
+        f"{REFERENCES}/genome.fa",
     log:
-        f"{REFERENCES}/logs/kallisto.log"
+        f"{REFERENCES}/logs/kallisto.log",
     resources:
         runtime=hours(2),
         mem_mb=gb(32),
     shell:
-        'kallisto index '
-        '--index {output} '
-        '{input} '
-        '&> {log}'
+        "kallisto index "
+        "--index {output} "
+        "{input} "
+        "&> {log}"
 
 
 rule conversion_refflat:
     input:
-        f"{REFERENCES}/annotation.gtf"
+        f"{REFERENCES}/annotation.gtf",
     output:
-        f"{REFERENCES}/annotation.refflat"
+        f"{REFERENCES}/annotation.refflat",
     log:
-        f"{REFERENCES}/logs/annotation.refflat.log"
+        f"{REFERENCES}/logs/annotation.refflat.log",
     resources:
         runtime=hours(2),
-        mem_mb=gb(2)
+        mem_mb=gb(2),
     shell:
-        'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp '
-        '''&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} '''
-        '&& rm {output}.tmp '
+        "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp "
+        """&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} """
+        "&& rm {output}.tmp "
 
 
 rule conversion_bed12:
     input:
-        f"{REFERENCES}/annotation.gtf"
+        f"{REFERENCES}/annotation.gtf",
     output:
-        f"{REFERENCES}/annotation.bed12"
+        f"{REFERENCES}/annotation.bed12",
     resources:
         runtime=hours(2),
-        mem_mb=gb(2)
+        mem_mb=gb(2),
     shell:
-        'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp '
-        '&& genePredToBed {output}.tmp {output} '
-        '&& rm {output}.tmp'
+        "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp "
+        "&& genePredToBed {output}.tmp {output} "
+        "&& rm {output}.tmp"
 
 
 rule chromsizes:
     input:
-        f"{REFERENCES}/genome.fa"
+        f"{REFERENCES}/genome.fa",
     output:
-        f"{REFERENCES}/genome.chromsizes"
+        f"{REFERENCES}/genome.chromsizes",
     log:
-        f"{REFERENCES}/logs/genome.chromsizes.log"
+        f"{REFERENCES}/logs/genome.chromsizes.log",
     params:
         # NOTE: Be careful with the memory here; make sure you have enough
         # and/or it matches the resources you're requesting
-        java_args='-Xmx20g'
+        java_args="-Xmx20g",
         # java_args='-Xmx2g'  # [TEST SETTINGS -1]
     resources:
         mem_mb=gb(24),
-        runtime=hours(2)
+        runtime=hours(2),
     shell:
-        'export LC_COLLATE=C; '
-        'rm -f {output}.tmp '
-        '&& picard '
-        '{params.java_args} '
-        'CreateSequenceDictionary R={input} O={output}.tmp &> {log} '
+        "export LC_COLLATE=C; "
+        "rm -f {output}.tmp "
+        "&& picard "
+        "{params.java_args} "
+        "CreateSequenceDictionary R={input} O={output}.tmp &> {log} "
         '&& grep "^@SQ" {output}.tmp '
-        '''| awk '{{print $2, $3}}' '''
+        """| awk '{{print $2, $3}}' """
         '| sed "s/SN://g;s/ LN:/\\t/g" '
-        '| sort -k1,1 > {output} '
-        '&& rm -f {output}.tmp '
+        "| sort -k1,1 > {output} "
+        "&& rm -f {output}.tmp "
 
 
 rule mappings:
@@ -287,14 +267,16 @@ rule mappings:
     Creates gzipped TSV mapping between attributes in the GTF.
     """
     input:
-        gtf=f"{REFERENCES}/annotation.gtf"
+        gtf=f"{REFERENCES}/annotation.gtf",
     output:
-        f"{REFERENCES}/annotation.mapping.tsv.gz"
+        f"{REFERENCES}/annotation.mapping.tsv.gz",
     params:
-        include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', [])
+        include_featuretypes=lambda wildcards, output: conversion_kwargs[
+            output[0]
+        ].get("include_featuretypes", []),
     resources:
         runtime=hours(2),
-        mem_mb=gb(2)
+        mem_mb=gb(2),
     run:
         import gffutils
 
@@ -314,7 +296,7 @@ rule mappings:
                 continue
 
             d = dict(f.attributes)
-            d['__featuretype__'] = ft
+            d["__featuretype__"] = ft
             res.append(d)
 
         df = pandas.DataFrame(res)
@@ -323,7 +305,7 @@ rule mappings:
         # include_featuretypes settings, this may take a while.
         df = df.drop_duplicates()
 
-        df.to_csv(output[0], sep='\t', index=False, compression='gzip')
+        df.to_csv(output[0], sep="\t", index=False, compression="gzip")
 
         # Restore original setting
         gffutils.constants.always_return_list = orig_setting

From 0b9beec823996cd89c4357b089e34b0b312f6717 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 6 Jan 2025 22:44:39 -0500
Subject: [PATCH 044/196] run snakefmt on rnaseq (and then re-add some comments
 that caused failure)

---
 workflows/rnaseq/Snakefile | 690 ++++++++++++++++++++-----------------
 1 file changed, 365 insertions(+), 325 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 5e37daf1..3b384cd3 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -9,40 +9,51 @@ from lib import utils
 from lib.utils import autobump, gb, hours
 
 
-configfile: 'config/config.yaml'
+configfile: "config/config.yaml"
 
-include: '../references/Snakefile'
 
-REFERENCES = config.get('reference_dir', '../../references')
+include: "../references/Snakefile"
+
+
+REFERENCES = config.get("reference_dir", "../../references")
 sampletable = pd.read_table(config["sampletable"], sep="\t")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
 is_sra = utils.detect_sra(sampletable)
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.iloc[:, 0].values
-patterns = yaml.safe_load(open('config/rnaseq_patterns.yaml'))
+patterns = yaml.safe_load(open("config/rnaseq_patterns.yaml"))
 
 
 wildcard_constraints:
-    n = '[1,2]',
-    sample = '|'.join(SAMPLES)
+    n="[1,2]",
+    sample="|".join(SAMPLES),
+
+
+localrules:
+    symlinks,
+    symlink_targets,
 
-localrules: symlinks, symlink_targets
 
 rule all:
     input:
-        patterns["multiqc"]
+        patterns["multiqc"],
+
 
 if is_sra:
-    include: '../../rules/sra.smk'
+
+    include: "../../rules/sra.smk"
 
 
 rule symlinks:
     input:
-        lambda wc: sampletable.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] if is_paired
-        else sampletable.loc[wc.sample, ['orig_filename']]
+        lambda wc: (
+            sampletable.loc[wc.sample, ["orig_filename", "orig_filename_R2"]]
+            if is_paired
+            else sampletable.loc[wc.sample, ["orig_filename"]]
+        ),
     output:
-        expand(patterns["fastq"], n=n, allow_missing=True)
+        expand(patterns["fastq"], n=n, allow_missing=True),
     threads: 1
     resources:
         mem_mb=100,
@@ -54,33 +65,42 @@ rule symlinks:
 
 
 rule symlink_targets:
-    input: 
-        expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n)
+    input:
+        expand(
+            "data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n
+        ),
+
 
 # This can be set at the command line with --config strand_check_reads=1000
-config.setdefault('strand_check_reads', 1e5)
+config.setdefault("strand_check_reads", 1e5)
 
 # TODO: re-enable
 # include: '../../rules/strand_check.smk'
 
+
 rule cutadapt:
     input:
-        fastq=expand(patterns["fastq"], n=n, allow_missing=True)
+        fastq=expand(patterns["fastq"], n=n, allow_missing=True),
     output:
-        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True)
+        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
     log:
-        'data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log'
+        "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
     threads: 6
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     params:
         extra=(
-            "--nextseq-trim 20 "
-            "--overlap 6 "
-            "--minimum-length 25 "
-            "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
-        ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if is_paired else ""
+            (
+                "--nextseq-trim 20 "
+                "--overlap 6 "
+                "--minimum-length 25 "
+                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+            )
+            + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
+            if is_paired
+            else ""
+        ),
     run:
         if is_paired:
             shell(
@@ -103,53 +123,54 @@ rule cutadapt:
                 "&> {log}"
             )
 
+
 # TODO: rm wrapper
 rule fastqc:
     input:
-        '{sample_dir}/{sample}/{sample}{suffix}'
-    threads:
-        6
+        "{sample_dir}/{sample}/{sample}{suffix}",
+    threads: 6
     output:
-        html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html',
-        zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip',
+        html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html",
+        zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip",
     resources:
         mem_mb=gb(8),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     script:
-        utils.wrapper_for('fastqc/wrapper.py')
+        utils.wrapper_for("fastqc/wrapper.py")
 
 
-if config['aligner'] == 'hisat2':
+if config["aligner"] == "hisat2":
+
     rule hisat2:
         input:
             fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.hisat2_index.output,
         output:
-            bam=temporary(patterns['bam'])
+            bam=temporary(patterns["bam"]),
         log:
-            patterns['bam'] + '.log'
+            patterns["bam"] + ".log",
         threads: 6
         resources:
             mem_mb=gb(32),
-            runtime=autobump(hours=8)
+            runtime=autobump(hours=8),
         params:
-            extra=""
+            extra="",
         run:
             prefix = os.path.commonprefix(input.index).rstrip(".")
-            sam = output.bam.replace('.bam', '.sam')
+            sam = output.bam.replace(".bam", ".sam")
 
             if is_paired:
                 assert len(input.fastq) == 2
-                fastqs = '-1 {0} -2 {1} '.format(*input.fastq)
+                fastqs = "-1 {0} -2 {1} ".format(*input.fastq)
             else:
                 assert len(input.fastq) == 1
-                fastqs = '-U {0} '.format(input.fastq)
+                fastqs = "-U {0} ".format(input.fastq)
 
             shell(
                 "hisat2 "
                 "-x {prefix} "
                 "{fastqs} "
-                '--no-unal '
+                "--no-unal "
                 "--threads {threads} "
                 "-S {sam} "
                 "> {log} 2>&1"
@@ -161,40 +182,42 @@ if config['aligner'] == 'hisat2':
                 "&& rm {sam}"
             )
 
+
+
 # TODO: star has lots of rules. Better to be in rules/aligner.smk?
 
-if config['aligner'].startswith('star'):
+if config["aligner"].startswith("star"):
 
     # STAR can be run in 1-pass or 2-pass modes. Since we may be running it
     # more than once in almost the same way, we pull out the shell command here
     # and use it below.
     STAR_CMD = (
-        'STAR '
-        '--runThreadN {threads} '
-        '--genomeDir {genomedir} '
-        '--readFilesIn {input.fastq} '
-        '--readFilesCommand zcat '
-        '--outFileNamePrefix {prefix} '
-        '{params.extra} '
+        "STAR "
+        "--runThreadN {threads} "
+        "--genomeDir {genomedir} "
+        "--readFilesIn {input.fastq} "
+        "--readFilesCommand zcat "
+        "--outFileNamePrefix {prefix} "
+        "{params.extra} "
     )
     STAR_PARAMS = (
         # NOTE: The STAR docs indicate that the following parameters are
         # standard options for ENCODE long-RNA-seq pipeline.  Comments are from
         # the STAR docs.
-        '--outFilterType BySJout '               # reduces number of spurious junctions
-        '--outFilterMultimapNmax 20 '            # if more than this many multimappers, consider unmapped
-        '--alignSJoverhangMin 8 '                # min overhang for unannotated junctions
-        '--alignSJDBoverhangMin 1 '              # min overhang for annotated junctions
-        '--outFilterMismatchNmax 999 '           # max mismatches per pair
-        '--outFilterMismatchNoverReadLmax 0.04 ' # max mismatches per pair relative to read length
-        '--alignIntronMin 20 '                   # min intron length
-        '--alignIntronMax 1000000 '              # max intron length
-        '--alignMatesGapMax 1000000 '            # max distance between mates
-        '--outSAMunmapped None '                 # do not report aligned reads in output
+        "--outFilterType BySJout "  # reduces number of spurious junctions
+        "--outFilterMultimapNmax 20 "  # if more than this many multimappers, consider unmapped
+        "--alignSJoverhangMin 8 "  # min overhang for unannotated junctions
+        "--alignSJDBoverhangMin 1 "  # min overhang for annotated junctions
+        "--outFilterMismatchNmax 999 "  # max mismatches per pair
+        "--outFilterMismatchNoverReadLmax 0.04 "  # max mismatches per pair relative to read length
+        "--alignIntronMin 20 "  # min intron length
+        "--alignIntronMax 1000000 "  # max intron length
+        "--alignMatesGapMax 1000000 "  # max distance between mates
+        "--outSAMunmapped None "  # do not report aligned reads in output
     )
-    logfile_extensions = ['Log.progress.out', 'Log.out', 'Log.final.out', 'Log.std.out']
+    logfile_extensions = ["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"]
 
-if config['aligner'] == 'star':
+if config["aligner"] == "star":
 
     rule star:
         """
@@ -203,37 +226,39 @@ if config['aligner'] == 'star':
         input:
             fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.star_index.output,
-            annotation=f"{REFERENCES}/annotation.gtf"
+            annotation=f"{REFERENCES}/annotation.gtf",
         output:
-            bam=temporary(patterns['bam']),
-            sjout=temporary(patterns['bam'].replace('.bam', '.star.SJ.out.tab')),
+            bam=temporary(patterns["bam"]),
+            sjout=temporary(patterns["bam"].replace(".bam", ".star.SJ.out.tab")),
         log:
-            patterns['bam'].replace('.bam', '.star.bam.log')
+            patterns["bam"].replace(".bam", ".star.bam.log"),
         threads: 16
         resources:
             mem_mb=gb(64),
-            runtime=autobump(hours=8)
+            runtime=autobump(hours=8),
         params:
-            extra=STAR_PARAMS
-
+            extra=STAR_PARAMS,
         run:
             genomedir = os.path.dirname(input.index[0])
             outdir = os.path.dirname(output[0])
-            prefix = output.bam.replace('.bam', '.star.')
+            prefix = output.bam.replace(".bam", ".star.")
             shell(
-                STAR_CMD + (
-                    '--outSAMtype BAM SortedByCoordinate '
-                    '--outStd BAM_SortedByCoordinate > {output.bam} '
-                    '2> {log} '
+                STAR_CMD
+                + (
+                    "--outSAMtype BAM SortedByCoordinate "
+                    "--outStd BAM_SortedByCoordinate > {output.bam} "
+                    "2> {log} "
                 )
             )
 
             # move various hard-coded log files to log directory
-            logfiles = expand(prefix + '{ext}', ext=logfile_extensions)
-            shell('mkdir -p {outdir}/star_logs '
-                  '&& mv {logfiles} {outdir}/star_logs')
+            logfiles = expand(prefix + "{ext}", ext=logfile_extensions)
+            shell(
+                "mkdir -p {outdir}/star_logs " "&& mv {logfiles} {outdir}/star_logs"
+            )
 
-if config['aligner'] == 'star-twopass':
+
+if config["aligner"] == "star-twopass":
 
     rule star_pass1:
         """
@@ -242,37 +267,38 @@ if config['aligner'] == 'star-twopass':
         input:
             fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.star_index.output,
-            annotation=f"{REFERENCES}/annotation.gtf"
+            annotation=f"{REFERENCES}/annotation.gtf",
         output:
-            sjout=temporary(patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')),
+            sjout=temporary(patterns["bam"].replace(".bam", ".star-pass1.SJ.out.tab")),
         log:
-            patterns['bam'].replace('.bam', '.star-pass1.bam.log')
+            patterns["bam"].replace(".bam", ".star-pass1.bam.log"),
         threads: 16
         resources:
             mem_mb=gb(64),
-            runtime=autobump(hours=8)
+            runtime=autobump(hours=8),
         params:
-            extra=STAR_PARAMS
+            extra=STAR_PARAMS,
         run:
             genomedir = os.path.dirname(input.index[0])
             outdir = os.path.dirname(output[0])
-            prefix = output.sjout.replace('SJ.out.tab', '')
+            prefix = output.sjout.replace("SJ.out.tab", "")
             shell(
-                STAR_CMD +
-                (
+                STAR_CMD
+                + (
                     # In this first pass, we don't actually care about the
                     # alignment -- just the detected junctions. So we output
                     # the SAM to /dev/null.
-                    '--outStd SAM > /dev/null '
-                    '2> {log} '
+                    "--outStd SAM > /dev/null "
+                    "2> {log} "
                 )
             )
 
             # move various hard-coded log files to log directory
-            logfiles = expand(prefix + '{ext}', ext=logfile_extensions)
-            shell('mkdir -p {outdir}/star-pass1_logs '
-                  '&& mv {logfiles} {outdir}/star-pass1_logs')
-
+            logfiles = expand(prefix + "{ext}", ext=logfile_extensions)
+            shell(
+                "mkdir -p {outdir}/star-pass1_logs "
+                "&& mv {logfiles} {outdir}/star-pass1_logs"
+            )
 
     rule star_pass2:
         """
@@ -283,47 +309,52 @@ if config['aligner'] == 'star-twopass':
             fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.star_index.output,
             annotation=f"{REFERENCES}/annotation.gtf",
-            sjout=expand(patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES),
+            sjout=expand(
+                patterns["bam"].replace(".bam", ".star-pass1.SJ.out.tab"),
+                sample=SAMPLES,
+            ),
         output:
-            bam=temporary(patterns['bam']),
-            sjout=temporary(patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')),
+            bam=temporary(patterns["bam"]),
+            sjout=temporary(patterns["bam"].replace(".bam", ".star-pass2.SJ.out.tab")),
         log:
-            patterns['bam'].replace('.bam', '.star-pass2.bam.log')
+            patterns["bam"].replace(".bam", ".star-pass2.bam.log"),
         threads: 16
         resources:
             mem_mb=gb(64),
-            runtime=autobump(hours=8)
+            runtime=autobump(hours=8),
         params:
-            extra=STAR_PARAMS
+            extra=STAR_PARAMS,
         run:
             genomedir = os.path.dirname(input.index[0])
             outdir = os.path.dirname(output[0])
-            prefix = output.bam.replace('.bam', '.star-pass2.')
+            prefix = output.bam.replace(".bam", ".star-pass2.")
             shell(
-                STAR_CMD + (
+                STAR_CMD
+                + (
                     # In contrast to pass 1, we will be keeping these BAMs --
                     # so sort them
-                    '--outSAMtype BAM SortedByCoordinate '
-
+                    "--outSAMtype BAM SortedByCoordinate "
                     # Splice junction databases from all samples in the first
                     # pass.
-                    '--sjdbFileChrStartEnd {input.sjout} '
-                    '--outStd BAM_SortedByCoordinate > {output.bam} '
-                    '2> {log} '
+                    "--sjdbFileChrStartEnd {input.sjout} "
+                    "--outStd BAM_SortedByCoordinate > {output.bam} "
+                    "2> {log} "
                 )
             )
 
             # move various hard-coded log files to log directory
-            logfiles = expand(prefix + '{ext}', ext=logfile_extensions)
-            shell('mkdir -p {outdir}/star-pass2_logs '
-                  '&& mv {logfiles} {outdir}/star-pass2_logs')
+            logfiles = expand(prefix + "{ext}", ext=logfile_extensions)
+            shell(
+                "mkdir -p {outdir}/star-pass2_logs "
+                "&& mv {logfiles} {outdir}/star-pass2_logs"
+            )
 
-            shell('rm -r {prefix}_STARgenome')
+            shell("rm -r {prefix}_STARgenome")
 
 
 rule rRNA:
     input:
-        fastq=expand(patterns["cutadapt"], n=1, allow_missing=True),  # currently only R1
+        fastq=expand(patterns["cutadapt"], n=1, allow_missing=True),
         index=multiext(
             f"{REFERENCES}/bowtie2/rrna",
             ".1.bt2",
@@ -335,21 +366,21 @@ rule rRNA:
             ".fa",
         ),
     output:
-        bam=temporary(patterns['rrna']['bam'])
+        bam=temporary(patterns["rrna"]["bam"]),
     log:
-        patterns['rrna']['bam'] + '.log'
+        patterns["rrna"]["bam"] + ".log",
     threads: 6
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     params:
         extra=(
-            '-k 1 '       # NOTE: we only care if >=1 mapped
-            '--no-unal '  # NOTE: suppress unaligned reads
-        )
+            "-k 1 "
+            "--no-unal "
+        ),
     run:
         prefix = os.path.commonprefix(input.index).rstrip(".")
-        sam = output.bam.replace('.bam', '.sam')
+        sam = output.bam.replace(".bam", ".sam")
 
         shell(
             "bowtie2 "
@@ -370,41 +401,41 @@ rule rRNA:
 
 rule fastq_count:
     input:
-        fastq='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz'
+        fastq="{sample_dir}/{sample}/{sample}{suffix}.fastq.gz",
     output:
-        '{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize'
+        "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize",
     threads: 1
     resources:
         mem_mb=gb(1),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'zcat {input} | echo $((`wc -l`/4)) > {output}'
+        "zcat {input} | echo $((`wc -l`/4)) > {output}"
 
 
 rule bam_count:
     input:
-        bam='{sample_dir}/{sample}/{suffix}.bam'
+        bam="{sample_dir}/{sample}/{suffix}.bam",
     output:
-        '{sample_dir}/{sample}/{suffix}.bam.libsize'
+        "{sample_dir}/{sample}/{suffix}.bam.libsize",
     threads: 1
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'samtools view -c {input} > {output}'
+        "samtools view -c {input} > {output}"
 
 
 rule bam_index:
     input:
-        bam='{prefix}.bam'
+        bam="{prefix}.bam",
     output:
-        bai='{prefix}.bam.bai'
+        bai="{prefix}.bam.bai",
     threads: 1
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'samtools index {input} {output}'
+        "samtools index {input} {output}"
 
 
 # TODO: split into multiple featurecounts runs, since PE needs to be sorted each time.
@@ -414,91 +445,101 @@ rule featurecounts:
     """
     input:
         annotation=rules.gtf.output,
-        bam=expand(patterns['markduplicates']['bam'], sample=SAMPLES),
+        bam=expand(patterns["markduplicates"]["bam"], sample=SAMPLES),
     output:
-        counts='{sample_dir}/rnaseq_aggregation/featurecounts.txt'
+        counts="{sample_dir}/rnaseq_aggregation/featurecounts.txt",
     log:
-        '{sample_dir}/rnaseq_aggregation/featurecounts.txt.log'
+        "{sample_dir}/rnaseq_aggregation/featurecounts.txt.log",
     threads: 8
     resources:
         mem_mb=gb(16),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     params:
         strand_arg={
-                'unstranded': '-s0 ',
-                'fr-firststrand': '-s2 ',
-                'fr-secondstrand': '-s1 ',
-            }[config["stranded"]],
-        extra=""
+            "unstranded": "-s0 ",
+            "fr-firststrand": "-s2 ",
+            "fr-secondstrand": "-s1 ",
+        }[config["stranded"]],
+        extra="",
     run:
         # NOTE: By default, we use -p for paired-end
-        p_arg = ''
+        p_arg = ""
         if is_paired:
-            p_arg = '-p --countReadPairs '
+            p_arg = "-p --countReadPairs "
         shell(
-            'featureCounts '
-            '{params.strand_arg} '
-            '{p_arg} '
-            '-T {threads} '
-            '-a {input.annotation} '
-            '-o {output.counts} '
-            '{input.bam} '
-            '&> {log}'
+            "featureCounts "
+            "{params.strand_arg} "
+            "{p_arg} "
+            "-T {threads} "
+            "-a {input.annotation} "
+            "-o {output.counts} "
+            "{input.bam} "
+            "&> {log}"
         )
 
-# TODO: port some of this over to utils, or maybe script.
+
+# # TODO: port some of this over to utils, or maybe script.
 rule rrna_libsizes_table:
     """
     Aggregate rRNA counts into a table
     """
     input:
-        rrna=expand(patterns['rrna']['libsize'], sample=SAMPLES),
-        fastq=expand(patterns['libsizes']['cutadapt'], sample=SAMPLES),
+        rrna=expand(patterns["rrna"]["libsize"], sample=SAMPLES),
+        fastq=expand(patterns["libsizes"]["cutadapt"], sample=SAMPLES),
     output:
-        json=patterns['rrna_percentages_yaml'],
-        tsv=patterns['rrna_percentages_table']
+        json=patterns["rrna_percentages_yaml"],
+        tsv=patterns["rrna_percentages_table"],
     threads: 1
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     run:
         def rrna_sample(f):
-            return utils.extract_wildcards(patterns['rrna']['libsize'], f)['sample']
+            return utils.extract_wildcards(patterns["rrna"]["libsize"], f)["sample"]
+
 
         def sample(f):
-            return utils.extract_wildcards(patterns['libsizes']['cutadapt'], f)['sample']
+            return utils.extract_wildcards(patterns["libsizes"]["cutadapt"], f)[
+                "sample"
+            ]
+
 
         def million(f):
             return float(open(f).read()) / 1e6
 
+
         rrna = sorted(input.rrna, key=rrna_sample)
         fastq = sorted(input.fastq, key=sample)
         samples = list(map(rrna_sample, rrna))
         rrna_m = list(map(million, rrna))
         fastq_m = list(map(million, fastq))
 
-        df = pd.DataFrame(dict(
-            sample=samples,
-            million_reads_rRNA=rrna_m,
-            million_reads_fastq=fastq_m,
-        ))
-        df = df.set_index('sample')
-        df['rRNA_percentage'] = df.million_reads_rRNA / df.million_reads_fastq * 100
+        df = pd.DataFrame(
+            dict(
+                sample=samples,
+                million_reads_rRNA=rrna_m,
+                million_reads_fastq=fastq_m,
+            )
+        )
+        df = df.set_index("sample")
+        df["rRNA_percentage"] = df.million_reads_rRNA / df.million_reads_fastq * 100
 
-        df[['million_reads_fastq', 'million_reads_rRNA', 'rRNA_percentage']].to_csv(output.tsv, sep='\t')
+        df[["million_reads_fastq", "million_reads_rRNA", "rRNA_percentage"]].to_csv(
+            output.tsv, sep="\t"
+        )
         y = {
-            'id': 'rrna_percentages_table',
-            'section_name': 'rRNA content',
-            'description': 'Amount of reads mapping to rRNA sequence',
-            'plot_type': 'table',
-            'pconfig': {
-                'id': 'rrna_percentages_table_table',
-                'title': 'rRNA content table',
-                'min': 0
+            "id": "rrna_percentages_table",
+            "section_name": "rRNA content",
+            "description": "Amount of reads mapping to rRNA sequence",
+            "plot_type": "table",
+            "pconfig": {
+                "id": "rrna_percentages_table_table",
+                "title": "rRNA content table",
+                "min": 0,
             },
-            'data': yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader),
+            "data": yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader),
         }
-        with open(output.json, 'w') as fout:
+        with open(output.json, "w") as fout:
             yaml.dump(y, fout, default_flow_style=False)
 
 
@@ -521,29 +562,29 @@ rule multiqc:
             patterns["rrna_percentages_table"],
             patterns["featurecounts"],
         ),
-        config='config/multiqc_config.yaml'
+        config="config/multiqc_config.yaml",
     output:
-        'data/rnaseq_aggregation/multiqc.html'
+        "data/rnaseq_aggregation/multiqc.html",
     log:
-        'data/rnaseq_aggregation/multiqc.log'
+        "data/rnaseq_aggregation/multiqc.log",
     threads: 1
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     run:
         analysis_directory = set([os.path.dirname(i) for i in input])
         outdir = os.path.dirname(output[0])
         basename = os.path.basename(output[0])
         shell(
-            'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 '
-            'multiqc '
-            '--quiet '
-            '--outdir {outdir} '
-            '--force '
-            '--filename {basename} '
-            '--config {input.config} '
-            '{analysis_directory} '
-            '&> {log} '
+            "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 "
+            "multiqc "
+            "--quiet "
+            "--outdir {outdir} "
+            "--force "
+            "--filename {basename} "
+            "--config {input.config} "
+            "{analysis_directory} "
+            "&> {log} "
         )
 
 
@@ -552,29 +593,29 @@ rule markduplicates:
     Mark or remove PCR duplicates with Picard MarkDuplicates
     """
     input:
-        bam=patterns['bam']
+        bam=patterns["bam"],
     output:
-        bam=patterns['markduplicates']['bam'],
-        metrics=patterns['markduplicates']['metrics'],
+        bam=patterns["markduplicates"]["bam"],
+        metrics=patterns["markduplicates"]["metrics"],
     log:
-        patterns['markduplicates']['bam'] + '.log'
+        patterns["markduplicates"]["bam"] + ".log",
     threads: 1
     resources:
         mem_mb=gb(32),
         runtime=autobump(hours=2),
         disk_mb=autobump(gb=100),
     params:
-        java_args='-Xmx20g'
+        java_args="-Xmx20g",
         # java_args='-Xmx2g'  # [TEST SETTINGS -1]
     shell:
-        'picard '
-        '{params.java_args} '
-        'MarkDuplicates '
-        'INPUT={input.bam} '
-        'OUTPUT={output.bam} '
-        'METRICS_FILE={output.metrics} '
-        'VALIDATION_STRINGENCY=LENIENT '
-        '&> {log}'
+        "picard "
+        "{params.java_args} "
+        "MarkDuplicates "
+        "INPUT={input.bam} "
+        "OUTPUT={output.bam} "
+        "METRICS_FILE={output.metrics} "
+        "VALIDATION_STRINGENCY=LENIENT "
+        "&> {log}"
 
 
 rule collectrnaseqmetrics:
@@ -582,35 +623,35 @@ rule collectrnaseqmetrics:
     Calculate various RNA-seq QC metrics with Picarc CollectRnaSeqMetrics
     """
     input:
-        bam=patterns['markduplicates']['bam'],
+        bam=patterns["markduplicates"]["bam"],
         refflat=rules.conversion_refflat.output,
     output:
-        metrics=patterns['collectrnaseqmetrics']['metrics'],
+        metrics=patterns["collectrnaseqmetrics"]["metrics"],
     log:
-        patterns['collectrnaseqmetrics']['metrics'] + '.log'
+        patterns["collectrnaseqmetrics"]["metrics"] + ".log",
     threads: 1
     resources:
         mem_mb=gb(32),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     params:
-        java_args='-Xmx20g',
+        java_args="-Xmx20g",
         # java_args='-Xmx2g',  # [TEST SETTINGS -1]
         strand_arg={
-                'unstranded': 'STRAND=NONE ',
-                'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ',
-                'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ',
-            }[config["stranded"]]
+            "unstranded": "STRAND=NONE ",
+            "fr-firststrand": "STRAND=SECOND_READ_TRANSCRIPTION_STRAND ",
+            "fr-secondstrand": "STRAND=FIRST_READ_TRANSCRIPTION_STRAND ",
+        }[config["stranded"]],
     run:
         shell(
-            'picard '
-            '{params.java_args} '
-            'CollectRnaSeqMetrics '
-            '{params.strand_arg} '
-            'VALIDATION_STRINGENCY=LENIENT '
-            'REF_FLAT={input.refflat} '
-            'INPUT={input.bam} '
-            'OUTPUT={output.metrics} '
-            '&> {log}'
+            "picard "
+            "{params.java_args} "
+            "CollectRnaSeqMetrics "
+            "{params.strand_arg} "
+            "VALIDATION_STRINGENCY=LENIENT "
+            "REF_FLAT={input.refflat} "
+            "INPUT={input.bam} "
+            "OUTPUT={output.metrics} "
+            "&> {log}"
         )
 
 
@@ -619,18 +660,18 @@ rule preseq:
     Compute a library complexity curve with preseq
     """
     input:
-        bam=patterns['bam']
+        bam=patterns["bam"],
     output:
-        patterns['preseq']
+        patterns["preseq"],
     threads: 1
     resources:
         mem_mb=gb(1),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'preseq '
-        'c_curve '
-        '-B {input} '
-        '-o {output} '
+        "preseq "
+        "c_curve "
+        "-B {input} "
+        "-o {output} "
 
 
 rule salmon:
@@ -639,37 +680,37 @@ rule salmon:
     """
     input:
         fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
-        index=REFERENCES + "/salmon/versionInfo.json"
+        index=REFERENCES + "/salmon/versionInfo.json",
     output:
-        patterns['salmon']
+        patterns["salmon"],
     log:
-        patterns['salmon'] + '.log'
+        patterns["salmon"] + ".log",
     threads: 6
     resources:
         mem_mb=gb(32),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     params:
         extra=(
             "--libType=A "
             "--gcBias "
             "--seqBias "
             "--validateMappings "
-        )
+        ),
     run:
         outdir = os.path.dirname(output[0])
         index_dir = os.path.dirname(input.index)
         if is_paired:
-            fastq_arg = f'-1 {input.fastq[0]} -2 {input.fastq[1]} '
+            fastq_arg = f"-1 {input.fastq[0]} -2 {input.fastq[1]} "
         else:
-            fastq_arg = f'-r {input.fastq} '
+            fastq_arg = f"-r {input.fastq} "
         shell(
-            'salmon quant '
-            '--index {index_dir} '
-            '--output {outdir} '
-            '--threads {threads} '
-            '{params.extra} '
-            '{fastq_arg} '
-            '&> {log}'
+            "salmon quant "
+            "--index {index_dir} "
+            "--output {outdir} "
+            "--threads {threads} "
+            "{params.extra} "
+            "{fastq_arg} "
+            "&> {log}"
         )
 
 
@@ -681,55 +722,56 @@ rule kallisto:
         fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
         index=REFERENCES + "/kallisto/transcripts.idx",
     output:
-        patterns['kallisto']
+        patterns["kallisto"],
     log:
-        patterns['kallisto'] + '.log'
-    threads:
-        8
+        patterns["kallisto"] + ".log",
+    threads: 8
     resources:
         mem_mb=gb(32),
         runtime=autobump(hours=2),
     params:
         strand_arg={
-                'unstranded': '',
-                'fr-firststrand': '--rf-stranded',
-                'fr-secondstrand': '--fr-stranded',
-            }[config["stranded"]],
+            "unstranded": "",
+            "fr-firststrand": "--rf-stranded",
+            "fr-secondstrand": "--fr-stranded",
+        }[config["stranded"]],
         extra=(
-            "--bootstrap-samples 100" if is_paired else
-            "--single --fragment-length 300 --sd 20 --bootstrap-samples 100"
+            "--bootstrap-samples 100"
+            if is_paired
+            else "--single --fragment-length 300 --sd 20 --bootstrap-samples 100"
         ),
     run:
         outdir = os.path.dirname(output[0])
         shell(
-            'kallisto quant '
-            '--index {input.index} '
-            '--output-dir {outdir} '
-            '--threads {threads} '
-            '--bootstrap-samples 100 '
-            '--threads {threads} '
-            '{params.strand_arg} '
-            '{params.extra} '
-            '{input.fastq} '
-            '&> {log}'
+            "kallisto quant "
+            "--index {input.index} "
+            "--output-dir {outdir} "
+            "--threads {threads} "
+            "--bootstrap-samples 100 "
+            "--threads {threads} "
+            "{params.strand_arg} "
+            "{params.extra} "
+            "{input.fastq} "
+            "&> {log}"
         )
 
+
 rule rseqc_infer_experiment:
     """
     Infer strandedness of experiment
     """
     input:
-        bam=patterns['markduplicates']['bam'],
+        bam=patterns["markduplicates"]["bam"],
         bed12=rules.conversion_bed12.output,
     output:
-        txt=patterns['rseqc']['infer_experiment']
+        txt=patterns["rseqc"]["infer_experiment"],
     log:
-        patterns['rseqc']['infer_experiment'] + '.log'
+        patterns["rseqc"]["infer_experiment"] + ".log",
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}'
+        "infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}"
 
 
 rule rseqc_read_distribution:
@@ -737,17 +779,17 @@ rule rseqc_read_distribution:
     read distribution plots
     """
     input:
-        bam=patterns['markduplicates']['bam'],
+        bam=patterns["markduplicates"]["bam"],
         bed12=rules.conversion_bed12.output,
     output:
-        txt=patterns['rseqc']['read_distribution']
+        txt=patterns["rseqc"]["read_distribution"],
     log:
-        patterns['rseqc']['read_distribution'] + '.log'
+        patterns["rseqc"]["read_distribution"] + ".log",
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}'
+        "read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}"
 
 
 rule idxstats:
@@ -755,19 +797,17 @@ rule idxstats:
     Run samtools idxstats on sample bams
     """
     input:
-        bam=patterns['markduplicates']['bam'],
-        bai=patterns['markduplicates']['bam'] + '.bai'
+        bam=patterns["markduplicates"]["bam"],
+        bai=patterns["markduplicates"]["bam"] + ".bai",
     output:
-        txt=patterns['samtools']['idxstats']
-    log: 
-        patterns['samtools']['idxstats'] + '.log'
+        txt=patterns["samtools"]["idxstats"],
+    log:
+        patterns["samtools"]["idxstats"] + ".log",
     resources:
         mem_mb=gb(16),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     run:
-        shell(
-            'samtools idxstats {input.bam} 2> {log} 1> {output.txt}'
-        )
+        shell("samtools idxstats {input.bam} 2> {log} 1> {output.txt}")
 
 
 rule bigwig_neg:
@@ -775,36 +815,36 @@ rule bigwig_neg:
     Create a bigwig for negative-strand reads
     """
     input:
-        bam=patterns['markduplicates']['bam'],
-        bai=patterns['markduplicates']['bam'] + '.bai',
+        bam=patterns["markduplicates"]["bam"],
+        bai=patterns["markduplicates"]["bam"] + ".bai",
     output:
-        patterns['bigwig']['neg']
+        patterns["bigwig"]["neg"],
     threads: 8
     resources:
         mem_mb=gb(16),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     log:
-        patterns['bigwig']['neg'] + '.log'
+        patterns["bigwig"]["neg"] + ".log",
     params:
-        strand_arg = {
-                'unstranded': '',
-                'fr-firststrand': '--filterRNAstrand reverse ',
-                'fr-secondstrand': '--filterRNAstrand forward ',
-            }[config["stranded"]],
+        strand_arg={
+            "unstranded": "",
+            "fr-firststrand": "--filterRNAstrand reverse ",
+            "fr-secondstrand": "--filterRNAstrand forward ",
+        }[config["stranded"]],
         extra=(
-            '--minMappingQuality 20 '
-            '--smoothLength 10 '
-            '--normalizeUsing BPM '    # equivalent to TPM # [TEST SETTINGS]
+            "--minMappingQuality 20 "
+            "--smoothLength 10 "
+            "--normalizeUsing BPM "  # [TEST SETTINGS]
         ),
     run:
         shell(
-            'bamCoverage '
-            '--bam {input.bam} '
-            '-o {output} '
-            '-p {threads} '
-            '{params.extra} '
-            '{params.strand_arg} '
-            '&> {log}'
+            "bamCoverage "
+            "--bam {input.bam} "
+            "-o {output} "
+            "-p {threads} "
+            "{params.extra} "
+            "{params.strand_arg} "
+            "&> {log}"
         )
 
 
@@ -813,58 +853,58 @@ rule bigwig_pos:
     Create a bigwig for postive-strand reads.
     """
     input:
-        bam=patterns['markduplicates']['bam'],
-        bai=patterns['markduplicates']['bam'] + '.bai',
+        bam=patterns["markduplicates"]["bam"],
+        bai=patterns["markduplicates"]["bam"] + ".bai",
     output:
-        patterns['bigwig']['pos']
+        patterns["bigwig"]["pos"],
     threads: 8
     resources:
         mem_mb=gb(16),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     log:
-        patterns['bigwig']['pos'] + '.log'
+        patterns["bigwig"]["pos"] + ".log",
     params:
         strand_arg={
-                'unstranded': '',
-                'fr-firststrand': '--filterRNAstrand forward ',
-                'fr-secondstrand': '--filterRNAstrand reverse ',
-            }[config["stranded"]],
+            "unstranded": "",
+            "fr-firststrand": "--filterRNAstrand forward ",
+            "fr-secondstrand": "--filterRNAstrand reverse ",
+        }[config["stranded"]],
         extra=(
-            '--minMappingQuality 20 '
-            '--smoothLength 10 '
-            '--normalizeUsing BPM '    # equivalent to TPM # [TEST SETTINGS]
+            "--minMappingQuality 20 "
+            "--smoothLength 10 "
+            "--normalizeUsing BPM "  # [TEST SETTINGS]
         ),
     run:
         shell(
-            'bamCoverage '
-            '--bam {input.bam} '
-            '-o {output} '
-            '-p {threads} '
-            '{params.extra} '
-            '{params.strand_arg} '
-            '&> {log}'
+            "bamCoverage "
+            "--bam {input.bam} "
+            "-o {output} "
+            "-p {threads} "
+            "{params.extra} "
+            "{params.strand_arg} "
+            "&> {log}"
         )
 
 
 rule flagstat:
     input:
-        bam=patterns['markduplicates']['bam'],
-        bai=patterns['markduplicates']['bam'] + '.bai'
+        bam=patterns["markduplicates"]["bam"],
+        bai=patterns["markduplicates"]["bam"] + ".bai",
     output:
-        patterns['samtools']['flagstat']
+        patterns["samtools"]["flagstat"],
     log:
-        patterns['samtools']['flagstat'] + '.log'
+        patterns["samtools"]["flagstat"] + ".log",
     shell:
-        'samtools flagstat {input.bam} > {output}'
+        "samtools flagstat {input.bam} > {output}"
 
 
 rule samtools_stats:
     input:
-        bam=patterns['markduplicates']['bam'],
-        bai=patterns['markduplicates']['bam'] + '.bai'
+        bam=patterns["markduplicates"]["bam"],
+        bai=patterns["markduplicates"]["bam"] + ".bai",
     output:
-        patterns['samtools']['stats']
+        patterns["samtools"]["stats"],
     log:
-        patterns['samtools']['stats'] + '.log'
+        patterns["samtools"]["stats"] + ".log",
     shell:
-        'samtools stats {input.bam} > {output}'
+        "samtools stats {input.bam} > {output}"

From 6968e4fd3a3d4deb7e3d41bd3e9508ffe1416db7 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 10 Jan 2025 23:08:58 -0500
Subject: [PATCH 045/196] move wrappers to scripts

---
 wrappers/wrappers/epic2/wrapper.py => scripts/epic2.py            | 0
 .../macs2/callpeak/wrapper.py => scripts/macs2_callpeak.py        | 0
 .../merge_and_dedup/wrapper.py => scripts/merge_and_dedup.py      | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename wrappers/wrappers/epic2/wrapper.py => scripts/epic2.py (100%)
 rename wrappers/wrappers/macs2/callpeak/wrapper.py => scripts/macs2_callpeak.py (100%)
 rename wrappers/wrappers/combos/merge_and_dedup/wrapper.py => scripts/merge_and_dedup.py (100%)

diff --git a/wrappers/wrappers/epic2/wrapper.py b/scripts/epic2.py
similarity index 100%
rename from wrappers/wrappers/epic2/wrapper.py
rename to scripts/epic2.py
diff --git a/wrappers/wrappers/macs2/callpeak/wrapper.py b/scripts/macs2_callpeak.py
similarity index 100%
rename from wrappers/wrappers/macs2/callpeak/wrapper.py
rename to scripts/macs2_callpeak.py
diff --git a/wrappers/wrappers/combos/merge_and_dedup/wrapper.py b/scripts/merge_and_dedup.py
similarity index 100%
rename from wrappers/wrappers/combos/merge_and_dedup/wrapper.py
rename to scripts/merge_and_dedup.py

From 8013417fa746715ebb2d9b14e517bbb5ac771995 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 10 Jan 2025 23:09:30 -0500
Subject: [PATCH 046/196] overhaul and simplify preprocessor

---
 ci/preprocessor.py | 120 +++++++++++++--------------------------------
 1 file changed, 35 insertions(+), 85 deletions(-)

diff --git a/ci/preprocessor.py b/ci/preprocessor.py
index 042bee33..6bf05361 100644
--- a/ci/preprocessor.py
+++ b/ci/preprocessor.py
@@ -7,54 +7,16 @@
 in production. Rather than require users edit files to remove those
 test-specific patterns, here we keep the test settings commented out and only
 un-comment when running tests.
-
-First, we look for any line that matches "# [test settings]" (case insensitive,
-with optional surrounding spacing) and an optional signed integer. Any of these
-would work:
-
-    >>> assert matches('# [test settings]')
-    >>> assert matches('#[test settings]')
-    >>> assert matches('# [ test settings ]')
-    >>> assert matches('# [ test settings -1]')
-    >>> assert matches('# [ test settings +2]')
-    >>> assert matches('# [ TEST SETTINGS +2]')
-    >>> assert matches('# [ TeSt SeTTiNgS +2   ]')
-
-If a lines does not match, output it as-is.
-
-If a line matches, then uncomment it. Specifically, remove the first "#" in the
-line; if it was followed by exactly one space, then remove that too.
-
-If a line matches and a signed integer was provided, then consider it
-a relative location, and then comment-out the referred-to line. Example:
-
-    >>> preprocess('''
-    ... use this for production
-    ... # use this for tests  # [test settings -1]
-    ... '''.splitlines(True))
-    <BLANKLINE>
-    # use this for production
-    use this for tests  # [test settings -1]
-    <BLANKLINE>
-
-If the matched special string creates the first "#" in the line, then do
-nothing to that line but still respect the relative locations. Useful for just
-commenting out nearby lines for tests:
-
-    >>> preprocess('''
-    ... # [TEST SETTINGS +1]
-    ... comment out for testing'''.splitlines(True))
-    <BLANKLINE>
-    # [TEST SETTINGS +1]
-    # comment out for testing
 """
 
+
 import re
-regexp = re.compile(r'#\s?\[\s?test settings\s?(?P<rel>[-+]*\d)?\s*\]')
 
+regexp = re.compile(r"#\s?\[\s?(enable|disable) for test\s?\]")
 
-def matches(line):
-    return regexp.search(line.lower()) is not None
+
+def is_commented(line):
+    return line.strip().startswith("#")
 
 
 def comment_line(line):
@@ -66,87 +28,75 @@ def comment_line(line):
     """
     x = []
     for i, character in enumerate(line):
-        if character == ' ':
+        if character == " ":
             x.append(character)
         else:
             break
-    x.append('# ')
+    x.append("# ")
     x.extend(line[i:])
-    return ''.join(x)
+    return "".join(x)
 
 
 def uncomment_line(line):
     """
     Removes the first instance of "#" from a line; if it was followed by
-    exactly one space then remove that too.
+    exactly one space then remove that too . . . UNLESS the *only* comment is the
+    special character that triggers this behavior, in which case we do nothing.
 
     >>> assert uncomment_line('# asdf') == 'asdf'
     >>> assert uncomment_line('#asdf') == 'asdf'
     >>> assert uncomment_line('# asdf # but this should be kept') == 'asdf # but this should be kept'
     >>> assert uncomment_line('#    asdf') == '    asdf'
     >>> assert uncomment_line('  #    asdf') == '      asdf'
+    >>> assert uncomment_line('do nothing') == 'do nothing'
+    >>> assert uncomment_line('do nothing # [disable for test]') == 'do nothing # [disable for test]')
+    >>> assert uncomment_line('#uncomment # [disable for test]') == 'uncomment # [disable for test]')
     """
-    first = line.find('#')
+    first = line.find("#")
 
-    # If the first comment is the one that flag the line, then do nothing.
+    # If the first comment is the one that flagged the line, then do nothing.
     m = regexp.search(line.lower())
     if m:
         if m.start() == first:
             return line
 
-    if line[first + 1] == ' ' and line[first + 2] != ' ':
-        pattern = '# '
+    if line[first + 1] == " " and line[first + 2] != " ":
+        pattern = "# "
     else:
-        pattern = '#'
-    return line.replace(pattern, '', 1)
+        pattern = "#"
+    return line.replace(pattern, "", 1)
 
 
 def preprocess(lines):
+    result = []
 
     if isinstance(lines, str):
         lines = [lines]
 
-    # These lists will keep track of whether a line should be changed.  We need to
-    # create them ahead of time so that we can use relative indexing from line N to
-    # modify the state of lines N-1 or N+1
-    uncomment = [False for i in range(len(lines))]
-    comment = [False for i in range(len(lines))]
-
-    for i, line in enumerate(lines):
+    for line in lines:
         m = regexp.search(line.lower())
-        if m:
-            # There as at least a "[ test settings ]", so remove comment
-            uncomment[i] = True
-
-            # Figure out if there was also a relative location to uncomment,
-            # and keep track of it in the `comment` list.
-            rel = m.group('rel')
-            if rel is not None:
-                rel = int(rel)
-                comment[i + rel] = True
+        if not m:
+            result.append(line)
+            continue
 
-    result = []
-    for (c, u, line) in zip(comment, uncomment, lines):
-        # E.g., in this situation, unclear what should happen:
-        #
-        #     # [test settings]
-        #     # [test settings -1]
-        #
-        if c and u:
-            raise ValueError("Line {0} is trying to be both commented and uncommented".format(line))
-        if c:
-            result.append(comment_line(line))
-        elif u:
+        action = m.group(1)
+        if action == "enable" and is_commented(line):
             result.append(uncomment_line(line))
+        elif action == "disable" and not is_commented(line):
+            result.append(comment_line(line))
         else:
-            result.append(line)
-    print(''.join(result))
+            raise ValueError(f"Inconsistent commenting and action:\n{line}")
+
+    print("".join(result))
 
 
 if __name__ == "__main__":
     import argparse
+
     ap = argparse.ArgumentParser(usage=__doc__)
-    ap.add_argument('infile', help='Input file to modify. Modified file printed to stdout.')
+    ap.add_argument(
+        "infile", help="Input file to modify. Modified file printed to stdout."
+    )
     args = ap.parse_args()
     lines = open(args.infile).readlines()
     preprocess(lines)

From 595eddf83c6cd784ce632b84ff71b2352607a473 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 10 Jan 2025 23:09:52 -0500
Subject: [PATCH 047/196] add bed_to_bigbed as script

---
 scripts/bed_to_bigbed.py | 56 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 scripts/bed_to_bigbed.py

diff --git a/scripts/bed_to_bigbed.py b/scripts/bed_to_bigbed.py
new file mode 100644
index 00000000..13ab5444
--- /dev/null
+++ b/scripts/bed_to_bigbed.py
@@ -0,0 +1,56 @@
+import sys
+import os
+import numpy as np
+import pandas as pd
+from snakemake.shell import shell
+
+sys.path.insert(0, os.path.dirname(__file__) + "/..")
+from lib import chipseq
+
+# Based on the filename, identify the algorithm;
+# Based on the contents, identify the format.
+algorithm = os.path.basename(os.path.dirname(snakemake.input.bed))
+kind = chipseq.detect_peak_format(snakemake.input.bed)
+
+# bedToBigBed doesn't handle zero-size files
+if os.stat(snakemake.input.bed).st_size == 0:
+    shell("touch {output}")
+
+# Note that autoSql filenames are relative to the workdir of the snakefile
+# calling this script.
+elif kind == 'narrowPeak':
+    _as = '../../include/autosql/bigNarrowPeak.as'
+    _type = 'bed6+4'
+    names=[
+        'chrom', 'chromStart', 'chromEnd', 'name', 'score',
+        'strand', 'signalValue', 'pValue', 'qValue', 'peak']
+elif kind == 'broadPeak':
+    _as = '../../include/autosql/bigBroadPeak.as'
+    _type = 'bed6+3'
+    names=[
+        'chrom', 'chromStart', 'chromEnd', 'name', 'score',
+        'strand', 'signalValue', 'pValue', 'qValue']
+elif kind == 'epic2Input':
+    _as = f'../../include/autosql/{kind}Peak.as'
+    _type = 'bed6+4'
+    names=[
+        'chrom', 'chromStart', 'chromEnd', 'pValue', 'score',
+        'strand', 'ChIPCount', 'InputCount', 'FDR', 'log2FoldChange']
+elif kind == 'epic2NoInput':
+    _as = f'../../include/autosql/{kind}Peak.as'
+    _type = 'bed6'
+    names=[
+        'chrom', 'chromStart', 'chromEnd', 'ChIPCount', 'score',
+        'strand']
+else:
+    raise ValueError("Unhandled format for {0}".format(input.bed))
+
+df = pd.read_table(snakemake.input.bed, index_col=False, names=names)
+df['score'] = df['score'] - df['score'].min()
+df['score'] = (df['score'] / df['score'].max()) * 1000
+df['score'] = df['score'].replace([np.inf, -np.inf], np.nan).fillna(0)
+df['score'] = df['score'].astype(int)
+df.to_csv(snakemake.output[0] + '.tmp', sep='\t', index=False, header=False)
+
+shell('bedToBigBed -as={_as} -type={_type} {snakemake.output}.tmp {snakemake.input.chromsizes} {snakemake.output} &> {snakemake.log}')
+shell('rm {snakemake.output}.tmp')

From 95cefeacf068d69f6799e5d56a86f9f6ac68d04b Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 10 Jan 2025 23:10:08 -0500
Subject: [PATCH 048/196] add peakcallers to requirements.txt

---
 include/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/requirements.txt b/include/requirements.txt
index fd8df8be..a2b21ee3 100644
--- a/include/requirements.txt
+++ b/include/requirements.txt
@@ -4,6 +4,7 @@ bowtie
 bowtie2
 cutadapt>=3.0
 deeptools
+epic2
 fastq-screen
 fastqc
 font-ttf-dejavu-sans-mono
@@ -13,6 +14,7 @@ hisat2
 intervalstats
 ipython
 kallisto
+macs2
 multiqc
 pandas
 pandoc

From c35776379d1c933e013c28b5ec2bbb37e5cd1950 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 10 Jan 2025 23:10:44 -0500
Subject: [PATCH 049/196] clean up log handling for epic2

---
 scripts/epic2.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/scripts/epic2.py b/scripts/epic2.py
index ee66e766..6ac30bdb 100644
--- a/scripts/epic2.py
+++ b/scripts/epic2.py
@@ -2,8 +2,6 @@
 import glob
 from snakemake import shell
 
-log = snakemake.log_fmt_shell()
-logfile = None
 extra = snakemake.params.get('extra', '')
 
 outdir, basebed = os.path.split(snakemake.output.bed)
@@ -11,21 +9,17 @@
 extra = snakemake.params.block.get('extra', '')
 
 # `-c` has to be skipped if no control is provided
-# if os.path.isfile(snakemake.input.control):
 if len(snakemake.input.control) > 0:
     arguments = '-c {snakemake.input.control} '
 else:
     arguments = ''
-# Add `--guess-bampe` if input dataset is paired-end
-if snakemake.params.is_paired:
-    arguments += '--guess-bampe '
 
 
 shell(
     'epic2 ' + arguments + extra +
     '-t {snakemake.input.ip} '
-    '--chromsizes {snakemake.input.chromsizes} | '
-    'sort -k1,1 -k2,2n > {label}.tmp.bed '
+    '--chromsizes {snakemake.input.chromsizes} 2> {snakemake.log} | '
+    'sort -k1,1 -k2,2n > {label}.tmp.bed'
 )
 
 # Fix the output file so that it doesn't have negative numbers and so it fits

From 52ac28aa49d6f310e66517ece8e7143c39f46ee8 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 10 Jan 2025 23:11:21 -0500
Subject: [PATCH 050/196] test settings overhaul

---
 workflows/rnaseq/Snakefile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 3b384cd3..84682e9d 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -605,8 +605,8 @@ rule markduplicates:
         runtime=autobump(hours=2),
         disk_mb=autobump(gb=100),
     params:
-        java_args="-Xmx20g",
-        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
+        java_args="-Xmx20g",  # [disable for test]
+        # java_args='-Xmx2g'  # [enable for test]
     shell:
         "picard "
         "{params.java_args} "
@@ -634,8 +634,8 @@ rule collectrnaseqmetrics:
         mem_mb=gb(32),
         runtime=autobump(hours=2),
     params:
-        java_args="-Xmx20g",
-        # java_args='-Xmx2g',  # [TEST SETTINGS -1]
+        java_args="-Xmx20g",   # [disable for test]
+        # java_args='-Xmx2g',  # [enable for test]
         strand_arg={
             "unstranded": "STRAND=NONE ",
             "fr-firststrand": "STRAND=SECOND_READ_TRANSCRIPTION_STRAND ",
@@ -834,7 +834,7 @@ rule bigwig_neg:
         extra=(
             "--minMappingQuality 20 "
             "--smoothLength 10 "
-            "--normalizeUsing BPM "  # [TEST SETTINGS]
+            "--normalizeUsing BPM "  # [disable for test]
         ),
     run:
         shell(
@@ -872,7 +872,7 @@ rule bigwig_pos:
         extra=(
             "--minMappingQuality 20 "
             "--smoothLength 10 "
-            "--normalizeUsing BPM "  # [TEST SETTINGS]
+            "--normalizeUsing BPM "  # [disable for test]
         ),
     run:
         shell(

From 9024aa6d5c8983916fd4601a2ba4144e937e4fc4 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 10 Jan 2025 23:11:29 -0500
Subject: [PATCH 051/196] comment sampletable

---
 workflows/rnaseq/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 84682e9d..a94f4639 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -16,7 +16,7 @@ include: "../references/Snakefile"
 
 
 REFERENCES = config.get("reference_dir", "../../references")
-sampletable = pd.read_table(config["sampletable"], sep="\t")
+sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
 is_sra = utils.detect_sra(sampletable)

From 2483b98ab569ea278a9cd5dc835332bf47f582db Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 10 Jan 2025 23:11:42 -0500
Subject: [PATCH 052/196] various rnaseq fixes

---
 workflows/rnaseq/Snakefile | 40 ++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index a94f4639..a328d887 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -1,7 +1,6 @@
 import sys
 import os
 import yaml
-import tempfile
 import pandas as pd
 
 sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
@@ -38,6 +37,7 @@ localrules:
 rule all:
     input:
         patterns["multiqc"],
+        patterns["bigwig"],
 
 
 if is_sra:
@@ -124,19 +124,38 @@ rule cutadapt:
             )
 
 
-# TODO: rm wrapper
 rule fastqc:
     input:
-        "{sample_dir}/{sample}/{sample}{suffix}",
-    threads: 6
+        '{sample_dir}/{sample}/{sample}{suffix}'
+    threads:
+        1
     output:
-        html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html",
-        zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip",
+        html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html',
+        zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip',
     resources:
         mem_mb=gb(8),
-        runtime=autobump(hours=2),
-    script:
-        utils.wrapper_for("fastqc/wrapper.py")
+        runtime=autobump(hours=2)
+    log:
+        '{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log',
+    run:
+        outdir = os.path.dirname(output.html) or "."
+        shell(
+            'fastqc '
+            '--noextract '
+            '--quiet '
+            '--outdir {outdir} '
+            '{input} '
+            '{log} '
+        )
+        outfile = os.path.basename(input[0])
+        for s in ['.fastq', '.fq', '.gz', '.bam']:
+            outfile = outfile.replace(s, '')
+        out_zip = os.path.join(outdir, outfile + '_fastqc.zip')
+        if not os.path.abspath(out_zip) == os.path.abspath(output.zip):
+            shell('mv {out_zip} {output.zip}')
+        out_html = os.path.join(outdir, outfile + '_fastqc.html')
+        if not os.path.abspath(out_html) == os.path.abspath(output.html):
+            shell('mv {out_html} {output.html}')
 
 
 if config["aligner"] == "hisat2":
@@ -149,7 +168,7 @@ if config["aligner"] == "hisat2":
             bam=temporary(patterns["bam"]),
         log:
             patterns["bam"] + ".log",
-        threads: 6
+        threads: 16
         resources:
             mem_mb=gb(32),
             runtime=autobump(hours=8),
@@ -173,6 +192,7 @@ if config["aligner"] == "hisat2":
                 "--no-unal "
                 "--threads {threads} "
                 "-S {sam} "
+                "{params.extra} "
                 "> {log} 2>&1"
             )
 

From 227646c8433d0eecf91b4f0b1237be64677117c0 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Fri, 10 Jan 2025 23:12:11 -0500
Subject: [PATCH 053/196] chipseq overhaul and simplification

---
 lib/chipseq.py                                |  36 +-
 workflows/chipseq/Snakefile                   | 971 +++++++-----------
 .../chipseq/config/chipseq_patterns.yaml      |  10 +-
 workflows/chipseq/config/config.yaml          |  51 -
 4 files changed, 413 insertions(+), 655 deletions(-)

diff --git a/lib/chipseq.py b/lib/chipseq.py
index 887bb9f9..62608ed8 100644
--- a/lib/chipseq.py
+++ b/lib/chipseq.py
@@ -1,9 +1,11 @@
+from snakemake.io import expand
+
 """
 Helpers for ChIP-seq.
 """
 
 # Example config for reference
-# __example_config__ = {
+#  {
 #     'peak_calling': {
 #         [
 #             {
@@ -24,7 +26,32 @@
 #         ]
 #     }
 # }
+#
+# This needs to be expanded out to the following patterns:
+#
+# [
+#   'data/chipseq_peaks/macs2/rep1/peaks.bigbed',
+#   'data/chipseq_peaks/macs2/rep2/peaks.bigbed',
+# ]
+#
+# Which in turn needs these bams:
+#
+# [
+#   expand(patterns['merged_techreps'], label=['input_1', 'ip_1']),
+#   expand(patterns['merged_techreps'], label=['input_2', 'ip_2']),
+#
+#
 
+def add_bams_to_peak_calling(config):
+    d = peak_calling_dict(config)
+    for key, block in d.items():
+        peak_calling_run, algorithm = key
+        block['ip_bams'] = expand('data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam', label=block['ip'])
+        block['control_bams'] = expand('data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam', label=block['control'])
+        block['bed'] = f"data/chipseq_peaks/{algorithm}/{peak_calling_run}/peaks.bed"
+        block['bigbed'] = f"data/chipseq_peaks/{algorithm}/{peak_calling_run}/peaks.bigbed"
+        d[key] = block
+    return d
 
 def peak_calling_dict(config, algorithm=None):
     """
@@ -60,11 +87,6 @@ def peak_calling_dict(config, algorithm=None):
         if key in d:
             raise ValueError("peak calling run '{0}' already defined".format(key))
 
-        # If metadata key has been provided, then use that to populate the
-        # block as default values.
-        metadata = config['references'][config['organism']][config['aligner']['tag']].get('metadata', {})
-        block.update(metadata)
-
         d[key] = block
     return d
 
@@ -139,7 +161,7 @@ def merged_input_for_ip(sampletable, merged_ip):
     ... input1      input      s2cell-1             s2cell-input-1
     ... input3      input      s2cell-2             s2cell-input-3
     ... input9      input      s2cell-1             s2cell-input-1'''),
-    ... sep='\s+')
+    ... sep='\\s+')
 
 
     >>> merged_input_for_ip(df, 's2cell-gaf-1')
diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 2b5fc485..24b09dec 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -1,181 +1,110 @@
 import sys
 import os
-from textwrap import dedent
 import yaml
-import tempfile
 import pandas as pd
-import numpy as np
-import pybedtools
 
-HERE = str(Path(workflow.snakefile).parent)
-sys.path.insert(0, HERE + "/../..")
-from lib import common, utils, helpers, aligners, chipseq
-from lib.patterns_targets import ChIPSeqConfig
+sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
+from lib import utils
+from lib import chipseq
 from lib.utils import autobump, gb, hours
 
-# ----------------------------------------------------------------------------
-#
-# Search for the string "NOTE:" to look for points of configuration that might
-# be helpful for your experiment.
-#
-# ----------------------------------------------------------------------------
 
-if not workflow.overwrite_configfiles:
-    configfile: 'config/config.yaml'
+configfile: "config/config.yaml"
 
-config = common.load_config(config)
 
-include: '../references/Snakefile'
+include: "../references/Snakefile"
 
-# Verify configuration of config and sampletable files
-helpers.preflight(config)
 
-c = ChIPSeqConfig(
-    config,
-    config.get('patterns', 'config/chipseq_patterns.yaml')
-)
+REFERENCES = config.get("reference_dir", "../../references")
+sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
+sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
+is_paired = utils.detect_layout(sampletable) == "PE"
+is_sra = utils.detect_sra(sampletable)
+n = ["1", "2"] if is_paired else ["1"]
+SAMPLES = sampletable.iloc[:, 0].values
+patterns = yaml.safe_load(open("config/chipseq_patterns.yaml"))["patterns_by_sample"]
+peaks = chipseq.add_bams_to_peak_calling(config)
 
-SAMPLES = c.sampletable.iloc[:, 0].values
 
 wildcard_constraints:
-    n = '[1,2]',
-    sample = '|'.join(SAMPLES)
+    n="[1,2]",
+    sample="|".join(SAMPLES),
 
 
+localrules:
+    symlinks,
+    symlink_targets,
 
-def wrapper_for(path):
-    return 'file:' + os.path.join('../..','wrappers', 'wrappers', path)
-
-
-# ----------------------------------------------------------------------------
-# RULES
-# ----------------------------------------------------------------------------
-
-
-# See "patterns and targets" in the documentation for what's going on here.
-final_targets = utils.flatten((
-    c.targets['bam'],
-    utils.flatten(c.targets['fastqc']),
-    [c.targets['fastq_screen']],
-    [c.targets['multiqc']],
-    utils.flatten(c.targets['markduplicates']),
-    utils.flatten(c.targets['bigwig']),
-    utils.flatten(c.targets['peaks']),
-    utils.flatten(c.targets['merged_techreps']),
-    utils.flatten(c.targets['fingerprint']),
-    utils.flatten(c.targets['bigbed']),
-    utils.flatten(c.targets['multibigwigsummary']),
-    utils.flatten(c.targets['plotcorrelation']),
-))
-
-if config.get('merged_bigwigs', None):
-    final_targets.extend(utils.flatten(c.targets['merged_bigwig']))
-
-
-def render_r1_r2(pattern):
-    return expand(pattern, sample='{sample}', n=c.n)
-
-def render_r1_only(pattern):
-    return expand(pattern, sample='{sample}', n=1)
 
 rule targets:
-    """
-    Final targets to create
-    """
-    input: final_targets
-
-
-if 'orig_filename' in c.sampletable.columns:
-
-    localrules: symlinks
-
-    # Convert the sampletable to be indexed by the first column, for
-    # convenience in generating the input/output filenames.
-    _st = c.sampletable.set_index(c.sampletable.columns[0])
-
-    def orig_for_sample(wc):
-        """
-        Given a sample, returns either one or two original fastq files
-        depending on whether the library was single- or paired-end.
-        """
-        if c.is_paired:
-            return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']]
-        return _st.loc[wc.sample, ['orig_filename']]
+    input:
+        patterns["multiqc"],
+        expand(patterns["bigwig"], label=sampletable.label),
+        [v["bed"] for k, v in peaks.items()],
 
 
-    rule symlinks:
-        """
-        Symlinks files over from original filename
-        """
-        input:
-            orig_for_sample
-        output:
-            render_r1_r2(c.patterns['fastq'])
-        threads: 1
-        resources:
-            mem_mb=gb(1),
-            runtime=10,
-        run:
-            assert len(output) == len(input), (input, output)
-            for src, linkname in zip(input, output):
-                utils.make_relative_symlink(src, linkname)
+if is_sra:
 
+    include: "../../rules/sra.smk"
 
-    rule symlink_targets:
-        input: c.targets['fastq']
 
+rule symlinks:
+    input:
+        lambda wc: (
+            sampletable.loc[wc.sample, ["orig_filename", "orig_filename_R2"]]
+            if is_paired
+            else sampletable.loc[wc.sample, ["orig_filename"]]
+        ),
+    output:
+        expand(patterns["fastq"], n=n, allow_missing=True),
+    threads: 1
+    resources:
+        mem_mb=100,
+        runtime=10,
+    run:
+        assert len(output) == len(input), (input, output)
+        for src, linkname in zip(input, output):
+            utils.make_relative_symlink(src, linkname)
 
-if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('SRR')) > 0:
 
-    # Convert the sampletable to be indexed by the first column, for
-    # convenience in generating the input/output filenames.
-    _st = c.sampletable.set_index(c.sampletable.columns[0])
+rule symlink_targets:
+    input:
+        expand(
+            "data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n
+        ),
 
-    rule fastq_dump:
-        output:
-            fastq=render_r1_r2(c.patterns['fastq'])
-        log:
-            render_r1_only(c.patterns['fastq'])[0] + '.log'
-        params:
-            is_paired=c.is_paired,
-            sampletable=_st,
-            # limit = 100000, # [TEST SETTINGS]
-        resources:
-            mem_mb=autobump(gb=8),
-            runtime=autobump(hours=2)
-        conda:
-            '../../wrappers/wrappers/fastq-dump/environment.yaml'
-        script:
-            wrapper_for('fastq-dump/wrapper.py')
 
 rule cutadapt:
-    """
-    Run cutadapt
-    """
     input:
-        fastq=render_r1_r2(c.patterns['fastq'])
+        fastq=expand(patterns["fastq"], n=n, allow_missing=True),
     output:
-        fastq=render_r1_r2(c.patterns['cutadapt'])
-    resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
     log:
-        render_r1_r2(c.patterns['cutadapt'])[0] + '.log'
+        "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
     threads: 6
+    resources:
+        mem_mb=gb(2),
+        runtime=autobump(hours=2),
+    params:
+        extra=(
+            (
+                "--nextseq-trim 20 "
+                "--overlap 6 "
+                "--minimum-length 25 "
+                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+            )
+            + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
+            if is_paired
+            else ""
+        ),
     run:
-
-        # NOTE: Change cutadapt params here
-        if c.is_paired:
+        if is_paired:
             shell(
                 "cutadapt "
                 "-o {output[0]} "
                 "-p {output[1]} "
-                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
-                "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
-                '--nextseq-trim 20 '
-                "--overlap 6 "
-                '-j {threads} '
-                '--minimum-length 25 '
+                "-j {threads} "
+                "{params.extra} "
                 "{input.fastq[0]} "
                 "{input.fastq[1]} "
                 "&> {log}"
@@ -184,67 +113,85 @@ rule cutadapt:
             shell(
                 "cutadapt "
                 "-o {output[0]} "
-                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
-                '--nextseq-trim 20 '
-                "--overlap 6 "
-                '-j {threads} '
-                '--minimum-length 25 '
+                "-j {threads} "
+                "{params.extra} "
                 "{input.fastq[0]} "
                 "&> {log}"
             )
 
 
 rule fastqc:
-    """
-    Run FastQC
-    """
     input:
-        '{sample_dir}/{sample}/{sample}{suffix}'
-    threads:
-        6
+        "{sample_dir}/{sample}/{sample}{suffix}",
+    threads: 1
     output:
-        html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html',
-        zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip',
+        html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html",
+        zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip",
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2)
-    script:
-        wrapper_for('fastqc/wrapper.py')
+        mem_mb=gb(8),
+        runtime=autobump(hours=2),
+    log:
+        "{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log",
+    run:
+        outdir = os.path.dirname(output.html) or "."
+        shell(
+            "fastqc "
+            "--noextract "
+            "--quiet "
+            "--outdir {outdir} "
+            "{input} "
+            "&> {log} "
+        )
+        outfile = os.path.basename(input[0])
+        for s in [".fastq", ".fq", ".gz", ".bam"]:
+            outfile = outfile.replace(s, "")
+        out_zip = os.path.join(outdir, outfile + "_fastqc.zip")
+        if not os.path.abspath(out_zip) == os.path.abspath(output.zip):
+            shell("mv {out_zip} {output.zip}")
+        out_html = os.path.join(outdir, outfile + "_fastqc.html")
+        if not os.path.abspath(out_html) == os.path.abspath(output.html):
+            shell("mv {out_html} {output.html}")
 
 
 rule bowtie2:
-    """
-    Map reads with Bowtie2
-    """
     input:
-        fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']),
-        index=[c.refdict[c.organism][config['aligner']['tag']]['bowtie2']]
+        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+        index=multiext(
+            f"{REFERENCES}/bowtie2/genome",
+            ".1.bt2",
+            ".2.bt2",
+            ".3.bt2",
+            ".4.bt2",
+            ".rev.1.bt2",
+            ".rev.2.bt2",
+            ".fa",
+        ),
     output:
-        bam=c.patterns['bam']
+        bam=temporary(patterns["bam"]),
     log:
-        c.patterns['bam'] + '.log'
+        patterns["bam"] + ".log",
     threads: 16
     resources:
         mem_mb=gb(32),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
+    params:
+        extra="",
     run:
-        prefix = aligners.prefix_from_bowtie2_index(input.index)
-        sam = output.bam.replace('.bam', '.sam')
-
-        if c.is_paired:
-            assert len(input.fastq) == 2
-            fastqs = '-1 {0} -2 {1} '.format(*input.fastq)
-        else:
-            assert len(input.fastq) == 1
-            fastqs = '-U {0} '.format(input.fastq)
-
+        prefix = os.path.commonprefix(input.index).rstrip(".")
+        sam = output.bam.replace(".bam", ".sam")
+        fastqs = (
+            f"-1 {input.fastq[0]} -2 {input.fastq[1]}"
+            if is_paired
+            else f"-U {input.fastq}"
+        )
         shell(
             "bowtie2 "
             "-x {prefix} "
             "{fastqs} "
-            '--no-unal '  # NOTE: suppress unaligned reads
+            "--no-unal "
             "--threads {threads} "
             "-S {sam} "
+            "{params.extra} "
             "> {log} 2>&1"
         )
 
@@ -256,271 +203,164 @@ rule bowtie2:
 
 
 rule unique:
-    """
-    Remove multimappers
-    """
     input:
-        c.patterns['bam']
+        patterns["bam"],
     output:
-        c.patterns['unique']
+        patterns["unique"],
     threads: 1
     resources:
         mem_mb=gb(1),
-        runtime=autobump(hours=2)
-    shell:
+        runtime=autobump(hours=2),
+    params:
         # NOTE: the quality score chosen here should reflect the scores output
         # by the aligner used. For example, STAR uses 255 as max mapping
         # quality.
-        'samtools view -b -q 20 {input} > {output}'
+        extra="-q 20",
+    shell:
+        "samtools view -b {params.extra} {input} > {output}"
 
 
 rule fastq_count:
-    """
-    Count reads in a FASTQ file
-    """
     input:
-        fastq='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz'
+        fastq="{sample_dir}/{sample}/{sample}{suffix}.fastq.gz",
     output:
-        '{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize'
+        "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize",
     threads: 1
     resources:
         mem_mb=gb(1),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'zcat {input} | echo $((`wc -l`/4)) > {output}'
+        "zcat {input} | echo $((`wc -l`/4)) > {output}"
 
 
 rule bam_count:
-    """
-    Count reads in a BAM file
-    """
     input:
-        bam='{sample_dir}/{sample}/{suffix}.bam'
+        bam="{sample_dir}/{sample}/{suffix}.bam",
     output:
-        '{sample_dir}/{sample}/{suffix}.bam.libsize'
+        "{sample_dir}/{sample}/{suffix}.bam.libsize",
     threads: 1
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'samtools view -c {input} > {output}'
+        "samtools view -c {input} > {output}"
 
 
 rule bam_index:
-    """
-    Index a BAM
-    """
     input:
-        bam='{prefix}.bam'
+        bam="{prefix}.bam",
     output:
-        bai='{prefix}.bam.bai'
+        bai="{prefix}.bam.bai",
     threads: 1
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'samtools index {input} {output}'
-
-
-def fastq_screen_references():
-    """
-    Returns the Bowtie2 indexes for the configured references from the
-    `fastq_screen:` section of the config
-    """
-    refs = {}
-    for i in config['fastq_screen']:
-        refs[i['label']] = c.refdict[i['organism']][i['tag']]['bowtie2']
-    return refs
-
-
-rule fastq_screen:
-    """
-    Run fastq_screen to look for contamination from other genomes
-    """
-    input:
-        **fastq_screen_references(),
-        fastq=render_r1_only(rules.cutadapt.output.fastq),
-    output:
-        txt=c.patterns['fastq_screen']
-    log:
-        c.patterns['fastq_screen'] + '.log'
-    threads: 6
-    resources:
-        mem_mb=autobump(gb=4),
-        runtime=autobump(hours=2)
-    params: subset=100000
-    script:
-        wrapper_for('fastq_screen/wrapper.py')
-
-
-multiqc_inputs = [
-    utils.flatten(c.targets['fastqc']) +
-    utils.flatten(c.targets['cutadapt']) +
-    utils.flatten(c.targets['bam']) +
-    utils.flatten(c.targets['markduplicates']) +
-    utils.flatten(c.targets['fingerprint']) +
-    utils.flatten(c.targets['peaks']) +
-    utils.flatten(c.targets['fastq_screen']) +
-    utils.flatten(c.targets['plotcorrelation'])
-]
-
-if c.is_paired:
-    multiqc_inputs.extend(utils.flatten(c.targets['collectinsertsizemetrics']['metrics']))
-
-rule multiqc:
-    """
-    Aggregate various QC stats and logs into a single HTML report with MultiQC
-    """
-    # NOTE: if you add more rules and want MultiQC to pick up the output, best
-    # to add outputs from those rules to the inputs here.
-    input:
-        files=multiqc_inputs,
-        config='config/multiqc_config.yaml'
-    output:
-        c.targets['multiqc']
-    log:
-        c.targets['multiqc'][0] + '.log'
-    threads: 1
-    resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2)
-    run:
-        analysis_directory = set([os.path.dirname(i) for i in input])
-        outdir = os.path.dirname(c.targets['multiqc'][0])
-        basename = os.path.basename(c.targets['multiqc'][0])
-        shell(
-            'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 '
-            'multiqc '
-            '--quiet '
-            '--outdir {outdir} '
-            '--force '
-            '--filename {basename} '
-            '--config {input.config} '
-            '{analysis_directory} '
-            '&> {log} '
-        )
+        "samtools index {input} {output}"
 
 
 rule markduplicates:
-    """
-    Mark or remove PCR duplicates with Picard MarkDuplicates
-    """
     input:
-        bam=c.patterns['unique']
+        bam=patterns["unique"],
     output:
-        bam=c.patterns['markduplicates']['bam'],
-        metrics=c.patterns['markduplicates']['metrics']
+        bam=patterns["markduplicates"]["bam"],
+        metrics=patterns["markduplicates"]["metrics"],
     log:
-        c.patterns['markduplicates']['bam'] + '.log'
+        patterns["markduplicates"]["bam"] + ".log",
     threads: 1
     resources:
         mem_mb=gb(32),
         runtime=autobump(hours=2),
-        disk_mb=gb(100)
+        disk_mb=gb(100),
     params:
-        # NOTE: Be careful with the memory here; make sure you have enough
-        # and/or it matches the resources you're requesting in the cluster
-        # config.
-        java_args='-Xmx20g'
-        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
+        java_args="-Xmx20g",  # [disable for test]
+        # java_args='-Xmx2g'  # [enable for test]
     shell:
-        'picard '
-        '{params.java_args} '
-        'MarkDuplicates '
-        'INPUT={input.bam} '
-        'OUTPUT={output.bam} '
-        'REMOVE_DUPLICATES=true '
-        'METRICS_FILE={output.metrics} '
-        'VALIDATION_STRINGENCY=LENIENT '
-        '&> {log}'
+        "picard "
+        "{params.java_args} "
+        "MarkDuplicates "
+        "INPUT={input.bam} "
+        "OUTPUT={output.bam} "
+        "REMOVE_DUPLICATES=true "
+        "METRICS_FILE={output.metrics} "
+        "VALIDATION_STRINGENCY=LENIENT "
+        "&> {log}"
 
 
 rule merge_techreps:
-    """
-    Technical replicates are merged and then re-deduped.
-
-    If there's only one technical replicate, its unique, nodups bam is simply
-    symlinked.
-    """
     input:
         lambda wc: expand(
-            c.patterns['markduplicates']['bam'],
-            sample=common.get_techreps(c.sampletable, wc.label),
-        )
+            patterns["markduplicates"]["bam"],
+            sample=utils.get_techreps(sampletable, wc.label),
+        ),
     output:
-        bam=c.patterns['merged_techreps'],
-        metrics=c.patterns['merged_techreps'] + '.metrics'
+        bam=patterns["merged_techreps"],
+        metrics=patterns["merged_techreps"] + ".metrics",
     log:
-        c.patterns['merged_techreps'] + '.log'
+        patterns["merged_techreps"] + ".log",
     threads: 1
     resources:
         mem_mb=gb(32),
         runtime=autobump(hours=2),
         disk_mb=gb(100),
     params:
-        # NOTE: Be careful with the memory here; make sure you have enough
-        # and/or it matches the resources you're requesting in the cluster
-        # config.
-        java_args='-Xmx32g'
-        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
+        java_args="-Xmx32g",  # [disable for test]
+        # java_args='-Xmx2g'  # [enable for test]
     script:
-        wrapper_for('combos/merge_and_dedup/wrapper.py')
+        "../../scripts/merge_and_dedup.py"
+
+
+if is_paired:
 
-if c.is_paired:
     rule collectinsertsizemetrics:
         input:
-            bam=c.patterns['markduplicates']['bam'],
+            bam=patterns["markduplicates"]["bam"],
         output:
-            pdf=c.patterns['collectinsertsizemetrics']['pdf'],
-            metrics=c.patterns['collectinsertsizemetrics']['metrics']
+            pdf=patterns["collectinsertsizemetrics"]["pdf"],
+            metrics=patterns["collectinsertsizemetrics"]["metrics"],
         log:
-            c.patterns['collectinsertsizemetrics']['metrics'] + '.log'
+            patterns["collectinsertsizemetrics"]["metrics"] + ".log",
         threads: 1
         resources:
             mem_mb=gb(32),
-            runtime=autobump(hours=2)
+            runtime=autobump(hours=2),
         params:
-            java_args='-Xmx20g'
-            # java_args='-Xmx2g'  # [TEST SETTINGS -1]
+            java_args="-Xmx20g",  # [disable for test]
+            # java_args='-Xmx2g'  # [enable for test]
         shell:
-            'picard '
-            '{params.java_args} '
-            'CollectInsertSizeMetrics '
-            'I={input.bam} '
-            'O={output.metrics} '
-            'H={output.pdf} '
-            '&> {log} '
+            "picard "
+            "{params.java_args} "
+            "CollectInsertSizeMetrics "
+            "I={input.bam} "
+            "O={output.metrics} "
+            "H={output.pdf} "
+            "&> {log} "
 
-rule bigwig:
-    """
-    Create a bigwig.
 
-    See note below about normalizing!
-    """
+rule bigwig:
     input:
-        bam=c.patterns['merged_techreps'],
-        bai=c.patterns['merged_techreps'] + '.bai',
+        bam=patterns["merged_techreps"],
+        bai=patterns["merged_techreps"] + ".bai",
     output:
-        c.patterns['bigwig']
+        patterns["bigwig"],
     log:
-        c.patterns['bigwig'] + '.log'
+        patterns["bigwig"] + ".log",
     threads: 1
     resources:
         mem_mb=gb(16),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'bamCoverage '
-        '--bam {input.bam} '
-        '-o {output} '
-        '-p {threads} '
-        '--minMappingQuality 20 '
-        '--ignoreDuplicates '
+        "bamCoverage "
+        "--bam {input.bam} "
+        "-o {output} "
+        "-p {threads} "
+        "--minMappingQuality 20 "
+        "--ignoreDuplicates "
         # Can't use the CPM normalization for testing due to <1000 reads total
         # in example data; keep uncommented when running in production
-        # [TEST SETTINGS +1]
-        '--normalizeUsing CPM '
-        '--extendReads 300 '
-        '&> {log}'
+        "--normalizeUsing CPM "  # [disable for test]
+        "--extendReads 300 "
+        "&> {log}"
 
 
 rule fingerprint:
@@ -531,175 +371,114 @@ rule fingerprint:
     Note: uses the merged techreps.
     """
     input:
-        bams=lambda wc: expand(c.patterns['merged_techreps'], label=wc.ip_label),
-        control=lambda wc: expand(c.patterns['merged_techreps'], label=chipseq.merged_input_for_ip(c.sampletable, wc.ip_label)),
-        bais=lambda wc: expand(c.patterns['merged_techreps'] + '.bai', label=wc.ip_label),
-        control_bais=lambda wc: expand(c.patterns['merged_techreps'] + '.bai', label=chipseq.merged_input_for_ip(c.sampletable, wc.ip_label)),
+        bams=lambda wc: expand(patterns["merged_techreps"], label=wc.ip_label),
+        control=lambda wc: expand(
+            patterns["merged_techreps"],
+            label=chipseq.merged_input_for_ip(sampletable, wc.ip_label),
+        ),
+        bais=lambda wc: expand(patterns["merged_techreps"] + ".bai", label=wc.ip_label),
+        control_bais=lambda wc: expand(
+            patterns["merged_techreps"] + ".bai",
+            label=chipseq.merged_input_for_ip(sampletable, wc.ip_label),
+        ),
     output:
-        plot=c.patterns['fingerprint']['plot'],
-        raw_counts=c.patterns['fingerprint']['raw_counts'],
-        metrics=c.patterns['fingerprint']['metrics']
+        plot=patterns["fingerprint"]["plot"],
+        raw_counts=patterns["fingerprint"]["raw_counts"],
+        metrics=patterns["fingerprint"]["metrics"],
     threads: 8
-    log: c.patterns['fingerprint']['metrics'] + '.log'
+    log:
+        patterns["fingerprint"]["metrics"] + ".log",
     threads: 1
     resources:
         mem_mb=gb(32),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     run:
         if len(input.control) == 0:
             jsdsample_arg = ""
         else:
-            jsdsample_arg = '--JSDsample ' + str(input.control)
+            jsdsample_arg = "--JSDsample " + str(input.control)
         shell(
-            'plotFingerprint ' '--bamfiles {input.bams} '
-            '-p {threads} '
+            "plotFingerprint "
+            "--bamfiles {input.bams} "
+            "-p {threads} "
             # The JSDsample argument is disabled for testing as it dramatically
             # increases the run time.
-            # [TEST SETTINGS +1]
-            '{jsdsample_arg} '
-            '--smartLabels '
-            '--extendReads=300 '
-            '--skipZeros '
-            '--outQualityMetrics {output.metrics} '
-            '--outRawCounts {output.raw_counts} '
-            '--plotFile {output.plot} '
+            "{jsdsample_arg} "  # [disable for test]
+            "--smartLabels "
+            "--extendReads=300 "
+            "--skipZeros "
+            "--outQualityMetrics {output.metrics} "
+            "--outRawCounts {output.raw_counts} "
+            "--plotFile {output.plot} "
             # Default is 500k; use fewer to speed up testing:
-            # '--numberOfSamples 50 '  # [TEST SETTINGS ]
-            '&> {log} '
+            # '--numberOfSamples 50 '  # [enable for test]
+            "&> {log} "
             '&& sed -i "s/NA/0.0/g" {output.metrics} '
         )
 
 
-rule sicer:
-    """
-    Run the SICER peak caller
-    """
-    input:
-        ip=lambda wc:
-            expand(
-                c.patterns['merged_techreps'],
-                label=chipseq.samples_for_run(config, wc.sicer_run, 'sicer', 'ip'),
-            ),
-        control=lambda wc:
-            expand(
-                c.patterns['merged_techreps'],
-                label=chipseq.samples_for_run(config, wc.sicer_run, 'sicer', 'control'),
-            ),
-        chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'],
-    output:
-        bed=c.patterns['peaks']['sicer']
-    log:
-        c.patterns['peaks']['sicer'] + '.log'
-    resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2)
-    params:
-        block=lambda wc: chipseq.block_for_run(config, wc.sicer_run, 'sicer')
-    wrapper:
-        wrapper_for('sicer')
 
 rule macs2:
     """
     Run the macs2 peak caller
     """
     input:
-        ip=lambda wc:
-            expand(
-                c.patterns['merged_techreps'],
-                label=chipseq.samples_for_run(config, wc.macs2_run, 'macs2', 'ip'),
-            ),
-        control=lambda wc:
-            expand(
-                c.patterns['merged_techreps'],
-                label=chipseq.samples_for_run(config, wc.macs2_run, 'macs2', 'control'),
-            ),
-        chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'],
+        ip=lambda wc: expand(
+            patterns["merged_techreps"],
+            label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "ip"),
+        ),
+        control=lambda wc: expand(
+            patterns["merged_techreps"],
+            label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "control"),
+        ),
+        chromsizes=rules.chromsizes.output,
     output:
-        bed=c.patterns['peaks']['macs2']
+        bed=patterns["peaks"]["macs2"],
     resources:
         mem_mb=gb(16),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     log:
-        c.patterns['peaks']['macs2'] + '.log'
+        patterns["peaks"]["macs2"] + ".log",
     params:
-        block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, 'macs2')
-    wrapper:
-        wrapper_for('macs2/callpeak')
+        block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, "macs2"),
+    script:
+        "../../scripts/macs2_callpeak.py"
+
 
-# Epic2 peak caller
-# See https://github.com/biocore-ntnu/epic2
 rule epic2:
     """
     Run the epic2 peak caller
     """
     input:
-        ip=lambda wc:
-            expand(
-                c.patterns['merged_techreps'],
-                label=chipseq.samples_for_run(config, wc.epic2_run, 'epic2', 'ip'),
-            ),
-        control=lambda wc:
-            expand(
-                c.patterns['merged_techreps'],
-                label=chipseq.samples_for_run(config, wc.epic2_run, 'epic2', 'control'),
-            ),
-        bai=lambda wc:  # epic2 requires both .bam and .bam.bai (bam index) files (.bam.bai is not explicitly)
-            expand(
-                c.patterns['merged_techreps'] + '.bai',
-                label=chipseq.samples_for_run(config, wc.epic2_run, 'epic2', 'ip'),
-            ),
-        chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes']
+        ip=lambda wc: expand(
+            patterns["merged_techreps"],
+            label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"),
+        ),
+        control=lambda wc: expand(
+            patterns["merged_techreps"],
+            label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"),
+        ),
+        bai=lambda wc: expand(
+            patterns["merged_techreps"] + ".bai",
+            label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"),
+        )
+        + expand(
+            patterns["merged_techreps"] + ".bai",
+            label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"),
+        ),
+        chromsizes=rules.chromsizes.output,
     output:
-        bed=c.patterns['peaks']['epic2']
+        bed=patterns["peaks"]["epic2"],
     resources:
         mem_mb=gb(16),
-        runtime=autobump(hours=2)
-    log:
-        c.patterns['peaks']['epic2'] + '.log'
-    params:
-        block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, 'epic2'),
-        is_paired=c.is_paired
-    wrapper:
-        wrapper_for('epic2')
-
-
-rule spp:
-    """
-    Run the SPP peak caller
-    """
-    input:
-        ip=lambda wc:
-            expand(
-                c.patterns['merged_techreps'],
-                label=chipseq.samples_for_run(config, wc.spp_run, 'spp', 'ip'),
-            ),
-        control=lambda wc:
-            expand(
-                c.patterns['merged_techreps'],
-                label=chipseq.samples_for_run(config, wc.spp_run, 'spp', 'control'),
-            ),
-        chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'],
-    output:
-        bed=c.patterns['peaks']['spp'],
-        enrichment_estimates=c.patterns['peaks']['spp'] + '.est.wig',
-        smoothed_enrichment_mle=c.patterns['peaks']['spp'] + '.mle.wig',
-        rdata=c.patterns['peaks']['spp'] + '.RData'
+        runtime=autobump(hours=2),
     log:
-        c.patterns['peaks']['spp'] + '.log'
-    resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2)
+        patterns["peaks"]["epic2"] + ".log",
     params:
-        block=lambda wc: chipseq.block_for_run(config, wc.spp_run, 'spp'),
-        keep_tempfiles=False,
-        # NOTE: Be careful with the memory here; make sure you have enough
-        # and/or it matches the resources you're requesting in the cluster
-        # config.
-        java_args='-Xmx24g',
-        # java_args='-Xmx2g',  # [TEST SETTINGS -1]
-    threads: 2
-    wrapper:
-        wrapper_for('spp')
+        block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, "epic2"),
+        is_paired=is_paired,
+    script:
+        "../../scripts/epic2.py"
 
 
 rule bed_to_bigbed:
@@ -707,59 +486,17 @@ rule bed_to_bigbed:
     Convert BED to bigBed
     """
     input:
-        bed='{prefix}.bed',
-        chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes']
-    output: '{prefix}.bigbed'
+        bed="{prefix}.bed",
+        chromsizes=rules.chromsizes.output,
+    output:
+        "{prefix}.bigbed",
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
-    log: '{prefix}.bigbed.log'
-    run:
-        # Based on the filename, identify the algorithm. Based on the contents,
-        # identify the format.
-        algorithm = os.path.basename(os.path.dirname(input.bed))
-        kind = chipseq.detect_peak_format(input.bed)
-
-        # bedToBigBed doesn't handle zero-size files
-        # bigbed is not created from epic2-generated peaks
-        if os.stat(input.bed).st_size == 0:
-            shell("touch {output}")
-        elif kind == 'narrowPeak':
-            _as = '../../include/autosql/bigNarrowPeak.as'
-            _type = 'bed6+4'
-            names=[
-                'chrom', 'chromStart', 'chromEnd', 'name', 'score',
-                'strand', 'signalValue', 'pValue', 'qValue', 'peak']
-        elif kind == 'broadPeak':
-            _as = '../../include/autosql/bigBroadPeak.as'
-            _type = 'bed6+3'
-            names=[
-                'chrom', 'chromStart', 'chromEnd', 'name', 'score',
-                'strand', 'signalValue', 'pValue', 'qValue']
-        elif kind == 'epic2Input':
-            _as = f'../../include/autosql/{kind}Peak.as'
-            _type = 'bed6+4'
-            names=[
-                'chrom', 'chromStart', 'chromEnd', 'pValue', 'score',
-                'strand', 'ChIPCount', 'InputCount', 'FDR', 'log2FoldChange']
-        elif kind == 'epic2NoInput':
-            _as = f'../../include/autosql/{kind}Peak.as'
-            _type = 'bed6'
-            names=[
-                'chrom', 'chromStart', 'chromEnd', 'ChIPCount', 'score',
-                'strand']
-        else:
-            raise ValueError("Unhandled format for {0}".format(input.bed))
-
-        df = pd.read_table(input.bed, index_col=False, names=names)
-        df['score'] = df['score'] - df['score'].min()
-        df['score'] = (df['score'] / df['score'].max()) * 1000
-        df['score'] = df['score'].replace([np.inf, -np.inf], np.nan).fillna(0)
-        df['score'] = df['score'].astype(int)
-        df.to_csv(output[0] + '.tmp', sep='\t', index=False, header=False)
-
-        shell('bedToBigBed -as={_as} -type={_type} {output}.tmp {input.chromsizes} {output} &> {log}')
-        shell('rm {output}.tmp')
+        runtime=autobump(hours=2),
+    log:
+        "{prefix}.bigbed.log",
+    script:
+        "../../scripts/bed_to_bigbed.py"
 
 
 rule multibigwigsummary:
@@ -767,25 +504,25 @@ rule multibigwigsummary:
     Summarize the bigWigs across genomic bins
     """
     input:
-        c.targets['bigwig']
+        expand(patterns["bigwig"], label=sampletable.label),
     output:
-        npz=c.targets['multibigwigsummary']['npz'],
-        tab=c.targets['multibigwigsummary']['tab']
+        npz=patterns["multibigwigsummary"]["npz"],
+        tab=patterns["multibigwigsummary"]["tab"],
     threads: 16
     resources:
         mem_mb=gb(16),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     run:
         # from the input files, figure out the sample name.
-        labels = ' '.join([i.split('/')[-2] for i in input])
+        labels = " ".join([i.split("/")[-2] for i in input])
         shell(
-            'multiBigwigSummary '
-            'bins '
-            '-b {input} '
-            '--labels {labels} '
-            '--numberOfProcessors {threads} '
-            '-out {output.npz} '
-            '--outRawCounts {output.tab}'
+            "multiBigwigSummary "
+            "bins "
+            "-b {input} "
+            "--labels {labels} "
+            "--numberOfProcessors {threads} "
+            "-out {output.npz} "
+            "--outRawCounts {output.tab}"
         )
 
 
@@ -794,22 +531,21 @@ rule plotcorrelation:
     Plot a heatmap of correlations across all samples
     """
     input:
-        c.targets['multibigwigsummary']['npz']
+        patterns["multibigwigsummary"]["npz"],
     output:
-        heatmap=c.targets['plotcorrelation']['heatmap'],
-        tab=c.targets['plotcorrelation']['tab']
+        heatmap=patterns["plotcorrelation"]["heatmap"],
+        tab=patterns["plotcorrelation"]["tab"],
     resources:
         mem_mb=gb(2),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     shell:
-        'plotCorrelation '
-        '--corData {input} '
-        '--corMethod spearman '
-        '--whatToPlot heatmap '
-        '--plotFile {output.heatmap} '
-        '--colorMap Reds '
-        '--outFileCorMatrix {output.tab}'
-
+        "plotCorrelation "
+        "--corData {input} "
+        "--corMethod spearman "
+        "--whatToPlot heatmap "
+        "--plotFile {output.heatmap} "
+        "--colorMap Reds "
+        "--outFileCorMatrix {output.tab}"
         # NOTE: if you're expecting negative correlation, try a divergent
         # colormap and setting the min/max to ensure that the colomap is
         # centered on zero:
@@ -817,45 +553,88 @@ rule plotcorrelation:
         # '--zMin -1 '
         # '--zMax 1 '
 
-if 'merged_bigwigs' in config:
-    rule merge_bigwigs:
-        """
-        Merge together bigWigs as specified in the config ("merged_bigwigs"
-        section).
-        """
-        input:
-            bigwigs=lambda wc: expand(
-                c.patterns['bigwig'],
-                label=config['merged_bigwigs'][wc.merged_bigwig_label],
-            ),
-            chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'],
-        output:
-            c.patterns['merged_bigwig']
-        resources:
-            mem_mb=gb(16),
-            runtime=autobump(hours=2)
-        log:
-            c.patterns['merged_bigwig'] + '.log'
-        script:
-            wrapper_for('average-bigwigs/wrapper.py')
 
 rule idxstats:
-    """
-    Run samtools idxstats on sample bams
-    """
     input:
-        bam=c.patterns['markduplicates']['bam'],
-        bai=c.patterns['markduplicates']['bam'] + '.bai'
+        bam=patterns["markduplicates"]["bam"],
+        bai=patterns["markduplicates"]["bam"] + ".bai",
     output:
-        txt=c.patterns['samtools']['idxstats']
+        txt=patterns["samtools"]["idxstats"],
     resources:
         mem_mb=gb(16),
-        runtime=autobump(hours=2)
-    log: 
-        c.patterns['samtools']['idxstats'] + '.log'
+        runtime=autobump(hours=2),
+    log:
+        patterns["samtools"]["idxstats"] + ".log",
+    shell:
+        "samtools idxstats {input.bam} 2> {log} 1> {output.txt}"
+
+
+rule flagstat:
+    input:
+        bam=patterns["markduplicates"]["bam"],
+        bai=patterns["markduplicates"]["bam"] + ".bai",
+    output:
+        patterns["samtools"]["flagstat"],
+    log:
+        patterns["samtools"]["flagstat"] + ".log",
+    shell:
+        "samtools flagstat {input.bam} > {output}"
+
+
+rule samtools_stats:
+    input:
+        bam=patterns["markduplicates"]["bam"],
+        bai=patterns["markduplicates"]["bam"] + ".bai",
+    output:
+        patterns["samtools"]["stats"],
+    log:
+        patterns["samtools"]["stats"] + ".log",
+    shell:
+        "samtools stats {input.bam} > {output}"
+
+
+rule multiqc:
+    input:
+        expand(patterns["bam"], sample=SAMPLES),
+        expand(patterns["fastqc"]["raw"], sample=SAMPLES),
+        expand(patterns["fastqc"]["cutadapt"], sample=SAMPLES),
+        expand(patterns["fastqc"]["bam"], sample=SAMPLES),
+        expand(patterns["bigwig"], label=sampletable.label),
+        expand(patterns["samtools"]["idxstats"], sample=SAMPLES),
+        expand(patterns["samtools"]["flagstat"], sample=SAMPLES),
+        expand(patterns["samtools"]["stats"], sample=SAMPLES),
+        expand(patterns["merged_techreps"], label=sampletable.label),
+        expand(
+            patterns["fingerprint"]["metrics"],
+            ip_label=sampletable.loc[sampletable.antibody != "input", "label"],
+        ),
+        expand(patterns["collectinsertsizemetrics"], sample=SAMPLES)
+        if is_paired
+        else [],
+        [v["bigbed"] for v in peaks.values()],
+        patterns["multibigwigsummary"]["tab"],
+        patterns["plotcorrelation"]["tab"],
+        config="config/multiqc_config.yaml",
+    output:
+        patterns["multiqc"],
+    log:
+        patterns["multiqc"] + ".log",
+    threads: 1
+    resources:
+        mem_mb=gb(2),
+        runtime=autobump(hours=2),
     run:
+        analysis_directory = "data"
+        outdir = os.path.dirname(output[0])
+        basename = os.path.basename(output[0])
         shell(
-            'samtools idxstats {input.bam} 2> {log} 1> {output.txt}'
+            "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 "
+            "multiqc "
+            "--quiet "
+            "--outdir {outdir} "
+            "--force "
+            "--filename {basename} "
+            "--config {input.config} "
+            "{analysis_directory} "
+            "&> {log} "
         )
-
-# vim: ft=python
diff --git a/workflows/chipseq/config/chipseq_patterns.yaml b/workflows/chipseq/config/chipseq_patterns.yaml
index 3e44107a..90b511c9 100644
--- a/workflows/chipseq/config/chipseq_patterns.yaml
+++ b/workflows/chipseq/config/chipseq_patterns.yaml
@@ -49,7 +49,15 @@ patterns_by_sample:
     metrics: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics'
 
   samtools:
-    idxstats: 'data/rnaseq_samples/{sample}/idxstat_{sample}.txt'
+    idxstats: 'data/rnaseq_samples/{sample}/samtools_idxstats_{sample}.txt'
+    flagstat: 'data/rnaseq_samples/{sample}/samtools_flagstat_{sample}.txt'
+    stats: 'data/rnaseq_samples/{sample}/samtools_stats_{sample}.txt'
+
+  peaks:
+    macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed'
+    spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bed'
+    sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bed'
+    epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bed'
 
 patterns_by_peaks:
   peaks:
diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml
index 591fe13b..a8d10142 100644
--- a/workflows/chipseq/config/config.yaml
+++ b/workflows/chipseq/config/config.yaml
@@ -39,33 +39,10 @@ chipseq:
   # merging step of the workflow merges and de-dupes appropriately so that the
   # peak callers only see BAMs with all duplicates removed.
   #
-  # The "extra" block is used to pass extra information to the peak-caller in
-  # a run-specific manner. Check the wrapper README for details on this. For
-  # example, the macs2 wrapper passes `extra` verbatim to the command line, but
-  # the spp wrapper handles things differently.
-  #
   # Each wrapper is built to accept either single or multiple BAMs and output
   # at least a BED file of peaks.
   #
   peak_calling:
-    - label: gaf-embryo-sicer
-      algorithm: sicer
-      ip:
-        - gaf-embryo-1
-      control:
-        - input-embryo-1
-      redundancy_threshold: 1
-      window_size: 200
-      fragment_size: 150
-      # optional user-specified override mappable genome proportion if
-      # specified here, SICER will use this value instead of the value specific
-      # to the genome build if NOT specified here, SICER will use the
-      # mappability value for your genome build
-      effective_genome_fraction: 0.75
-      genome_build: dm6
-      gap_size: 600
-      fdr: 0.01
-
 
     - label: gaf-embryo-1
       algorithm: macs2
@@ -80,23 +57,6 @@ chipseq:
       effective_genome_count: 7e7
       extra: '--nomodel --extsize 147'
 
-    - label: gaf-embryo-1
-      algorithm: spp
-      ip:
-        - gaf-embryo-1
-      control:
-        - input-embryo-1
-      extra:
-        fdr: 0.3
-        zthr: 4
-
-    - label: gaf-embryo-1-defaults
-      algorithm: spp
-      ip:
-        - gaf-embryo-1
-      control:
-        - input-embryo-1
-
     - label: gaf-wingdisc-pooled
       algorithm: macs2
       ip:
@@ -107,17 +67,6 @@ chipseq:
         - input-wingdisc-2
       extra: '--nomodel --extsize 147'
 
-    - label: gaf-wingdisc-pooled
-      algorithm: spp
-      ip:
-        - gaf-wingdisc-1
-        - gaf-wingdisc-2
-      control:
-        - input-wingdisc-1
-        # - input-wingdisc-2
-      extra:
-        fdr: 0.5
-        zthr: 4
 
     - label: gaf-wingdisc-pooled-1
       algorithm: epic2

From 413620746f8687399a12b800be4fca3076ca5406 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:29:59 -0500
Subject: [PATCH 054/196] clean up some tests

---
 lib/patterns_targets.py                       | 256 ------------------
 .../{star_1pass.tsv => hisat2.tsv}            |   0
 test/test_configs/override.yaml               |  14 -
 3 files changed, 270 deletions(-)
 delete mode 100644 lib/patterns_targets.py
 rename test/test_configs/{star_1pass.tsv => hisat2.tsv} (100%)
 delete mode 100644 test/test_configs/override.yaml

diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py
deleted file mode 100644
index 08fedb26..00000000
--- a/lib/patterns_targets.py
+++ /dev/null
@@ -1,256 +0,0 @@
-"""
-This module handles the reading and filling-in of patterns. It can be used from
-within Snakefiles or in downstream (figure-making) scripts.
-"""
-
-import os
-import collections
-import yaml
-from . import utils
-from . import chipseq
-from snakemake.io import expand
-
-HERE = os.path.abspath(os.path.dirname(__file__))
-
-# Note: when adding support for new peak callers, add them here.
-PEAK_CALLERS = ['macs2', 'spp', 'sicer', 'epic2']
-
-
-def update_recursive(d, u):
-    """
-    Update dictionary `d` with items in dictionary `u`, recursively
-    """
-    for k, v in u.items():
-        if isinstance(v, collections.abc.Mapping):
-            d[k] = update_recursive(d.get(k, {}), v)
-        else:
-            d[k] = v
-    return d
-
-
-class SeqConfig(object):
-    def __init__(self, config, patterns, workdir=None):
-        """
-        This class takes care of common tasks related to config and patterns
-        files (reading the sampletable, etc) but is intended to be subclassed.
-
-        Parameters
-        ----------
-        config : str or dict
-
-        patterns : str
-            Path to patterns YAML file
-
-        workdir : str
-            Config, patterns, and all paths in `config` should be interpreted
-            as relative to `workdir`
-        """
-        self.path = None
-        self.workdir = '.'
-        if workdir is not None:
-            config = os.path.join(workdir, config)
-            patterns = os.path.join(workdir, patterns)
-            self.workdir = workdir
-
-        self.config = config
-
-        stranded = self.config.get('stranded', None)
-        self.stranded = None
-        if stranded:
-            if stranded in ('unstranded'):
-                self.stranded = 'unstranded'
-            elif stranded in ('fr-firststrand', 'ISR', 'SR', 'reverse'):
-                self.stranded = 'fr-firststrand'
-            elif stranded in ('fr-secondstrand', 'ISF', 'SF', 'forward'):
-                self.stranded = 'fr-secondstrand'
-
-        # Read the config file and extract all sort of useful bits. This mostly
-        # uses the `common` module to handle the details.
-        self.samples, self.sampletable = utils.get_sampletable(self.config)
-        self.patterns = yaml.load(open(patterns), Loader=yaml.FullLoader)
-        self.is_paired = utils.detect_layout(self.sampletable) == 'PE'
-        if self.is_paired:
-            self.n = [1, 2]
-        else:
-            self.n = [1]
-        if 'Run' in self.sampletable.columns and sum(self.sampletable['Run'].str.startswith('SRR')) > 0:
-            self.is_sra = True
-        else:
-            self.is_sra = False
-
-        ##########################utils.preflight(self.config)
-
-class RNASeqConfig(SeqConfig):
-    def __init__(self, config, patterns, workdir=None):
-        """
-        Config object specific to RNA-seq workflows.
-
-        Fills in patterns to create targets by handling the by-sample and
-        by-aggregate sections separately.
-
-        Parameters
-        ----------
-
-        config : dict
-
-        patterns : str
-            Path to patterns YAML file
-
-        workdir : str
-            Config, patterns, and all paths in `config` should be interpreted
-            as relative to `workdir`
-        """
-        SeqConfig.__init__(self, config, patterns, workdir)
-
-        self.fill = dict(sample=self.samples, n=self.n)
-        self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None)
-        self.targets = utils.fill_patterns(self.patterns, self.fill)
-
-        # If the sampletable is from an sra metadata table, then we need to set the value of
-        # 'orig_filename' for each of the samples to where the fastq was downloaded
-        if self.is_sra:
-            self.sampletable['orig_filename'] = expand(self.patterns["sra_fastq"], sample=self.samples, n=1)
-            if self.is_paired:
-                self.sampletable['orig_filename_R2'] = expand(self.patterns["sra_fastq"], sample=self.samples, n=2)
-
-        # Then the aggregation
-        if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config:
-            self.fill_by_aggregation = dict(
-                merged_bigwig_label=self.config['merged_bigwigs'].keys(),
-            )
-            self.targets_by_aggregation = utils.fill_patterns(
-                self.patterns_by_aggregation,
-                self.fill_by_aggregation
-            )
-            self.targets.update(self.targets_by_aggregation)
-            self.patterns.update(self.patterns_by_aggregation)
-
-        #########################utils.rnaseq_preflight(self)
-
-
-class ChIPSeqConfig(SeqConfig):
-    def __init__(self, config, patterns, workdir=None):
-        """
-        Config object specific to ChIP-seq workflows.
-
-        Fills in patterns to create targets by handling the by-sample, by-peak,
-        and by-aggregate sections separately.
-
-        Parameters
-        ----------
-
-        config : dict
-
-        patterns : str
-            Path to patterns YAML file
-
-        workdir : str
-            Config, patterns, and all paths in `config` should be interpreted
-            as relative to `workdir`
-        """
-        SeqConfig.__init__(self, config, patterns, workdir)
-
-        self.targets = {}
-
-        # For ChIP-seq, the structure of the patterns is quite different for
-        # samples than it is for peaks. For example, the peaks do not have any
-        # sample info in the filenames but aggregate possibly many different samples
-        #
-        # So construct them separately, and then later update self.patterns and
-        # self.targets.
-        #
-        # The averaged bigwigs are also aggregated, but in a different way.
-        # They will be handled separately.
-        #
-        # First, the samples...
-        self.patterns_by_sample = self.patterns['patterns_by_sample']
-        self.fill_by_sample = dict(
-            n=self.n,
-            sample=self.samples.values,
-            label=self.sampletable.label.values,
-            ip_label=self.sampletable.label[
-                self.sampletable.antibody != 'input'].values
-        )
-        self.targets_by_sample = utils.fill_patterns(
-            self.patterns_by_sample, self.fill_by_sample)
-
-        self.targets.update(self.targets_by_sample)
-        self.patterns.update(self.patterns_by_sample)
-
-        # Then the aggregation...
-        self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None)
-        if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config:
-            self.fill_by_aggregation = dict(
-                merged_bigwig_label=self.config['merged_bigwigs'].keys(),
-            )
-            self.targets_by_aggregation = utils.fill_patterns(
-                self.patterns_by_aggregation,
-                self.fill_by_aggregation
-            )
-            self.targets.update(self.targets_by_aggregation)
-            self.patterns.update(self.patterns_by_aggregation)
-
-        # Then the peaks...
-        #
-
-        self.patterns_by_peaks = self.patterns['patterns_by_peaks']
-        self.targets_for_peaks = {}
-
-        # We need to fill in just those peak-calling runs that are specified
-        # for each peak-caller. For reference, here's an example
-        # `patterns_by_peaks` from the YAML:
-        #
-        #        peaks:
-        #           macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bed'
-        #           spp: '{peak_calling}/spp/{spp_run}/peaks.bed'
-        #        bigbed:
-        #            macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bigbed'
-        #            spp: '{peak_calling}/spp/{spp_run}/peaks.bigbed'
-
-
-        # Also note that the snakefile's all rule uses
-        # utils.flatten(c.targets['peaks']), but in the case where no
-        # peak-calling runs are specified these should be initialized,
-        # otherwise we'll get a KeyError.
-        self.targets['peaks'] = []
-        self.targets['bigbed'] = []
-
-        for pc in PEAK_CALLERS:
-            # Extract out just the subset of `patterns_by_peaks` for this
-            # peak-caller e.g., from the example above, if pc='macs2' this
-            # would only be:
-            #
-            #   peaks:
-            #      macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bed'
-            #   bigbed:
-            #       macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bigbed'
-            #
-            _peak_patterns = {
-                k: {pc: v[pc]} for k, v in self.patterns_by_peaks.items()
-            }
-
-
-            # Fix for issue #166, which was caused by commit 8a211122:
-            #
-            # If no runs for the peak-caller are configured, this will be
-            # empty and we should continue on.
-            peaks_to_fill = list(chipseq.peak_calling_dict(self.config, algorithm=pc).keys())
-
-            if not peaks_to_fill:
-                continue
-
-            _fill = {pc + '_run': peaks_to_fill}
-
-            # The trick here is the recursive updating of targets_for_peaks.
-            # We're adding the filled-in runs of each peak caller to the
-            # targets as they're built.
-            update_recursive(
-                self.targets_for_peaks,
-                utils.fill_patterns(_peak_patterns, _fill)
-            )
-
-
-        self.targets.update(self.targets_for_peaks)
-        self.patterns.update(self.patterns_by_peaks)
-
-        utils.chipseq_preflight(self)
diff --git a/test/test_configs/star_1pass.tsv b/test/test_configs/hisat2.tsv
similarity index 100%
rename from test/test_configs/star_1pass.tsv
rename to test/test_configs/hisat2.tsv
diff --git a/test/test_configs/override.yaml b/test/test_configs/override.yaml
deleted file mode 100644
index bd05a925..00000000
--- a/test/test_configs/override.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Due to the way Snakemake recursively merges config items, we need to
-# recursively reset this dictonary to override the default one in order to
-# allow arbitrary other sample names.
-#
-# Use it like this
-#
-#   snakemake --configfile ../../test/override.yaml --config sampletable=/path/to/tsv
-#
-merged_bigwigs:
-  control_pos:
-    pos: []
-  treatment_all:
-    pos: []
-    neg: []

From d7bb4924773b29cd987ee6fb45c0e44831387aae Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:30:18 -0500
Subject: [PATCH 055/196] convert rrna table to script

---
 scripts/rrna_libsizes_table.py | 58 ++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 scripts/rrna_libsizes_table.py

diff --git a/scripts/rrna_libsizes_table.py b/scripts/rrna_libsizes_table.py
new file mode 100644
index 00000000..f71d48bc
--- /dev/null
+++ b/scripts/rrna_libsizes_table.py
@@ -0,0 +1,58 @@
+"""
+Prepares a TSV and JSON file for multiqc to pick up and display as a sortable
+table
+"""
+import sys
+import os
+import pandas as pd
+import yaml
+
+sys.path.insert(0, os.path.dirname(__file__) + "/..")
+from lib import utils
+
+
+def rrna_sample(f):
+    return utils.extract_wildcards(snakemake.config["patterns"]["rrna"]["libsize"], f)["sample"]
+
+
+def sample(f):
+    return utils.extract_wildcards(snakemake.config["patterns"]["libsizes"]["cutadapt"], f)["sample"]
+
+
+def million(f):
+    return float(open(f).read()) / 1e6
+
+
+rrna = sorted(snakemake.input.rrna, key=rrna_sample)
+fastq = sorted(snakemake.input.fastq, key=sample)
+samples = list(map(rrna_sample, rrna))
+rrna_m = list(map(million, rrna))
+fastq_m = list(map(million, fastq))
+
+df = pd.DataFrame(
+    dict(
+        sample=samples,
+        million_reads_rRNA=rrna_m,
+        million_reads_fastq=fastq_m,
+    )
+)
+df = df.set_index("sample")
+df["rRNA_percentage"] = df.million_reads_rRNA / df.million_reads_fastq * 100
+
+df[["million_reads_fastq", "million_reads_rRNA", "rRNA_percentage"]].to_csv(
+    snakemake.output.tsv, sep="\t"
+)
+y = {
+    "id": "rrna_percentages_table",
+    "section_name": "rRNA content",
+    "description": "Amount of reads mapping to rRNA sequence",
+    "plot_type": "table",
+    "pconfig": {
+        "id": "rrna_percentages_table_table",
+        "title": "rRNA content table",
+        "min": 0,
+    },
+    "data": yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader),
+}
+with open(snakemake.output.json, "w") as fout:
+    yaml.dump(y, fout, default_flow_style=False)

From 66f5a11b4d9028a099d74e5efa8eb23c4aa41842 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:30:36 -0500
Subject: [PATCH 056/196] fix test on preprocessor

---
 ci/preprocessor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/preprocessor.py b/ci/preprocessor.py
index 6bf05361..1cd7e5da 100644
--- a/ci/preprocessor.py
+++ b/ci/preprocessor.py
@@ -49,8 +49,8 @@ def uncomment_line(line):
     >>> assert uncomment_line('#    asdf') == '    asdf'
     >>> assert uncomment_line('  #    asdf') == '      asdf'
     >>> assert uncomment_line('do nothing') == 'do nothing'
-    >>> assert uncomment_line('do nothing # [disable for test]') == 'do nothing # [disable for test]')
-    >>> assert uncomment_line('#uncomment # [disable for test]') == 'uncomment # [disable for test]')
+    >>> assert uncomment_line('do nothing # [disable for test]') == 'do nothing # [disable for test]'
+    >>> assert uncomment_line('#uncomment # [disable for test]') == 'uncomment # [disable for test]'
     """
     first = line.find("#")
 

From bfdbf5e874b8b6a19d4ac6083db776d446a0c3db Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:30:49 -0500
Subject: [PATCH 057/196] updated env yaml

---
 env.yml | 62 ++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/env.yml b/env.yml
index 02f0f695..9bbc8a71 100644
--- a/env.yml
+++ b/env.yml
@@ -34,7 +34,7 @@ dependencies:
   - cairo=1.18.2
   - certifi=2024.12.14
   - cffi=1.17.1
-  - charset-normalizer=3.4.0
+  - charset-normalizer=3.4.1
   - click=8.1.8
   - coin-or-cbc=2.10.12
   - coin-or-cgl=0.60.9
@@ -60,6 +60,7 @@ dependencies:
   - docutils=0.21.2
   - dpath=2.2.0
   - eido=0.2.4
+  - epic2=0.0.52
   - et_xmlfile=2.0.0
   - exceptiongroup=1.2.2
   - execnet=2.1.1
@@ -82,8 +83,8 @@ dependencies:
   - gffutils=0.13
   - gfortran_impl_linux-64=14.2.0
   - giflib=5.2.2
-  - gitdb=4.0.11
-  - gitpython=3.1.43
+  - gitdb=4.0.12
+  - gitpython=3.1.44
   - graphite2=1.3.13
   - gsl=1.16
   - gxx_impl_linux-64=14.2.0
@@ -102,7 +103,7 @@ dependencies:
   - imagesize=1.4.1
   - immutables=0.21
   - importlib-metadata=8.5.0
-  - importlib_resources=6.4.5
+  - importlib_resources=6.5.2
   - iniconfig=2.0.0
   - intervalstats=1.01
   - ipython=8.31.0
@@ -133,7 +134,7 @@ dependencies:
   - libcups=2.3.3
   - libcurl=8.11.1
   - libdeflate=1.23
-  - libedit=3.1.20191231
+  - libedit=3.1.20240808
   - libev=4.33
   - libexpat=2.6.4
   - libffi=3.4.2
@@ -158,7 +159,7 @@ dependencies:
   - libnsl=2.0.1
   - libopenblas=0.3.28
   - libopenssl-static=3.4.0
-  - libpng=1.6.44
+  - libpng=1.6.45
   - libsanitizer=14.2.0
   - libsqlite=3.47.2
   - libssh2=1.11.1
@@ -174,6 +175,7 @@ dependencies:
   - libzlib=1.3.1
   - logmuse=0.2.8
   - logomaker=0.8
+  - macs2=2.2.9.1
   - make=4.4.1
   - markdown=3.6
   - markdown-it-py=3.0.0
@@ -187,6 +189,7 @@ dependencies:
   - mysql-connector-c=6.1.11
   - natsort=8.4.0
   - nbformat=5.10.4
+  - ncbi-vdb=3.1.1
   - ncurses=6.5
   - networkx=3.4.2
   - nspr=4.36
@@ -197,6 +200,7 @@ dependencies:
   - openjpeg=2.5.3
   - openpyxl=3.1.5
   - openssl=3.4.0
+  - ossuuid=1.6.2
   - packaging=24.2
   - pandas=2.2.3
   - pandoc=3.6.1
@@ -208,14 +212,44 @@ dependencies:
   - pephubclient=0.4.4
   - peppy=0.40.7
   - perl=5.32.1
+  - perl-alien-build=2.84
+  - perl-alien-libxml2=0.17
+  - perl-business-isbn=3.007
+  - perl-business-isbn-data=20210112.006
+  - perl-capture-tiny=0.48
+  - perl-carp=1.50
+  - perl-constant=1.33
+  - perl-exporter=5.74
+  - perl-extutils-makemaker=7.70
+  - perl-ffi-checklib=0.28
+  - perl-file-chdir=0.1011
+  - perl-file-path=2.18
+  - perl-file-temp=0.2304
+  - perl-file-which=1.24
   - perl-gd=2.56
   - perl-gdgraph=1.54
   - perl-gdtextutil=0.86
+  - perl-importer=0.026
+  - perl-parent=0.243
+  - perl-path-tiny=0.124
+  - perl-pathtools=3.75
+  - perl-scope-guard=0.21
+  - perl-sub-info=0.002
+  - perl-term-table=0.024
+  - perl-test-fatal=0.016
+  - perl-test-warnings=0.031
+  - perl-test2-suite=0.000163
+  - perl-try-tiny=0.31
+  - perl-uri=5.17
+  - perl-xml-libxml=2.0210
+  - perl-xml-namespacesupport=1.12
+  - perl-xml-sax=1.02
+  - perl-xml-sax-base=1.09
   - pexpect=4.9.0
   - picard=2.27.5
   - pickleshare=0.7.5
   - pigz=2.8
-  - pillow=11.0.0
+  - pillow=11.1.0
   - pip=24.3.1
   - pixman=0.44.2
   - pkgutil-resolve-name=1.3.10
@@ -238,20 +272,20 @@ dependencies:
   - pydantic=2.10.4
   - pydantic-core=2.27.2
   - pyfaidx=0.8.1.3
-  - pygments=2.18.0
+  - pygments=2.19.1
   - pyparsing=3.2.1
   - pysam=0.22.1
   - pysocks=1.7.1
   - pytest=8.3.4
   - pytest-xdist=3.6.1
-  - python=3.12.8
+  - python=3.11.11
   - python-dateutil=2.9.0.post0
   - python-fastjsonschema=2.21.1
   - python-isal=1.7.1
   - python-kaleido=0.2.1
   - python-tzdata=2024.2
   - python-zlib-ng=0.5.1
-  - python_abi=3.12
+  - python_abi=3.11
   - pytz=2024.1
   - pyvcf3=1.0.3
   - pyyaml=6.0.2
@@ -267,7 +301,7 @@ dependencies:
   - rseqc=5.0.4
   - salmon=1.10.3
   - samtools=1.21
-  - scipy=1.14.1
+  - scipy=1.15.0
   - seaborn=0.13.2
   - seaborn-base=0.13.2
   - sed=4.8
@@ -279,12 +313,12 @@ dependencies:
   - slack_sdk=3.34.0
   - smart_open=7.1.0
   - smmap=5.0.0
-  - snakemake=8.26.0
+  - snakemake=8.27.0
   - snakemake-interface-common=1.17.4
   - snakemake-interface-executor-plugins=9.3.3
   - snakemake-interface-report-plugins=1.1.0
   - snakemake-interface-storage-plugins=3.3.0
-  - snakemake-minimal=8.26.0
+  - snakemake-minimal=8.27.0
   - snowballstemmer=2.2.0
   - soupsieve=2.5
   - spectra=0.0.11
@@ -296,7 +330,7 @@ dependencies:
   - sphinxcontrib-qthelp=2.0.0
   - sphinxcontrib-serializinghtml=1.1.10
   - sqlite=3.47.2
-  - sra-tools=2.9.6
+  - sra-tools=3.1.1
   - stack_data=0.6.3
   - star=2.7.11b
   - statsmodels=0.14.4

From a466da0471854632f07d7bde57c5f5b92210cbd2 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:30:59 -0500
Subject: [PATCH 058/196] fix import

---
 lib/postprocess/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py
index 16010e14..f8fc64a6 100644
--- a/lib/postprocess/utils.py
+++ b/lib/postprocess/utils.py
@@ -9,7 +9,7 @@
 
 here = os.path.dirname(os.path.abspath(__file__))
 sys.path.insert(0, os.path.join(here, "../../lib"))
-from common import openfile
+from utils import openfile
 
 
 

From eb6892529023c1a5df7478971f4074c55baff84c Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:31:17 -0500
Subject: [PATCH 059/196] fix strand check

---
 rules/strand_check.smk | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/rules/strand_check.smk b/rules/strand_check.smk
index 625ba3e2..4c936a77 100644
--- a/rules/strand_check.smk
+++ b/rules/strand_check.smk
@@ -1,14 +1,13 @@
-
 rule sample_strand_check:
     input:
-        fastq=fill_r1_r2(c.sampletable, c.patterns['fastq']),
-        index=rules.bowtie2_index.output,
+        fastq=expand(patterns["fastq"], n=n, allow_missing=True),
+        index=expand(rules.bowtie2_index.output, label="genome"),
         bed12=rules.conversion_bed12.output,
     output:
         strandedness='strand_check/{sample}/{sample}.strandedness',
         bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'),
         bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'),
-        fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', sample=SAMPLES, n=n)),
+        fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', n=n, allow_missing=True)),
     log:
         'strand_check/{sample}/{sample}.strandedness.log'
     threads: 6
@@ -16,17 +15,13 @@ rule sample_strand_check:
         mem_mb=gb(8),
         runtime=autobump(hours=2)
     run:
-        prefix = aligners.prefix_from_bowtie2_index(input.index)
-        nreads = int(config['strand_check_reads']) * 4
-        if c.is_paired:
-            assert len(input.fastq) == 2
-            assert len(output.fastqs) == 2
+        prefix = os.path.commonprefix(input.index).rstrip(".")
+        nreads = int(1e5 * 4)
+        if is_paired:
             shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
             shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}')
             fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} '
         else:
-            assert len(input.fastq) == 1
-            assert len(output.fastqs) == 1
             shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
             fastqs = f'-U {output.fastqs[0]} '
         shell(

From 8f33026592306e59b91f5812e4a86e9d1a86a732 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:33:52 -0500
Subject: [PATCH 060/196] split featurecounts

---
 workflows/rnaseq/Snakefile                   | 182 +++++++------------
 workflows/rnaseq/config/rnaseq_patterns.yaml |   4 +-
 2 files changed, 71 insertions(+), 115 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index a328d887..29732d47 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -18,10 +18,10 @@ REFERENCES = config.get("reference_dir", "../../references")
 sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
-is_sra = utils.detect_sra(sampletable)
 n = ["1", "2"] if is_paired else ["1"]
-SAMPLES = sampletable.iloc[:, 0].values
+SAMPLES = sampletable.index
 patterns = yaml.safe_load(open("config/rnaseq_patterns.yaml"))
+config["patterns"] = patterns
 
 
 wildcard_constraints:
@@ -37,12 +37,43 @@ localrules:
 rule all:
     input:
         patterns["multiqc"],
-        patterns["bigwig"],
+        expand(patterns["bigwig"]["pos"], sample=SAMPLES),
+        expand(patterns["bigwig"]["neg"], sample=SAMPLES),
 
+# Optionally run ``snakemake strand_check`` to do a preliminary run evaluating strandedness.
+include: '../../rules/strand_check.smk'
 
-if is_sra:
+if utils.detect_sra(sampletable):
+    sampletable['orig_filename'] = expand(
+        'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1)
+
+    if is_paired:
+        sampletable['orig_filename_R2'] = expand(
+            'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2)
+
+    rule fastq_dump:
+        output:
+            fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n, allow_missing=True)
+        log:
+            'original_data/sra_samples/{sample}/{sample}.fastq.gz.log'
+        params:
+            is_paired=is_paired,
+            # extra="-X 100000",  # [enable for test]
+        resources:
+            mem_mb=gb(1),
+            disk_mb=autobump(gb=1),
+            runtime=autobump(hours=2)
+        run:
+            srr = sampletable.loc[wildcards.sample, "Run"]
+            extra = params.get("extra", "")
+            if is_paired:
+                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
+                shell("mv {srr}_1.fastq.gz {output[0]}")
+                shell("mv {srr}_2.fastq.gz {output[1]}")
+            else:
+                shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp")
+                shell("mv {output[0]}.tmp {output[0]}")
 
-    include: "../../rules/sra.smk"
 
 
 rule symlinks:
@@ -71,13 +102,6 @@ rule symlink_targets:
         ),
 
 
-# This can be set at the command line with --config strand_check_reads=1000
-config.setdefault("strand_check_reads", 1e5)
-
-# TODO: re-enable
-# include: '../../rules/strand_check.smk'
-
-
 rule cutadapt:
     input:
         fastq=expand(patterns["fastq"], n=n, allow_missing=True),
@@ -145,7 +169,7 @@ rule fastqc:
             '--quiet '
             '--outdir {outdir} '
             '{input} '
-            '{log} '
+            '2> {log} '
         )
         outfile = os.path.basename(input[0])
         for s in ['.fastq', '.fq', '.gz', '.bam']:
@@ -203,9 +227,6 @@ if config["aligner"] == "hisat2":
             )
 
 
-
-# TODO: star has lots of rules. Better to be in rules/aligner.smk?
-
 if config["aligner"].startswith("star"):
 
     # STAR can be run in 1-pass or 2-pass modes. Since we may be running it
@@ -240,9 +261,7 @@ if config["aligner"].startswith("star"):
 if config["aligner"] == "star":
 
     rule star:
-        """
-        Align with STAR (1-pass mode)
-        """
+        "Align with STAR (1-pass mode)"
         input:
             fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.star_index.output,
@@ -281,9 +300,7 @@ if config["aligner"] == "star":
 if config["aligner"] == "star-twopass":
 
     rule star_pass1:
-        """
-        First pass of alignment with STAR to get the junctions
-        """
+        "First pass of alignment with STAR to get the junctions"
         input:
             fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
             index=rules.star_index.output,
@@ -458,18 +475,14 @@ rule bam_index:
         "samtools index {input} {output}"
 
 
-# TODO: split into multiple featurecounts runs, since PE needs to be sorted each time.
 rule featurecounts:
-    """
-    Count reads in annotations with featureCounts from the subread package
-    """
     input:
         annotation=rules.gtf.output,
-        bam=expand(patterns["markduplicates"]["bam"], sample=SAMPLES),
+        bam=patterns["markduplicates"]["bam"],
     output:
-        counts="{sample_dir}/rnaseq_aggregation/featurecounts.txt",
+        patterns["featurecounts"]["per_sample"]
     log:
-        "{sample_dir}/rnaseq_aggregation/featurecounts.txt.log",
+        patterns["featurecounts"]["per_sample"] + ".log"
     threads: 8
     resources:
         mem_mb=gb(16),
@@ -482,7 +495,6 @@ rule featurecounts:
         }[config["stranded"]],
         extra="",
     run:
-        # NOTE: By default, we use -p for paired-end
         p_arg = ""
         if is_paired:
             p_arg = "-p --countReadPairs "
@@ -492,17 +504,34 @@ rule featurecounts:
             "{p_arg} "
             "-T {threads} "
             "-a {input.annotation} "
-            "-o {output.counts} "
+            "-o {output} "
             "{input.bam} "
             "&> {log}"
         )
 
+rule aggregate_featurecounts:
+    input:
+        expand(patterns["featurecounts"]["per_sample"], sample=SAMPLES)
+    output:
+        patterns["featurecounts"]["aggregated"]
+    log: 
+        patterns["featurecounts"]["aggregated"] + ".log"
+    threads:
+        1
+    resources:
+        mem_mb=gb(8),
+        runtime=autobump(hours=1)
+    run:
+        for i, file in enumerate(input):
+            df = pd.read_csv(file, sep="\t", comment="#")
+            df = df.set_index('Geneid', drop=False)
+            if i == 0:
+                final = df
+                continue
+            final[df.columns[-1]] = df[df.columns[-1]]
+        final.to_csv(output[0], sep="\t", index=False)
 
-# # TODO: port some of this over to utils, or maybe script.
 rule rrna_libsizes_table:
-    """
-    Aggregate rRNA counts into a table
-    """
     input:
         rrna=expand(patterns["rrna"]["libsize"], sample=SAMPLES),
         fastq=expand(patterns["libsizes"]["cutadapt"], sample=SAMPLES),
@@ -513,54 +542,8 @@ rule rrna_libsizes_table:
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2),
-    run:
-        def rrna_sample(f):
-            return utils.extract_wildcards(patterns["rrna"]["libsize"], f)["sample"]
-
-
-        def sample(f):
-            return utils.extract_wildcards(patterns["libsizes"]["cutadapt"], f)[
-                "sample"
-            ]
-
-
-        def million(f):
-            return float(open(f).read()) / 1e6
-
-
-        rrna = sorted(input.rrna, key=rrna_sample)
-        fastq = sorted(input.fastq, key=sample)
-        samples = list(map(rrna_sample, rrna))
-        rrna_m = list(map(million, rrna))
-        fastq_m = list(map(million, fastq))
-
-        df = pd.DataFrame(
-            dict(
-                sample=samples,
-                million_reads_rRNA=rrna_m,
-                million_reads_fastq=fastq_m,
-            )
-        )
-        df = df.set_index("sample")
-        df["rRNA_percentage"] = df.million_reads_rRNA / df.million_reads_fastq * 100
-
-        df[["million_reads_fastq", "million_reads_rRNA", "rRNA_percentage"]].to_csv(
-            output.tsv, sep="\t"
-        )
-        y = {
-            "id": "rrna_percentages_table",
-            "section_name": "rRNA content",
-            "description": "Amount of reads mapping to rRNA sequence",
-            "plot_type": "table",
-            "pconfig": {
-                "id": "rrna_percentages_table_table",
-                "title": "rRNA content table",
-                "min": 0,
-            },
-            "data": yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader),
-        }
-        with open(output.json, "w") as fout:
-            yaml.dump(y, fout, default_flow_style=False)
+    script:
+        "../../scripts/rrna_libsizes_table.py"
 
 
 rule multiqc:
@@ -579,8 +562,9 @@ rule multiqc:
             expand(patterns["samtools"]["idxstats"], sample=SAMPLES),
             expand(patterns["samtools"]["flagstat"], sample=SAMPLES),
             expand(patterns["samtools"]["stats"], sample=SAMPLES),
+
             patterns["rrna_percentages_table"],
-            patterns["featurecounts"],
+            patterns["featurecounts"]["aggregated"],
         ),
         config="config/multiqc_config.yaml",
     output:
@@ -609,9 +593,6 @@ rule multiqc:
 
 
 rule markduplicates:
-    """
-    Mark or remove PCR duplicates with Picard MarkDuplicates
-    """
     input:
         bam=patterns["bam"],
     output:
@@ -639,9 +620,6 @@ rule markduplicates:
 
 
 rule collectrnaseqmetrics:
-    """
-    Calculate various RNA-seq QC metrics with Picarc CollectRnaSeqMetrics
-    """
     input:
         bam=patterns["markduplicates"]["bam"],
         refflat=rules.conversion_refflat.output,
@@ -676,9 +654,6 @@ rule collectrnaseqmetrics:
 
 
 rule preseq:
-    """
-    Compute a library complexity curve with preseq
-    """
     input:
         bam=patterns["bam"],
     output:
@@ -695,9 +670,6 @@ rule preseq:
 
 
 rule salmon:
-    """
-    Quantify reads coming from transcripts with Salmon
-    """
     input:
         fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
         index=REFERENCES + "/salmon/versionInfo.json",
@@ -735,9 +707,6 @@ rule salmon:
 
 
 rule kallisto:
-    """
-    Quantify reads coming from transcripts with Kallisto
-    """
     input:
         fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
         index=REFERENCES + "/kallisto/transcripts.idx",
@@ -777,9 +746,6 @@ rule kallisto:
 
 
 rule rseqc_infer_experiment:
-    """
-    Infer strandedness of experiment
-    """
     input:
         bam=patterns["markduplicates"]["bam"],
         bed12=rules.conversion_bed12.output,
@@ -795,9 +761,6 @@ rule rseqc_infer_experiment:
 
 
 rule rseqc_read_distribution:
-    """
-    read distribution plots
-    """
     input:
         bam=patterns["markduplicates"]["bam"],
         bed12=rules.conversion_bed12.output,
@@ -813,9 +776,6 @@ rule rseqc_read_distribution:
 
 
 rule idxstats:
-    """
-    Run samtools idxstats on sample bams
-    """
     input:
         bam=patterns["markduplicates"]["bam"],
         bai=patterns["markduplicates"]["bam"] + ".bai",
@@ -831,9 +791,6 @@ rule idxstats:
 
 
 rule bigwig_neg:
-    """
-    Create a bigwig for negative-strand reads
-    """
     input:
         bam=patterns["markduplicates"]["bam"],
         bai=patterns["markduplicates"]["bam"] + ".bai",
@@ -869,9 +826,6 @@ rule bigwig_neg:
 
 
 rule bigwig_pos:
-    """
-    Create a bigwig for postive-strand reads.
-    """
     input:
         bam=patterns["markduplicates"]["bam"],
         bai=patterns["markduplicates"]["bam"] + ".bai",
diff --git a/workflows/rnaseq/config/rnaseq_patterns.yaml b/workflows/rnaseq/config/rnaseq_patterns.yaml
index 92b2a534..35681125 100644
--- a/workflows/rnaseq/config/rnaseq_patterns.yaml
+++ b/workflows/rnaseq/config/rnaseq_patterns.yaml
@@ -15,7 +15,9 @@ libsizes:
   cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize'
   bam:     'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.libsize'
 fastq_screen: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt'
-featurecounts: 'data/rnaseq_aggregation/featurecounts.txt'
+featurecounts:
+   per_sample: 'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt'
+   aggregated: 'data/rnaseq_aggregation/featurecounts.txt'
 libsizes_table: 'data/rnaseq_aggregation/libsizes_table.tsv'
 libsizes_yaml: 'data/rnaseq_aggregation/libsizes_table_mqc.yaml'
 rrna_percentages_table: 'data/rnaseq_aggregation/rrna_percentages_table.tsv'

From 39209ce270e5d5259a323c75c88cb4411b313819 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:35:14 -0500
Subject: [PATCH 061/196] all sorts of fixes and cleanup

---
 .circleci/config.yml                       | 34 +++++---------
 lib/utils.py                               |  2 +-
 rules/sra.smk                              |  2 +-
 test/lcdb-wf-test                          |  7 ++-
 test/test_configs/hisat2.tsv               |  4 +-
 test/test_configs/star_override_1pass.yaml | 10 ----
 test/test_configs/star_override_2pass.yaml | 10 ----
 test/test_configs/test_rnaseq_config.yaml  | 54 ++++++++--------------
 test/workflow_test_params.yaml             | 18 ++++----
 9 files changed, 48 insertions(+), 93 deletions(-)
 delete mode 100644 test/test_configs/star_override_1pass.yaml
 delete mode 100644 test/test_configs/star_override_2pass.yaml

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 02b27915..16e5b5f0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -220,16 +220,6 @@ variables:
               --until bed_to_bigbed
           fi
 
-  # --------------------------------------------------------------------------
-  # Standard references workflow.
-  references-step: &references-step
-      run:
-        name: references workflow
-        command: |
-          source /opt/miniforge/etc/profile.d/conda.sh
-          conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -k --orig $ORIG
-
   # --------------------------------------------------------------------------
   # Standard RNA-seq workflow
   rnaseq-step: &rnaseq-step
@@ -408,14 +398,14 @@ jobs:
   #     - *get-data
   #     - *colocalization-step
 
-  references:
-    <<: *defaults
-    steps:
-      - checkout
-      - *restore_cache
-      - *set-path
-      - *get-data
-      - *references-step
+  # references:
+  #   <<: *defaults
+  #   steps:
+  #     - checkout
+  #     - *restore_cache
+  #     - *set-path
+  #     - *get-data
+  #     - *references-step
 
   build-docs:
     <<: *defaults
@@ -489,10 +479,10 @@ workflows:
           requires:
             - initial-setup
             - pytest
-      - references:
-          requires:
-            - initial-setup
-            - pytest
+      # - references:
+      #     requires:
+      #       - initial-setup
+      #       - pytest
       # - colocalization:
       #     requires:
       #       - initial-setup
diff --git a/lib/utils.py b/lib/utils.py
index f1a97c79..0e5cc9e2 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1137,7 +1137,7 @@ def check_urls(config, verbose=False):
         too-many-connection issues
     """
     failures = []
-    urls = list(set(utils.flatten(pluck(config, "url"))))
+    urls = list(set(flatten(pluck(config, "url"))))
     for url in urls:
         if url.startswith("file://"):
             continue
diff --git a/rules/sra.smk b/rules/sra.smk
index 861b5098..2992f503 100644
--- a/rules/sra.smk
+++ b/rules/sra.smk
@@ -14,7 +14,7 @@ rule fastq_dump:
     params:
         is_paired=is_paired,
         sampletable=_st,
-        # extra="-X 100000",  # [TEST SETTINGS]
+        # extra="-X 100000",  # [enable for test]
     resources:
         mem_mb=gb(1),
         disk_mb=autobump(gb=1),
diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test
index df59b24c..21f6978c 100755
--- a/test/lcdb-wf-test
+++ b/test/lcdb-wf-test
@@ -142,9 +142,12 @@ class Runner(object):
             %(prog)s rnaseq --run-workflow --strandedness-pe
             %(prog)s rnaseq --run-workflow --strandedness-se
             %(prog)s rnaseq --run-workflow --star-2pass
-            %(prog)s rnaseq --run-workflow --star-1pass
+            %(prog)s rnaseq --run-workflow --hisat2
             %(prog)s rnaseq --run-workflow --pe
 
+            # Since there are a lot of parameters here, see
+            # "workflow_test_params.yaml" for how they are managed.
+
             """,
             formatter_class=argparse.RawDescriptionHelpFormatter
         )
@@ -328,7 +331,7 @@ class Runner(object):
         if args.url_check:
             print_header("url check")
             sys.path.insert(0, str(TOPLEVEL))
-            from lib.common import check_all_urls_found
+            from lib.utils import check_all_urls_found
 
             check_all_urls_found()
 
diff --git a/test/test_configs/hisat2.tsv b/test/test_configs/hisat2.tsv
index 3c73275e..df6746ce 100644
--- a/test/test_configs/hisat2.tsv
+++ b/test/test_configs/hisat2.tsv
@@ -1,3 +1,3 @@
 samplename	group	layout	orig_filename
-sample1-star-1pass	control	SE	data/example_data/rnaseq_sample1PE_1.fq.gz
-sample2-star-1pass	control	SE	data/example_data/rnaseq_sample2.fq.gz
+sample1-hisat2	control	SE	data/example_data/rnaseq_sample1PE_1.fq.gz
+sample2-hisat2	control	SE	data/example_data/rnaseq_sample2.fq.gz
diff --git a/test/test_configs/star_override_1pass.yaml b/test/test_configs/star_override_1pass.yaml
deleted file mode 100644
index cba6ff76..00000000
--- a/test/test_configs/star_override_1pass.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-aligner:
-  index: star
-  tag: test
-
-merged_bigwigs:
-  control_pos:
-    pos: []
-  treatment_all:
-    pos: []
-    neg: []
diff --git a/test/test_configs/star_override_2pass.yaml b/test/test_configs/star_override_2pass.yaml
deleted file mode 100644
index b091eba3..00000000
--- a/test/test_configs/star_override_2pass.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-aligner:
-  index: 'star-twopass'
-  tag: test
-
-merged_bigwigs:
-  control_pos:
-    pos: []
-  treatment_all:
-    pos: []
-    neg: []
diff --git a/test/test_configs/test_rnaseq_config.yaml b/test/test_configs/test_rnaseq_config.yaml
index 6c674345..2cbd3d66 100644
--- a/test/test_configs/test_rnaseq_config.yaml
+++ b/test/test_configs/test_rnaseq_config.yaml
@@ -1,43 +1,27 @@
-sampletable: 'config/sampletable.tsv'
-
-patterns: 'config/rnaseq_patterns.yaml'
-
-# Which key in the `references` dict below to use
-organism: 'dmel'
-
-# If not specified here, use the environment variable REFERENCES_DIR.
-references_dir: 'references_data'
-
-aligner:
-  index: 'hisat2'
-  tag: 'test'
+fasta:
+  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
+  postprocess: 'lib.utils.gzipped'
 
-stranded: 'fr-firststrand'
+gtf:
+  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf"
+  postprocess: 'lib.utils.gzipped'
 
 rrna:
-  index: 'bowtie2'
-  tag: 'rRNA'
-
-gtf:
-  tag: "test"
+  url:
+    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
+    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
+  postprocess:
+    function: 'lib.utils.filter_fastas'
+    args: 'Drosophila melanogaster'
 
-salmon:
-  tag: "test"
 
-kallisto:
-  tag: "test"
+sampletable: 'config/sampletable.tsv'
 
-fastq_screen:
-  - label: rRNA
-    organism: dmel
-    tag: test
-  - label: Fly
-    organism: dmel
-    tag: test
+patterns: 'config/rnaseq_patterns.yaml'
 
-# See the reference config files in the top level of the repo,
-# include/reference_configs, for inspiration for more species.
+# See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info.
+stranded: 'fr-firststrand'     # for dUTP libraries
+#         'fr-secondstrand'    # for ligation libraries
+#         'unstranded'         # for libraries without strand specificity
 
-include_references:
-  - '../../include/reference_configs/test.yaml'
-  - '../../include/reference_configs/Drosophila_melanogaster.yaml'
+aligner: 'star'
diff --git a/test/workflow_test_params.yaml b/test/workflow_test_params.yaml
index 70e57da6..5d74fac9 100644
--- a/test/workflow_test_params.yaml
+++ b/test/workflow_test_params.yaml
@@ -45,19 +45,17 @@ rnaseq:
     desc: Tests running STAR in 2-pass mode. Only runs until the star_pass2 rule.
     args: |
         --until star_pass2
-        --configfile
-        __ORIG__/test/test_configs/test_rnaseq_config.yaml
-        __ORIG__/test/test_configs/star_override_2pass.yaml
+        --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml
         --config sampletable=__ORIG__/test/test_configs/star_2pass.tsv
+        --config aligner="star-twopass"
 
-  star-1pass:
-    desc: Tests running STAR in 1-pass (default) mode. Only runs until the star rule.
+  hisat2:
+    desc: Tests running HISAT2
     args: |
-        --until star
-        --configfile
-        __ORIG__/test/test_configs/test_rnaseq_config.yaml
-        __ORIG__/test/test_configs/star_override_1pass.yaml
-        --config sampletable=__ORIG__/test/test_configs/star_1pass.tsv
+        --until hisat2
+        --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml
+        --config sampletable=__ORIG__/test/test_configs/hisat2.tsv
+        --config aligner=hisat2
 
   pe:
     desc: Tests paired-end data

From 155307a6d77db198296deaa46e76877b344f5bcd Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:36:56 -0500
Subject: [PATCH 062/196] sra for chipseq

---
 workflows/chipseq/Snakefile | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 24b09dec..9c8a2f37 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -19,7 +19,6 @@ REFERENCES = config.get("reference_dir", "../../references")
 sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
-is_sra = utils.detect_sra(sampletable)
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.iloc[:, 0].values
 patterns = yaml.safe_load(open("config/chipseq_patterns.yaml"))["patterns_by_sample"]
@@ -43,9 +42,36 @@ rule targets:
         [v["bed"] for k, v in peaks.items()],
 
 
-if is_sra:
+if utils.detect_sra(sampletable):
+    sampletable['orig_filename'] = expand(
+        'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1)
 
-    include: "../../rules/sra.smk"
+    if is_paired:
+        sampletable['orig_filename_R2'] = expand(
+            'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2)
+
+    rule fastq_dump:
+        output:
+            fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n, allow_missing=True)
+        log:
+            'original_data/sra_samples/{sample}/{sample}.fastq.gz.log'
+        params:
+            is_paired=is_paired,
+            # extra="-X 100000",  # [enable for test]
+        resources:
+            mem_mb=gb(1),
+            disk_mb=autobump(gb=1),
+            runtime=autobump(hours=2)
+        run:
+            srr = sampletable.loc[wildcards.sample, "Run"]
+            extra = params.get("extra", "")
+            if is_paired:
+                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
+                shell("mv {srr}_1.fastq.gz {output[0]}")
+                shell("mv {srr}_2.fastq.gz {output[1]}")
+            else:
+                shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp")
+                shell("mv {output[0]}.tmp {output[0]}")
 
 
 rule symlinks:

From fd1c1c3df76284f7080e9e706d68fa61ae54356d Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:37:13 -0500
Subject: [PATCH 063/196] clean out test suite

---
 lib/test_suite.py | 89 +----------------------------------------------
 1 file changed, 1 insertion(+), 88 deletions(-)

diff --git a/lib/test_suite.py b/lib/test_suite.py
index 21b9c052..eb018c3f 100644
--- a/lib/test_suite.py
+++ b/lib/test_suite.py
@@ -1,88 +1 @@
-import os
-import pprint
-from textwrap import dedent
-from . import common
-
-
-def test_config_loading(tmpdir):
-    f0 = tmpdir.mkdir('subdir').join('file0.yaml')
-    dir_to_include = tmpdir.join('subdir')
-    f0.write(dedent('''
-    references:
-      species_to_keep:
-        tag_from_directory:
-          fasta:
-            url: "https://from_directory"
-
-        # Will get overwritten by a specific file
-        tag_from_file:
-          fasta:
-            url: "https://from_directory"
-
-        # Will get overwritten by specific file, and then that will get
-        # overwritten by the config
-        tag_from_config:
-          fasta:
-            url: "https://from_directory"
-    '''))
-    f1 = tmpdir.join('subdir', 'file1.yaml')
-    f1.write(dedent('''
-    references:
-      species2:
-        tag_only_in_directory:
-          fasta:
-            url: ""
-            indexes:
-              - bowtie2
-    '''))
-
-    f2 = tmpdir.join('file1.yaml')
-    f2.write(dedent('''
-    references:
-      species_to_keep:
-        tag_from_file:
-          fasta:
-            url: "https://from_file"
-        tag_from_config:
-          fasta:
-            url: "https://from_file"
-
-    '''))
-
-    f3 = tmpdir.join('file3.yaml')
-    f3.write(dedent('''
-    references_dir: "/data"
-    references:
-      species_to_keep:
-        tag_from_config:
-          fasta:
-            url: "https://from_config"
-
-    include_references:
-      - {dir_to_include}
-      - {f2}
-    '''.format(dir_to_include=dir_to_include, f2=f2)))
-
-    config = common.load_config(str(f3))
-
-    assert config == {
-        'references_dir': '/data',
-        'include_references': [
-            '{0}/subdir'.format(str(tmpdir)),
-            '{0}/file1.yaml'.format(str(tmpdir)),
-        ],
-        'references': {
-            'species_to_keep': {
-                'tag_from_config': {
-                    'fasta': {'url': 'https://from_config'}},
-                'tag_from_directory': {
-                    'fasta': {'url': 'https://from_directory'}},
-                'tag_from_file': {
-                    'fasta': {'url': 'https://from_file'}}
-            },
-            'species2': {
-                'tag_only_in_directory': {
-                    'fasta': {'indexes': ['bowtie2'], 'url': ''}}},
-        },
-    }
-
+from . import utils

From d322e333949a4d25f37f4b83cd1127cdb7ad2ae0 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 12 Jan 2025 11:39:27 -0500
Subject: [PATCH 064/196] add strandcheck back to snakefile

---
 rules/sra.smk              | 34 -------------------
 rules/strand_check.smk     | 64 -----------------------------------
 workflows/rnaseq/Snakefile | 68 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 66 insertions(+), 100 deletions(-)
 delete mode 100644 rules/sra.smk
 delete mode 100644 rules/strand_check.smk

diff --git a/rules/sra.smk b/rules/sra.smk
deleted file mode 100644
index 2992f503..00000000
--- a/rules/sra.smk
+++ /dev/null
@@ -1,34 +0,0 @@
-
-sampletable['orig_filename'] = expand(
-    'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1)
-
-if is_paired:
-    sampletable['orig_filename_R2'] = expand(
-        'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2)
-
-rule fastq_dump:
-    output:
-        fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n)
-    log:
-        'original_data/sra_samples/{sample}/{sample}.fastq.gz.log'
-    params:
-        is_paired=is_paired,
-        sampletable=_st,
-        # extra="-X 100000",  # [enable for test]
-    resources:
-        mem_mb=gb(1),
-        disk_mb=autobump(gb=1),
-        runtime=autobump(hours=2)
-    run:
-        _st = sampletable.set_index(sampletable.columns[0])
-        srr = _st.loc[wildcards.sample, "Run"]
-        extra = params.get("extra", "")
-        if is_paired:
-            shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
-            shell("mv {srr}_1.fastq.gz {output[0]}")
-            shell("mv {srr}_2.fastq.gz {output[1]}")
-        else:
-            shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp")
-            shell("mv {output[0]}.tmp {output[0]}")
-
-# vim: ft=snakemake
diff --git a/rules/strand_check.smk b/rules/strand_check.smk
deleted file mode 100644
index 4c936a77..00000000
--- a/rules/strand_check.smk
+++ /dev/null
@@ -1,64 +0,0 @@
-rule sample_strand_check:
-    input:
-        fastq=expand(patterns["fastq"], n=n, allow_missing=True),
-        index=expand(rules.bowtie2_index.output, label="genome"),
-        bed12=rules.conversion_bed12.output,
-    output:
-        strandedness='strand_check/{sample}/{sample}.strandedness',
-        bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'),
-        bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'),
-        fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', n=n, allow_missing=True)),
-    log:
-        'strand_check/{sample}/{sample}.strandedness.log'
-    threads: 6
-    resources:
-        mem_mb=gb(8),
-        runtime=autobump(hours=2)
-    run:
-        prefix = os.path.commonprefix(input.index).rstrip(".")
-        nreads = int(1e5 * 4)
-        if is_paired:
-            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
-            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}')
-            fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} '
-        else:
-            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
-            fastqs = f'-U {output.fastqs[0]} '
-        shell(
-            "bowtie2 "
-            "-x {prefix} "
-            "{fastqs} "
-            '--no-unal '
-            "--threads {threads} 2> {log} "
-            "| samtools view -Sb - "
-            "| samtools sort - -o {output.bam} "
-        )
-        shell("samtools index {output.bam}")
-        shell(
-            'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}'
-        )
-
-rule strand_check:
-    input:
-        expand('strand_check/{sample}/{sample}.strandedness', sample=SAMPLES)
-    output:
-        html='strand_check/strandedness.html',
-        filelist=temporary('strand_check/filelist')
-    log:
-        'strand_check/strandedness.log'
-    resources:
-        mem_mb=gb(1),
-        runtime=autobump(hours=2)
-    run:
-        with open(output.filelist, 'w') as fout:
-            for i in  input:
-                fout.write(i + '\n')
-        shell(
-            'multiqc '
-            '--force '
-            '--module rseqc '
-            '--file-list {output.filelist} '
-            '--filename {output.html} &> {log}'
-        )
-
-# vim: ft=snakemake
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 29732d47..0dff1d54 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -40,8 +40,6 @@ rule all:
         expand(patterns["bigwig"]["pos"], sample=SAMPLES),
         expand(patterns["bigwig"]["neg"], sample=SAMPLES),
 
-# Optionally run ``snakemake strand_check`` to do a preliminary run evaluating strandedness.
-include: '../../rules/strand_check.smk'
 
 if utils.detect_sra(sampletable):
     sampletable['orig_filename'] = expand(
@@ -102,6 +100,72 @@ rule symlink_targets:
         ),
 
 
+# Optionally run ``snakemake strand_check`` to do a preliminary run on
+# automatically-subset data to evaluate strandedness.
+rule sample_strand_check:
+    input:
+        fastq=expand(patterns["fastq"], n=n, allow_missing=True),
+        index=expand(rules.bowtie2_index.output, label="genome"),
+        bed12=rules.conversion_bed12.output,
+    output:
+        strandedness='strand_check/{sample}/{sample}.strandedness',
+        bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'),
+        bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'),
+        fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', n=n, allow_missing=True)),
+    log:
+        'strand_check/{sample}/{sample}.strandedness.log'
+    threads: 6
+    resources:
+        mem_mb=gb(8),
+        runtime=autobump(hours=2)
+    run:
+        prefix = os.path.commonprefix(input.index).rstrip(".")
+        nreads = int(1e5 * 4)
+        if is_paired:
+            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
+            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}')
+            fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} '
+        else:
+            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
+            fastqs = f'-U {output.fastqs[0]} '
+        shell(
+            "bowtie2 "
+            "-x {prefix} "
+            "{fastqs} "
+            '--no-unal '
+            "--threads {threads} 2> {log} "
+            "| samtools view -Sb - "
+            "| samtools sort - -o {output.bam} "
+        )
+        shell("samtools index {output.bam}")
+        shell(
+            'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}'
+        )
+
+
+rule strand_check:
+    input:
+        expand('strand_check/{sample}/{sample}.strandedness', sample=SAMPLES)
+    output:
+        html='strand_check/strandedness.html',
+        filelist=temporary('strand_check/filelist')
+    log:
+        'strand_check/strandedness.log'
+    resources:
+        mem_mb=gb(1),
+        runtime=autobump(hours=2)
+    run:
+        with open(output.filelist, 'w') as fout:
+            for i in  input:
+                fout.write(i + '\n')
+        shell(
+            'multiqc '
+            '--force '
+            '--module rseqc '
+            '--file-list {output.filelist} '
+            '--filename {output.html} &> {log}'
+        )
+
 rule cutadapt:
     input:
         fastq=expand(patterns["fastq"], n=n, allow_missing=True),

From 8b6b52a01e52cc44b500beb9b71962503f9618f6 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Tue, 14 Jan 2025 11:13:03 -0500
Subject: [PATCH 065/196] don't use patterns any more

---
 workflows/rnaseq/Snakefile | 338 +++++++++++++++++++------------------
 1 file changed, 172 insertions(+), 166 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 0dff1d54..5b9923b8 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -11,7 +11,6 @@ from lib.utils import autobump, gb, hours
 configfile: "config/config.yaml"
 
 
-include: "../references/Snakefile"
 
 
 REFERENCES = config.get("reference_dir", "../../references")
@@ -20,8 +19,6 @@ sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.index
-patterns = yaml.safe_load(open("config/rnaseq_patterns.yaml"))
-config["patterns"] = patterns
 
 
 wildcard_constraints:
@@ -36,10 +33,9 @@ localrules:
 
 rule all:
     input:
-        patterns["multiqc"],
-        expand(patterns["bigwig"]["pos"], sample=SAMPLES),
-        expand(patterns["bigwig"]["neg"], sample=SAMPLES),
+        "data/rnaseq_aggregation/multiqc.html",
 
+include: "../references/Snakefile"
 
 if utils.detect_sra(sampletable):
     sampletable['orig_filename'] = expand(
@@ -82,7 +78,7 @@ rule symlinks:
             else sampletable.loc[wc.sample, ["orig_filename"]]
         ),
     output:
-        expand(patterns["fastq"], n=n, allow_missing=True),
+        expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n)
     threads: 1
     resources:
         mem_mb=100,
@@ -104,7 +100,7 @@ rule symlink_targets:
 # automatically-subset data to evaluate strandedness.
 rule sample_strand_check:
     input:
-        fastq=expand(patterns["fastq"], n=n, allow_missing=True),
+        fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n),
         index=expand(rules.bowtie2_index.output, label="genome"),
         bed12=rules.conversion_bed12.output,
     output:
@@ -168,9 +164,9 @@ rule strand_check:
 
 rule cutadapt:
     input:
-        fastq=expand(patterns["fastq"], n=n, allow_missing=True),
+        fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n)
     output:
-        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+        fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz', n=n)
     log:
         "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
     threads: 6
@@ -214,17 +210,17 @@ rule cutadapt:
 
 rule fastqc:
     input:
-        '{sample_dir}/{sample}/{sample}{suffix}'
+        'data/rnaseq_samples/{sample}/{sample}{suffix}'
     threads:
         1
     output:
-        html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html',
-        zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip',
+        html='data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html',
+        zip='data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip',
     resources:
         mem_mb=gb(8),
         runtime=autobump(hours=2)
     log:
-        '{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log',
+        'data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log',
     run:
         outdir = os.path.dirname(output.html) or "."
         shell(
@@ -250,12 +246,12 @@ if config["aligner"] == "hisat2":
 
     rule hisat2:
         input:
-            fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+            fastq=rules.cutadapt.output,
             index=rules.hisat2_index.output,
         output:
-            bam=temporary(patterns["bam"]),
+            bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam")
         log:
-            patterns["bam"] + ".log",
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log"
         threads: 16
         resources:
             mem_mb=gb(32),
@@ -292,7 +288,10 @@ if config["aligner"] == "hisat2":
 
 
 if config["aligner"].startswith("star"):
-
+    if os.getenv("TMPDIR"):
+        tmpdir_arg = "--outTmpDir $TMPDIR/star "
+    else:
+        tmpdir_arg = ""
     # STAR can be run in 1-pass or 2-pass modes. Since we may be running it
     # more than once in almost the same way, we pull out the shell command here
     # and use it below.
@@ -303,6 +302,7 @@ if config["aligner"].startswith("star"):
         "--readFilesIn {input.fastq} "
         "--readFilesCommand zcat "
         "--outFileNamePrefix {prefix} "
+        "{tmpdir_arg} "
         "{params.extra} "
     )
     STAR_PARAMS = (
@@ -327,18 +327,19 @@ if config["aligner"] == "star":
     rule star:
         "Align with STAR (1-pass mode)"
         input:
-            fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+            fastq=rules.cutadapt.output,
             index=rules.star_index.output,
             annotation=f"{REFERENCES}/annotation.gtf",
         output:
-            bam=temporary(patterns["bam"]),
-            sjout=temporary(patterns["bam"].replace(".bam", ".star.SJ.out.tab")),
+            bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
+            sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab")
         log:
-            patterns["bam"].replace(".bam", ".star.bam.log"),
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log"
         threads: 16
         resources:
             mem_mb=gb(64),
             runtime=autobump(hours=8),
+            disk_mb=gb(80),
         params:
             extra=STAR_PARAMS,
         run:
@@ -366,17 +367,18 @@ if config["aligner"] == "star-twopass":
     rule star_pass1:
         "First pass of alignment with STAR to get the junctions"
         input:
-            fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+            fastq=rules.cutadapt.output,
             index=rules.star_index.output,
             annotation=f"{REFERENCES}/annotation.gtf",
         output:
-            sjout=temporary(patterns["bam"].replace(".bam", ".star-pass1.SJ.out.tab")),
+            sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab")
         log:
-            patterns["bam"].replace(".bam", ".star-pass1.bam.log"),
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log"
         threads: 16
         resources:
             mem_mb=gb(64),
             runtime=autobump(hours=8),
+            disk_mb=gb(80),
         params:
             extra=STAR_PARAMS,
         run:
@@ -407,22 +409,20 @@ if config["aligner"] == "star-twopass":
         samples to get the final BAM
         """
         input:
-            fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+            fastq=rules.cutadapt.output,
             index=rules.star_index.output,
             annotation=f"{REFERENCES}/annotation.gtf",
-            sjout=expand(
-                patterns["bam"].replace(".bam", ".star-pass1.SJ.out.tab"),
-                sample=SAMPLES,
-            ),
+            sjout=expand(rules.star_pass1.output, sample=SAMPLES)
         output:
-            bam=temporary(patterns["bam"]),
-            sjout=temporary(patterns["bam"].replace(".bam", ".star-pass2.SJ.out.tab")),
+            bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
+            sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.SJ.out.tab")
         log:
-            patterns["bam"].replace(".bam", ".star-pass2.bam.log"),
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log"
         threads: 16
         resources:
             mem_mb=gb(64),
             runtime=autobump(hours=8),
+            disk_mb=gb(80),
         params:
             extra=STAR_PARAMS,
         run:
@@ -455,7 +455,7 @@ if config["aligner"] == "star-twopass":
 
 rule rRNA:
     input:
-        fastq=expand(patterns["cutadapt"], n=1, allow_missing=True),
+        fastq='data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz',
         index=multiext(
             f"{REFERENCES}/bowtie2/rrna",
             ".1.bt2",
@@ -467,9 +467,9 @@ rule rRNA:
             ".fa",
         ),
     output:
-        bam=temporary(patterns["rrna"]["bam"]),
+        bam='data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam',
     log:
-        patterns["rrna"]["bam"] + ".log",
+        'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.log'
     threads: 6
     resources:
         mem_mb=gb(2),
@@ -539,14 +539,40 @@ rule bam_index:
         "samtools index {input} {output}"
 
 
+rule markduplicates:
+    input:
+        bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.bam'
+    output:
+        bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam',
+        metrics='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics'
+    log:
+        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.log'
+    threads: 1
+    resources:
+        mem_mb=gb(32),
+        runtime=autobump(hours=2),
+        disk_mb=autobump(gb=100),
+    params:
+        java_args="-Xmx20g",  # [disable for test]
+        # java_args='-Xmx2g'  # [enable for test]
+    shell:
+        "picard "
+        "{params.java_args} "
+        "MarkDuplicates "
+        "INPUT={input.bam} "
+        "OUTPUT={output.bam} "
+        "METRICS_FILE={output.metrics} "
+        "VALIDATION_STRINGENCY=LENIENT "
+        "&> {log}"
+
 rule featurecounts:
     input:
         annotation=rules.gtf.output,
-        bam=patterns["markduplicates"]["bam"],
+        bam=rules.markduplicates.output.bam
     output:
-        patterns["featurecounts"]["per_sample"]
+        'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt'
     log:
-        patterns["featurecounts"]["per_sample"] + ".log"
+        'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log'
     threads: 8
     resources:
         mem_mb=gb(16),
@@ -575,11 +601,11 @@ rule featurecounts:
 
 rule aggregate_featurecounts:
     input:
-        expand(patterns["featurecounts"]["per_sample"], sample=SAMPLES)
+        expand('data/rnaseq_samples/{sample}/{sample}_featurecounts.txt', sample=SAMPLES)
     output:
-        patterns["featurecounts"]["aggregated"]
+        'data/rnaseq_aggregation/featurecounts.txt'
     log: 
-        patterns["featurecounts"]["aggregated"] + ".log"
+        'data/rnaseq_aggregation/featurecounts.txt.log'
     threads:
         1
     resources:
@@ -595,13 +621,14 @@ rule aggregate_featurecounts:
             final[df.columns[-1]] = df[df.columns[-1]]
         final.to_csv(output[0], sep="\t", index=False)
 
+
 rule rrna_libsizes_table:
     input:
-        rrna=expand(patterns["rrna"]["libsize"], sample=SAMPLES),
-        fastq=expand(patterns["libsizes"]["cutadapt"], sample=SAMPLES),
+        rrna=expand('data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize', sample=SAMPLES),
+        fastq=expand('data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize', sample=SAMPLES),
     output:
-        json=patterns["rrna_percentages_yaml"],
-        tsv=patterns["rrna_percentages_table"],
+        tsv='data/rnaseq_aggregation/rrna_percentages_table.tsv',
+        json='data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml',
     threads: 1
     resources:
         mem_mb=gb(2),
@@ -610,87 +637,18 @@ rule rrna_libsizes_table:
         "../../scripts/rrna_libsizes_table.py"
 
 
-rule multiqc:
-    input:
-        files=(
-            expand(patterns["fastqc"]["raw"], sample=SAMPLES),
-            expand(patterns["fastqc"]["cutadapt"], sample=SAMPLES),
-            expand(patterns["fastqc"]["bam"], sample=SAMPLES),
-            expand(patterns["markduplicates"]["bam"], sample=SAMPLES),
-            expand(patterns["salmon"], sample=SAMPLES),
-            expand(patterns["kallisto"], sample=SAMPLES),
-            expand(patterns["preseq"], sample=SAMPLES),
-            expand(patterns["rseqc"]["infer_experiment"], sample=SAMPLES),
-            expand(patterns["rseqc"]["read_distribution"], sample=SAMPLES),
-            expand(patterns["collectrnaseqmetrics"]["metrics"], sample=SAMPLES),
-            expand(patterns["samtools"]["idxstats"], sample=SAMPLES),
-            expand(patterns["samtools"]["flagstat"], sample=SAMPLES),
-            expand(patterns["samtools"]["stats"], sample=SAMPLES),
-
-            patterns["rrna_percentages_table"],
-            patterns["featurecounts"]["aggregated"],
-        ),
-        config="config/multiqc_config.yaml",
-    output:
-        "data/rnaseq_aggregation/multiqc.html",
-    log:
-        "data/rnaseq_aggregation/multiqc.log",
-    threads: 1
-    resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
-    run:
-        analysis_directory = set([os.path.dirname(i) for i in input])
-        outdir = os.path.dirname(output[0])
-        basename = os.path.basename(output[0])
-        shell(
-            "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 "
-            "multiqc "
-            "--quiet "
-            "--outdir {outdir} "
-            "--force "
-            "--filename {basename} "
-            "--config {input.config} "
-            "{analysis_directory} "
-            "&> {log} "
-        )
 
 
-rule markduplicates:
-    input:
-        bam=patterns["bam"],
-    output:
-        bam=patterns["markduplicates"]["bam"],
-        metrics=patterns["markduplicates"]["metrics"],
-    log:
-        patterns["markduplicates"]["bam"] + ".log",
-    threads: 1
-    resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
-        disk_mb=autobump(gb=100),
-    params:
-        java_args="-Xmx20g",  # [disable for test]
-        # java_args='-Xmx2g'  # [enable for test]
-    shell:
-        "picard "
-        "{params.java_args} "
-        "MarkDuplicates "
-        "INPUT={input.bam} "
-        "OUTPUT={output.bam} "
-        "METRICS_FILE={output.metrics} "
-        "VALIDATION_STRINGENCY=LENIENT "
-        "&> {log}"
 
 
 rule collectrnaseqmetrics:
     input:
-        bam=patterns["markduplicates"]["bam"],
+        bam=rules.markduplicates.output.bam,
         refflat=rules.conversion_refflat.output,
     output:
-        metrics=patterns["collectrnaseqmetrics"]["metrics"],
+        metrics='data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics'
     log:
-        patterns["collectrnaseqmetrics"]["metrics"] + ".log",
+        'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics.log'
     threads: 1
     resources:
         mem_mb=gb(32),
@@ -719,9 +677,11 @@ rule collectrnaseqmetrics:
 
 rule preseq:
     input:
-        bam=patterns["bam"],
+        bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam',
     output:
-        patterns["preseq"],
+        'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt'
+    log:
+        'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt.log'
     threads: 1
     resources:
         mem_mb=gb(1),
@@ -731,16 +691,17 @@ rule preseq:
         "c_curve "
         "-B {input} "
         "-o {output} "
+        "&> {log}"
 
 
 rule salmon:
     input:
-        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+        fastq=rules.cutadapt.output,
         index=REFERENCES + "/salmon/versionInfo.json",
     output:
-        patterns["salmon"],
+        'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf'
     log:
-        patterns["salmon"] + ".log",
+        'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf.log'
     threads: 6
     resources:
         mem_mb=gb(32),
@@ -772,12 +733,12 @@ rule salmon:
 
 rule kallisto:
     input:
-        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+        fastq=rules.cutadapt.output,
         index=REFERENCES + "/kallisto/transcripts.idx",
     output:
-        patterns["kallisto"],
+        'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5'
     log:
-        patterns["kallisto"] + ".log",
+        'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log'
     threads: 8
     resources:
         mem_mb=gb(32),
@@ -811,12 +772,12 @@ rule kallisto:
 
 rule rseqc_infer_experiment:
     input:
-        bam=patterns["markduplicates"]["bam"],
+        bam=rules.markduplicates.output,
         bed12=rules.conversion_bed12.output,
     output:
-        txt=patterns["rseqc"]["infer_experiment"],
+        'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt',
     log:
-        patterns["rseqc"]["infer_experiment"] + ".log",
+        'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt.log'
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2),
@@ -826,12 +787,12 @@ rule rseqc_infer_experiment:
 
 rule rseqc_read_distribution:
     input:
-        bam=patterns["markduplicates"]["bam"],
+        bam=rules.markduplicates.output,
         bed12=rules.conversion_bed12.output,
     output:
-        txt=patterns["rseqc"]["read_distribution"],
+        'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt',
     log:
-        patterns["rseqc"]["read_distribution"] + ".log",
+        'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt.log'
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2),
@@ -839,33 +800,57 @@ rule rseqc_read_distribution:
         "read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}"
 
 
-rule idxstats:
+rule samtools_idxstats:
     input:
-        bam=patterns["markduplicates"]["bam"],
-        bai=patterns["markduplicates"]["bam"] + ".bai",
+        bam=rules.markduplicates.output.bam,
+        bai=rules.markduplicates.output.bam + ".bai",
     output:
-        txt=patterns["samtools"]["idxstats"],
+        'data/rnaseq_samples/{sample}/idxstat_{sample}.txt'
     log:
-        patterns["samtools"]["idxstats"] + ".log",
+        'data/rnaseq_samples/{sample}/idxstat_{sample}.txt.log'
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2),
-    run:
-        shell("samtools idxstats {input.bam} 2> {log} 1> {output.txt}")
+    shell:
+        "samtools idxstats {input.bam} 2> {log} 1> {output}"
+
+
+rule samtools_flagstat:
+    input:
+        bam=rules.markduplicates.output.bam,
+        bai=rules.markduplicates.output.bam + ".bai",
+    output:
+        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat'
+    log:
+        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat.log'
+    shell:
+        "samtools flagstat {input.bam} > {output}"
+
+
+rule samtools_stats:
+    input:
+        bam=rules.markduplicates.output.bam,
+        bai=rules.markduplicates.output.bam + ".bai",
+    output:
+        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats'
+    log:
+        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats.log'
+    shell:
+        "samtools stats {input.bam} > {output}"
 
 
 rule bigwig_neg:
     input:
-        bam=patterns["markduplicates"]["bam"],
-        bai=patterns["markduplicates"]["bam"] + ".bai",
+        bam=rules.markduplicates.output.bam,
+        bai=rules.markduplicates.output.bam + ".bai",
     output:
-        patterns["bigwig"]["neg"],
+        'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig'
     threads: 8
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2),
     log:
-        patterns["bigwig"]["neg"] + ".log",
+        'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig.log'
     params:
         strand_arg={
             "unstranded": "",
@@ -891,16 +876,16 @@ rule bigwig_neg:
 
 rule bigwig_pos:
     input:
-        bam=patterns["markduplicates"]["bam"],
-        bai=patterns["markduplicates"]["bam"] + ".bai",
+        bam=rules.markduplicates.output.bam,
+        bai=rules.markduplicates.output.bam + ".bai",
     output:
-        patterns["bigwig"]["pos"],
+        'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig'
     threads: 8
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2),
     log:
-        patterns["bigwig"]["pos"] + ".log",
+        'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig.log'
     params:
         strand_arg={
             "unstranded": "",
@@ -923,26 +908,47 @@ rule bigwig_pos:
             "&> {log}"
         )
 
-
-rule flagstat:
+rule multiqc:
     input:
-        bam=patterns["markduplicates"]["bam"],
-        bai=patterns["markduplicates"]["bam"] + ".bai",
+        files=(
+            expand(rules.fastqc.output.zip, sample=SAMPLES, suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"]),
+            expand(rules.markduplicates.output, sample=SAMPLES),
+            expand(rules.salmon.output, sample=SAMPLES),
+            expand(rules.kallisto.output, sample=SAMPLES),
+            expand(rules.preseq.output, sample=SAMPLES),
+            expand(rules.collectrnaseqmetrics.output, sample=SAMPLES),
+            expand(rules.samtools_stats.output, sample=SAMPLES),
+            expand(rules.samtools_flagstat.output, sample=SAMPLES),
+            expand(rules.samtools_idxstats.output, sample=SAMPLES),
+            expand(rules.rseqc_infer_experiment.output, sample=SAMPLES),
+            expand(rules.rseqc_read_distribution.output, sample=SAMPLES),
+            expand(rules.bigwig_pos.output, sample=SAMPLES),
+            expand(rules.bigwig_neg.output, sample=SAMPLES),
+            rules.rrna_libsizes_table.output,
+        ),
+        config="config/multiqc_config.yaml",
     output:
-        patterns["samtools"]["flagstat"],
+        "data/rnaseq_aggregation/multiqc.html",
     log:
-        patterns["samtools"]["flagstat"] + ".log",
-    shell:
-        "samtools flagstat {input.bam} > {output}"
-
+        "data/rnaseq_aggregation/multiqc.log",
+    threads: 1
+    resources:
+        mem_mb=gb(2),
+        runtime=autobump(hours=2),
+        disk_mb=gb(10)
+    run:
+        analysis_directory = set([os.path.dirname(i) for i in input])
+        outdir = os.path.dirname(output[0])
+        basename = os.path.basename(output[0])
+        shell(
+            "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 "
+            "multiqc "
+            "--quiet "
+            "--outdir {outdir} "
+            "--force "
+            "--filename {basename} "
+            "--config {input.config} "
+            "{analysis_directory} "
+            "&> {log} "
+        )
 
-rule samtools_stats:
-    input:
-        bam=patterns["markduplicates"]["bam"],
-        bai=patterns["markduplicates"]["bam"] + ".bai",
-    output:
-        patterns["samtools"]["stats"],
-    log:
-        patterns["samtools"]["stats"] + ".log",
-    shell:
-        "samtools stats {input.bam} > {output}"

From d5799fa619a3acb3961a16f6522ed5d440fba104 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Tue, 14 Jan 2025 18:24:08 -0500
Subject: [PATCH 066/196] snakefmt cleanup

---
 workflows/rnaseq/Snakefile | 282 +++++++++++++++++++++----------------
 1 file changed, 161 insertions(+), 121 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 5b9923b8..bf87a234 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -11,8 +11,6 @@ from lib.utils import autobump, gb, hours
 configfile: "config/config.yaml"
 
 
-
-
 REFERENCES = config.get("reference_dir", "../../references")
 sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
@@ -35,28 +33,38 @@ rule all:
     input:
         "data/rnaseq_aggregation/multiqc.html",
 
+
 include: "../references/Snakefile"
 
+
 if utils.detect_sra(sampletable):
-    sampletable['orig_filename'] = expand(
-        'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1)
+    sampletable["orig_filename"] = expand(
+        "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1
+    )
 
     if is_paired:
-        sampletable['orig_filename_R2'] = expand(
-            'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2)
+        sampletable["orig_filename_R2"] = expand(
+            "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+            sample=SAMPLES,
+            n=2,
+        )
 
     rule fastq_dump:
         output:
-            fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n, allow_missing=True)
+            fastq=expand(
+                "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+                n=n,
+                allow_missing=True,
+            ),
         log:
-            'original_data/sra_samples/{sample}/{sample}.fastq.gz.log'
+            "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
         params:
             is_paired=is_paired,
             # extra="-X 100000",  # [enable for test]
         resources:
             mem_mb=gb(1),
             disk_mb=autobump(gb=1),
-            runtime=autobump(hours=2)
+            runtime=autobump(hours=2),
         run:
             srr = sampletable.loc[wildcards.sample, "Run"]
             extra = params.get("extra", "")
@@ -65,7 +73,9 @@ if utils.detect_sra(sampletable):
                 shell("mv {srr}_1.fastq.gz {output[0]}")
                 shell("mv {srr}_2.fastq.gz {output[1]}")
             else:
-                shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp")
+                shell(
+                    "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp"
+                )
                 shell("mv {output[0]}.tmp {output[0]}")
 
 
@@ -78,7 +88,7 @@ rule symlinks:
             else sampletable.loc[wc.sample, ["orig_filename"]]
         ),
     output:
-        expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n)
+        expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
     threads: 1
     resources:
         mem_mb=100,
@@ -100,73 +110,88 @@ rule symlink_targets:
 # automatically-subset data to evaluate strandedness.
 rule sample_strand_check:
     input:
-        fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n),
+        fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
         index=expand(rules.bowtie2_index.output, label="genome"),
         bed12=rules.conversion_bed12.output,
     output:
-        strandedness='strand_check/{sample}/{sample}.strandedness',
-        bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'),
-        bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'),
-        fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', n=n, allow_missing=True)),
+        strandedness="strand_check/{sample}/{sample}.strandedness",
+        bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"),
+        bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"),
+        fastqs=temporary(
+            expand(
+                "strand_check/{sample}/{sample}_R{n}.strandedness.fastq",
+                n=n,
+                allow_missing=True,
+            )
+        ),
     log:
-        'strand_check/{sample}/{sample}.strandedness.log'
+        "strand_check/{sample}/{sample}.strandedness.log",
     threads: 6
     resources:
         mem_mb=gb(8),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     run:
         prefix = os.path.commonprefix(input.index).rstrip(".")
         nreads = int(1e5 * 4)
         if is_paired:
-            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
-            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}')
-            fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} '
+            shell(
+                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
+            )
+            shell(
+                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}"
+            )
+            fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} "
         else:
-            shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}')
-            fastqs = f'-U {output.fastqs[0]} '
+            shell(
+                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
+            )
+            fastqs = f"-U {output.fastqs[0]} "
         shell(
             "bowtie2 "
             "-x {prefix} "
             "{fastqs} "
-            '--no-unal '
+            "--no-unal "
             "--threads {threads} 2> {log} "
             "| samtools view -Sb - "
             "| samtools sort - -o {output.bam} "
         )
         shell("samtools index {output.bam}")
         shell(
-            'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}'
+            "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}"
         )
 
 
 rule strand_check:
     input:
-        expand('strand_check/{sample}/{sample}.strandedness', sample=SAMPLES)
+        expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES),
     output:
-        html='strand_check/strandedness.html',
-        filelist=temporary('strand_check/filelist')
+        html="strand_check/strandedness.html",
+        filelist=temporary("strand_check/filelist"),
     log:
-        'strand_check/strandedness.log'
+        "strand_check/strandedness.log",
     resources:
         mem_mb=gb(1),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     run:
-        with open(output.filelist, 'w') as fout:
-            for i in  input:
-                fout.write(i + '\n')
+        with open(output.filelist, "w") as fout:
+            for i in input:
+                fout.write(i + "\n")
         shell(
-            'multiqc '
-            '--force '
-            '--module rseqc '
-            '--file-list {output.filelist} '
-            '--filename {output.html} &> {log}'
+            "multiqc "
+            "--force "
+            "--module rseqc "
+            "--file-list {output.filelist} "
+            "--filename {output.html} &> {log}"
         )
 
+
 rule cutadapt:
     input:
-        fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n)
+        fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
     output:
-        fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz', n=n)
+        fastq=expand(
+            "data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz", n=n
+        ),
     log:
         "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
     threads: 6
@@ -210,36 +235,35 @@ rule cutadapt:
 
 rule fastqc:
     input:
-        'data/rnaseq_samples/{sample}/{sample}{suffix}'
-    threads:
-        1
+        "data/rnaseq_samples/{sample}/{sample}{suffix}",
+    threads: 1
     output:
-        html='data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html',
-        zip='data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip',
+        html="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html",
+        zip="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip",
     resources:
         mem_mb=gb(8),
-        runtime=autobump(hours=2)
+        runtime=autobump(hours=2),
     log:
-        'data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log',
+        "data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log",
     run:
         outdir = os.path.dirname(output.html) or "."
         shell(
-            'fastqc '
-            '--noextract '
-            '--quiet '
-            '--outdir {outdir} '
-            '{input} '
-            '2> {log} '
+            "fastqc "
+            "--noextract "
+            "--quiet "
+            "--outdir {outdir} "
+            "{input} "
+            "2> {log} "
         )
         outfile = os.path.basename(input[0])
-        for s in ['.fastq', '.fq', '.gz', '.bam']:
-            outfile = outfile.replace(s, '')
-        out_zip = os.path.join(outdir, outfile + '_fastqc.zip')
+        for s in [".fastq", ".fq", ".gz", ".bam"]:
+            outfile = outfile.replace(s, "")
+        out_zip = os.path.join(outdir, outfile + "_fastqc.zip")
         if not os.path.abspath(out_zip) == os.path.abspath(output.zip):
-            shell('mv {out_zip} {output.zip}')
-        out_html = os.path.join(outdir, outfile + '_fastqc.html')
+            shell("mv {out_zip} {output.zip}")
+        out_html = os.path.join(outdir, outfile + "_fastqc.html")
         if not os.path.abspath(out_html) == os.path.abspath(output.html):
-            shell('mv {out_html} {output.html}')
+            shell("mv {out_html} {output.html}")
 
 
 if config["aligner"] == "hisat2":
@@ -249,9 +273,9 @@ if config["aligner"] == "hisat2":
             fastq=rules.cutadapt.output,
             index=rules.hisat2_index.output,
         output:
-            bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam")
+            bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
         log:
-            "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log"
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log",
         threads: 16
         resources:
             mem_mb=gb(32),
@@ -287,6 +311,7 @@ if config["aligner"] == "hisat2":
             )
 
 
+
 if config["aligner"].startswith("star"):
     if os.getenv("TMPDIR"):
         tmpdir_arg = "--outTmpDir $TMPDIR/star "
@@ -332,9 +357,11 @@ if config["aligner"] == "star":
             annotation=f"{REFERENCES}/annotation.gtf",
         output:
             bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
-            sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab")
+            sjout=temporary(
+                "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab"
+            ),
         log:
-            "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log"
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log",
         threads: 16
         resources:
             mem_mb=gb(64),
@@ -371,9 +398,11 @@ if config["aligner"] == "star-twopass":
             index=rules.star_index.output,
             annotation=f"{REFERENCES}/annotation.gtf",
         output:
-            sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab")
+            sjout=temporary(
+                "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab"
+            ),
         log:
-            "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log"
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log",
         threads: 16
         resources:
             mem_mb=gb(64),
@@ -412,12 +441,14 @@ if config["aligner"] == "star-twopass":
             fastq=rules.cutadapt.output,
             index=rules.star_index.output,
             annotation=f"{REFERENCES}/annotation.gtf",
-            sjout=expand(rules.star_pass1.output, sample=SAMPLES)
+            sjout=expand(rules.star_pass1.output, sample=SAMPLES),
         output:
             bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
-            sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.SJ.out.tab")
+            sjout=temporary(
+                "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.SJ.out.tab"
+            ),
         log:
-            "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log"
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log",
         threads: 16
         resources:
             mem_mb=gb(64),
@@ -455,7 +486,7 @@ if config["aligner"] == "star-twopass":
 
 rule rRNA:
     input:
-        fastq='data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz',
+        fastq="data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz",
         index=multiext(
             f"{REFERENCES}/bowtie2/rrna",
             ".1.bt2",
@@ -467,9 +498,9 @@ rule rRNA:
             ".fa",
         ),
     output:
-        bam='data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam',
+        bam="data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam",
     log:
-        'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.log'
+        "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.log",
     threads: 6
     resources:
         mem_mb=gb(2),
@@ -541,12 +572,12 @@ rule bam_index:
 
 rule markduplicates:
     input:
-        bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.bam'
+        bam="data/rnaseq_samples/{sample}/{sample}.cutadapt.bam",
     output:
-        bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam',
-        metrics='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics'
+        bam="data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam",
+        metrics="data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics",
     log:
-        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.log'
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.log",
     threads: 1
     resources:
         mem_mb=gb(32),
@@ -565,14 +596,15 @@ rule markduplicates:
         "VALIDATION_STRINGENCY=LENIENT "
         "&> {log}"
 
+
 rule featurecounts:
     input:
         annotation=rules.gtf.output,
-        bam=rules.markduplicates.output.bam
+        bam=rules.markduplicates.output.bam,
     output:
-        'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt'
+        "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt",
     log:
-        'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log'
+        "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log",
     threads: 8
     resources:
         mem_mb=gb(16),
@@ -599,22 +631,24 @@ rule featurecounts:
             "&> {log}"
         )
 
+
 rule aggregate_featurecounts:
     input:
-        expand('data/rnaseq_samples/{sample}/{sample}_featurecounts.txt', sample=SAMPLES)
+        expand(
+            "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", sample=SAMPLES
+        ),
     output:
-        'data/rnaseq_aggregation/featurecounts.txt'
-    log: 
-        'data/rnaseq_aggregation/featurecounts.txt.log'
-    threads:
-        1
+        "data/rnaseq_aggregation/featurecounts.txt",
+    log:
+        "data/rnaseq_aggregation/featurecounts.txt.log",
+    threads: 1
     resources:
         mem_mb=gb(8),
-        runtime=autobump(hours=1)
+        runtime=autobump(hours=1),
     run:
         for i, file in enumerate(input):
             df = pd.read_csv(file, sep="\t", comment="#")
-            df = df.set_index('Geneid', drop=False)
+            df = df.set_index("Geneid", drop=False)
             if i == 0:
                 final = df
                 continue
@@ -624,11 +658,17 @@ rule aggregate_featurecounts:
 
 rule rrna_libsizes_table:
     input:
-        rrna=expand('data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize', sample=SAMPLES),
-        fastq=expand('data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize', sample=SAMPLES),
+        rrna=expand(
+            "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize",
+            sample=SAMPLES,
+        ),
+        fastq=expand(
+            "data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize",
+            sample=SAMPLES,
+        ),
     output:
-        tsv='data/rnaseq_aggregation/rrna_percentages_table.tsv',
-        json='data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml',
+        tsv="data/rnaseq_aggregation/rrna_percentages_table.tsv",
+        json="data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml",
     threads: 1
     resources:
         mem_mb=gb(2),
@@ -637,24 +677,20 @@ rule rrna_libsizes_table:
         "../../scripts/rrna_libsizes_table.py"
 
 
-
-
-
-
 rule collectrnaseqmetrics:
     input:
         bam=rules.markduplicates.output.bam,
         refflat=rules.conversion_refflat.output,
     output:
-        metrics='data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics'
+        metrics="data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics",
     log:
-        'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics.log'
+        "data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics.log",
     threads: 1
     resources:
         mem_mb=gb(32),
         runtime=autobump(hours=2),
     params:
-        java_args="-Xmx20g",   # [disable for test]
+        java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g',  # [enable for test]
         strand_arg={
             "unstranded": "STRAND=NONE ",
@@ -677,11 +713,11 @@ rule collectrnaseqmetrics:
 
 rule preseq:
     input:
-        bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam',
+        bam="data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam",
     output:
-        'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt'
+        "data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt",
     log:
-        'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt.log'
+        "data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt.log",
     threads: 1
     resources:
         mem_mb=gb(1),
@@ -699,9 +735,9 @@ rule salmon:
         fastq=rules.cutadapt.output,
         index=REFERENCES + "/salmon/versionInfo.json",
     output:
-        'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf'
+        "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf",
     log:
-        'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf.log'
+        "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf.log",
     threads: 6
     resources:
         mem_mb=gb(32),
@@ -736,9 +772,9 @@ rule kallisto:
         fastq=rules.cutadapt.output,
         index=REFERENCES + "/kallisto/transcripts.idx",
     output:
-        'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5'
+        "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5",
     log:
-        'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log'
+        "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log",
     threads: 8
     resources:
         mem_mb=gb(32),
@@ -775,9 +811,9 @@ rule rseqc_infer_experiment:
         bam=rules.markduplicates.output,
         bed12=rules.conversion_bed12.output,
     output:
-        'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt',
+        "data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt",
     log:
-        'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt.log'
+        "data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt.log",
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2),
@@ -790,9 +826,9 @@ rule rseqc_read_distribution:
         bam=rules.markduplicates.output,
         bed12=rules.conversion_bed12.output,
     output:
-        'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt',
+        "data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt",
     log:
-        'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt.log'
+        "data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt.log",
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2),
@@ -805,9 +841,9 @@ rule samtools_idxstats:
         bam=rules.markduplicates.output.bam,
         bai=rules.markduplicates.output.bam + ".bai",
     output:
-        'data/rnaseq_samples/{sample}/idxstat_{sample}.txt'
+        "data/rnaseq_samples/{sample}/idxstat_{sample}.txt",
     log:
-        'data/rnaseq_samples/{sample}/idxstat_{sample}.txt.log'
+        "data/rnaseq_samples/{sample}/idxstat_{sample}.txt.log",
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2),
@@ -820,9 +856,9 @@ rule samtools_flagstat:
         bam=rules.markduplicates.output.bam,
         bai=rules.markduplicates.output.bam + ".bai",
     output:
-        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat'
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat",
     log:
-        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat.log'
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat.log",
     shell:
         "samtools flagstat {input.bam} > {output}"
 
@@ -832,9 +868,9 @@ rule samtools_stats:
         bam=rules.markduplicates.output.bam,
         bai=rules.markduplicates.output.bam + ".bai",
     output:
-        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats'
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats",
     log:
-        'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats.log'
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats.log",
     shell:
         "samtools stats {input.bam} > {output}"
 
@@ -844,13 +880,13 @@ rule bigwig_neg:
         bam=rules.markduplicates.output.bam,
         bai=rules.markduplicates.output.bam + ".bai",
     output:
-        'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig'
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig",
     threads: 8
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2),
     log:
-        'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig.log'
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig.log",
     params:
         strand_arg={
             "unstranded": "",
@@ -879,13 +915,13 @@ rule bigwig_pos:
         bam=rules.markduplicates.output.bam,
         bai=rules.markduplicates.output.bam + ".bai",
     output:
-        'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig'
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig",
     threads: 8
     resources:
         mem_mb=gb(16),
         runtime=autobump(hours=2),
     log:
-        'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig.log'
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig.log",
     params:
         strand_arg={
             "unstranded": "",
@@ -908,10 +944,15 @@ rule bigwig_pos:
             "&> {log}"
         )
 
+
 rule multiqc:
     input:
         files=(
-            expand(rules.fastqc.output.zip, sample=SAMPLES, suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"]),
+            expand(
+                rules.fastqc.output.zip,
+                sample=SAMPLES,
+                suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"],
+            ),
             expand(rules.markduplicates.output, sample=SAMPLES),
             expand(rules.salmon.output, sample=SAMPLES),
             expand(rules.kallisto.output, sample=SAMPLES),
@@ -935,7 +976,7 @@ rule multiqc:
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2),
-        disk_mb=gb(10)
+        disk_mb=gb(10),
     run:
         analysis_directory = set([os.path.dirname(i) for i in input])
         outdir = os.path.dirname(output[0])
@@ -951,4 +992,3 @@ rule multiqc:
             "{analysis_directory} "
             "&> {log} "
         )
-

From da2fc328078b22877a3c5f24e087c05a6d66b584 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 19 Jan 2025 13:03:50 -0500
Subject: [PATCH 067/196] rrna_libsizes_table script avoids utils

---
 scripts/rrna_libsizes_table.py | 22 ++++++++++++++++------
 workflows/rnaseq/Snakefile     |  3 +++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/scripts/rrna_libsizes_table.py b/scripts/rrna_libsizes_table.py
index f71d48bc..ea2b6820 100644
--- a/scripts/rrna_libsizes_table.py
+++ b/scripts/rrna_libsizes_table.py
@@ -2,21 +2,31 @@
 Prepares a TSV and JSON file for multiqc to pick up and display as a sortable
 table
 """
-import sys
 import os
+import re
 import pandas as pd
 import yaml
-
-sys.path.insert(0, os.path.dirname(__file__) + "/..")
-from lib import utils
+from snakemake.io import regex_from_filepattern
 
 
 def rrna_sample(f):
-    return utils.extract_wildcards(snakemake.config["patterns"]["rrna"]["libsize"], f)["sample"]
+    m = re.compile(
+        regex_from_filepattern(
+            snakemake.params.rrna_pattern,
+        )
+    ).match(f)
+    if m:
+        return m.groupdict()["sample"]
 
 
 def sample(f):
-    return utils.extract_wildcards(snakemake.config["patterns"]["libsizes"]["cutadapt"], f)["sample"]
+    m = re.compile(
+        regex_from_filepattern(
+            snakemake.params.fastq_pattern,
+        )
+    ).match(f)
+    if m:
+        return m.groupdict()["sample"]
 
 
 def million(f):
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index bf87a234..3f2e5b90 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -670,6 +670,9 @@ rule rrna_libsizes_table:
         tsv="data/rnaseq_aggregation/rrna_percentages_table.tsv",
         json="data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml",
     threads: 1
+    params:
+        rrna_pattern=lambda wc: "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize",
+        fastq_pattern=lambda wc: "data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize",
     resources:
         mem_mb=gb(2),
         runtime=autobump(hours=2),

From b049ef6e8371cf1961f5ad73ecccf5b01d97c074 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 19 Jan 2025 13:08:20 -0500
Subject: [PATCH 068/196] use mem and disk rather than mem_mb and disk_mb

---
 workflows/rnaseq/Snakefile | 131 +++++++++++++++++++------------------
 1 file changed, 68 insertions(+), 63 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 3f2e5b90..7247bbc2 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -5,7 +5,6 @@ import pandas as pd
 
 sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
 from lib import utils
-from lib.utils import autobump, gb, hours
 
 
 configfile: "config/config.yaml"
@@ -62,9 +61,9 @@ if utils.detect_sra(sampletable):
             is_paired=is_paired,
             # extra="-X 100000",  # [enable for test]
         resources:
-            mem_mb=gb(1),
-            disk_mb=autobump(gb=1),
-            runtime=autobump(hours=2),
+            mem="1g",
+            disk="1g",
+            runtime="2h",
         run:
             srr = sampletable.loc[wildcards.sample, "Run"]
             extra = params.get("extra", "")
@@ -91,8 +90,8 @@ rule symlinks:
         expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
     threads: 1
     resources:
-        mem_mb=100,
-        runtime=10,
+        mem="1g",
+        runtime="10m",
     run:
         assert len(output) == len(input), (input, output)
         for src, linkname in zip(input, output):
@@ -128,8 +127,8 @@ rule sample_strand_check:
         "strand_check/{sample}/{sample}.strandedness.log",
     threads: 6
     resources:
-        mem_mb=gb(8),
-        runtime=autobump(hours=2),
+        mem="8g",
+        runtime="2h",
     run:
         prefix = os.path.commonprefix(input.index).rstrip(".")
         nreads = int(1e5 * 4)
@@ -170,8 +169,8 @@ rule strand_check:
     log:
         "strand_check/strandedness.log",
     resources:
-        mem_mb=gb(1),
-        runtime=autobump(hours=2),
+        mem="1g",
+        runtime="2h",
     run:
         with open(output.filelist, "w") as fout:
             for i in input:
@@ -196,8 +195,8 @@ rule cutadapt:
         "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
     threads: 6
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     params:
         extra=(
             (
@@ -241,8 +240,8 @@ rule fastqc:
         html="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html",
         zip="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip",
     resources:
-        mem_mb=gb(8),
-        runtime=autobump(hours=2),
+        mem="8g",
+        runtime="2h",
     log:
         "data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log",
     run:
@@ -278,8 +277,8 @@ if config["aligner"] == "hisat2":
             "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log",
         threads: 16
         resources:
-            mem_mb=gb(32),
-            runtime=autobump(hours=8),
+            mem="32g",
+            runtime="8h",
         params:
             extra="",
         run:
@@ -364,9 +363,9 @@ if config["aligner"] == "star":
             "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log",
         threads: 16
         resources:
-            mem_mb=gb(64),
-            runtime=autobump(hours=8),
-            disk_mb=gb(80),
+            mem="64g",
+            runtime="8h",
+            disk="80g",
         params:
             extra=STAR_PARAMS,
         run:
@@ -405,9 +404,9 @@ if config["aligner"] == "star-twopass":
             "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log",
         threads: 16
         resources:
-            mem_mb=gb(64),
-            runtime=autobump(hours=8),
-            disk_mb=gb(80),
+            mem="64g",
+            runtime="8h",
+            disk="80g",
         params:
             extra=STAR_PARAMS,
         run:
@@ -451,9 +450,9 @@ if config["aligner"] == "star-twopass":
             "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log",
         threads: 16
         resources:
-            mem_mb=gb(64),
-            runtime=autobump(hours=8),
-            disk_mb=gb(80),
+            mem="64g",
+            runtime="8h",
+            disk="80g",
         params:
             extra=STAR_PARAMS,
         run:
@@ -503,8 +502,8 @@ rule rRNA:
         "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.log",
     threads: 6
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     params:
         extra=(
             "-k 1 "
@@ -538,8 +537,8 @@ rule fastq_count:
         "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize",
     threads: 1
     resources:
-        mem_mb=gb(1),
-        runtime=autobump(hours=2),
+        mem="1g",
+        runtime="2h",
     shell:
         "zcat {input} | echo $((`wc -l`/4)) > {output}"
 
@@ -551,8 +550,8 @@ rule bam_count:
         "{sample_dir}/{sample}/{suffix}.bam.libsize",
     threads: 1
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     shell:
         "samtools view -c {input} > {output}"
 
@@ -564,8 +563,8 @@ rule bam_index:
         bai="{prefix}.bam.bai",
     threads: 1
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     shell:
         "samtools index {input} {output}"
 
@@ -580,9 +579,9 @@ rule markduplicates:
         "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.log",
     threads: 1
     resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
-        disk_mb=autobump(gb=100),
+        mem="32g",
+        runtime="2h",
+        disk="100g",
     params:
         java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
@@ -607,8 +606,8 @@ rule featurecounts:
         "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log",
     threads: 8
     resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2),
+        mem="16g",
+        runtime="2h",
     params:
         strand_arg={
             "unstranded": "-s0 ",
@@ -643,8 +642,8 @@ rule aggregate_featurecounts:
         "data/rnaseq_aggregation/featurecounts.txt.log",
     threads: 1
     resources:
-        mem_mb=gb(8),
-        runtime=autobump(hours=1),
+        mem="8g",
+        runtime="1h"
     run:
         for i, file in enumerate(input):
             df = pd.read_csv(file, sep="\t", comment="#")
@@ -674,8 +673,8 @@ rule rrna_libsizes_table:
         rrna_pattern=lambda wc: "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize",
         fastq_pattern=lambda wc: "data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize",
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     script:
         "../../scripts/rrna_libsizes_table.py"
 
@@ -690,8 +689,8 @@ rule collectrnaseqmetrics:
         "data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics.log",
     threads: 1
     resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
+        mem="32g",
+        runtime="2h",
     params:
         java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g',  # [enable for test]
@@ -723,8 +722,8 @@ rule preseq:
         "data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt.log",
     threads: 1
     resources:
-        mem_mb=gb(1),
-        runtime=autobump(hours=2),
+        mem="1g",
+        runtime="2h",
     shell:
         "preseq "
         "c_curve "
@@ -743,8 +742,8 @@ rule salmon:
         "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf.log",
     threads: 6
     resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
+        mem="32g",
+        runtime="2h",
     params:
         extra=(
             "--libType=A "
@@ -780,8 +779,8 @@ rule kallisto:
         "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log",
     threads: 8
     resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
+        mem="32g",
+        runtime="2h",
     params:
         strand_arg={
             "unstranded": "",
@@ -818,8 +817,8 @@ rule rseqc_infer_experiment:
     log:
         "data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt.log",
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     shell:
         "infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}"
 
@@ -833,8 +832,8 @@ rule rseqc_read_distribution:
     log:
         "data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt.log",
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     shell:
         "read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}"
 
@@ -848,8 +847,8 @@ rule samtools_idxstats:
     log:
         "data/rnaseq_samples/{sample}/idxstat_{sample}.txt.log",
     resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2),
+        mem="16g",
+        runtime="2h",
     shell:
         "samtools idxstats {input.bam} 2> {log} 1> {output}"
 
@@ -862,6 +861,9 @@ rule samtools_flagstat:
         "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat",
     log:
         "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat.log",
+    resources:
+        mem="8g",
+        runtime="2h",
     shell:
         "samtools flagstat {input.bam} > {output}"
 
@@ -874,6 +876,9 @@ rule samtools_stats:
         "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats",
     log:
         "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats.log",
+    resources:
+        mem="8g",
+        runtime="2h",
     shell:
         "samtools stats {input.bam} > {output}"
 
@@ -886,8 +891,8 @@ rule bigwig_neg:
         "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig",
     threads: 8
     resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2),
+        mem="16g",
+        runtime="2h",
     log:
         "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig.log",
     params:
@@ -921,8 +926,8 @@ rule bigwig_pos:
         "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig",
     threads: 8
     resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2),
+        mem="16g",
+        runtime="2h",
     log:
         "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig.log",
     params:
@@ -977,9 +982,9 @@ rule multiqc:
         "data/rnaseq_aggregation/multiqc.log",
     threads: 1
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
-        disk_mb=gb(10),
+        mem="2g",
+        runtime="2h",
+        disk="10g",
     run:
         analysis_directory = set([os.path.dirname(i) for i in input])
         outdir = os.path.dirname(output[0])

From 650e60ff39eb94042aee8a3779954c260f67a292 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 19 Jan 2025 13:12:59 -0500
Subject: [PATCH 069/196] convert to mem and disk in references

---
 workflows/references/Snakefile | 56 +++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index d6dcf759..682f1bfe 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -3,7 +3,6 @@ import sys
 import pandas
 
 sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
-from lib.utils import autobump, gb, hours
 from lib import utils
 
 REFERENCES = config.get("reference_dir", "../../references")
@@ -18,6 +17,9 @@ rule fasta:
         temporary(f"{REFERENCES}/genome.fa.gz"),
     log:
         f"{REFERENCES}/logs/genome.fa.gz.log",
+    resources:
+        mem_mb="4g",
+        runtime="2h",
     run:
         utils.download_and_postprocess(
             urls=config["fasta"]["url"],
@@ -32,6 +34,9 @@ rule gtf:
         temporary(f"{REFERENCES}/annotation.gtf.gz"),
     log:
         f"{REFERENCES}/logs/annotation.gtf.gz.log",
+    resources:
+        mem="4g",
+        runtime="2h",
     run:
         utils.download_and_postprocess(
             urls=config["gtf"]["url"],
@@ -46,6 +51,9 @@ rule rrna:
         temporary(f"{REFERENCES}/rrna.fa.gz"),
     log:
         f"{REFERENCES}/logs/rrna.fa.gz.log",
+    resources:
+        mem="4g",
+        runtime="2h",
     run:
         utils.download_and_postprocess(
             urls=config["rrna"]["url"],
@@ -60,6 +68,9 @@ rule unzip:
         f"{REFERENCES}/{{prefix}}.gz",
     output:
         f"{REFERENCES}/{{prefix}}",
+    resources:
+        mem="4g",
+        runtime="2h",
     shell:
         "gunzip -c {input} > {output}"
 
@@ -81,9 +92,9 @@ rule bowtie2_index:
     log:
         f"{REFERENCES}/logs/bowtie2_{{label}}.log",
     resources:
-        runtime=autobump(hours=8),
-        mem_mb=autobump(gb=32),
-        disk_mb=autobump(gb=50),
+        mem="32g",
+        disk="50g",
+        runtime="8h",
     threads: 8
     run:
         index = os.path.commonprefix(output).rstrip(".")
@@ -101,8 +112,8 @@ rule star_index:
         f"{REFERENCES}/logs/star.log",
     threads: 8
     resources:
-        runtime=autobump(hours=8),
-        mem_mb=gb(64),
+        mem="64g",
+        runtime="8h",
     run:
         genomedir = os.path.dirname(output[0])
         shell("rm -r {genomedir}")
@@ -148,9 +159,9 @@ rule hisat2_index:
     log:
         f"{REFERENCES}/logs/hisat2.log",
     resources:
-        runtime=autobump(hours=8),
-        mem_mb=autobump(gb=32),
-        disk_mb=autobump(gb=50),
+        mem="32g",
+        disk="50g",
+        runtime="8h",
     threads: 8
     run:
         index = os.path.commonprefix(output).rstrip(".")
@@ -165,7 +176,8 @@ rule transcriptome_fasta:
     output:
         f"{REFERENCES}/transcriptome.fa",
     resources:
-        runtime=hours(1),
+        mem="4g",
+        runtime="2h",
     shell:
         "gffread {input.gtf} -w {output} -g {input.fasta}"
 
@@ -180,8 +192,8 @@ rule salmon_index:
     params:
         outdir=f"{REFERENCES}/salmon",
     resources:
-        mem_mb=gb(32),
-        runtime=hours(2),
+        mem="32g",
+        runtime="2h",
     run:
         outdir = os.path.dirname(output[0])
         shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}")
@@ -195,8 +207,8 @@ rule kallisto_index:
     log:
         f"{REFERENCES}/logs/kallisto.log",
     resources:
-        runtime=hours(2),
-        mem_mb=gb(32),
+        mem="32g",
+        runtime="2h",
     shell:
         "kallisto index "
         "--index {output} "
@@ -212,8 +224,8 @@ rule conversion_refflat:
     log:
         f"{REFERENCES}/logs/annotation.refflat.log",
     resources:
-        runtime=hours(2),
-        mem_mb=gb(2),
+        mem="2g",
+        runtime="2h",
     shell:
         "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp "
         """&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} """
@@ -226,8 +238,8 @@ rule conversion_bed12:
     output:
         f"{REFERENCES}/annotation.bed12",
     resources:
-        runtime=hours(2),
-        mem_mb=gb(2),
+        mem="2g",
+        runtime="2h",
     shell:
         "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp "
         "&& genePredToBed {output}.tmp {output} "
@@ -247,8 +259,8 @@ rule chromsizes:
         java_args="-Xmx20g",
         # java_args='-Xmx2g'  # [TEST SETTINGS -1]
     resources:
-        mem_mb=gb(24),
-        runtime=hours(2),
+        mem="24g",
+        runtime="2h",
     shell:
         "export LC_COLLATE=C; "
         "rm -f {output}.tmp "
@@ -275,8 +287,8 @@ rule mappings:
             output[0]
         ].get("include_featuretypes", []),
     resources:
-        runtime=hours(2),
-        mem_mb=gb(2),
+        mem="2g",
+        runtime="2h",
     run:
         import gffutils
 

From d5db4a56c864242d7d2cb698a479e0edea4becf7 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 19 Jan 2025 13:35:52 -0500
Subject: [PATCH 070/196] spell out params fully in wrapper

---
 include/WRAPPER_SLURM | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/WRAPPER_SLURM b/include/WRAPPER_SLURM
index b2a2ffd4..2c92f7ae 100755
--- a/include/WRAPPER_SLURM
+++ b/include/WRAPPER_SLURM
@@ -26,13 +26,11 @@ fi
 # Run snakemake
 (
     time snakemake \
-    -p \
+    --printshellcmds \
     --directory $PWD \
-    -k \
-    --restart-times 3 \
+    --keep-going \
     --rerun-incomplete \
     --jobname "s.{rulename}.{jobid}.sh" \
-    -j 999 \
     --use-conda \
     --configfile config/config.yaml \
     $PROFILE_CMD \

From b3a7d94f76b38beca89f6ee4f724d434b5ed1937 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Sun, 19 Jan 2025 13:36:05 -0500
Subject: [PATCH 071/196] timestamped log file for slurm wrapper

---
 include/WRAPPER_SLURM | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/WRAPPER_SLURM b/include/WRAPPER_SLURM
index 2c92f7ae..9f7f1344 100755
--- a/include/WRAPPER_SLURM
+++ b/include/WRAPPER_SLURM
@@ -19,11 +19,13 @@ if [ -z "$LCDBWF_SNAKEMAKE_PROFILE" ]; then
         PROFILE_CMD="--profile $SNAKEMAKE_PROFILE"
     fi
 else
-# LCDBWF_SNAKEMAKE_PROFILE found, this takes priority if both profile variables are set
+# LCDBWF_SNAKEMAKE_PROFILE takes priority if both profile variables are set
 PROFILE_CMD="--profile $LCDBWF_SNAKEMAKE_PROFILE"
 fi
 
-# Run snakemake
+# Timestamped log file
+LOGFILE="Snakefile_$(date +"%Y-%m-%d_%H%M").log"
+
 (
     time snakemake \
     --printshellcmds \
@@ -35,7 +37,7 @@ fi
     --configfile config/config.yaml \
     $PROFILE_CMD \
     "$@"
-    ) > "Snakefile.log" 2>&1
+    ) > "$LOGFILE" 2>&1
 
 SNAKE_PID=$!
 

From aa437be9adcd8344a2e5911ea1cbcf274f98be96 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 20 Jan 2025 10:15:29 -0500
Subject: [PATCH 072/196] rm wrappers

---
 wrappers/.gitignore                           |   5 -
 wrappers/LICENSE                              |  21 --
 wrappers/README.md                            |   1 -
 wrappers/test/conftest.py                     |  10 -
 wrappers/test/raw_data_fixtures.py            | 180 ------------
 wrappers/test/test_atropos.py                 | 156 -----------
 wrappers/test/test_bowtie2.py                 |  95 -------
 wrappers/test/test_cutadapt.py                | 151 -----------
 wrappers/test/test_deeptools.py               |  37 ---
 wrappers/test/test_demo.py                    | 159 -----------
 wrappers/test/test_dupradar.py                |  49 ----
 wrappers/test/test_fastq_screen.py            |  36 ---
 wrappers/test/test_fastqc.py                  |  70 -----
 wrappers/test/test_featurecounts.py           |  59 ----
 wrappers/test/test_hisat2.py                  | 120 --------
 wrappers/test/test_kallisto.py                |  69 -----
 wrappers/test/test_multiqc.py                 |  48 ----
 wrappers/test/test_picard.py                  | 116 --------
 wrappers/test/test_rseqc.py                   | 151 -----------
 wrappers/test/test_salmon.py                  |  83 ------
 wrappers/test/test_samtools.py                |  12 -
 wrappers/test/utils.py                        | 152 -----------
 wrappers/test_toy.py                          | 100 -------
 wrappers/wrappers/atropos/README.md           | 167 ------------
 wrappers/wrappers/atropos/environment.yaml    |   4 -
 wrappers/wrappers/atropos/wrapper.py          |  80 ------
 wrappers/wrappers/average-bigwigs/README.md   |  75 -----
 .../wrappers/average-bigwigs/environment.yaml |   5 -
 wrappers/wrappers/average-bigwigs/wrapper.py  |  32 ---
 .../wrappers/combos/merge_and_dedup/README.md |  66 -----
 .../combos/merge_and_dedup/environment.yaml   |   7 -
 wrappers/wrappers/demo/README.md              |  69 -----
 wrappers/wrappers/demo/environment.yaml       |   4 -
 wrappers/wrappers/demo/wrapper.py             |  27 --
 wrappers/wrappers/dupradar/README.md          |  83 ------
 wrappers/wrappers/dupradar/environment.yaml   |  10 -
 wrappers/wrappers/dupradar/wrapper.py         |  94 -------
 wrappers/wrappers/epic2/environment.yaml      |   8 -
 wrappers/wrappers/fastq-dump/environment.yaml |   5 -
 wrappers/wrappers/fastq-dump/wrapper.py       |  41 ---
 wrappers/wrappers/fastq_screen/README.md      |  61 -----
 .../wrappers/fastq_screen/environment.yaml    |   7 -
 wrappers/wrappers/fastq_screen/wrapper.py     |  72 -----
 wrappers/wrappers/fastqc/README.md            |  32 ---
 wrappers/wrappers/fastqc/environment.yaml     |   9 -
 wrappers/wrappers/fastqc/wrapper.py           |  48 ----
 wrappers/wrappers/macs2/callpeak/README.md    |  61 -----
 .../wrappers/macs2/callpeak/environment.yaml  |   8 -
 wrappers/wrappers/sicer/README.md             |  59 ----
 wrappers/wrappers/sicer/environment.yaml      |  10 -
 wrappers/wrappers/sicer/wrapper.py            | 147 ----------
 wrappers/wrappers/spp/README.md               | 175 ------------
 wrappers/wrappers/spp/environment.yaml        |  11 -
 wrappers/wrappers/spp/wrapper.py              | 256 ------------------
 54 files changed, 3613 deletions(-)
 delete mode 100644 wrappers/.gitignore
 delete mode 100644 wrappers/LICENSE
 delete mode 100644 wrappers/README.md
 delete mode 100644 wrappers/test/conftest.py
 delete mode 100644 wrappers/test/raw_data_fixtures.py
 delete mode 100644 wrappers/test/test_atropos.py
 delete mode 100644 wrappers/test/test_bowtie2.py
 delete mode 100644 wrappers/test/test_cutadapt.py
 delete mode 100644 wrappers/test/test_deeptools.py
 delete mode 100644 wrappers/test/test_demo.py
 delete mode 100644 wrappers/test/test_dupradar.py
 delete mode 100644 wrappers/test/test_fastq_screen.py
 delete mode 100644 wrappers/test/test_fastqc.py
 delete mode 100644 wrappers/test/test_featurecounts.py
 delete mode 100644 wrappers/test/test_hisat2.py
 delete mode 100644 wrappers/test/test_kallisto.py
 delete mode 100644 wrappers/test/test_multiqc.py
 delete mode 100644 wrappers/test/test_picard.py
 delete mode 100644 wrappers/test/test_rseqc.py
 delete mode 100644 wrappers/test/test_salmon.py
 delete mode 100644 wrappers/test/test_samtools.py
 delete mode 100644 wrappers/test/utils.py
 delete mode 100644 wrappers/test_toy.py
 delete mode 100644 wrappers/wrappers/atropos/README.md
 delete mode 100644 wrappers/wrappers/atropos/environment.yaml
 delete mode 100644 wrappers/wrappers/atropos/wrapper.py
 delete mode 100644 wrappers/wrappers/average-bigwigs/README.md
 delete mode 100644 wrappers/wrappers/average-bigwigs/environment.yaml
 delete mode 100644 wrappers/wrappers/average-bigwigs/wrapper.py
 delete mode 100644 wrappers/wrappers/combos/merge_and_dedup/README.md
 delete mode 100644 wrappers/wrappers/combos/merge_and_dedup/environment.yaml
 delete mode 100644 wrappers/wrappers/demo/README.md
 delete mode 100644 wrappers/wrappers/demo/environment.yaml
 delete mode 100644 wrappers/wrappers/demo/wrapper.py
 delete mode 100644 wrappers/wrappers/dupradar/README.md
 delete mode 100644 wrappers/wrappers/dupradar/environment.yaml
 delete mode 100644 wrappers/wrappers/dupradar/wrapper.py
 delete mode 100644 wrappers/wrappers/epic2/environment.yaml
 delete mode 100644 wrappers/wrappers/fastq-dump/environment.yaml
 delete mode 100644 wrappers/wrappers/fastq-dump/wrapper.py
 delete mode 100644 wrappers/wrappers/fastq_screen/README.md
 delete mode 100644 wrappers/wrappers/fastq_screen/environment.yaml
 delete mode 100644 wrappers/wrappers/fastq_screen/wrapper.py
 delete mode 100644 wrappers/wrappers/fastqc/README.md
 delete mode 100644 wrappers/wrappers/fastqc/environment.yaml
 delete mode 100644 wrappers/wrappers/fastqc/wrapper.py
 delete mode 100644 wrappers/wrappers/macs2/callpeak/README.md
 delete mode 100644 wrappers/wrappers/macs2/callpeak/environment.yaml
 delete mode 100644 wrappers/wrappers/sicer/README.md
 delete mode 100644 wrappers/wrappers/sicer/environment.yaml
 delete mode 100644 wrappers/wrappers/sicer/wrapper.py
 delete mode 100644 wrappers/wrappers/spp/README.md
 delete mode 100644 wrappers/wrappers/spp/environment.yaml
 delete mode 100644 wrappers/wrappers/spp/wrapper.py

diff --git a/wrappers/.gitignore b/wrappers/.gitignore
deleted file mode 100644
index ede3cdda..00000000
--- a/wrappers/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-.test*
-__pycache__
-.snakemake
-.cache
-**.snakemake*
diff --git a/wrappers/LICENSE b/wrappers/LICENSE
deleted file mode 100644
index 17b3ab77..00000000
--- a/wrappers/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2016 lcdb
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/wrappers/README.md b/wrappers/README.md
deleted file mode 100644
index 79d134e9..00000000
--- a/wrappers/README.md
+++ /dev/null
@@ -1 +0,0 @@
-See documentation at http://lcdb-wf.readthedocs.io/en/latest/wrappers.html
diff --git a/wrappers/test/conftest.py b/wrappers/test/conftest.py
deleted file mode 100644
index d346905e..00000000
--- a/wrappers/test/conftest.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import os
-import pytest
-import tempfile
-import shutil
-import inspect
-from snakemake.shell import shell
-from snakemake.utils import makedirs
-from lcdblib.snakemake import aligners
-
-from raw_data_fixtures import *
diff --git a/wrappers/test/raw_data_fixtures.py b/wrappers/test/raw_data_fixtures.py
deleted file mode 100644
index c19f8601..00000000
--- a/wrappers/test/raw_data_fixtures.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Fixtures used for downloading data from the test data repo
-"""
-
-import os
-import pytest
-from utils import tmpdir_for_func, _download_file, symlink_in_tempdir, run, dpath
-
-# ----------------------------------------------------------------------------
-# FASTQ files
-@pytest.fixture(scope='session')
-def sample1_se_fq(tmpdir_factory):
-    d = tmpdir_for_func(tmpdir_factory)
-    fn = 'rnaseq_samples/sample1/sample1.small_R1.fastq.gz'
-    return _download_file(fn, d)
-
-@pytest.fixture(scope='session')
-def sample1_se_tiny_fq(tmpdir_factory):
-    """
-    Single-end FASTQ file with 1010 reads
-    """
-    d = tmpdir_for_func(tmpdir_factory)
-    fn = 'rnaseq_samples/sample1/sample1.tiny_R1.fastq.gz'
-    return _download_file(fn, d)
-
-@pytest.fixture(scope='session')
-def sample1_pe_fq(tmpdir_factory):
-    pair = []
-    d = tmpdir_for_func(tmpdir_factory)
-    for fn in [
-        'rnaseq_samples/sample1/sample1.small_R1.fastq.gz',
-        'rnaseq_samples/sample1/sample1.small_R2.fastq.gz'
-    ]:
-        pair.append(_download_file(fn, d))
-    return pair
-
-@pytest.fixture(scope='session')
-def sample1_pe_tiny_fq(tmpdir_factory):
-    pair = []
-    d = tmpdir_for_func(tmpdir_factory)
-    for fn in [
-        'rnaseq_samples/sample1/sample1.tiny_R1.fastq.gz',
-        'rnaseq_samples/sample1/sample1.tiny_R2.fastq.gz'
-    ]:
-        pair.append(_download_file(fn, d))
-    return pair
-
-# ----------------------------------------------------------------------------
-# BAM files
-
-@pytest.fixture(scope='session')
-def sample1_se_bam(tmpdir_factory):
-    d = tmpdir_for_func(tmpdir_factory)
-    fn = 'rnaseq_samples/sample1/sample1.small.single.sorted.bam'
-    return _download_file(fn, d)
-
-
-@pytest.fixture(scope='session')
-def sample1_pe_bam(tmpdir_factory):
-    d = tmpdir_for_func(tmpdir_factory)
-    fn = 'rnaseq_samples/sample1/sample1.small.paired.sorted.bam'
-    return _download_file(fn, d)
-
-
-@pytest.fixture(scope='session')
-def sample1_se_tiny_bam(tmpdir_factory):
-    d = tmpdir_for_func(tmpdir_factory)
-    fn = 'rnaseq_samples/sample1/sample1.tiny.single.sorted.bam'
-    return _download_file(fn, d)
-
-
-@pytest.fixture(scope='session')
-def sample1_pe_tiny_bam(tmpdir_factory):
-    d = tmpdir_for_func(tmpdir_factory)
-    fn = 'rnaseq_samples/sample1/sample1.tiny.paired.sorted.bam'
-    return _download_file(fn, d)
-
-
-@pytest.fixture(scope='session')
-def sample1_se_bam_bai(sample1_se_bam, tmpdir_factory):
-    """
-    Returns both the bam and the bam.bai
-    """
-    snakefile = '''
-    rule index:
-        input: bam='sample1.sorted.bam'
-        output: bai='sample1.sorted.bam.bai'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_bam: 'sample1.sorted.bam'
-
-        }
-    )
-    tmpdir = str(tmpdir_factory.mktemp('sample1_se_bam_bai'))
-    run(dpath('../wrappers/samtools/index'), snakefile, None, input_data_func, tmpdir)
-    return {
-            'bam': os.path.join(tmpdir, 'sample1.sorted.bam'),
-            'bai': os.path.join(tmpdir, 'sample1.sorted.bam.bai'),
-    }
-
-
-@pytest.fixture(scope='session')
-def sample1_se_tiny_bam_bai(sample1_se_tiny_bam, tmpdir_factory):
-    """
-    Returns both the bam and the bam.bai
-    """
-    snakefile = '''
-    rule index:
-        input: bam='sample1.sorted.bam'
-        output: bai='sample1.sorted.bam.bai'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1.sorted.bam'
-
-        }
-    )
-    tmpdir = str(tmpdir_factory.mktemp('sample1_se_tiny_bam_bai'))
-    run(dpath('../wrappers/samtools/index'), snakefile, None, input_data_func, tmpdir)
-    return {
-            'bam': os.path.join(tmpdir, 'sample1.sorted.bam'),
-            'bai': os.path.join(tmpdir, 'sample1.sorted.bam.bai'),
-    }
-
-# ----------------------------------------------------------------------------
-# Annotations
-
-@pytest.fixture(scope='session')
-def transcriptome(tmpdir_factory):
-    d = tmpdir_for_func(tmpdir_factory)
-    fn = 'seq/dm6.small.transcriptome.fa'
-    return _download_file(fn, d)
-
-
-@pytest.fixture(scope='session')
-def dm6_fa(tmpdir_factory):
-    fn = 'seq/dm6.small.fa'
-    d = tmpdir_for_func(tmpdir_factory)
-    return _download_file(fn, d)
-
-
-@pytest.fixture(scope='session')
-def annotation(tmpdir_factory):
-    fn = 'annotation/dm6.small.gtf'
-    d = tmpdir_for_func(tmpdir_factory)
-    return _download_file(fn, d)
-
-
-@pytest.fixture(scope='session')
-def annotation_refflat(tmpdir_factory):
-    fn = 'annotation/dm6.small.refflat'
-    d = tmpdir_for_func(tmpdir_factory)
-    return _download_file(fn, d)
-
-
-@pytest.fixture(scope='session')
-def annotation_db(annotation):
-    import gffutils
-    gffutils.create_db(
-        data=annotation, dbfn=annotation + '.db',
-        merge_strategy='merge',
-        id_spec={'transcript': ['transcript_id', 'transcript_symbol'],
-                 'gene': ['gene_id', 'gene_symbol']},
-        gtf_transcript_key='transcript_id',
-        gtf_gene_key='gene_id')
-    return annotation + '.db'
-
-
-@pytest.fixture(scope='session')
-def annotation_bed12(annotation_db):
-    import gffutils
-    db = gffutils.FeatureDB(annotation_db)
-    bed12 = '.'.join(annotation_db.strip().split('.')[:-2]) + '.bed12'
-    with open(bed12, 'w') as handle:
-        for t in db.features_of_type('transcript'):
-            handle.write(db.bed12(t, name_field='transcript_id') + '\n')
-    return bed12
diff --git a/wrappers/test/test_atropos.py b/wrappers/test/test_atropos.py
deleted file mode 100644
index f695202e..00000000
--- a/wrappers/test/test_atropos.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import pytest
-import os
-import gzip
-from utils import run, dpath, symlink_in_tempdir
-
-
-def test_atropos_simple(sample1_se_tiny_fq, tmpdir):
-    snakefile = '''
-                rule atropos:
-                    input:
-                        fastq='sample1_R1.fastq.gz'
-                    output:
-                        fastq='sample1_R1.trim.fastq.gz'
-                    params: extra='-a AAA'
-                    threads: 2
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1_R1.fastq.gz'
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2)
-
-
-def test_atropos_simple_with_log(sample1_se_tiny_fq, tmpdir):
-    snakefile = '''
-                rule atropos:
-                    input:
-                        fastq='sample1_R1.fastq.gz'
-                    output:
-                        fastq='sample1_R1.trim.fastq.gz'
-                    params: extra='-a AAA'
-                    threads: 2
-                    log: 'sample1.atropos.log'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1_R1.fastq.gz'
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-        assert 'This is Atropos' in open('sample1.atropos.log').readline()
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2)
-
-
-def test_atropos_se_with_list(sample1_se_tiny_fq, tmpdir):
-    snakefile = '''
-                rule atropos:
-                    input: 'sample1_R1.fastq.gz'
-                    output: 'sample1_R1.trim.fastq.gz'
-                    params: extra='-a AAA'
-                    threads: 2
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1_R1.fastq.gz'
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2)
-
-
-def test_atropos_pe(sample1_pe_tiny_fq, tmpdir):
-    snakefile = '''
-    rule atropos:
-        input:
-            R1='sample1_R1.fastq.gz',
-            R2='sample1_R2.fastq.gz',
-        output:
-            R1='sample1_R1.trim.fastq.gz',
-            R2='sample2_R1.trim.fastq.gz',
-        params: extra='-a AAA'
-        threads: 2
-        log: 'sample1.atropos.log'
-        wrapper: "file:wrapper"
-                '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz',
-            sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz',
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-        assert 'This is Atropos' in open('sample1.atropos.log').readline()
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2)
-
-
-def test_atropos_pe_with_list(sample1_pe_tiny_fq, tmpdir):
-
-    snakefile = '''
-                rule atropos:
-                    input: 'sample1_R1.fastq.gz', 'sample1_R2.fastq.gz',
-                    output: 'sample1_R1.trim.fastq.gz', 'sample2_R1.trim.fastq.gz',
-                    params: extra='-a AAA'
-                    threads: 2
-                    log: 'sample1.atropos.log'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz',
-            sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz',
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-        assert 'This is Atropos' in open('sample1.atropos.log').readline()
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2)
diff --git a/wrappers/test/test_bowtie2.py b/wrappers/test/test_bowtie2.py
deleted file mode 100644
index 6ee9b76f..00000000
--- a/wrappers/test/test_bowtie2.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import os
-import pytest
-from snakemake.shell import shell
-from lcdblib.snakemake import aligners
-from utils import run, dpath, symlink_in_tempdir, tmpdir_for_func
-
-
-@pytest.fixture(scope='session')
-def bowtie2_indexes(dm6_fa, tmpdir_factory):
-    d = tmpdir_for_func(tmpdir_factory)
-    snakefile = '''
-    rule bowtie2:
-        input: fasta='dm6.fa'
-        output: index=['dm6.1.bt2', 'dm6.2.bt2']
-        log: 'bowtie2.log'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            dm6_fa: 'dm6.fa'
-        }
-    )
-
-    def check():
-        assert 'Total time for backward call to driver' in open('bowtie2.log').readlines()[-1]
-        assert list(shell('bowtie2-inspect dm6 -n', iterable=True)) == ['2L', '2R']
-
-    run(
-        dpath('../wrappers/bowtie2/build'),
-        snakefile, check, input_data_func, d)
-    return aligners.bowtie2_index_from_prefix(os.path.join(d, 'dm6'))
-
-
-def _dict_of_bowtie2_indexes(bowtie2_indexes, prefix):
-    d = {}
-    indexes = aligners.bowtie2_index_from_prefix(prefix)
-    bowtie2_indexes = sorted(bowtie2_indexes)
-    indexes = sorted(indexes)
-    for k, v in zip(bowtie2_indexes, indexes):
-        d[k] = v
-    return d
-
-
-def test_bowtie2_align_se(bowtie2_indexes, sample1_se_tiny_fq, tmpdir):
-    d = _dict_of_bowtie2_indexes(bowtie2_indexes, 'dm6')
-    indexes = list(d.values())
-    snakefile = '''
-        rule bowtie2_align:
-            input:
-                fastq='sample1_R1.fastq.gz',
-                index={indexes}
-            output:
-                bam='sample1.bam'
-            log: "bowtie2.log"
-            wrapper: "file:wrapper"
-    '''.format(indexes=indexes)
-    d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz'
-    input_data_func = symlink_in_tempdir(d)
-
-    def check():
-        assert "overall alignment rate" in open('bowtie2.log').read()
-
-        # should have at least some mapped and unmapped
-        assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) > 0
-        assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0
-
-    run(dpath('../wrappers/bowtie2/align'), snakefile, check, input_data_func, tmpdir)
-
-
-def test_bowtie2_align_se_rm_unmapped(bowtie2_indexes, sample1_se_tiny_fq, tmpdir):
-    d = _dict_of_bowtie2_indexes(bowtie2_indexes, 'dm6')
-    indexes = list(d.values())
-    snakefile = '''
-        rule bowtie2_align:
-            input:
-                fastq='sample1_R1.fastq.gz',
-                index={indexes}
-            output:
-                bam='sample1.bam'
-            params:
-                samtools_view_extra='-F 0x04'
-            log: "bowtie2.log"
-            wrapper: "file:wrapper"
-    '''.format(indexes=indexes)
-    d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz'
-    input_data_func = symlink_in_tempdir(d)
-
-    def check():
-        assert "overall alignment rate" in open('bowtie2.log').read()
-
-        # should have at least some mapped and unmapped
-        assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) == 0
-        assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0
-
-    run(dpath('../wrappers/bowtie2/align'), snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_cutadapt.py b/wrappers/test/test_cutadapt.py
deleted file mode 100644
index 97f5c7f3..00000000
--- a/wrappers/test/test_cutadapt.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import os
-import gzip
-from utils import run, dpath, rm, symlink_in_tempdir
-
-def test_cutadapt_simple(sample1_se_tiny_fq, tmpdir):
-    snakefile = '''
-                rule cutadapt:
-                    input:
-                        fastq='sample1_R1.fastq.gz'
-                    output:
-                        fastq='sample1_R1.trim.fastq.gz'
-                    params: extra='-a AAA'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1_R1.fastq.gz'
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir)
-
-
-def test_cutadapt_simple_with_log(sample1_se_tiny_fq, tmpdir):
-    snakefile = '''
-                rule cutadapt:
-                    input:
-                        fastq='sample1_R1.fastq.gz'
-                    output:
-                        fastq='sample1_R1.trim.fastq.gz'
-                    params: extra='-a AAA'
-                    log: 'sample1.cutadapt.log'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1_R1.fastq.gz'
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-        assert 'This is cutadapt' in open('sample1.cutadapt.log').readline()
-
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir)
-
-
-def test_cutadapt_se_with_list(sample1_se_tiny_fq, tmpdir):
-    snakefile = '''
-                rule cutadapt:
-                    input: 'sample1_R1.fastq.gz'
-                    output: 'sample1_R1.trim.fastq.gz'
-                    params: extra='-a AAA'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1_R1.fastq.gz'
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir)
-
-def test_cutadapt_pe(sample1_pe_tiny_fq, tmpdir):
-    snakefile = '''
-                rule cutadapt:
-                    input:
-                        R1='sample1_R1.fastq.gz',
-                        R2='sample1_R2.fastq.gz',
-                    output:
-                        R1='sample1_R1.trim.fastq.gz',
-                        R2='sample2_R1.trim.fastq.gz',
-                    params: extra='-a AAA'
-                    log: 'sample1.cutadapt.log'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz',
-            sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz',
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-        assert 'This is cutadapt' in open('sample1.cutadapt.log').readline()
-
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir)
-
-def test_cutadapt_pe_with_list(sample1_pe_tiny_fq, tmpdir):
-    snakefile = '''
-                rule cutadapt:
-                    input: 'sample1_R1.fastq.gz', 'sample1_R2.fastq.gz',
-                    output: 'sample1_R1.trim.fastq.gz', 'sample2_R1.trim.fastq.gz',
-                    params: extra='-a AAA'
-                    log: 'sample1.cutadapt.log'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz',
-            sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz',
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz'))
-        b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz'))
-        assert a == b == 4040
-        assert 'This is cutadapt' in open('sample1.cutadapt.log').readline()
-
-        assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz')
-
-    run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_deeptools.py b/wrappers/test/test_deeptools.py
deleted file mode 100644
index cbf87690..00000000
--- a/wrappers/test/test_deeptools.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import os
-import gzip
-from utils import run, dpath, rm, symlink_in_tempdir
-import pyBigWig
-
-def test_deeptools_bamCoverage(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, tmpdir):
-    snakefile = '''
-                rule deeptools:
-                    input:
-                        bam='sample1.bam',
-                        bai='sample1.bam.bai'
-                    output: 'sample1.bw',
-                    log: 'deeptools.log'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1.bam',
-            sample1_se_tiny_bam_bai['bai']: 'sample1.bam.bai',
-        }
-    )
-
-    def check():
-        bw = pyBigWig.open('sample1.bw')
-        header_keys = list(bw.header().keys())
-        for k in ['maxVal', 'minVal', 'nBasesCovered', 'nLevels', 'sumData',
-                  'sumSquared', 'version']:
-            assert k in header_keys
-
-        # bigWig version should be independent of BAM input, so we can check
-        # the value
-        assert bw.header()['version'] == 4
-
-        first_chrom = list(bw.chroms().keys())[0]
-        assert isinstance(bw.stats(first_chrom)[0], float)
-
-    run(dpath('../wrappers/deeptools/bamCoverage'), snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_demo.py b/wrappers/test/test_demo.py
deleted file mode 100644
index dd7be5ee..00000000
--- a/wrappers/test/test_demo.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# This file demonstrates tests for the `demo` wrapper. It is heavily commented,
-# and is included as part of the test suite to ensure that it's correct.
-
-# The `run` function does most of the work. It creates a tempdir, copies over
-# input data, Snakefile, and wrapper, runs the Snakefile, and runs
-# a user-provided test function against the output.
-from utils import run
-
-
-# The `dpath` function figures out the path the wrapper even when in a tempdir
-from utils import dpath
-
-# `symlink_in_tempdir` is a decorator function that lets us easily map fixtures
-# to input files expected by our Snakefile. The examples below will demonstrate
-# how it works.
-from utils import symlink_in_tempdir
-
-
-# A note on fixtures
-# ------------------
-#
-# py.test implicitly does a `from conftest import *`, so we will have the
-# fixtures from that package available here.
-#
-# Currently we have the fixtures from raw_data_fixtures.py imported into
-# conftest.py, which in turn makes them available in this file.
-#
-# py.test also includes a built-in `tmpdir` fixture which we use here to have
-# a nicely-named tmpdir for running the test.
-#
-# See http://doc.pytest.org/en/latest/fixture.html for more info.
-
-
-# Our first test. The test function names must start with `test_` in order for
-# py.test to find them.
-def test_demo(sample1_se_tiny_fq, tmpdir):
-
-    # A note on these arguments
-    # -------------------------
-    #
-    # Test function arguments are expected to be fixtures. The fixture
-    # `sample1_se_tiny_fq` will be the path to the downloaded example data.  See
-    # conftest.sample1_se_tiny_fq().
-    #
-    # The fixture `tmpdir` (which comes built-in with py.test) will be
-    # a py.path.local object pointing to a tempdir created just for this test.
-    # It will match the glob /tmp/pytest-*, and only the last 3 tempdirs are
-    # retained.
-
-    # Write the snakefile
-    # -------------------
-    # First we write the Snakefile to use in testing. Inputs need to come from
-    # fixutres. Write whatever filename you'd like; we'll connect the fixture
-    # to the written filename below.
-    #
-    # `snakefile` is typically a triple-quoted string; it will be automatically
-    # run through textwrap.dedent later so you don't have to worry about
-    # indentation.
-    #
-    # The wrapper will be copied to a subdirectory of the temp dir called,
-    # appropriately enough, "wrapper". So your snakefile will generally end
-    # with the line `wrapper: "file:wrapper"`.
-    snakefile = '''
-    rule demo:
-        input: 'a.fastq.gz'
-        output: 'b.fastq.gz'
-        wrapper: "file:wrapper"
-    '''
-
-    # Map fixtures to input files
-    # ---------------------------
-    # Next we map the fixture sample1_se_tiny_fq (a temp file which has downloaded
-    # data from the test data repo into a temp dir) to the input file that our
-    # Snakefile expects.
-    #
-    # Keys are paths to downloaded example data (typically downloaded just once
-    # per py.test session), which is provided by the fixture. The values of the
-    # dict are paths relative to the Snakefile and must match what is expected
-    # by the snakefile.
-    #
-    # Technically, `symlink_in_tempdir` returns a function that takes a path as
-    # its argument and symlinks keys over to values within that path. While
-    # this seems a little convoluted, doing it this way means that we don't
-    # have to keep track -- or even care -- what the fixture's provided
-    # filename is, avoiding the need to keep looking back at the fixtures
-    # module to remember what the filenames are.  It keeps the input file setup
-    # logic tightly coupled to the Snakefile, since they're both defined in the
-    # same function.
-    #
-    # So: since the above snakefile expects a.fastq.gz as input, we need to
-    # make that happen, like this:
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'a.fastq.gz'
-        }
-    )
-
-    # Write a test function
-    # ---------------------
-    # This is our test function. It will be called after the Snakefile has been
-    # run and it will be called in the same temp directory in which the
-    # Snakefile is run, so paths should be relative to the Snakefile.
-    #
-    # This function should not accept any arguments.
-    #
-    # In this case, the demo wrapper simply copies input to output, so here we
-    # assert the files are identical.
-    def check():
-        assert open('a.fastq.gz', 'rb').read() == open('b.fastq.gz', 'rb').read()
-
-    # Call `run()`
-    # ------------
-    # Now that we have defined everything, the `run` function does all of the
-    # work. Note we pass the `tmpdir` fixture here.
-    #
-    # (that's because py.test manages tmpdirs for tests, which are in this
-    # current module, but run() lives in the utils module which won't get
-    # nicely managed. But run() needs to know where to build the test case,
-    # hence the need to pass it here)
-    run(dpath('../wrappers/demo'), snakefile, check, input_data_func, tmpdir)
-
-
-
-# This test function shows how to use downloaded paired-end data from
-# a different fixture.
-def test_demo_pe(sample1_pe_fq, tmpdir):
-
-    # In contrast to the sample1_se_tiny_fq fixture used in the previous function,
-    # here the paired-end fixture `sample1_pe_fq` is a tuple of path names (see
-    # conftest.sample1_pe_fq())
-
-
-    # The snakefile reflects what the wrapper expects for PE (see
-    # wrappers/demo/README.md).
-    snakefile = '''
-    rule demo:
-        input:
-            R1='a1.fastq.gz',
-            R2='a2.fastq.gz'
-        output:
-            R1='b1.fastq.gz',
-            R2='b2.fastq.gz'
-        wrapper: "file:wrapper"
-    '''
-
-    # Map fixture to input files. Again, since this is paired-end we need to
-    # make sure both files are provided the right filename for testing.
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_pe_fq[0]: 'a1.fastq.gz',
-            sample1_pe_fq[1]: 'a2.fastq.gz',
-        }
-    )
-
-    def check():
-        assert open('a1.fastq.gz', 'rb').read() == open('b1.fastq.gz', 'rb').read()
-        assert open('a2.fastq.gz', 'rb').read() == open('b2.fastq.gz', 'rb').read()
-
-    run(dpath('../wrappers/demo'), snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_dupradar.py b/wrappers/test/test_dupradar.py
deleted file mode 100644
index 6122bd5c..00000000
--- a/wrappers/test/test_dupradar.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import os
-import pytest
-from test_picard import sample1_se_bam_markdups
-from utils import symlink_in_tempdir, run, dpath
-
-
-@pytest.fixture(scope='session')
-def sample1_se_dupradar(sample1_se_bam_markdups, annotation, tmpdir_factory):
-    snakefile = '''
-    rule dupradar:
-        input:
-            bam='sample1.bam',
-            annotation='dm6.gtf'
-        output:
-            density_scatter='sample1.density_scatter.png',
-            expression_histogram='sample1.expression_histogram.png',
-            expression_barplot='sample1.expression_barplot.png',
-            expression_boxplot='sample1.expression_boxplot.png',
-            multimapping_histogram='sample1.multimapping_histogram.png',
-            dataframe='sample1.dupradar.tsv',
-            model='sample1.model.txt',
-            curve='sample1.curve.txt'
-        wrapper:
-            'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_bam_markdups['bam']: 'sample1.bam',
-            annotation: 'dm6.gtf',
-        }
-    )
-    tmpdir = str(tmpdir_factory.mktemp('dupradar_fixture'))
-    run(dpath('../wrappers/dupradar'), snakefile, None, input_data_func, tmpdir, use_conda=False)
-    mapping = dict(
-        density_scatter='sample1.density_scatter.png',
-        expression_histogram='sample1.expression_histogram.png',
-        expression_barplot='sample1.expression_barplot.png',
-        expression_boxplot='sample1.expression_boxplot.png',
-        multimapping_histogram='sample1.multimapping_histogram.png',
-        dataframe='sample1.dupradar.tsv',
-    )
-    for k, v in mapping.items():
-        mapping[k] = os.path.join(tmpdir, v)
-    return mapping
-
-
-#@pytest.mark.xfail
-def test_dupradar(sample1_se_dupradar):
-    assert open(sample1_se_dupradar['dataframe']).readline().startswith('"ID"\t"geneLength"')
diff --git a/wrappers/test/test_fastq_screen.py b/wrappers/test/test_fastq_screen.py
deleted file mode 100644
index 5cae9832..00000000
--- a/wrappers/test/test_fastq_screen.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import os
-import zipfile
-from utils import run, dpath, rm, symlink_in_tempdir
-from test_bowtie2 import bowtie2_indexes
-
-def test_fastq_screen(sample1_se_tiny_fq, bowtie2_indexes, tmpdir):
-    snakefile = '''
-    rule fastq_screen:
-        input:
-            fastq='sample1_R1.fastq.gz',
-            dm6={indexes}
-        output:
-            txt='sample1_R1_screen.txt'
-        params:
-            subset=100000,
-            aligner='bowtie2'
-        wrapper:
-            "file:wrapper"
-    '''.format(indexes=bowtie2_indexes)
-
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1_R1.fastq.gz'
-        }
-    )
-
-    def check():
-        with open('sample1_R1_screen.txt') as fh:
-            res = fh.readlines()
-            r1 = res[0].strip().split()
-            r3 = res[2].strip().split()
-            assert r1[-1] == '100000'
-            assert r3[0] == 'dm6'
-
-
-    run(dpath('../wrappers/fastq_screen'), snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_fastqc.py b/wrappers/test/test_fastqc.py
deleted file mode 100644
index 5df5eda9..00000000
--- a/wrappers/test/test_fastqc.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import os
-import zipfile
-from utils import run, dpath, rm, symlink_in_tempdir
-
-import pytest
-from utils import tmpdir_for_func, _download_file
-
-@pytest.fixture(scope='session')
-def fastqc(sample1_se_tiny_fq, tmpdir_factory):
-    snakefile = '''
-    rule fastqc:
-        input:
-            fastq='sample1_R1.fastq.gz'
-        output:
-            html='sample1_R1_fastqc.html',
-            zip='sample1_R1_fastqc.zip'
-        wrapper: "file:wrapper"'''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1_R1.fastq.gz'
-        }
-    )
-    tmpdir = str(tmpdir_factory.mktemp('fastqc_fixture'))
-    run(dpath('../wrappers/fastqc'), snakefile, None, input_data_func, tmpdir)
-    return os.path.join(tmpdir, 'sample1_R1_fastqc.zip')
-
-
-def test_fastqc(sample1_se_tiny_fq, tmpdir):
-    snakefile = '''
-    rule fastqc:
-        input:
-            fastq='sample1_R1.fastq.gz'
-        output:
-            html='results/sample1_R1.html',
-            zip='sample1_R1.zip'
-        wrapper: "file:wrapper"'''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1_R1.fastq.gz'
-        }
-    )
-
-    def check():
-        assert '<html>' in open('results/sample1_R1.html').readline()
-        contents = [
-            'sample1_R1_fastqc/',
-            'sample1_R1_fastqc/Icons/',
-            'sample1_R1_fastqc/Images/',
-            'sample1_R1_fastqc/Icons/fastqc_icon.png',
-            'sample1_R1_fastqc/Icons/warning.png',
-            'sample1_R1_fastqc/Icons/error.png',
-            'sample1_R1_fastqc/Icons/tick.png',
-            'sample1_R1_fastqc/summary.txt',
-            'sample1_R1_fastqc/Images/per_base_quality.png',
-            'sample1_R1_fastqc/Images/per_tile_quality.png',
-            'sample1_R1_fastqc/Images/per_sequence_quality.png',
-            'sample1_R1_fastqc/Images/per_base_sequence_content.png',
-            'sample1_R1_fastqc/Images/per_sequence_gc_content.png',
-            'sample1_R1_fastqc/Images/per_base_n_content.png',
-            'sample1_R1_fastqc/Images/sequence_length_distribution.png',
-            'sample1_R1_fastqc/Images/duplication_levels.png',
-            'sample1_R1_fastqc/Images/adapter_content.png',
-            'sample1_R1_fastqc/fastqc_report.html',
-            'sample1_R1_fastqc/fastqc_data.txt',
-            'sample1_R1_fastqc/fastqc.fo'
-        ]
-        for i in zipfile.ZipFile('sample1_R1.zip').namelist():
-            assert i in contents
-
-    run(dpath('../wrappers/fastqc'), snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_featurecounts.py b/wrappers/test/test_featurecounts.py
deleted file mode 100644
index cb3760f3..00000000
--- a/wrappers/test/test_featurecounts.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import os
-import gzip
-from utils import run, dpath, rm, symlink_in_tempdir
-
-def test_featurecounts_se(sample1_se_tiny_bam, annotation, tmpdir):
-    snakefile = '''
-                rule featurecounts:
-                    input:
-                        annotation='dm6.gtf',
-                        bam='sample1.bam'
-                    output:
-                        counts='sample1.counts',
-                    log: 'featurecounts.log'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1.bam',
-            annotation: 'dm6.gtf',
-        }
-    )
-
-    def check():
-        assert '//===================' in open('featurecounts.log').read()
-        assert '# Program:featureCounts' in open('sample1.counts').readline()
-        assert open('sample1.counts.summary').readline().startswith('Status')
-        assert sum(1 for _ in open('sample1.counts')) == 169
-
-    run(dpath('../wrappers/featurecounts'), snakefile, check, input_data_func, tmpdir)
-
-def test_featurecounts_pe(sample1_pe_tiny_bam, annotation, tmpdir):
-    snakefile = '''
-                rule featurecounts:
-                    input:
-                        annotation='dm6.gtf',
-                        bam='sample1.bam'
-                    output:
-                        counts='sample1.counts',
-                    log: 'featurecounts.log'
-                    params: extra='-p -P -s 1 -B --splitOnly'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_pe_tiny_bam: 'sample1.bam',
-            annotation: 'dm6.gtf',
-        }
-    )
-
-    def check():
-        assert '//===================' in open('featurecounts.log').read()
-        assert '# Program:featureCounts' in open('sample1.counts').readline()
-        assert open('sample1.counts.summary').readline().startswith('Status')
-        assert sum(1 for _ in open('sample1.counts')) == 169
-
-        # TODO: maybe assert that below a certain level are counted when all
-        # those extra arguments are used?
-
-    run(dpath('../wrappers/featurecounts'), snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_hisat2.py b/wrappers/test/test_hisat2.py
deleted file mode 100644
index add7abb0..00000000
--- a/wrappers/test/test_hisat2.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import os
-import pytest
-from snakemake.shell import shell
-from lcdblib.snakemake import aligners
-from utils import run, dpath, symlink_in_tempdir, tmpdir_for_func
-
-
-@pytest.fixture(scope='session')
-def hisat2_indexes(dm6_fa, tmpdir_factory):
-    d = tmpdir_for_func(tmpdir_factory)
-    snakefile = '''
-    rule hisat2:
-        input: fasta='2L.fa'
-        output: index=['2L.1.ht2', '2L.2.ht2']
-        log: 'hisat.log'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            dm6_fa: '2L.fa'
-        }
-    )
-
-    def check():
-        assert 'Total time for call to driver' in open('hisat.log').readlines()[-1]
-        assert list(shell('hisat2-inspect 2L -n', iterable=True)) == ['2L', '2R']
-
-    run(
-        dpath('../wrappers/hisat2/build'),
-        snakefile, check, input_data_func, d)
-    return aligners.hisat2_index_from_prefix(os.path.join(d, '2L'))
-
-
-def _dict_of_hisat2_indexes(hisat2_indexes, prefix):
-    d = {}
-    indexes = aligners.hisat2_index_from_prefix(prefix)
-    hisat2_indexes = sorted(hisat2_indexes)
-    indexes = sorted(indexes)
-    for k, v in zip(hisat2_indexes, indexes):
-        d[k] = v
-    return d
-
-
-def test_hisat2_align_se(hisat2_indexes, sample1_se_tiny_fq, tmpdir):
-    d = _dict_of_hisat2_indexes(hisat2_indexes, '2L')
-    indexes = list(d.values())
-    snakefile = '''
-        rule hisat2_align:
-            input:
-                fastq='sample1_R1.fastq.gz',
-                index={indexes}
-            output:
-                bam='sample1.bam'
-            log: "hisat2.log"
-            wrapper: "file:wrapper"
-    '''.format(indexes=indexes)
-    d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz'
-    input_data_func = symlink_in_tempdir(d)
-
-    def check():
-        assert "overall alignment rate" in open('hisat2.log').read()
-
-        # should have at least some mapped and unmapped
-        assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) > 0
-        assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0
-
-    run(dpath('../wrappers/hisat2/align'), snakefile, check, input_data_func, tmpdir)
-
-
-def test_hisat2_align_se_SRA(hisat2_indexes, tmpdir):
-    d = _dict_of_hisat2_indexes(hisat2_indexes, '2L')
-    indexes = list(d.values())
-    snakefile = '''
-        rule hisat2_align:
-            input:
-                index={indexes}
-            output:
-                bam='sample1.bam'
-            params: hisat2_extra='--sra-acc SRR1990338'
-            log: "hisat2.log"
-            wrapper: "file:wrapper"
-    '''.format(indexes=indexes)
-    input_data_func = symlink_in_tempdir(d)
-
-    def check():
-        assert "overall alignment rate" in open('hisat2.log').read()
-
-        # should have at least some mapped and unmapped
-        assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) > 0
-        assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0
-
-    run(dpath('../wrappers/hisat2/align'), snakefile, check, input_data_func, tmpdir)
-
-
-def test_hisat2_align_se_rm_unmapped(hisat2_indexes, sample1_se_tiny_fq, tmpdir):
-    d = _dict_of_hisat2_indexes(hisat2_indexes, '2L')
-    indexes = list(d.values())
-    snakefile = '''
-        rule hisat2_align:
-            input:
-                fastq='sample1_R1.fastq.gz',
-                index={indexes}
-            output:
-                bam='sample1.bam'
-            params:
-                samtools_view_extra='-F 0x04'
-            log: "hisat2.log"
-            wrapper: "file:wrapper"
-    '''.format(indexes=indexes)
-    d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz'
-    input_data_func = symlink_in_tempdir(d)
-
-    def check():
-        assert "overall alignment rate" in open('hisat2.log').read()
-
-        # should have at least some mapped and unmapped
-        assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) == 0
-        assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0
-
-    run(dpath('../wrappers/hisat2/align'), snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_kallisto.py b/wrappers/test/test_kallisto.py
deleted file mode 100644
index 32e32e1b..00000000
--- a/wrappers/test/test_kallisto.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import os
-import json
-import pytest
-import pysam
-from snakemake.shell import shell
-from lcdblib.snakemake import aligners
-from utils import run, dpath, rm, symlink_in_tempdir, tmpdir_for_func
-
-
-@pytest.fixture(scope='session')
-def kallisto_index(tmpdir_factory, transcriptome):
-    d = tmpdir_for_func(tmpdir_factory)
-    snakefile = '''
-    rule kallisto:
-        input: fasta='transcriptome.fa'
-        output: index='transcriptome.idx'
-        log: 'log'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            transcriptome: 'transcriptome.fa',
-        }
-    )
-
-    def check():
-        log = open('log').read()
-        assert '[build] target deBruijn graph'
-
-    run(
-        dpath('../wrappers/kallisto/index'),
-        snakefile, check, input_data_func, d)
-    return os.path.join(d, 'transcriptome.idx')
-
-
-def test_kallisto_quant(tmpdir, sample1_se_tiny_fq, kallisto_index):
-    snakefile = '''
-    rule kallisto_quant:
-        input:
-             fastq='sample1.fq.gz',
-             index='out/transcriptome.idx'
-
-        params: extra='--single --fragment-length=200 --sd=20'
-        output:
-            h5='quant/abundance.h5',
-            tsv='quant/abundance.tsv',
-            json='quant/run_info.json',
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1.fq.gz',
-            kallisto_index: 'out/transcriptome.idx',
-        }
-    )
-
-    def check():
-        assert sum(1 for _ in open('quant/abundance.tsv')) == 310
-        assert open('quant/abundance.tsv').readline() == (
-                'target_id\tlength\teff_length\test_counts\ttpm\n')
-        keys = ['call', 'index_version', 'n_bootstraps', 'n_processed', 'n_targets', 'start_time']
-        d = json.load(open('quant/run_info.json'))
-        for k in keys:
-            assert k in d
-
-
-    run(
-        dpath('../wrappers/kallisto/quant'),
-        snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_multiqc.py b/wrappers/test/test_multiqc.py
deleted file mode 100644
index 8f361807..00000000
--- a/wrappers/test/test_multiqc.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import pytest
-import os
-import gzip
-from utils import run, dpath, rm, symlink_in_tempdir
-from test_fastqc import fastqc
-
-
-def test_multiqc(fastqc, tmpdir):
-    snakefile = '''
-    rule multiqc:
-        input: 'results/sample1_R1_fastqc.zip'
-        output: 'multiqc.html'
-        log: 'log'
-        params:
-            analysis_directory='results'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func=symlink_in_tempdir(
-        {
-            fastqc: 'results/sample1_R1_fastqc.zip',
-        }
-    )
-
-    def check():
-        assert '<!DOCTYPE html>' in open('multiqc.html').readline()
-
-    run(dpath('../wrappers/multiqc'), snakefile, check, input_data_func, tmpdir)
-
-def test_multiqc_other_dir(fastqc, tmpdir):
-    snakefile = '''
-    rule multiqc:
-        input: 'results/sample1_R1_fastqc.zip'
-        output: 'reports/multiqc.html'
-        log: 'log'
-        params:
-            analysis_directory='results'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func=symlink_in_tempdir(
-        {
-            fastqc: 'results/sample1_R1_fastqc.zip',
-        }
-    )
-
-    def check():
-        assert '<!DOCTYPE html>' in open('reports/multiqc.html').readline()
-
-    run(dpath('../wrappers/multiqc'), snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_picard.py b/wrappers/test/test_picard.py
deleted file mode 100644
index 659d116b..00000000
--- a/wrappers/test/test_picard.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import pytest
-import os
-import gzip
-from utils import run, dpath, rm, symlink_in_tempdir
-
-
-@pytest.fixture(scope='session')
-def sample1_se_bam_markdups(sample1_se_bam, tmpdir_factory):
-    snakefile = '''
-    rule markduplicates:
-        input:
-            bam='sample1.bam'
-        output:
-            bam='sample1.dupsmarked.bam',
-            metrics='sample1.dupmetrics.txt'
-        log: 'log'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_bam: 'sample1.bam',
-        }
-    )
-    tmpdir = str(tmpdir_factory.mktemp('markduplicates_fixture'))
-    run(dpath('../wrappers/picard/markduplicates'), snakefile, None, input_data_func, tmpdir, use_conda=True)
-    return {
-            'bam': os.path.join(tmpdir, 'sample1.dupsmarked.bam'),
-            'metrics': os.path.join(tmpdir, 'sample1.dupmetrics.txt')
-            }
-
-
-def test_markduplicates_se(sample1_se_bam_markdups, tmpdir):
-    assert open(sample1_se_bam_markdups['metrics']).readline().startswith('##')
-
-
-def test_picard_collectrnaseqmetrics_se(sample1_se_tiny_bam, annotation_refflat, tmpdir):
-    snakefile = '''
-    rule collectrnaseqmetrics:
-        input:
-            bam='sample1.bam',
-            refflat='dm6.refflat',
-        output:
-            metrics='sample1.metrics'
-        log: 'log'
-        params:
-            extra="STRAND=NONE",
-            java_args='-Xmx512m'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1.bam',
-            annotation_refflat: 'dm6.refflat',
-        }
-    )
-
-    def check():
-        assert '## METRICS CLASS' in open('sample1.metrics').read()
-
-    run(dpath('../wrappers/picard/collectrnaseqmetrics'), snakefile, check, input_data_func, tmpdir, use_conda=True)
-
-
-def test_picard_collectrnaseqmetrics_se_plot(sample1_se_tiny_bam, annotation_refflat, tmpdir):
-    snakefile = '''
-    rule collectrnaseqmetrics:
-        input:
-            bam='sample1.bam',
-            refflat='dm6.refflat',
-        output:
-            metrics='sample1.metrics',
-            plot='sample1.pdf'
-        log: 'log'
-        params: extra="STRAND=NONE CHART=sample1.pdf"
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1.bam',
-            annotation_refflat: 'dm6.refflat',
-        }
-    )
-
-    def check():
-        assert '## METRICS CLASS' in open('sample1.metrics').read()
-
-    run(dpath('../wrappers/picard/collectrnaseqmetrics'), snakefile, check, input_data_func, tmpdir, use_conda=True)
-
-
-@pytest.mark.xfail
-def test_picard_collectrnaseqmetrics_too_small_heap(sample1_se_tiny_bam, annotation_refflat, tmpdir):
-    # set the java vm heap size to 128 bytes which should fail. This tests to
-    # make sure the java args are making it through to the wrapper.
-    snakefile = '''
-    rule collectrnaseqmetrics:
-        input:
-            bam='sample1.bam',
-            refflat='dm6.refflat',
-        output:
-            metrics='sample1.metrics'
-        log: 'log'
-        params:
-            extra="STRAND=NONE",
-            java_args='-Xmx128'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1.bam',
-            annotation_refflat: 'dm6.refflat',
-        }
-    )
-
-    def check():
-        assert '## METRICS CLASS' in open('sample1.metrics').read()
-
-    run(dpath('../wrappers/picard/collectrnaseqmetrics'), snakefile, check, input_data_func, tmpdir, use_conda=True)
diff --git a/wrappers/test/test_rseqc.py b/wrappers/test/test_rseqc.py
deleted file mode 100644
index d97ae919..00000000
--- a/wrappers/test/test_rseqc.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import pytest
-import os
-import gzip
-from utils import run, dpath, rm, symlink_in_tempdir
-from textwrap import dedent
-
-def test_infer_experiment(sample1_se_tiny_bam, annotation_bed12, tmpdir):
-    snakefile = '''
-                rule infer_experiment:
-                    input:
-                        bam='sample1_R1.bam',
-                        bed='dm6.bed12'
-                    output:
-                        txt = 'sample1_R1.infer_experiment.txt'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1_R1.bam',
-            annotation_bed12: 'dm6.bed12'
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-        expected = dedent("""\
-                This is SingleEnd Data
-                Fraction of reads failed to determine:
-                Fraction of reads explained by "++,--":
-                Fraction of reads explained by "+-,-+":""").splitlines(False)
-
-        with open('sample1_R1.infer_experiment.txt', 'r') as handle:
-            results = handle.read().strip()
-        for ex in expected:
-            assert ex in results
-
-    run(dpath('../wrappers/rseqc/infer_experiment'), snakefile, check, input_data_func, tmpdir, use_conda=True)
-
-
-def test_gB_cov(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, annotation_bed12, tmpdir):
-    snakefile = '''
-                rule geneBody_coverage:
-                    input:
-                        bam='sample1_R1.sort.bam',
-                        bai='sample1_R1.sort.bam.bai',
-                        bed='dm6.bed12'
-                    output: txt='sample1_R1.geneBodyCoverage.txt',
-                            r='sample1_R1.geneBodyCoverage.r',
-                            img='sample1_R1.geneBodyCoverage.pdf',
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1_R1.sort.bam',
-            sample1_se_tiny_bam_bai['bai']: 'sample1_R1.sort.bam.bai',
-            annotation_bed12: 'dm6.bed12'
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-
-        # R code
-        with open('sample1_R1.geneBodyCoverage.r', 'r') as handle:
-            result = handle.readline().split(' ')[0]
-
-        assert  result == 'sample1_R1.sort'
-
-        # text
-        with open('sample1_R1.geneBodyCoverage.txt', 'r') as handle:
-            result = handle.readlines()[1].split('\t')[0]
-
-        assert  result == 'sample1_R1.sort'
-
-        # PDF
-        assert os.path.exists('sample1_R1.geneBodyCoverage.pdf')
-
-    run(dpath('../wrappers/rseqc/geneBody_coverage'), snakefile, check, input_data_func, tmpdir, use_conda=True)
-
-
-def test_gB_cov_png(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, annotation_bed12, tmpdir):
-    snakefile = '''
-                rule geneBody_coverage:
-                    input:
-                        bam='sample1_R1.sort.bam',
-                        bai='sample1_R1.sort.bam.bai',
-                        bed='dm6.bed12'
-                    output:
-                        txt='sample1_R1.geneBodyCoverage.txt',
-                        r='sample1_R1.geneBodyCoverage.r',
-                        img='sample1_R1.geneBodyCoverage.png',
-                    params:
-                        extra: = '-f png'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1_R1.sort.bam',
-            sample1_se_tiny_bam_bai['bai']: 'sample1_R1.sort.bam.bai',
-            annotation_bed12: 'dm6.bed12'
-        }
-    )
-
-    def check():
-        """ Check that the PNG is created """
-        assert os.path.exists('sample1_R1.geneBodyCoverage.png')
-
-
-@pytest.mark.skip
-def test_tin(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, annotation_bed12, tmpdir):
-    snakefile = '''
-                rule tin:
-                    input:
-                        bam='sample1_R1.sort.bam',
-                        bai='sample1_R1.sort.bam.bai',
-                        bed='dm6.bed12'
-                    output: table='sample1_R1.tin.tsv',
-                            summary='sample1_R1.tin.summary.txt'
-                    wrapper: "file:wrapper"
-                '''
-    input_data_func=symlink_in_tempdir(
-        {
-            sample1_se_tiny_bam: 'sample1_R1.sort.bam',
-            sample1_se_tiny_bam_bai['bai']: 'sample1_R1.sort.bam.bai',
-            annotation_bed12: 'dm6.bed12'
-        }
-    )
-
-    def check():
-        """
-        check for line lengths and that they are at least different sized
-        """
-
-        # R code
-        with open('sample1_R1.tin.tsv', 'r') as handle:
-            result = handle.readline().strip().split('\t')
-
-        assert  result == ['geneID', 'chrom', 'tx_start', 'tx_end', 'TIN']
-
-        # text
-        with open('sample1_R1.tin.summary.txt', 'r') as handle:
-            result = handle.readline().strip().split('\t')
-
-        assert  result == ['Bam_file', 'TIN(mean)', 'TIN(median)', 'TIN(stdev)']
-
-    run(dpath('../wrappers/rseqc/tin'), snakefile, check, input_data_func, tmpdir, use_conda=True)
-
diff --git a/wrappers/test/test_salmon.py b/wrappers/test/test_salmon.py
deleted file mode 100644
index 2e3796fa..00000000
--- a/wrappers/test/test_salmon.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import os
-import pytest
-from snakemake.shell import shell
-from utils import run, dpath, rm, symlink_in_tempdir, tmpdir_for_func
-
-
-@pytest.fixture(scope='session')
-def salmon_index(tmpdir_factory, transcriptome):
-    d = tmpdir_for_func(tmpdir_factory)
-    snakefile = '''
-    rule salmon:
-        input: fasta='transcriptome.fa'
-        output: hash='salmon_index/hash.bin'
-        log: 'log'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            transcriptome: 'transcriptome.fa',
-        }
-    )
-
-    def check():
-        log = open('log').read()
-        assert '[info] done building index' in log
-
-    run(
-        dpath('../wrappers/salmon/index'),
-        snakefile, check, input_data_func, d)
-    return os.path.join(d, 'salmon_index')
-
-
-def test_salmon_quant(tmpdir, sample1_se_tiny_fq, salmon_index):
-    snakefile = '''
-    rule salmon_quant:
-        input:
-             unmatedReads='sample1.fq.gz',
-             index=['idx/hash.bin', 'idx/sa.bin']
-        output: 'sample1/salmon/quant.sf'
-        params: extra='--libType A'
-        log: 'salmon.quant.log'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1.fq.gz',
-            salmon_index: 'idx',
-        }
-    )
-
-    def check():
-        assert open('sample1/salmon/quant.sf').readline() == (
-                'Name\tLength\tEffectiveLength\tTPM\tNumReads\n')
-
-    run(
-        dpath('../wrappers/salmon/quant'),
-        snakefile, check, input_data_func, tmpdir)
-
-def test_salmon_quant_single_index(tmpdir, sample1_se_tiny_fq, salmon_index):
-    snakefile = '''
-    rule salmon_quant:
-        input:
-             unmatedReads='sample1.fq.gz',
-             index='idx/hash.bin'
-        output: 'sample1/salmon/quant.sf'
-        params: extra='--libType A'
-        log: 'salmon.quant.log'
-        wrapper: 'file:wrapper'
-    '''
-    input_data_func = symlink_in_tempdir(
-        {
-            sample1_se_tiny_fq: 'sample1.fq.gz',
-            salmon_index: 'idx',
-        }
-    )
-
-    def check():
-        assert open('sample1/salmon/quant.sf').readline() == (
-                'Name\tLength\tEffectiveLength\tTPM\tNumReads\n')
-
-    run(
-        dpath('../wrappers/salmon/quant'),
-        snakefile, check, input_data_func, tmpdir)
diff --git a/wrappers/test/test_samtools.py b/wrappers/test/test_samtools.py
deleted file mode 100644
index 51ff105a..00000000
--- a/wrappers/test/test_samtools.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import subprocess as sp
-import pytest
-from snakemake import shell
-
-
-def test_samtools_sort_and_index(sample1_se_tiny_bam, sample1_se_tiny_bam_bai):
-    """
-    This test is primarily a trigger for the fixtures.
-    """
-    with pytest.raises(sp.CalledProcessError):
-        shell('samtools view {sample1_se_tiny_bam} 2L:1-100')
-    shell('samtools view {sample1_se_tiny_bam_bai[bam]} 2L:1-100')
diff --git a/wrappers/test/utils.py b/wrappers/test/utils.py
deleted file mode 100644
index 74dd396b..00000000
--- a/wrappers/test/utils.py
+++ /dev/null
@@ -1,152 +0,0 @@
-"""
-Stripped-down version of Snakemake's test framework.
-"""
-
-import sys
-import os
-from textwrap import dedent
-import subprocess as sp
-import tempfile
-import hashlib
-import urllib
-import shutil
-import shlex
-import inspect
-
-import pytest
-from snakemake import snakemake
-from snakemake.shell import shell
-from snakemake.utils import makedirs
-
-
-SCRIPTPATH = shutil.which('snakemake')
-
-# test data url
-URL = 'https://github.com/lcdb/lcdb-test-data/blob/add-chipseq/data/{}?raw=true'
-
-
-def tmpdir_for_func(factory):
-    caller = inspect.stack()[1][3]
-    return str(factory.mktemp(caller))
-
-
-def _download_file(fn, d):
-    """
-    Intended to be called from a pytest.fixture function.
-
-    `fn` is a path to a file that is used to fill in `URL`. `d` is a tempdir
-    likely created by the calling function to which the file will be
-    downloaded.
-
-    The path to the downloaded file is returned.
-    """
-    url = URL.format(fn)
-    dest = os.path.join(d, fn)
-    makedirs(os.path.dirname(dest))
-    basename = os.path.basename(fn)
-    shell('wget -q -O- {url} > {dest}')
-    return dest
-
-
-def dpath(path):
-    "path relative to this file"
-    return os.path.realpath(os.path.join(os.path.dirname(__file__), path))
-
-
-def md5sum(filename):
-    data = open(filename, 'rb').read()
-    return hashlib.md5(data).hexdigest()
-
-
-def run(path, snakefile, check=None, input_data_func=None, tmpdir=None, use_conda=False, **params):
-    """
-    Parameters
-    ----------
-
-    path : str
-        Path to a wrapper directory.
-
-    snakefile : str
-        Contents of a snakefile. `dedent()` will be run on it.
-
-    check : callable or None
-        After running the snakefile on the input data, this function will be
-        called while inside the directory. This function is where the actual
-        tests (assertions etc) should be performed.
-
-        If None, the snakefile will be run but no tests will be performed on
-        the output.
-
-    input_data_func : None | callable
-        If not None, then this callable object will be called with
-        a single argument corresponding to the temp directory. It will be
-        called after the wrapper and test-case contents have been copied to the
-        temp dir, but before the test is run. It is expected to create any data
-        required in whatever directory structure is required.
-
-    tmpdir : None or path
-
-    """
-    # store any tempdirs here for later deletion
-    to_clean_up = []
-
-
-    if tmpdir is None:
-        tmpdir = tempfile.mkdtemp(prefix='.test', dir=os.path.abspath('.'))
-    else:
-        tmpdir = str(tmpdir)
-    try:
-        # copy over the wrapper
-        wrapper_dir = os.path.join(tmpdir, 'wrapper')
-        os.makedirs(wrapper_dir)
-        cmds = (
-            'find {} -maxdepth 1 -type f -print0 | xargs -0 cp -t {}'
-            .format(shlex.quote(path), shlex.quote(wrapper_dir))
-        )
-        sp.call(cmds, shell=True)
-
-        # write the snakefile, filling in the "wrapper" placeholder
-        with open(os.path.join(tmpdir, 'Snakefile'), 'w') as fout:
-            fout.write('shell.executable("/bin/bash")\n')
-            fout.write(dedent(snakefile))
-
-        # Create the input data
-        input_data_func(tmpdir)
-
-        success = snakemake(os.path.join(tmpdir, 'Snakefile'), workdir=tmpdir, stats='stats.txt',
-                            snakemakepath=SCRIPTPATH, config={}, use_conda=use_conda, **params)
-        assert success, 'expected successful execution'
-
-        # Change to the tmpdir and run the test function
-        if check is not None:
-            cwd = os.getcwd()
-            os.chdir(tmpdir)
-            check()
-            os.chdir(cwd)
-
-    finally:
-        for t in to_clean_up:
-            shutil.rmtree(t)
-        #shutil.rmtree(tmpdir)
-
-
-def symlink_in_tempdir(mapping):
-    """
-    Returns a function that can be used for the `input_data_func` to utils.run.
-
-    `mapping` is a dict where keys are 'target' and values are 'linkname'.
-
-    It will symlink the data downloaded by the fixture into the temp dir
-    created for the test case.
-    """
-    def _wrapped(tmpdir):
-        for k, v in mapping.items():
-            _linkname = os.path.join(tmpdir, v)
-            _target = k
-            _linkdir = os.path.dirname(_linkname)
-            shell('mkdir -p {_linkdir} && ln -s {_target} {_linkname}')
-    return _wrapped
-
-
-def rm(path):
-    shutil.rmtree(path)
diff --git a/wrappers/test_toy.py b/wrappers/test_toy.py
deleted file mode 100644
index a8e63a12..00000000
--- a/wrappers/test_toy.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import os
-from textwrap import dedent
-import pytest
-import utils
-
-# Each module has a config dict
-config = dict()
-
-
-def generic_fixture(key, mapping, factory):
-    """
-    Tries to handle as much of the magic as possible.
-
-    Parameters
-    ----------
-    key : str
-        Key into the module-level config dict
-
-    mapping : dict
-        Maps paths from fixtures to input files expected by the snakefile
-
-    tmpdir : str
-        Path to temporary dir, usually created by utils.tmpdir_for_func
-
-    Returns
-    -------
-    After a successful Snakemake run, returns the dictionary of the config's
-    `output` key but with paths fixed to be relative to tmpdir. This returned
-    dict is ready to be used as a fixture by test functions.
-    """
-    conf = config[key]
-    tmpdir = utils.tmpdir_for_func(factory)
-    input_data_func = utils.symlink_in_tempdir(mapping)
-    utils.run(utils.dpath(conf['wrapper']), conf['snakefile'], None, input_data_func, tmpdir)
-    output = conf['output'].copy()
-    for k, v in output.items():
-        output[k] = os.path.join(tmpdir, v)
-    return output
-
-
-# In order for the doc generation to find this config info without re-running
-# all tests, it needs to be in the module-level dict. It similarly can't be
-# added during the fixture function's runtime.
-#
-# However, the mapping and tmpdir must be provided by the function, so the
-# config and the function are tightly coupled.
-#
-# So we add the item to the dictionary here, right above the function that will
-# be using it to keep them tightly coupled in the file.
-config['hisat2_index'] = dict(
-    description="Basic example of generating a hisat2 index",
-    wrapper="../wrappers/hisat2/build",
-    snakefile="""
-      rule hisat2_build:
-          input:
-              fasta="2L.fa"
-          output:
-              index=expand("hisat2_index/assembly.{n}.ht2", n=range(1,9))
-          log: "hisat.log"
-          wrapper: "file://wrapper"
-    """,
-    output={'prefix': 'hisat2_index/assembly'}
-)
-
-
-# All the hard work is done in the config and in generic_fixture(). Now we just
-# need to set up the correct mapping of fixtures to input files.
-@pytest.fixture(scope='module')
-def hisat2_index(tmpdir_factory, dm6_fa):
-    mapping = {dm6_fa: '2L.fa'}
-    return generic_fixture('hisat2_index', mapping, tmpdir_factory)
-
-# The actual test.
-def test_index(hisat2_index):
-    assert os.path.exists(hisat2_index['prefix'] + '.1.ht2')
-
-
-def extract_examples_for_wrapper(wrapper):
-    """
-    Returns the examples for the wrapper in markdown format.
-
-    Parameters
-    ----------
-    wrapper : str
-        Expected to be the value of one of the config dict's `wrapper` keys.
-    """
-    markdown = []
-    for k, v in config.items():
-        if v['wrapper'] != wrapper:
-            continue
-        snakefile = dedent(v['snakefile'])
-        markdown.append(
-            dedent(
-                """
-                {}
-
-                ```python""".format(v['description'])))
-        markdown.append(snakefile)
-        markdown.append("```")
-    return "\n".join(markdown)
diff --git a/wrappers/wrappers/atropos/README.md b/wrappers/wrappers/atropos/README.md
deleted file mode 100644
index 56b28b18..00000000
--- a/wrappers/wrappers/atropos/README.md
+++ /dev/null
@@ -1,167 +0,0 @@
-# Wrapper for atropos
-[Atropos](https://atropos.readthedocs.io/en/latest/index.html) is a fork of
-[Cutadapt](http://cutadapt.readthedocs.io/en/stable/index.html) which finds and
-removes adapter sequences, primers, poly-A tails and other types of unwanted
-sequence from your high-throughput sequencing reads.
-
-# Examples
-
-Minimal usage:
-
-```
-rule atropos:
-    input: fastq='{sample}.fastq'
-    output: fastq='{sample}.trim.fastq'
-    threads: 4
-    wrapper:
-        "file://path/to/atropos"
-```
-
-Use an adapters file and quality-trim reads to Q20:
-
-```
-rule atropos:
-    input: fastq='{sample}.fastq'
-    output: fastq='{sample}.trim.fastq'
-    params: extra="-a file:adapters.fa -q 20"
-    threads: 4
-    wrapper:
-        "file://path/to/atropos"
-```
-
-Optionally provide the adapters file as input in order to trigger a re-run if
-it has changed. The wrapper only pays attention to `input.fastq`, so adding
-another key doesn't affect the wrapper:
-
-```
-rule atropos:
-    input:
-        fastq='{sample}.fastq',
-        adapters='adapters.fa'
-    output: fastq='{sample}.trim.fastq'
-    params: extra="-a file:adapters.fa -q 20"
-    threads: 4
-    wrapper:
-        "file://path/to/atropos"
-```
-
-Example of how to use with other output files. Since the wrapper only pays
-attention to `output.fastq`, so other output files can be indicated but their
-filenames have to be indicated in `params.`:
-
-```
-rule atropos:
-    input:
-        fastq='{sample}.fastq',
-        adapters='adapters.fa'
-    output:
-        fastq='{sample}.trim.fastq',
-        short='{sample}.trim.too-short.fastq',
-        untrimmed='{sample}.untrimmed.fastq',
-    params:
-        extra=(
-            "-a file:adapters.fa "
-            "-q 20 "
-            "--too-short-output={sample}.trim.too-short.fastq "
-            "--untrimmed-output={sample}.untrimmed.fastq"
-        )
-    threads: 4
-    wrapper:
-        "file://path/to/atropos"
-```
-
-You can also run in pair-end mode.
-
-```
-rule atropos:
-    input:
-        R1='{sample}_r1.fastq',
-        R2='{sample}_r2.fastq',
-        adapters='adapters.fa'
-    output:
-        R1='{sample}_r1.trim.fastq',
-        R1='{sample}_r2.trim.fastq'
-    params: extra="-a file:adapters.fa -A file:adapters.fa -q 20"
-    threads: 4
-    wrapper:
-        "file://path/to/atropos"
-```
-
-
-## Input
-
-All inputs are FASTQ files, and they can be optionally gzipped.
-
-### Single-end mode:
-
-fastq : single-end FASTQ file
-
-### Paired-end mode:
-
-R1 : Read 1 FASTQ
-R2 : Read 2 FASTQ
-
-See examples below for other input options including adapters.
-
-## Output
-q
-### Single-end mode:
-
-fastq : Trimmed FASTQ file.
-
-### Paired-end mode:
-
-R1 : trimmed R1 FASTQ file
-R2 : trimmed R2 FASTQ file
-
-See examples below for other output options.
-
-## Log
-If a log file is specified, stdout and stderr will be captured there.
-
-## Threads
-One improvement of atropos over cutadapt is the ability to use threads which
-are passed to the `-T` option.
-
-## Params
-Additional parameters can be passed to atropos verbatim by supplying a string
-in `params.extra`.
-
-
-## Notes
-
-To dynamically select PE or SE without using `dynamic` support in snakemake,
-you can use a PHONY rule and use a function for `params.R2`, like in this
-example:
-
-```python
-def _input_func_atropos(wildcards):
-    """Determine if the sample is PE or SE"""
-    flags = some function to pull in se or pe info
-    if 'PE' in flags:
-        return {'R1': expand(fastqs['r1'], **wildcards)[0], 'R2': expand(fastqs['r2'], **wildcards)[0]}
-    else:
-        return {'R1': expand(fastqs['r1'], **wildcards)[0]}
-
-def _params_r2_atropos(wildcards):
-    """function to make temp R2 if pe."""
-    flags = some function to pull in se or pe info
-    if 'PE' in flags:
-        return expand(patterns['atropos']['r2'], **wildcards)[0] + '.tmp.gz'
-    else:
-        return None
-
-rule atropos:
-    input: unpack(_input_func_atropos)
-    output: R1=temp(patterns['atropos']['r1'])
-    params: R2=_params_r2_atropos
-    threads: 8
-    wrapper: wrapper_for('atropos')
-
-rule atropos_phony:
-    input: rules.atropos.output
-    output: temp(patterns['atropos']['r2'])
-    shell: """
-    mv {output[0]}.tmp.gz {output[0]}
-    """
-```
diff --git a/wrappers/wrappers/atropos/environment.yaml b/wrappers/wrappers/atropos/environment.yaml
deleted file mode 100644
index 314bcf2c..00000000
--- a/wrappers/wrappers/atropos/environment.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-channels:
-  - bioconda
-dependencies:
-  - atropos ==1.1.5
diff --git a/wrappers/wrappers/atropos/wrapper.py b/wrappers/wrappers/atropos/wrapper.py
deleted file mode 100644
index b6af4311..00000000
--- a/wrappers/wrappers/atropos/wrapper.py
+++ /dev/null
@@ -1,80 +0,0 @@
-__author__ = "Ryan Dale"
-__copyright__ = "Copyright 2016, Ryan Dale"
-__email__ = "dalerr@niddk.nih.gov"
-__license__ = "MIT"
-
-from snakemake.shell import shell
-
-extra = snakemake.params.get('extra', '')
-log = snakemake.log_fmt_shell()
-inputs = snakemake.input
-outputs = snakemake.output
-
-if isinstance(inputs, dict) and isinstance(outputs, dict):
-    # Get inputs
-    in_R1 = inputs.get('R1', None)
-    in_R2 = inputs.get('R2', None)
-    in_FASTQ = inputs.get('fastq', None)
-
-    if (in_R1 is None) and (in_FASTQ is not None):
-        in_R1 = in_FASTQ
-    elif (in_R1 is None) and (in_FASTQ is None):
-        raise KeyError('If providing a dictionary for input/output, you must uese either '
-            '`R1` or `fastq` for the first read. If providing a second read you must use `R2`.')
-
-    # Get outputs
-    out_R1 = outputs.get('R1', None)
-    out_R2 = outputs.get('R2', snakemake.params.get('R2', None))
-    out_FASTQ = outputs.get('fastq', None)
-
-    if (out_R1 is None) and (out_FASTQ is not None):
-        out_R1 = out_FASTQ
-    elif (out_R1 is None) and (out_FASTQ is None):
-        raise KeyError('If providing a dictionary for input/output, you must uese either '
-            '`R1` or `fastq` for the first read. If providing a second read you must use `R2`.')
-
-elif isinstance(inputs, list) and isinstance(outputs, list):
-    # Get inputs
-    if len(inputs) == 1:
-        in_R1 = inputs[0]
-        in_R2 = None
-    elif len(inputs) == 2:
-        in_R1 = sorted(inputs)[0]
-        in_R2 = sorted(inputs)[1]
-    else:
-        raise IndexError("If providing a list for input/output, they must have either 1 or 2 values.")
-
-    # Get outputs
-    if len(outputs) == 1:
-        out_R1 = outputs[0]
-        out_R2 = snakemake.params.get('R2', None)
-    elif len(outputs) == 2:
-        out_R1 = sorted(outputs)[0]
-        out_R2 = sorted(outputs)[1]
-    else:
-        raise IndexError("If providing a list for input/output, they must have either 1 or 2 values.")
-
-# Run paired end if both in_R2 and out_R2 are provided
-if (in_R2 is not None) and (out_R2 is not None):
-    shell(
-        "atropos trim "
-        "--threads {snakemake.threads} "
-        "{extra} "
-        "-pe1 {in_R1} "
-        "-pe2 {in_R2} "
-        "-o {out_R1} "
-        "-p {out_R2} "
-        "{log}"
-    )
-elif (in_R1 is not None) and (out_R1 is not None) and (in_R2 is None) and (out_R2 is None):
-    shell(
-        "atropos trim "
-        "{extra} "
-        "--threads {snakemake.threads} "
-        "-se {in_R1} "
-        "-o {out_R1} "
-        "{log}"
-    )
-else:
-    raise ValueError("Input and Output must match. If you give two value for "
-        "input you must give two values for output.")
diff --git a/wrappers/wrappers/average-bigwigs/README.md b/wrappers/wrappers/average-bigwigs/README.md
deleted file mode 100644
index af837c1f..00000000
--- a/wrappers/wrappers/average-bigwigs/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Average bigWigs
-
-Often we'd like to merge multiple bigWigs together for downstream work
-(heatmaps, etc) but there's no single tool to do this. This wrapper runs
-`bigWigMerge` on the inputs to sum their values, then uses `awk` to divide by
-their values and sort the way bedGraphToBigWig wants them.
-
-The intermediate bedGraph file will be created in ``$TMPDIR``.
-
-## Examples
-
-Minimal usage:
-
-```python
-rule average_bigwigs:
-    input: 
-        bigwigs=[
-            'a.bw',
-            'b.bw',
-            'c.bw'],
-        chromsizes='genome.chromsizes'
-    output:
-        'out.bw'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-Increase memory used for sorting:
-
-```python
-rule average_bigwigs:
-    input: 
-        bigwigs=[
-            'a.bw',
-            'b.bw',
-            'c.bw'],
-        chromsizes='genome.chromsizes'
-    output:
-        'out.bw'
-    params:
-        memory='32G'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-Single bigwig just gets symlinked over.
-
-```python
-rule average_bigwigs:
-    input: 
-        bigwigs='a.bw',
-        chromsizes='genome.chromsizes'
-    output:
-        'out.bw'
-    params:
-        memory='32G'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-## Input
-
-List of bigWig files.
-
-
-## Output
-
-Single bigWig file created by averaging the inputs
-
-## Threads
-Does not use threads
-
-## Params
-
-memory: Passed to `sort` as the `-S` argument.
diff --git a/wrappers/wrappers/average-bigwigs/environment.yaml b/wrappers/wrappers/average-bigwigs/environment.yaml
deleted file mode 100644
index 64dcd155..00000000
--- a/wrappers/wrappers/average-bigwigs/environment.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
-  - bioconda
-dependencies:
-  - ucsc-bigwigmerge
-  - ucsc-bedgraphtobigwig
diff --git a/wrappers/wrappers/average-bigwigs/wrapper.py b/wrappers/wrappers/average-bigwigs/wrapper.py
deleted file mode 100644
index 94be840a..00000000
--- a/wrappers/wrappers/average-bigwigs/wrapper.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os, sys
-sys.path.append(os.path.abspath('../../'))
-from lib import utils
-import tempfile
-from snakemake.shell import shell
-# Inspired by http://wresch.github.io/2014/01/31/merge-bigwig-files.html
-
-# If memory was supplied, we'll use that for sorting.
-if 'memory' in snakemake.params:
-    mem_arg = '-S {snakemake.params.memory}'
-else:
-    mem_arg = ''
-
-if len(snakemake.input.bigwigs) == 1:
-    utils.make_relative_symlink(snakemake.input.bigwigs[0], snakemake.output[0])
-
-else:
-
-    # bigWigMerge outputs sum; we need to divide each by n.
-    f = 1.0 / len(snakemake.input.bigwigs)
-
-    tmp = tempfile.NamedTemporaryFile(delete=False).name
-    tmpdir = tempfile.gettempdir()
-
-    shell(
-        'export LC_ALL=C; '
-        'bigWigMerge {snakemake.input.bigwigs} stdout 2> {snakemake.log} '
-        """| awk 'BEGIN{{OFS="\t"}}{{$4={f}*$4; print}}' """
-        '| sort {mem_arg} -T {tmpdir} -k1,1 -k2,2n > {tmp} '
-        '&& bedGraphToBigWig {tmp} {snakemake.input.chromsizes} '
-        '{snakemake.output} &>> {snakemake.log}'
-    )
diff --git a/wrappers/wrappers/combos/merge_and_dedup/README.md b/wrappers/wrappers/combos/merge_and_dedup/README.md
deleted file mode 100644
index b768e7d2..00000000
--- a/wrappers/wrappers/combos/merge_and_dedup/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# Merge and deduplicate
-
-Merges BAM files and then deduplicates the output. However if only one BAM file
-is created, the file is simply symlinked.
-
-This wrapper is often needed in ChIP-seq to merge technical replicates. The
-same fragment could have been sequenced in multiple tech reps, resulting in
-duplicate reads in the merged output even though each individual BAM already
-had duplicates removed.
-
-This method has an advantage over merging first and then deduping in separate
-rules when we want to retain both individual (per tech rep) deduped BAMs as
-well as merged deduped BAMs. Since the deduping has already happened once for
-each tech rep, we want to avoid doing so again if no merging happens.
-
-## Examples
-
-Minimal usage:
-
-```python
-rule merge_and_dedup:
-    input: 'a1.bam', 'a2.bam'
-    output:
-        bam='a-merged.bam',
-        metrics='a-merged.bam.metrics'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-In the following case, a symlink will be created since no merging needs to be
-performed on a single file:
-
-```python
-rule merge_and_dedup:
-    input: 'a1.bam'
-    output:
-        bam='a-merged.bam',
-        metrics='a-merged.bam.metrics'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-
-## Input
-
-Single BAM or list of BAMs.
-
-## Output
-
-- `bam`: output bam file
-- `metrics`: optional output metrics file. Default is to use
-  `{snakemake.output.bam}.metrics`.
-
-## Threads
-
-Threads are passed to `samtools merge`.
-
-## Params
-
-- `samtools_merge_extra`: addtional args passed verbatim to `samtools merge`
-
-- `markduplicates_extra`: addtional args passed verbatim to `markduplicates_extra`
-
-- `java_args`: passed to MarkDuplicates, often used to provide more memory
-  (e.g., `-Xmx32g`). Be sure to increase the corresponding rule's memory
-  resource to account for the additional allocation
diff --git a/wrappers/wrappers/combos/merge_and_dedup/environment.yaml b/wrappers/wrappers/combos/merge_and_dedup/environment.yaml
deleted file mode 100644
index b3e77ddb..00000000
--- a/wrappers/wrappers/combos/merge_and_dedup/environment.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-
-dependencies:
-  - picard
-  - samtools
diff --git a/wrappers/wrappers/demo/README.md b/wrappers/wrappers/demo/README.md
deleted file mode 100644
index a87fb3aa..00000000
--- a/wrappers/wrappers/demo/README.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# Demo wrapper
-
-This wrapper demonstrates current best-practices.
-
-The target audience of the wrapper's README should be yourself six months from
-now, under a tight deadline, frantically looking for that rule you wrote so you
-can copy/paste into a custom Snakefile.
-
-Examples should come first. There should be at least a minimal example and
-a reasonably complicated example. To be complete you can add links to docs,
-a brief description of the tool, and example output.
-
-This demo wrapper simply copies input files to output files.
-
-## Examples
-
-Minimal usage:
-
-```python
-rule demo:
-    input: 'a.txt'
-    output: 'b.txt'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-"paired-end" usage:
-
-```python
-rule demo:
-    input:
-        R1='a1.txt',
-        R2='a2.txt'
-    output:
-        R1='b1.txt',
-        R2='b2.txt'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-## Input
-
-Input file formats for this wrapper can be anything.
-
-### Single-end mode:
-
-Expects a single unnamed input file.
-
-### Paired-end mode:
-
-Expects two input files with keys `R1` and `R2`.
-
-## Output
-
-Output files are simply copies of input.
-
-### Single-end mode:
-
-Expects a single unnamed output file
-
-### Paired-end mode:
-
-Expects two output files with keys `R1` and `R2`.
-
-## Threads
-Does not use threads
-
-## Params
-Does not use params
diff --git a/wrappers/wrappers/demo/environment.yaml b/wrappers/wrappers/demo/environment.yaml
deleted file mode 100644
index f56993b2..00000000
--- a/wrappers/wrappers/demo/environment.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-channels:
-  - defaults
-dependencies:
-  - python=3
diff --git a/wrappers/wrappers/demo/wrapper.py b/wrappers/wrappers/demo/wrapper.py
deleted file mode 100644
index 158ce409..00000000
--- a/wrappers/wrappers/demo/wrapper.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env python
-
-from snakemake.shell import shell
-
-# All wrappers must be able to handle an optional params.extra.
-extra = snakemake.params.get('extra', '')
-
-
-# This lets us handle whether to write to a log file or to write to stdout.
-# See snakemake.script.log_fmt_shell for details.
-log = snakemake.log_fmt_shell()
-
-
-# This demo shows how to handle paired-end and single-end input data as two
-# different cases, depending on whether the rule's input included an "R2" key
-# or not.
-paired_end = (
-    'R1' in snakemake.input.keys() and
-    'R2' in snakemake.input.keys()
-)
-
-if paired_end: 
-    shell('cp {snakemake.input.R1} {snakemake.output.R1}')
-    shell('cp {snakemake.input.R2} {snakemake.output.R2}')
-
-else:
-    shell("cp {snakemake.input} {snakemake.output} {log}")
diff --git a/wrappers/wrappers/dupradar/README.md b/wrappers/wrappers/dupradar/README.md
deleted file mode 100644
index 0667bd9c..00000000
--- a/wrappers/wrappers/dupradar/README.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Wrapper for dupRadar
-
-dupRadar provides an easy way to distinguish between artifactual vs natural
-duplicate reads in RNA-Seq data. Prior to dupRadar only global duplication rates
-were used and they don't take into account the effect of gene expression levels. 
-dupRadar relates *duplication rates* and *length normalized read counts* of every
-gene to model the dependency of both variables. 
-
-[Link to homepage](https://www.bioconductor.org/packages/release/bioc/html/dupRadar.html)
-
-[Link to manual](https://www.bioconductor.org/packages/devel/bioc/vignettes/dupRadar/inst/doc/dupRadar.html)
-
-## Example
-
-Single-end, not stranded:
-
-```python
-rule dupRadar:
-    input:
-       bam='sample1.bam',
-       annotation='dm6.gtf',
-    output:
-        density_scatter='sample1.density_scatter.png',
-        expression_histogram='sample1.expression_histogram.png',
-        expression_boxplot='sample1.expression_boxplot.png',
-        expression_barplot='sample1.expression_barplot.png',
-        multimapping_histogram='sample1.multimapping_histogram.png',
-        dataframe='sample1.dupradar.tsv'
-    wrapper:
-        wrapper_for('dupRadar')
-```
-
-Paired-end, stranded:
-
-```python
-rule dupRadar:
-    input:
-       bam='{sample_dir}/{sample}/{sample}.cutadapt.hisat2.unique.sort.dedup.bam',
-       annotation='annotations/dm6.gtf',
-    output:
-        density_scatter='sample1.density_scatter.png',
-        expression_histogram='sample1.expression_histogram.png',
-        expression_boxplot='sample1.expression_boxplot.png',
-        expression_barplot='sample1.expression_barplot.png',
-        dataframe='sample1.dupradar.tsv'
-    params:
-        paired=True,
-        stranded=True
-    wrapper:
-        wrapper_for('dupRadar')
-```
-
-## Input
-* `bam`: BAM file with mapped reads has to be duplicate marked using either
-  Picard or BamUtil
-
-* `annotation`: GTF file contaning features to count the reads falling on the
-  features.
-
-## Output
-Output plots are described in the [dupRadar
-vignette)[http://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html].
-See that page for descriptions of outputs and how to interpret them.
-
-* `density_scatter`: expression vs percent duplication
-* `expression_boxplot`: expression vs percent duplication, binned into boxes
-* `expression_histogram`: standard histogram of expression (RPKM)
-* `expression_barplot`: percentage duplication in 5% expression bins.
-* `multimapping_histogram`: histogram showing fraction of reads coming from
-  multimapping reads
-* `dataframe`: results from `analyzeDuprates` saved as a TSV for downstream
-  analysis. Following the vignette, we also add the fraction of multimappers in
-  each gene as the column `mhRate`.
-* `model`: Slope and intercept of the dupsExpFit
-* `curve`: Simplified curve of the GLM for downstream plotting
-
-## Threads
-Threads are passed to dupRadar and are in turn passed to featureCounts, which
-it calls automatically.
-
-## Params
-* `paired`: True | False. Default False.
-* `stranded`: True | False | "reverse". Default False.
diff --git a/wrappers/wrappers/dupradar/environment.yaml b/wrappers/wrappers/dupradar/environment.yaml
deleted file mode 100644
index d59b35e1..00000000
--- a/wrappers/wrappers/dupradar/environment.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-channels:
-  - conda-forge
-  - bioconda
-  - lcdb
-dependencies:
-  - python=3
-  - bioconductor-dupradar
-  - r-kernsmooth
-  - r-base >=3.5.1
-  - ghostscript
diff --git a/wrappers/wrappers/dupradar/wrapper.py b/wrappers/wrappers/dupradar/wrapper.py
deleted file mode 100644
index e9ef30d6..00000000
--- a/wrappers/wrappers/dupradar/wrapper.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import tempfile
-from snakemake.shell import shell
-import os, sys
-sys.path.append(os.path.abspath('../..'))
-from lib import helpers
-
-extra = snakemake.params.get('extra', '')
-try:
-    log = snakemake.log
-except AttributeError:
-    log = None
-
-stranded = snakemake.params.get('stranded', False)
-try:
-    stranded_int = {False: 0, True: 1, 'reverse': 2}[stranded]
-except KeyError:
-    raise ValueError('"stranded" must be True|False|"reverse"')
-
-paired = snakemake.params.get('paired', False)
-try:
-    paired_bool= {True: 'TRUE', False: 'FALSE'}[paired]
-except KeyError:
-    raise ValueError('"paired" must be True or False')
-
-tempdir = tempfile.mkdtemp()
-
-# To avoid issues with png() related to X11 and cairo, we can use bitmap() instead.
-# (thanks
-# http://stackoverflow.com/questions/24999983/
-# r-unable-to-start-device-png-capabilities-has-true-for-png
-# #comment52353278_25064603 )
-
-script = """
-library(dupRadar)
-bam <- "{snakemake.input.bam}"
-gtf <- "{snakemake.input.annotation}"
-dm <- analyzeDuprates(bam, gtf, {stranded_int}, {paired_bool}, {snakemake.threads}, tmpDir = "{tempdir}")
-
-dm$mhRate <- (dm$allCountsMulti - dm$allCounts) / dm$allCountsMulti
-bitmap(file="{snakemake.output.multimapping_histogram}")
-hist(dm$mhRate, breaks=50, main=basename(bam),
-    xlab="Multimapping rate per gene", ylab="Frequency")
-dev.off()
-
-bitmap(file="{snakemake.output.density_scatter}")
-duprateExpDensPlot(dm, main=basename(bam))
-dev.off()
-
-bitmap(file="{snakemake.output.expression_histogram}")
-expressionHist(dm)
-dev.off()
-
-bitmap(file="{snakemake.output.expression_boxplot}")
-par(mar=c(10,4,4,2)+.1)
-duprateExpBoxplot(dm, main=basename(bam))
-dev.off()
-
-bitmap(file="{snakemake.output.expression_barplot}")
-readcountExpBoxplot(dm)
-dev.off()
-
-write.table(dm, file="{snakemake.output.dataframe}", sep="\\t")
-
-# The following is from
-# https://github.com/ewels/NGI-RNAseq/blob/master/bin/dupRadar.r
-
-fit <- duprateExpFit(DupMat=dm)
-df <- data.frame(intercept=as.numeric(fit$intercept), slope=c(fit$slope))
-cat("# dupRadar model params\\n", file="{snakemake.output.model}")
-write.table(df, file="{snakemake.output.model}", sep="\\t", append=TRUE, row.names=FALSE)
-
-# Get numbers from dupRadar GLM
-curve_x <- sort(log10(dm$RPK))
-curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response")
-# Remove all of the infinite values
-infs = which(curve_x %in% c(-Inf,Inf))
-curve_x = curve_x[-infs]
-curve_y = curve_y[-infs]
-# Reduce number of data points
-curve_x <- curve_x[seq(1, length(curve_x), 10)]
-curve_y <- curve_y[seq(1, length(curve_y), 10)]
-# Convert x values back to real counts
-curve_x = 10^curve_x
-# Write to file
-write.table(
-  cbind(curve_x, curve_y),
-  file="{snakemake.output.curve}",
-  quote=FALSE, row.names=FALSE
-)
-""".format(**locals())
-
-tmp = tempfile.NamedTemporaryFile(delete=False).name
-helpers.rscript(script, tmp, log=log)
-shell("rm -r {tempdir}")
diff --git a/wrappers/wrappers/epic2/environment.yaml b/wrappers/wrappers/epic2/environment.yaml
deleted file mode 100644
index cacda5da..00000000
--- a/wrappers/wrappers/epic2/environment.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-dependencies:
-  - epic2
-  - numpy
-  - bedtools
-  - ucsc-bedsort=377
diff --git a/wrappers/wrappers/fastq-dump/environment.yaml b/wrappers/wrappers/fastq-dump/environment.yaml
deleted file mode 100644
index 6653b6cc..00000000
--- a/wrappers/wrappers/fastq-dump/environment.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
-  - conda-forge
-  - bioconda
-dependencies:
-  - sra-tools>=3
diff --git a/wrappers/wrappers/fastq-dump/wrapper.py b/wrappers/wrappers/fastq-dump/wrapper.py
deleted file mode 100644
index 507efe43..00000000
--- a/wrappers/wrappers/fastq-dump/wrapper.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from snakemake import shell
-output = snakemake.output
-log = snakemake.log
-
-srr = snakemake.params.sampletable.loc[snakemake.wildcards.sample, 'Run']
-
-if hasattr(snakemake.params, "limit"):
-    limit = f'-X {snakemake.params.limit}'
-else:
-    limit = ""
-
-# Two different paths depending on the layout. In both cases, we
-# want to avoid creating the final output until the very end, to
-# avoid incomplete downloads.
-if snakemake.params.is_paired:
-    # For PE we need to use --split-files, which also means using
-    # the slower --gzip
-    shell(
-        'fastq-dump '
-        '{srr} '
-        '--gzip '
-        '--split-files '
-        '{limit} '
-        '&> {log}'
-    )
-
-    # The filenames are predictable, so we can move them as needed.
-    shell('mv {srr}_1.fastq.gz {output[0]}')
-    shell('mv {srr}_2.fastq.gz {output[1]}')
-
-else:
-    # For SE, we can use the faster stdout | gzip, and move it
-    # directly when done.
-    shell(
-        'fastq-dump '
-        '{srr} '
-        '-Z '
-        '{limit} '
-        '2> {log} | gzip -c > {output[0]}.tmp '
-        '&& mv {output[0]}.tmp {output[0]} '
-    )
diff --git a/wrappers/wrappers/fastq_screen/README.md b/wrappers/wrappers/fastq_screen/README.md
deleted file mode 100644
index efd36a32..00000000
--- a/wrappers/wrappers/fastq_screen/README.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# Wrapper for fastq_screen
-
-[`fastq_screen`](http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen)
-screens a library of sequences in FASTQ format against a set of sequence
-databases identifying the composition of the library and possible contaminants.
-
-Fastq screen uses a configuration file pointing to different database. For example:
-
-```
-DATABASE	ecoli	/data/Escherichia_coli/Bowtie2Index/genome	BOWTIE2
-DATABASE	hg19	/data/hg19/Bowtie2Index/genome	BOWTIE2
-DATABASE	mm10	/data/mm10/Bowtie2Index/genome	BOWTIE2
-```
-
-This configuration file is automatically generated by the wrapper based on
-which indexes are given as inputs (see **Example**). Currently the wrapper only
-supports bowtie2 and defaults to using a subset of 100000 reads.  Which can be
-overridden using `params.subset` setting.  Furthermore, `params.extra` is
-passed arguments verbatim to `fastq_screen`, for example
-`extra="--illumina1_3"` or `extra="--bowtie2 '--trim5=8'"`.
-
-Note that `fastq_screen` hard-codes the output filenames. This wrapper moves
-the hard-coded output files to those specified by the rule. Currently the
-wrapper does not save png's generated by fastq screen. It does, however, support
-the contextual saving of tagged and/or filtered output fastqs from fastq_screen.
-If desired, combinations of "--tag" and/or "--filter [filter_codes]" should be
-provided to the run via the "extra" parameter in the Snakemake rule. The output
-fastqs will *not* be tracked by Snakemake. They will be named as
-"{snakemake.output.txt}.tagged.fastq.gz" or "{snakemake.output.txt}.tagged_filter.fastq.gz"
-respectively.
-
-## Example:
-
-```
-rule fastq_screen:
-    input:
-        fastq="samples/{sample}.fastq.gz",
-        ecoli=["/data/Escherichia_coli/Bowtie2Index/genome.1.bt2", "/data/Escherichia_coli/Bowtie2Index/genome.2.bt2"],
-        hg19=["/data/hg19/Bowtie2Index/genome.1.bt2", "/data/hg19/Bowtie2Index/genome.2.bt2"],
-        mm10=["/data/mm10/Bowtie2Index/genome.1.bt2", "/data/mm10/Bowtie2Index/genome.2.bt2"]
-    output:
-        txt="qc/{sample}.fastq_screen.txt"
-    params:
-        subset=100000,
-        aligner='bowtie2'
-    threads: 8
-    wrapper:
-        "file:wrapper"
-```
-
-## Input
-
-* `fastq` is a FASTQ file, gzipped or not.
-
-* Additional arguments are used as labels and their values will be used to
-  generate database location.
-
-## Output
-
-`txt`: a text file containing the fraction of reads mapping to each provided
-index
diff --git a/wrappers/wrappers/fastq_screen/environment.yaml b/wrappers/wrappers/fastq_screen/environment.yaml
deleted file mode 100644
index 360a727c..00000000
--- a/wrappers/wrappers/fastq_screen/environment.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-channels:
-  - conda-forge
-  - bioconda
-dependencies:
-  - python=3
-  - fastq-screen
-  - bowtie2
diff --git a/wrappers/wrappers/fastq_screen/wrapper.py b/wrappers/wrappers/fastq_screen/wrapper.py
deleted file mode 100644
index 9b262cc1..00000000
--- a/wrappers/wrappers/fastq_screen/wrapper.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import os
-from snakemake.shell import shell
-import sys
-sys.path.append(os.path.abspath('../..'))
-from lib import aligners
-import tempfile
-
-__author__ = "Ryan Dale"
-__copyright__ = "Copyright 2016, Ryan Dale"
-__email__ = "dalerr@niddk.nih.gov"
-__license__ = "MIT"
-
-# Pull in parameters
-extra = snakemake.params.get('extra', '')
-aligner = snakemake.params.get('aligner', 'bowtie2')
-subset = snakemake.params.get('subset', 100000)
-
-if aligner == 'bowtie2':
-    parse_index = aligners.prefix_from_bowtie2_index
-
-# Make log
-log = snakemake.log_fmt_shell()
-
-# snakemake.params.fastq_screen_config can be either a dict or a string. If
-# string, interpret as a filename pointing to the fastq_screen config file.
-# Otherwise, create a new tempfile out of the contents of the dict:
-
-tmp = tempfile.NamedTemporaryFile(delete=False).name
-with open(tmp, 'w') as fout:
-    for k, v in snakemake.input.items():
-        if k != 'fastq':
-            label = k
-            if isinstance(v, str):
-                v = [v]
-            index = parse_index(v)
-            fout.write(
-                '\t'.join(['DATABASE', label, index, aligner.upper()]) + '\n')
-    config_file = tmp
-
-# fastq_screen hard-codes filenames according to this prefix. We will send
-# hard-coded output to a temp dir, and then move them later.
-tempdir = tempfile.mkdtemp()
-
-# Note that we assume only R1 is coming in.
-prefix = os.path.basename(snakemake.input.fastq[0].split('.fastq')[0])
-
-shell(
-    "fastq_screen --outdir {tempdir} "
-    "--force "
-    "--aligner {aligner} "
-    "--conf {config_file} "
-    "--subset {subset} "
-    "--threads {snakemake.threads} "
-    "{extra} "
-    "{snakemake.input.fastq} "
-    "{log}"
-)
-
-# Move output to the filenames specified by the rule
-shell("cp {tempdir}/{prefix}_screen.txt {snakemake.output.txt}")
-
-# Check for the output of the --tag option to fastq_screen
-if os.path.isfile("{tempdir}/{prefix}.tagged.fastq.gz"):
-    shell("cp {tempdir}/{prefix}.tagged.fastq.gz {snakemake.output.txt}.tagged.fastq.gz")
-
-# Check for the output of the --filter XXXXXX option to fastq_screen
-if os.path.isfile("{tempdir}/{prefix}.tagged_filter.fastq.gz"):
-    shell("cp {tempdir}/{prefix}.tagged_filter.fastq.gz {snakemake.output.txt}.tagged_filter.fastq.gz")
-
-# Clean up temp
-shell("rm -r {tempdir}")
-shell("rm {tmp}")
diff --git a/wrappers/wrappers/fastqc/README.md b/wrappers/wrappers/fastqc/README.md
deleted file mode 100644
index 678bf9be..00000000
--- a/wrappers/wrappers/fastqc/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Wrapper for FastQC
-
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) performs
-quality control for high-throughput sequencing data.
-
-## Input
-FASTQ, SAM, or BAM file. FastQC will auto-detect, but you can also use
-`--format` and one of bam, sam, bam_mapped, sam_mapped or fastq in the
-params.extra field (see example).
-
-## Output
-- html: an html file containing the report for the sample
-- zip: a zip file containing the images and text file of results
-
-## Threads
-Supports threads, passed in as the `--threads` arg
-
-## Params
-Additional parameters can be passed to FastQC verbatim by supplying a string in params.extra.
-
-# Example
-
-```
-rule fastqc:
-    input: 'samples/{sample}.fastq'
-    output:
-        html='samples/{sample}.fastqc.html',
-        zip='samples/{sample}.fastqc.zip'
-    params: extra="--contaminants adapters.tsv --format fastq"
-    wrapper:
-        "file://path/to/fastqc"
-```
diff --git a/wrappers/wrappers/fastqc/environment.yaml b/wrappers/wrappers/fastqc/environment.yaml
deleted file mode 100644
index 3d0dee62..00000000
--- a/wrappers/wrappers/fastqc/environment.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-dependencies:
-  # for fastqc running in minimal containers, which complain about missing
-  # fonts
-  - openjdk >=8.0.144
-  - font-ttf-dejavu-sans-mono
-  - fastqc
diff --git a/wrappers/wrappers/fastqc/wrapper.py b/wrappers/wrappers/fastqc/wrapper.py
deleted file mode 100644
index 32032bbd..00000000
--- a/wrappers/wrappers/fastqc/wrapper.py
+++ /dev/null
@@ -1,48 +0,0 @@
-__author__ = "Ryan Dale"
-__copyright__ = "Copyright 2016, Ryan Dale"
-__email__ = "dalerr@niddk.nih.gov"
-__license__ = "MIT"
-
-import os
-from snakemake.shell import shell
-from snakemake.utils import makedirs
-
-# fastqc creates a zip file and an html file but the filename is hard-coded by
-# replacing fastq|fastq.gz|fq|fq.gz|bam with _fastqc.zip|_fastqc.html in the
-# input file's basename.
-#
-# So we identify that file and move it to the expected output after fastqc is
-# done.
-
-outfile = os.path.basename(snakemake.input[0])
-outdir = os.path.dirname(snakemake.output.html)
-if outdir == '':
-    outdir = '.'
-
-strip = ['.fastq', '.fq', '.gz', '.bam']
-for s in strip:
-    outfile = outfile.replace(s, '')
-out_zip = os.path.join(outdir, outfile + '_fastqc.zip')
-out_html = os.path.join(outdir, outfile + '_fastqc.html')
-
-extra = snakemake.params.get('extra', '')
-log = snakemake.log_fmt_shell()
-
-shell(
-    'fastqc '
-    '--threads {snakemake.threads} '
-    '--noextract '
-    '--quiet '
-    '--outdir {outdir} '
-    '{extra} '
-    '{snakemake.input} '
-    '{log} '
-)
-
-def same_file(x, y):
-    return os.path.abspath(x) == os.path.abspath(y)
-
-if not same_file(out_zip,snakemake.output.zip):
-    shell('mv {out_zip} {snakemake.output.zip}')
-if not same_file(out_html, snakemake.output.html):
-    shell('mv {out_html} {snakemake.output.html}')
diff --git a/wrappers/wrappers/macs2/callpeak/README.md b/wrappers/wrappers/macs2/callpeak/README.md
deleted file mode 100644
index bafad838..00000000
--- a/wrappers/wrappers/macs2/callpeak/README.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# MACS2
-
-Wraps the `macs2 callpeak` subprogram to call ChIP-seq peaks on input BAM
-files.
-
-## Examples
-
-Minimal usage. MACS2 outputs a whole directory; this directory is the dirname
-of `output.bed`. Note the specification of the genome size in `params.extra`.
-
-```python
-rule macs2:
-    input:
-        treatment='ip.bam',
-        control='input.bam',
-        chromsizes='dm6.chromsizes'
-    output:
-        bed='out/peaks.bed'
-    extra: '-g dm'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-MACS2 supports multiple ip and input samples (they are concatenated). This also
-shows broad peak-calling, asks MACS2 to create scaled bedgraphs, and adds them as
-output files so downstream rules can use them:
-
-```python
-rule macs2:
-    input:
-        treatment=['ip1.bam', 'ip2.bam'],
-        control=['input1.bam', 'input2.bam'],
-        chromsizes='dm6.chromsizes'
-    output:
-        bed='out/peaks.bed'
-    params: extra='-g dm --bdg --SPMR --broad'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-## Input
-
-`treatment`: single BAM or list of BAMs for IP
-
-`control`: single BAM or list of BAMs for input
-
-`chromsizes`: Chromsizes table, used to ensure peak boundaries do not extend
-outside of chromosome limits.
-
-## Output
-
-`bed`: BED file of called peaks. This is symlinked from the
-`*_peaks.narrowPeak` or `*_peaks.broadPeak` file created by MACS2.
-
-Other files are created, these can be added as additional named outputs for use
-by downstream rules, however the wrapper only pays attention to
-`snakemake.output.bed`.
-
-
-## Params
-Additional params in `extra` will be passed verbatim to `macs2 callpeak`.
diff --git a/wrappers/wrappers/macs2/callpeak/environment.yaml b/wrappers/wrappers/macs2/callpeak/environment.yaml
deleted file mode 100644
index 51d04270..00000000
--- a/wrappers/wrappers/macs2/callpeak/environment.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-dependencies:
-  - macs2
-  - numpy
-  - bedtools
-  - ucsc-bedsort=377
diff --git a/wrappers/wrappers/sicer/README.md b/wrappers/wrappers/sicer/README.md
deleted file mode 100644
index 9be29101..00000000
--- a/wrappers/wrappers/sicer/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# SICER
-
-Wraps the `sicer` program to call ChIP-seq peaks on input BED files.
-
-## Examples
-
-Minimal usage. SICER is the best operating piece of hot garbage you'll ever find.
-It has a completely fixed set of input parameters it requires, hard-coded genome
-data in SICER/lib/GenomeData.py (submit bug report in bioconda if you need
-additions), and it can't be run from the same directory at the same time due to
-hard coded output filenames. It's a proper mess boss.
-
-```python
-rule sicer:
-    input:
-        ip='ip.bed',
-        control='input.bed',
-	redundancy_threshold=1,
-	window_size=200,
-	fragment_size=150,
-	effective_genome_fraction=0.75,
-	gap_size=600,
-	fdr=0.01
-    output:
-        bed='out/peaks.bed'
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-
-## Input
-
-`ip`: single BED for IP
-
-`control`: single BED for input
-
-`redundancy_threshold`: cutoff count above which duplicates are removed
-
-`window_size`: SICER resolution; 200 recommended for histones
-
-`fragment_size`: twice the shift from the beginning to the center of a read
-
-`effective_genome_fraction`: percentage of mappable genome; only set it here if you want to override the genome build in config.yaml
-
-`gap_size`: nonnegative integer multiple of window size. used to merge contiguous regions (higher means more liberal merging).
-
-`fdr`: FDR cutoff for calling significant regions.
-
-## Output
-
-`bed`: BED file of called peaks. This is a delicately processed version of `*island.bed` from SICER.
-
-Other files are created, these can be added as additional named outputs for use
-by downstream rules, however the wrapper only pays attention to
-`snakemake.output.bed`.
-
-
-## Params
-Do not use `extra` for this rule.
diff --git a/wrappers/wrappers/sicer/environment.yaml b/wrappers/wrappers/sicer/environment.yaml
deleted file mode 100644
index 44cd4d76..00000000
--- a/wrappers/wrappers/sicer/environment.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-dependencies:
-  - python=2
-  - numpy
-  - sicer
-  - bedtools
-  - ucsc-bedsort=377
-  - ucsc-wigtobigwig=377
diff --git a/wrappers/wrappers/sicer/wrapper.py b/wrappers/wrappers/sicer/wrapper.py
deleted file mode 100644
index 7fd29a9e..00000000
--- a/wrappers/wrappers/sicer/wrapper.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import tempfile
-import os
-import glob
-from snakemake import shell
-
-logfile = None
-
-# as SICER's interface is rather strict, this wrapper enforces named variables
-# instead of 'extra' arbitrary string
-
-def get_value(key, key2=None):
-    """
-    Get the value from params.block if it exists, otherwise from params.
-
-    If key2 is not None, it's a different key to extract from the same params.block.
-
-    Raises ValueError if nothing is configured.
-    """
-    if key2 is None:
-        key2 = key
-        val = snakemake.params.block.get(key, snakemake.params.get(key))
-    else:
-        val = snakemake.params.block.get(key, snakemake.params.block.get(key2))
-
-    if val is None:
-        raise ValueError(
-            "SICER requires the specification of '{0}'".format(key))
-    return val
-
-redundancy_threshold = get_value('redundancy_threshold')
-window_size = get_value('window_size')
-fragment_size = get_value('fragment_size')
-effective_genome_fraction = get_value('effective_genome_fraction', 'reference_effective_genome_fraction')
-gap_size = get_value('gap_size')
-fdr = get_value('fdr')
-genome_build = get_value('genome_build', 'reference_genome_build')
-
-outdir, basebed = os.path.split(snakemake.output.bed)
-label = snakemake.params.block['label']
-
-tmpdir = tempfile.mkdtemp()
-cwd = os.getcwd()
-
-# SICER expects bed input format, not bam as in other peak callers
-shell(
-    'bamToBed -i {snakemake.input.ip} > {tmpdir}/ip.bed ; '
-    'bamToBed -i {snakemake.input.control} > {tmpdir}/in.bed '
-)
-
-# SICER emits a single hard-coded file that does not respect output directory.
-# So move each run into its own temp directory to avoid collisions with
-# other processes.
-os.chdir(tmpdir)
-
-shell(
-    # there is a CI-specific bug, in which the python symlink is not correctly resolved to python2.7;
-    # so as a really desperate hack, modify SICER's python calls to directly touch 2.7
-    """sed 's/^python/$CONDA_PREFIX\/bin\/python2.7/' """
-    """$CONDA_PREFIX/share/sicer*/SICER.sh > {tmpdir}/SICER.sh && chmod u+x {tmpdir}/SICER.sh """
-)
-shell(
-    # run SICER
-    """{tmpdir}/SICER.sh {tmpdir} ip.bed in.bed {tmpdir} """
-    """{genome_build} {redundancy_threshold} {window_size} """
-    """{fragment_size} {effective_genome_fraction} {gap_size} {fdr} > tmp.output 2>&1 """
-)
-
-# Move back once the run is complete.
-os.chdir(cwd)
-
-# one of the results files gets converted to the broadPeak format ala macs
-resultsfile = glob.glob(os.path.join(tmpdir, '*-islands-summary-FDR*'))
-if len(resultsfile) == 1:
-    hit = resultsfile[0]
-    basehit = os.path.basename(resultsfile[0])
-elif len(resultsfile) > 1:
-    raise ValueError(
-        "Multiple islands-summary-FDR files found in {1}: {0}"
-        .format(os.listdir(tmpdir), tmpdir)
-    )
-else:
-    raise ValueError("No islands-summary-FDR file found in {1}: {0}".format(os.listdir(tmpdir), tmpdir))
-
-# "summary graph for [the run] in bedGraph format"
-summary_graph = glob.glob(os.path.join(tmpdir, '*-W{0}.graph*'.format(window_size)))
-if len(summary_graph) == 1:
-    summary_graph = summary_graph[0]
-else:
-    raise ValueError("SICER graph output file not found")
-
-# the bedGraph file above, normalized by library size per million, in wig format
-normalized_prefilter_wig = glob.glob(os.path.join(tmpdir, '*-W{0}-normalized.wig'.format(window_size)))
-if len(normalized_prefilter_wig) == 1:
-    normalized_prefilter_wig = normalized_prefilter_wig[0]
-else:
-    raise ValueError("SICER normalized prefilter wig file not found")
-
-# "summary of all candidate islands with their statistical significance
-candidate_islands = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-islands-summary'.format(window_size, gap_size)))
-if len(candidate_islands) == 1:
-    candidate_islands = candidate_islands[0]
-else:
-    raise ValueError("SICER candidate islands file not found")
-
-# "delineation of significant islands"
-significant_islands = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-island.bed'.format(window_size, gap_size)))
-if len(significant_islands) == 1:
-    significant_islands = significant_islands[0]
-else:
-    raise ValueError("SICER significant islands file not found")
-
-# "library of raw redundancy-removed reads on significant islands
-redundancy_removed = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-islandfiltered.bed'.format(window_size, gap_size)))
-if len(redundancy_removed) == 1:
-    redundancy_removed = redundancy_removed[0]
-else:
-    raise ValueError("SICER redundancy removed library file not found")
-
-# "wig file for the island-filtered redundancy-removed reads
-normalized_postfilter_wig = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-islandfiltered-normalized.wig'.format(window_size, gap_size)))
-if len(normalized_postfilter_wig) == 1:
-    normalized_postfilter_wig = normalized_postfilter_wig[0]
-else:
-    raise ValueError("SICER normalized postfilter wig file not found")
-
-shell(
-    "export LC_COLLATE=C; "
-    # format the output in broadPeak format
-    # note that SICER can emit p-values of 0 and in that case this file will contain "inf" entries
-    """awk -F"\\t" -v lab={label} """
-    """'{{printf("%s\\t%d\\t%d\\t%s_peak_%d\\t%d\\t.\\t%g\\t%g\\t%g\\n", $1, """
-    """$2, $3-1, lab, NR, -10*log($6)/log(10), $7, -log($6)/log(10), -log($8)/log(10))}}' """
-    "{hit} > {snakemake.output.bed}.tmp && "
-    # sort the bed file, just to be sure
-    "bedSort {snakemake.output.bed}.tmp {snakemake.output.bed} && "
-    # rename the assorted output files
-    "mv {resultsfile} {snakemake.output.bed}-islands-summary-significant && "
-    "mv {summary_graph} {snakemake.output.bed}.graph && "
-    "wigToBigWig {normalized_prefilter_wig} {snakemake.input.chromsizes} {snakemake.output.bed}-normalized-prefilter.bigWig && "
-    "wigToBigWig {normalized_postfilter_wig} {snakemake.input.chromsizes} {snakemake.output.bed}-normalized-postfilter.bigWig && "
-    "mv {candidate_islands} {snakemake.output.bed}-islands-summary && "
-    "mv {significant_islands} {snakemake.output.bed}-island.bed && "
-    "mv {redundancy_removed} {snakemake.output.bed}-islandfiltered.bed && "
-    "mv {tmpdir}/tmp.output {snakemake.output.bed}.log && "
-    # clean up the temp directory
-    "rm {snakemake.output.bed}.tmp && rm -Rf {tmpdir}"
-)
diff --git a/wrappers/wrappers/spp/README.md b/wrappers/wrappers/spp/README.md
deleted file mode 100644
index a8eb7c43..00000000
--- a/wrappers/wrappers/spp/README.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# spp
-
-Wraps the [`spp`](http://compbio.med.harvard.edu/Supplements/ChIP-seq/) peak-caller.
-
-This is a rather complicated wrapper. See input and output sections below for
-details.
-
-
-## Examples
-
-Minimal usage:
-
-```python
-rule spp:
-    input:
-      ip="ip.bam",
-      control="control.bam",
-      chromsizes='dm6.chromsizes'
-    output: "peaks.bed"
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-Specify parameters (see below for options):
-
-
-```python
-rule spp:
-    input:
-      ip="ip.bam",
-      control="control.bam",
-      chromsizes='dm6.chromsizes'
-    output: "peaks.bed"
-    params: block={'fdr': 0.1}
-
-    wrapper:
-        'file://path/to/wrapper'
-```
-
-Specify additional output files:
-
-```python
-rule spp:
-    input:
-        ip="ip.bam",
-        control="control.bam",
-        chromsizes='dm6.chromsizes'
-    output:
-        bed="peaks.bed"
-        enrichment_estimates="enrichment_est.bedgraph",
-        smoothed_enrichment_mle="enrichment_mle.bedgraph",
-        rdata="image.RData"
-    params: block={'fdr': 0.1}
-    log: "spp.log"
-```
-
-The works, with multiple replicate BAMs to be merged, keeping the tempfiles,
-increasing the memory available to MarkDuplicates, all the output files,
-adjusting spp params, and using 8 threads for merging and duplicates removal:
-
-
-```python
-rule spp:
-    input:
-        ip=["ip.bam", "ip2.bam"],
-        control=["control.bam", "control2.bam", "control3.bam"],
-        chromsizes='dm6.chromsizes'
-    output:
-        bed="peaks.bed"
-        enrichment_estimates="enrichment_est.bedgraph",
-        smoothed_enrichment_mle="enrichment_mle.bedgraph",
-        rdata="image.RData"
-    log: 'spp.log'
-    threads: 8
-    params:
-        block={'fdr': 0.1, 'bins': 10},
-        java_args='-Xmx64g'
-        keep_tempfiles=True
-    log: "spp.log"
-```
-
-## Input
-
-`ip`, `control`: BAM files. Duplicates should already be removed.
-
-`chromsizes`: Chromsizes table, used to ensure peak boundaries do not extend
-outside of chromosome limits.
-
-SPP itself only supports a single BAM file for IP and a single BAM file for
-control.  However, to support the common case of pooling replicates to gain
-coverage, this wrapper does handle multiple BAMs.
-
-If more than one BAM is provided for either IP or control, the BAMs are merged
-and then duplicates are removed from the merged file (to handle reads that
-occur  in both replicates, which would otherwise cause spp to complain) are
-then removed using MarkDuplicates. This merged, deduped BAM is then provided to
-SPP.
-
-The merged BAM, merged-and-deduped BAM, and metrics file (from MarkDuplicates)
-are created as temp files. The temp filenames are indicated in the log. If you
-need these for debugging, set `params: keep_tempfiles=True` to keep them.
-
-## Output
-
-The only required output is `bed`. Others, if specified, will trigger their
-respective creation.
-
-`bed`: narrowPeak format.
-
-`smoothed_enrichment_mle`: BEDGRAPH file (even though SPP calls it a "WIG") of
-smoothed enrichment using the `smoothed.enrichment.mle` method from SPP.
-Optional, if not specified it will not be created.
-
-`enrichment_estimates`: BEDGRAPH file (even though SPP calls it a "WIG") of
-enrichment estimates using the `get.conservative.fold.enrichment.profile`
-function from SPP. Optional, if not specified will not be created.
-
-`rdata`: Saves an image of the workspace. Handy for debugging. Optional, if not
-specified will not be created.
-
-An R script named after the BED file (`{snakemake.output.bed}.R`), will be
-written to the output directory. This can be run from the same directory as the
-snakefile was run from for debugging purposes.
-
-## Threads
-We do not run SPP in parallel mode due to trouble with running the `snow`
-library on clusters (it seems to crash unexpectedly and intermittently).
-However, for multiple BAMs, we pass the threads to samtools and MarkDuplicates.
-
-## Params
-
-### wrapper params
-
-`keep_tempfiles`: bool; if True then tempfiles created by merging and deduping
-replicate BAMs will be retained for debugging purposes.
-
-`java_args`: str; additional args provided to picard, e.g., `java_args="-Xmx64g"`
-
-### spp params
-
-Since SPP doesn't have a command-line interface, we can't use the "extras="
-mechanism to pass params verbatim. Instead, the R script created by the wrapper
-supports the following parameters, provided as keys to the `block` param to
-make it easier to work with the chipseq config format. For example:
-
-```python
-params:
-    block={'bins': 5, 'fdr': 0.1},
-    java_args='-Xmx64g'
-```
-
-`srange`: tuple; controls the range of lags over which to calculate
-cross-correlation. Default is `(50, 500)`
-
-`bins`: integer; controls how the binding characteristics will be binned. Default
-is `5`.
-
-`tecfilter`: bool; passed to `find.binding.positions` function. Default is True;
-set to False to prevent the exclusion of large regions with higher input than
-expected.
-
-`remove_anomalies`: bool; enable/disable the remove.tag.anomalies step. Defualt
-is False (do not remove anomalies). Setting to True can increase the time
-dramatically.
-
-`fdr`: float; false discovery rate when calling peaks. Default is `0.05`.
-
-`whs`: int. window half-size. Used if the auto-calculated
-`binding.characteristics` is NA. Default is `500`.
-
-`zthr`: float. Z threshold used when adding broad regions. Default is `3`.
-
-`bandwidth`: int. Bandwith for smoothing WIG file. Default is `200`.
-
-`step`: int; step size for smoothing WIG file. Default is `100`.
diff --git a/wrappers/wrappers/spp/environment.yaml b/wrappers/wrappers/spp/environment.yaml
deleted file mode 100644
index 42dd8086..00000000
--- a/wrappers/wrappers/spp/environment.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-
-dependencies:
-  - picard
-  - bedtools
-  - samtools
-  - r-spp
-  - r >=3.5.1
diff --git a/wrappers/wrappers/spp/wrapper.py b/wrappers/wrappers/spp/wrapper.py
deleted file mode 100644
index 364c3ba1..00000000
--- a/wrappers/wrappers/spp/wrapper.py
+++ /dev/null
@@ -1,256 +0,0 @@
-from textwrap import dedent
-import tempfile
-from snakemake.shell import shell
-log = snakemake.log_fmt_shell(append=True)
-
-# Since we'll be appending the output from multiple commands to the same log,
-# we want to ensure that the provided log file is empty to start
-if snakemake.log:
-    shell('cat /dev/null > {snakemake.log}')
-
-java_args = snakemake.params.get('java_args', '')
-keep_tempfiles = snakemake.params.get('keep_tempfiles', False)
-
-registered_for_deletion = [
-    snakemake.output.bed + '.tmp',
-    snakemake.output.bed + '.tmp.genome',
-]
-
-
-def merge_and_dedup(bams):
-    """
-    spp only handles one replicate at a time. To support pooled samples, we
-    merge and remove duplicates, storing the result in a tempfile.
-
-    If only one item is provided, return it immediately
-    """
-
-    if len(bams) == 1:
-        return bams
-
-    merged = tempfile.NamedTemporaryFile(delete=False, prefix='merged', suffix='.bam').name
-    merged_and_deduped = tempfile.NamedTemporaryFile(delete=False, prefix='merged_and_duped', suffix='.bam').name
-    metrics = tempfile.NamedTemporaryFile(delete=False, prefix='metrics', suffix='.txt').name
-
-    shell('echo "tempfiles created by merge_and_dedup: {merged} {merged_and_deduped} {metrics}" {log}')
-
-    if not keep_tempfiles:
-        registered_for_deletion.extend([merged, merged_and_deduped, metrics])
-
-    bams = ' '.join(bams)
-    shell(
-        'samtools merge '
-        '-f '
-        '-@ {snakemake.threads} '
-        '{merged} '
-        '{bams} '
-        '{log} '
-    )
-    shell(
-        'picard '
-        '{java_args} '
-        'MarkDuplicates '
-        'INPUT={merged} '
-        'OUTPUT={merged_and_deduped} '
-        'METRICS_FILE={metrics} '
-        'REMOVE_DUPLICATES=true '
-        '{log} '
-    )
-    return merged_and_deduped
-
-
-def Rbool(x):
-    """
-    Convert to R boolean string used to fill in a template
-    """
-    if x:
-        return 'TRUE'
-    return 'FALSE'
-
-
-# ----------------------------------------------------------------------------
-# DEFAULTS
-#
-extra = snakemake.params.block.get('extra', {})
-
-DEFAULTS = {
-    # srange controls the range of lags over which to calculate cross-correlation
-    'srange': (50, 500),
-    # bins controls how the binding characteristics will be binned
-    'bins': 5,
-    # enable/disable the remove.tag.anomalies step
-    'remove_anomalies': False,
-    # false discovery rate when calling peaks
-    'fdr': 0.05,
-    # window half-size. Used if binding.characteristics is NA.
-    'whs': 500,
-    # Z threshold used when adding broad regions.
-    'zthr': 3,
-    # bandwith for smoothing WIG file
-    'bandwidth': 200,
-    # step for smoothing WIG file
-    'step': 100,
-    # Set to False to disable the filtering of large regions with high input signal
-    'tecfilter': True,
-}
-
-params = {}
-for k, v in DEFAULTS.items():
-    v = extra.get(k, v)
-    if isinstance(v, bool):
-        v = Rbool(v)
-    params[k] = v
-
-# ----------------------------------------------------------------------------
-
-# R_template is incrementally built up so that we can intersperse comments and
-# to keep things better organized. It will be filled in with `**locals()` at
-# the end.
-
-ip = merge_and_dedup(snakemake.input.ip)
-control = merge_and_dedup(snakemake.input.control)
-
-
-R_template = """
-library(spp)
-chip.data <- read.bam.tags("{ip}")
-input.data <- read.bam.tags("{control}")
-"""
-
-
-#
-R_template += """
-for (chrom in names(chip.data$tags)){{
-    if (length(chip.data$tags[[chrom]]) < 10){{
-        print(paste("Chromosome", chrom, "has <10 reads; removing from analysis"))
-        chip.data$tags[[chrom]] <- NULL
-        chip.data$quality[[chrom]] <- NULL
-        input.data$tags[[chrom]] <- NULL
-        input.data$quality[[chrom]] <- NULL
-    }}
-}}
-"""
-
-# Use configured srange and bins, if provided. `accept.all.tags=TRUE` is
-# hard-coded since we were getting errors if FALSE.
-R_template += """
-binding.characteristics <- get.binding.characteristics(
-  chip.data,
-  srange=c({params[srange][0]}, {params[srange][1]}),
-  bin={params[bins]},
-  accept.all.tags=TRUE,
-  remove.tag.anomalies={params[remove_anomalies]}
-)
-"""
-
-R_template += """
-# Extract info from binding characteristics
-tag.shift <- round(binding.characteristics$peak$x/2)
-detection.window.halfsize <- binding.characteristics$whs
-if (!is.finite(detection.window.halfsize)){{
-  detection.window.halfsize <- {params[whs]}
-}}
-"""
-
-R_template += """
-# Reset data to tags, and remove any chromosomes with no data.
-# (tags is a list, names are chromosomes and values are integer vectors)
-
-chip.data <- chip.data$tags
-input.data <- input.data$tags
-
-chip.data[sapply(chip.data, is.null)] <- NULL
-input.data[sapply(input.data, is.null)] <- NULL
-"""
-
-
-if 'smoothed_enrichment_mle' in snakemake.output.keys():
-    R_template += dedent("""
-    smoothed.enrichment.estimate <- get.smoothed.enrichment.mle(
-      chip.data,
-      input.data,
-      bandwidth={params[bandwidth]},
-      step={params[step]},
-      tag.shift=tag.shift)
-    writewig(
-      smoothed.enrichment.estimate,
-      "{snakemake.output.smoothed_enrichment_mle}",
-      feature=""
-    )
-    """)
-
-if 'enrichment_estimates' in snakemake.output.keys():
-    R_template += dedent("""
-    enrichment.estimates <- get.conservative.fold.enrichment.profile(
-        chip.data, input.data, fws=500, step=100, alpha=0.01
-    )
-    writewig(enrichment.estimates, "{snakemake.output.enrichment_estimates}", feature="")
-    rm(enrichment.estimates)
-    """)
-
-R_template += """
-# Get peaks
-bp <- find.binding.positions(
-  signal.data=chip.data,
-  control.data=input.data,
-  fdr={params[fdr]},
-  whs=detection.window.halfsize,
-  tec.filter={params[tecfilter]}
-)
-"""
-
-R_template += """
-# Add broad regions to peaks
-bp <- add.broad.peak.regions(
-  chip.data,
-  input.data,
-  bp,
-  window.size=detection.window.halfsize,
-  z.thr={params[zthr]}
-)
-write.narrowpeak.binding(bp, "{snakemake.output.bed}.tmp")
-"""
-
-# Save image for later introspection or debugging
-if 'rdata' in snakemake.output.keys():
-    R_template += dedent("""
-    save.image("{snakemake.output.rdata}")
-    """)
-
-# write the filled-in template to the output directory for later debugging
-script_filename = snakemake.output.bed + '.R'
-with open(script_filename, 'w') as fout:
-    fout.write(R_template.format(**locals()))
-
-# Run it
-shell('Rscript {script_filename} {log}')
-
-# Fix the output file so that it doesn't have negative numbers and so it fits
-# inside the genome
-shell(
-    """awk -F "\\t" '{{OFS="\\t"; print $1, "0", $2}}' """
-    "{snakemake.input.chromsizes} "
-    "> {snakemake.output.bed}.tmp.genome"
-)
-shell(
-    "sort -k1,1 -k2,2n {snakemake.output.bed}.tmp | "
-    """awk -F "\\t" '{{OFS="\\t"; if (($2>0) && ($3>0)) print $0}}' | """
-    "bedtools intersect -a - -b {snakemake.output.bed}.tmp.genome > {snakemake.output.bed}"
-)
-
-# SPP's writewig() adds a header and is space-separated, so this turns it into
-# a proper bedGraph file ready for conversion to bigwig.
-if 'enrichment_estimates' in snakemake.output.keys():
-    shell('grep -v "track" {snakemake.output.enrichment_estimates} '
-          '| sed "s/ /\\t/g" > {snakemake.output.enrichment_estimates}.tmp '
-          '&& mv {snakemake.output.enrichment_estimates}.tmp '
-          '{snakemake.output.enrichment_estimates}')
-
-if 'smoothed_enrichment_mle' in snakemake.output.keys():
-    shell('grep -v "track" {snakemake.output.smoothed_enrichment_mle} '
-          '| sed "s/ /\\t/g" > {snakemake.output.smoothed_enrichment_mle}.tmp '
-          '&& mv {snakemake.output.smoothed_enrichment_mle}.tmp '
-          '{snakemake.output.smoothed_enrichment_mle}')
-
-for fn in registered_for_deletion:
-    shell('rm -v {fn} {log}')

From 9f0036654e0dbb7f1e0973d30995cd6d16399710 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 20 Jan 2025 10:15:48 -0500
Subject: [PATCH 073/196] resources to strings

---
 workflows/chipseq/Snakefile | 97 +++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 46 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 9c8a2f37..98152f5f 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -6,7 +6,6 @@ import pandas as pd
 sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
 from lib import utils
 from lib import chipseq
-from lib.utils import autobump, gb, hours
 
 
 configfile: "config/config.yaml"
@@ -59,9 +58,9 @@ if utils.detect_sra(sampletable):
             is_paired=is_paired,
             # extra="-X 100000",  # [enable for test]
         resources:
-            mem_mb=gb(1),
-            disk_mb=autobump(gb=1),
-            runtime=autobump(hours=2)
+            mem="1g",
+            disk="1g",
+            runtime="2h",
         run:
             srr = sampletable.loc[wildcards.sample, "Run"]
             extra = params.get("extra", "")
@@ -85,8 +84,8 @@ rule symlinks:
         expand(patterns["fastq"], n=n, allow_missing=True),
     threads: 1
     resources:
-        mem_mb=100,
-        runtime=10,
+        mem="1g",
+        runtime="10m",
     run:
         assert len(output) == len(input), (input, output)
         for src, linkname in zip(input, output):
@@ -109,8 +108,8 @@ rule cutadapt:
         "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
     threads: 6
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     params:
         extra=(
             (
@@ -154,8 +153,8 @@ rule fastqc:
         html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html",
         zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip",
     resources:
-        mem_mb=gb(8),
-        runtime=autobump(hours=2),
+        mem="8g",
+        runtime="2h",
     log:
         "{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log",
     run:
@@ -198,8 +197,8 @@ rule bowtie2:
         patterns["bam"] + ".log",
     threads: 16
     resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
+        mem="32g",
+        runtime="2h",
     params:
         extra="",
     run:
@@ -235,8 +234,8 @@ rule unique:
         patterns["unique"],
     threads: 1
     resources:
-        mem_mb=gb(1),
-        runtime=autobump(hours=2),
+        mem="1g",
+        runtime="2h",
     params:
         # NOTE: the quality score chosen here should reflect the scores output
         # by the aligner used. For example, STAR uses 255 as max mapping
@@ -253,8 +252,8 @@ rule fastq_count:
         "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize",
     threads: 1
     resources:
-        mem_mb=gb(1),
-        runtime=autobump(hours=2),
+        mem="1g",
+        runtime="2h",
     shell:
         "zcat {input} | echo $((`wc -l`/4)) > {output}"
 
@@ -266,8 +265,8 @@ rule bam_count:
         "{sample_dir}/{sample}/{suffix}.bam.libsize",
     threads: 1
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     shell:
         "samtools view -c {input} > {output}"
 
@@ -279,8 +278,8 @@ rule bam_index:
         bai="{prefix}.bam.bai",
     threads: 1
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     shell:
         "samtools index {input} {output}"
 
@@ -295,9 +294,9 @@ rule markduplicates:
         patterns["markduplicates"]["bam"] + ".log",
     threads: 1
     resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
-        disk_mb=gb(100),
+        mem="32g",
+        disk="100g",
+        runtime="2h",
     params:
         java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
@@ -326,9 +325,9 @@ rule merge_techreps:
         patterns["merged_techreps"] + ".log",
     threads: 1
     resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
-        disk_mb=gb(100),
+        mem="32g",
+        disk="100g",
+        runtime="2h",
     params:
         java_args="-Xmx32g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
@@ -348,8 +347,8 @@ if is_paired:
             patterns["collectinsertsizemetrics"]["metrics"] + ".log",
         threads: 1
         resources:
-            mem_mb=gb(32),
-            runtime=autobump(hours=2),
+            mem="32g",
+            runtime="2h",
         params:
             java_args="-Xmx20g",  # [disable for test]
             # java_args='-Xmx2g'  # [enable for test]
@@ -373,8 +372,8 @@ rule bigwig:
         patterns["bigwig"] + ".log",
     threads: 1
     resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2),
+        mem="16g",
+        runtime="2h",
     shell:
         "bamCoverage "
         "--bam {input.bam} "
@@ -416,8 +415,8 @@ rule fingerprint:
         patterns["fingerprint"]["metrics"] + ".log",
     threads: 1
     resources:
-        mem_mb=gb(32),
-        runtime=autobump(hours=2),
+        mem="32g",
+        runtime="2h",
     run:
         if len(input.control) == 0:
             jsdsample_arg = ""
@@ -461,8 +460,8 @@ rule macs2:
     output:
         bed=patterns["peaks"]["macs2"],
     resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2),
+        mem="16g",
+        runtime="2h",
     log:
         patterns["peaks"]["macs2"] + ".log",
     params:
@@ -496,8 +495,8 @@ rule epic2:
     output:
         bed=patterns["peaks"]["epic2"],
     resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2),
+        mem="16g",
+        runtime="2h",
     log:
         patterns["peaks"]["epic2"] + ".log",
     params:
@@ -517,8 +516,8 @@ rule bed_to_bigbed:
     output:
         "{prefix}.bigbed",
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     log:
         "{prefix}.bigbed.log",
     script:
@@ -536,8 +535,8 @@ rule multibigwigsummary:
         tab=patterns["multibigwigsummary"]["tab"],
     threads: 16
     resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2),
+        mem="16g",
+        runtime="2h",
     run:
         # from the input files, figure out the sample name.
         labels = " ".join([i.split("/")[-2] for i in input])
@@ -562,8 +561,8 @@ rule plotcorrelation:
         heatmap=patterns["plotcorrelation"]["heatmap"],
         tab=patterns["plotcorrelation"]["tab"],
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     shell:
         "plotCorrelation "
         "--corData {input} "
@@ -587,8 +586,8 @@ rule idxstats:
     output:
         txt=patterns["samtools"]["idxstats"],
     resources:
-        mem_mb=gb(16),
-        runtime=autobump(hours=2),
+        mem="16g",
+        runtime="2h",
     log:
         patterns["samtools"]["idxstats"] + ".log",
     shell:
@@ -601,6 +600,9 @@ rule flagstat:
         bai=patterns["markduplicates"]["bam"] + ".bai",
     output:
         patterns["samtools"]["flagstat"],
+    resources:
+        mem="8g",
+        runtime="2h",
     log:
         patterns["samtools"]["flagstat"] + ".log",
     shell:
@@ -613,6 +615,9 @@ rule samtools_stats:
         bai=patterns["markduplicates"]["bam"] + ".bai",
     output:
         patterns["samtools"]["stats"],
+    resources:
+        mem="8g",
+        runtime="2h",
     log:
         patterns["samtools"]["stats"] + ".log",
     shell:
@@ -647,8 +652,8 @@ rule multiqc:
         patterns["multiqc"] + ".log",
     threads: 1
     resources:
-        mem_mb=gb(2),
-        runtime=autobump(hours=2),
+        mem="2g",
+        runtime="2h",
     run:
         analysis_directory = "data"
         outdir = os.path.dirname(output[0])

From 65d2e3ba9b4281fb029a57fbc8367629aa25c952 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 20 Jan 2025 14:23:26 -0500
Subject: [PATCH 074/196] rm chipseq patterns

---
 workflows/chipseq/Snakefile                   | 179 +++++++++---------
 .../chipseq/config/chipseq_patterns.yaml      |  75 --------
 2 files changed, 91 insertions(+), 163 deletions(-)
 delete mode 100644 workflows/chipseq/config/chipseq_patterns.yaml

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 98152f5f..4dabbb52 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -20,7 +20,7 @@ sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.iloc[:, 0].values
-patterns = yaml.safe_load(open("config/chipseq_patterns.yaml"))["patterns_by_sample"]
+LABELS = sampletable.label.values
 peaks = chipseq.add_bams_to_peak_calling(config)
 
 
@@ -36,8 +36,8 @@ localrules:
 
 rule targets:
     input:
-        patterns["multiqc"],
-        expand(patterns["bigwig"], label=sampletable.label),
+        "data/chipseq_aggregation/multiqc.html",
+        expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=LABELS),
         [v["bed"] for k, v in peaks.items()],
 
 
@@ -81,7 +81,8 @@ rule symlinks:
             else sampletable.loc[wc.sample, ["orig_filename"]]
         ),
     output:
-        expand(patterns["fastq"], n=n, allow_missing=True),
+        expand("data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", n=n,
+               allow_missing=True),
     threads: 1
     resources:
         mem="1g",
@@ -95,15 +96,19 @@ rule symlinks:
 rule symlink_targets:
     input:
         expand(
-            "data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n
+            "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n
         ),
 
 
 rule cutadapt:
     input:
-        fastq=expand(patterns["fastq"], n=n, allow_missing=True),
+        fastq=expand(
+            "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz",
+            n=n, allow_missing=True),
     output:
-        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+        fastq=expand(
+            "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz",
+            n=n, allow_missing=True),
     log:
         "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
     threads: 6
@@ -180,7 +185,8 @@ rule fastqc:
 
 rule bowtie2:
     input:
-        fastq=expand(patterns["cutadapt"], n=n, allow_missing=True),
+        fastq=expand(
+            "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", n=n, allow_missing=True),
         index=multiext(
             f"{REFERENCES}/bowtie2/genome",
             ".1.bt2",
@@ -192,9 +198,9 @@ rule bowtie2:
             ".fa",
         ),
     output:
-        bam=temporary(patterns["bam"]),
+        bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"),
     log:
-        patterns["bam"] + ".log",
+        "data/chipseq_samples/{sample}/{sample}.cutadapt.bam.log",
     threads: 16
     resources:
         mem="32g",
@@ -229,9 +235,9 @@ rule bowtie2:
 
 rule unique:
     input:
-        patterns["bam"],
+        "data/chipseq_samples/{sample}/{sample}.cutadapt.bam",
     output:
-        patterns["unique"],
+        "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam",
     threads: 1
     resources:
         mem="1g",
@@ -286,12 +292,12 @@ rule bam_index:
 
 rule markduplicates:
     input:
-        bam=patterns["unique"],
+        bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam",
     output:
-        bam=patterns["markduplicates"]["bam"],
-        metrics=patterns["markduplicates"]["metrics"],
+        bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
+        metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics"
     log:
-        patterns["markduplicates"]["bam"] + ".log",
+        "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.log"
     threads: 1
     resources:
         mem="32g",
@@ -315,14 +321,14 @@ rule markduplicates:
 rule merge_techreps:
     input:
         lambda wc: expand(
-            patterns["markduplicates"]["bam"],
+            "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
             sample=utils.get_techreps(sampletable, wc.label),
         ),
     output:
-        bam=patterns["merged_techreps"],
-        metrics=patterns["merged_techreps"] + ".metrics",
+        bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
+        metrics="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.metrics",
     log:
-        patterns["merged_techreps"] + ".log",
+        "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.log"
     threads: 1
     resources:
         mem="32g",
@@ -339,12 +345,12 @@ if is_paired:
 
     rule collectinsertsizemetrics:
         input:
-            bam=patterns["markduplicates"]["bam"],
+            bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
         output:
-            pdf=patterns["collectinsertsizemetrics"]["pdf"],
-            metrics=patterns["collectinsertsizemetrics"]["metrics"],
+            pdf="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.pdf",
+            metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics",
         log:
-            patterns["collectinsertsizemetrics"]["metrics"] + ".log",
+            "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics.log"
         threads: 1
         resources:
             mem="32g",
@@ -364,12 +370,12 @@ if is_paired:
 
 rule bigwig:
     input:
-        bam=patterns["merged_techreps"],
-        bai=patterns["merged_techreps"] + ".bai",
+        bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
+        bai="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai",
     output:
-        patterns["bigwig"],
+        "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig",
     log:
-        patterns["bigwig"] + ".log",
+        "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig.log",
     threads: 1
     resources:
         mem="16g",
@@ -382,7 +388,7 @@ rule bigwig:
         "--minMappingQuality 20 "
         "--ignoreDuplicates "
         # Can't use the CPM normalization for testing due to <1000 reads total
-        # in example data; keep uncommented when running in production
+        # in example data
         "--normalizeUsing CPM "  # [disable for test]
         "--extendReads 300 "
         "&> {log}"
@@ -396,23 +402,25 @@ rule fingerprint:
     Note: uses the merged techreps.
     """
     input:
-        bams=lambda wc: expand(patterns["merged_techreps"], label=wc.ip_label),
+        bams=lambda wc: expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=wc.ip_label),
         control=lambda wc: expand(
-            patterns["merged_techreps"],
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
             label=chipseq.merged_input_for_ip(sampletable, wc.ip_label),
         ),
-        bais=lambda wc: expand(patterns["merged_techreps"] + ".bai", label=wc.ip_label),
+        bais=lambda wc: expand(
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai",
+            label=wc.ip_label),
         control_bais=lambda wc: expand(
-            patterns["merged_techreps"] + ".bai",
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai",
             label=chipseq.merged_input_for_ip(sampletable, wc.ip_label),
         ),
     output:
-        plot=patterns["fingerprint"]["plot"],
-        raw_counts=patterns["fingerprint"]["raw_counts"],
-        metrics=patterns["fingerprint"]["metrics"],
+        plot="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png",
+        raw_counts="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab",
+        metrics="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics",
     threads: 8
     log:
-        patterns["fingerprint"]["metrics"] + ".log",
+        "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics.log",
     threads: 1
     resources:
         mem="32g",
@@ -444,26 +452,23 @@ rule fingerprint:
 
 
 rule macs2:
-    """
-    Run the macs2 peak caller
-    """
     input:
         ip=lambda wc: expand(
-            patterns["merged_techreps"],
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
             label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "ip"),
         ),
         control=lambda wc: expand(
-            patterns["merged_techreps"],
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
             label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "control"),
         ),
         chromsizes=rules.chromsizes.output,
     output:
-        bed=patterns["peaks"]["macs2"],
+        bed="data/chipseq_peaks/macs2/{macs2_run}/peaks.bed",
     resources:
         mem="16g",
         runtime="2h",
     log:
-        patterns["peaks"]["macs2"] + ".log",
+        "data/chipseq_peaks/macs2/{macs2_run}/peaks.bed.log",
     params:
         block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, "macs2"),
     script:
@@ -471,34 +476,31 @@ rule macs2:
 
 
 rule epic2:
-    """
-    Run the epic2 peak caller
-    """
     input:
         ip=lambda wc: expand(
-            patterns["merged_techreps"],
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
             label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"),
         ),
         control=lambda wc: expand(
-            patterns["merged_techreps"],
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
             label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"),
         ),
         bai=lambda wc: expand(
-            patterns["merged_techreps"] + ".bai",
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai",
             label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"),
         )
         + expand(
-            patterns["merged_techreps"] + ".bai",
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai",
             label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"),
         ),
         chromsizes=rules.chromsizes.output,
     output:
-        bed=patterns["peaks"]["epic2"],
+        bed="data/chipseq_peaks/epic2/{epic2_run}/peaks.bed",
     resources:
         mem="16g",
         runtime="2h",
     log:
-        patterns["peaks"]["epic2"] + ".log",
+        "data/chipseq_peaks/epic2/{epic2_run}/peaks.bed.log"
     params:
         block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, "epic2"),
         is_paired=is_paired,
@@ -529,10 +531,10 @@ rule multibigwigsummary:
     Summarize the bigWigs across genomic bins
     """
     input:
-        expand(patterns["bigwig"], label=sampletable.label),
+        expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label),
     output:
-        npz=patterns["multibigwigsummary"]["npz"],
-        tab=patterns["multibigwigsummary"]["tab"],
+        npz="data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz",
+        tab="data/chipseq_aggregation/deeptools/multibigwigsummary.tab",
     threads: 16
     resources:
         mem="16g",
@@ -556,10 +558,10 @@ rule plotcorrelation:
     Plot a heatmap of correlations across all samples
     """
     input:
-        patterns["multibigwigsummary"]["npz"],
+        npz="data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz",
     output:
-        heatmap=patterns["plotcorrelation"]["heatmap"],
-        tab=patterns["plotcorrelation"]["tab"],
+        tab="data/chipseq_aggregation/deeptools/plotcorrelation.tab",
+        heatmap="data/chipseq_aggregation/deeptools/correlation_heatmap.png",
     resources:
         mem="2g",
         runtime="2h",
@@ -581,75 +583,76 @@ rule plotcorrelation:
 
 rule idxstats:
     input:
-        bam=patterns["markduplicates"]["bam"],
-        bai=patterns["markduplicates"]["bam"] + ".bai",
+        bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
+        bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai",
     output:
-        txt=patterns["samtools"]["idxstats"],
+        txt="data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt",
     resources:
         mem="16g",
         runtime="2h",
     log:
-        patterns["samtools"]["idxstats"] + ".log",
+        "data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt.log"
     shell:
         "samtools idxstats {input.bam} 2> {log} 1> {output.txt}"
 
 
 rule flagstat:
     input:
-        bam=patterns["markduplicates"]["bam"],
-        bai=patterns["markduplicates"]["bam"] + ".bai",
+        bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
+        bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai",
     output:
-        patterns["samtools"]["flagstat"],
+        "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt",
     resources:
         mem="8g",
         runtime="2h",
     log:
-        patterns["samtools"]["flagstat"] + ".log",
+        "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt.log"
     shell:
         "samtools flagstat {input.bam} > {output}"
 
 
 rule samtools_stats:
     input:
-        bam=patterns["markduplicates"]["bam"],
-        bai=patterns["markduplicates"]["bam"] + ".bai",
+        bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
+        bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai",
     output:
-        patterns["samtools"]["stats"],
+        "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt",
     resources:
         mem="8g",
         runtime="2h",
     log:
-        patterns["samtools"]["stats"] + ".log",
+        "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt.log"
     shell:
         "samtools stats {input.bam} > {output}"
 
 
 rule multiqc:
     input:
-        expand(patterns["bam"], sample=SAMPLES),
-        expand(patterns["fastqc"]["raw"], sample=SAMPLES),
-        expand(patterns["fastqc"]["cutadapt"], sample=SAMPLES),
-        expand(patterns["fastqc"]["bam"], sample=SAMPLES),
-        expand(patterns["bigwig"], label=sampletable.label),
-        expand(patterns["samtools"]["idxstats"], sample=SAMPLES),
-        expand(patterns["samtools"]["flagstat"], sample=SAMPLES),
-        expand(patterns["samtools"]["stats"], sample=SAMPLES),
-        expand(patterns["merged_techreps"], label=sampletable.label),
+        expand("data/chipseq_samples/{sample}/{sample}.cutadapt.bam", sample=SAMPLES),
+        expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip", sample=SAMPLES),
+        expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip", sample=SAMPLES),
+        expand("data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip", sample=SAMPLES),
+        expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label),
+        expand("data/chipseq_samples/{sample}/samtools_stats_{sample}.txt", sample=SAMPLES),
+        expand("data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt", sample=SAMPLES),
+        expand("data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt", sample=SAMPLES),
+        expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=sampletable.label),
         expand(
-            patterns["fingerprint"]["metrics"],
+            "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics",
             ip_label=sampletable.loc[sampletable.antibody != "input", "label"],
         ),
-        expand(patterns["collectinsertsizemetrics"], sample=SAMPLES)
-        if is_paired
-        else [],
+        expand(
+            "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics",
+             sample=SAMPLES
+        ) if is_paired else [],
         [v["bigbed"] for v in peaks.values()],
-        patterns["multibigwigsummary"]["tab"],
-        patterns["plotcorrelation"]["tab"],
+        "data/chipseq_aggregation/deeptools/plotcorrelation.tab",
+        "data/chipseq_aggregation/deeptools/multibigwigsummary.tab",
         config="config/multiqc_config.yaml",
     output:
-        patterns["multiqc"],
+        "data/chipseq_aggregation/multiqc.html",
     log:
-        patterns["multiqc"] + ".log",
+        "data/chipseq_aggregation/multiqc.html.log",
     threads: 1
     resources:
         mem="2g",
diff --git a/workflows/chipseq/config/chipseq_patterns.yaml b/workflows/chipseq/config/chipseq_patterns.yaml
deleted file mode 100644
index 90b511c9..00000000
--- a/workflows/chipseq/config/chipseq_patterns.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-patterns_by_sample:
-
-  fastq: 'data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz'
-  cutadapt: 'data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz'
-  bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.bam'
-
-  fastqc:
-    raw: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip'
-    cutadapt: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip'
-    bam: 'data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip'
-
-  libsizes:
-    fastq:   'data/chipseq_samples/{sample}/{sample}_R1.fastq.gz.libsize'
-    cutadapt: 'data/chipseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize'
-    bam:     'data/chipseq_samples/{sample}/{sample}.cutadapt.bam.libsize'
-    unique:     'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam.libsize'
-    nodups: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.libsize'
-
-  fastq_screen: 'data/chipseq_samples/{sample}/{sample}.cutadapt.screen.txt'
-  libsizes_table: 'data/chipseq_aggregation/libsizes_table.tsv'
-  libsizes_yaml: 'data/chipseq_aggregation/libsizes_table_mqc.yaml'
-  multiqc: 'data/chipseq_aggregation/multiqc.html'
-  unique: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam'
-
-  markduplicates:
-    bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam'
-    bai: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai'
-    metrics: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics'
-
-  merged_techreps: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam'
-
-  bigwig: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig'
-
-  fingerprint:
-    plot: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png'
-    raw_counts: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab'
-    metrics: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics'
-
-  multibigwigsummary:
-    npz: 'data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz'
-    tab: 'data/chipseq_aggregation/deeptools/multibigwigsummary.tab'
-
-  plotcorrelation:
-    tab: 'data/chipseq_aggregation/deeptools/plotcorrelation.tab'
-    heatmap: 'data/chipseq_aggregation/deeptools/correlation_heatmap.png'
-
-  collectinsertsizemetrics:
-    pdf: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.pdf'
-    metrics: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics'
-
-  samtools:
-    idxstats: 'data/rnaseq_samples/{sample}/samtools_idxstats_{sample}.txt'
-    flagstat: 'data/rnaseq_samples/{sample}/samtools_flagstat_{sample}.txt'
-    stats: 'data/rnaseq_samples/{sample}/samtools_stats_{sample}.txt'
-
-  peaks:
-    macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed'
-    spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bed'
-    sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bed'
-    epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bed'
-
-patterns_by_peaks:
-  peaks:
-    macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed'
-    spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bed'
-    sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bed'
-    epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bed'
-  bigbed:
-    macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bigbed'
-    spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bigbed'
-    sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bigbed'
-    epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bigbed'
-
-patterns_by_aggregate:
-  merged_bigwig: 'data/chipseq_aggregation/merged_bigwigs/{merged_bigwig_label}.bigwig'

From 3b57a27e1288c841c11b5c7f256beef257df8dd9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 20 Jan 2025 14:37:00 -0500
Subject: [PATCH 075/196] update chipseq_trackhub.py

---
 workflows/chipseq/chipseq_trackhub.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py
index e2bf9ecb..d069b015 100644
--- a/workflows/chipseq/chipseq_trackhub.py
+++ b/workflows/chipseq/chipseq_trackhub.py
@@ -25,7 +25,6 @@
 from trackhub.upload import upload_hub, stage_hub
 
 from lib import chipseq
-from lib.patterns_targets import ChIPSeqConfig
 
 ap = argparse.ArgumentParser()
 ap.add_argument('config', help='Main config.yaml file')
@@ -53,8 +52,6 @@
     genome=hub_config['hub']['genome']
 )
 
-c = ChIPSeqConfig(config, os.path.join(os.path.dirname(args.config), 'chipseq_patterns.yaml'))
-
 # Set up subgroups based on unique values from columns specified in the config
 df = pandas.read_csv(config['sampletable'], comment='#', sep='\t')
 cols = hub_config['subgroups']['columns']
@@ -82,8 +79,7 @@
     SubGroupDefinition(
         name='algorithm', label='algorithm', mapping={
             'macs2': 'macs2',
-            'spp': 'spp',
-            'sicer': 'sicer',
+            'epic2': 'epic2',
             'NA': 'NA',
         }))
 
@@ -146,8 +142,7 @@ def decide_color(samplename):
 
 for label in df['label'].unique():
 
-    # ASSUMPTION: bigwig filename pattern
-    bigwig = c.patterns['bigwig'].format(label=label)
+    bigwig = f"data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig"
 
     subgroup = df[df.loc[:, 'label'] == label].to_dict('records')[0]
     subgroup = {

From 4e86e1668141691a0b20fde3e02a90518d4dc9d0 Mon Sep 17 00:00:00 2001
From: Ryan Dale <ryan.dale@nih.gov>
Date: Mon, 20 Jan 2025 14:41:00 -0500
Subject: [PATCH 076/196] update rnaseq_trackhub.py

---
 workflows/rnaseq/rnaseq_trackhub.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/workflows/rnaseq/rnaseq_trackhub.py b/workflows/rnaseq/rnaseq_trackhub.py
index 91273574..6fe17f80 100644
--- a/workflows/rnaseq/rnaseq_trackhub.py
+++ b/workflows/rnaseq/rnaseq_trackhub.py
@@ -9,8 +9,6 @@
 """
 
 import os
-import sys
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
 import re
 from pprint import pprint
 import pandas
@@ -22,8 +20,6 @@
 from trackhub.upload import upload_hub, stage_hub
 import argparse
 
-from lib.patterns_targets import RNASeqConfig
-
 ap = argparse.ArgumentParser()
 ap.add_argument('config', help='Main config.yaml file')
 ap.add_argument('hub_config', help='Track hub config YAML file')
@@ -41,7 +37,6 @@
     for cfg in args.additional_configs:
         update_config(config, yaml.load(open(cfg), Loader=yaml.FullLoader))
 
-c = RNASeqConfig(config, os.path.join(os.path.dirname(args.config), 'rnaseq_patterns.yaml'))
 
 hub, genomes_file, genome, trackdb = default_hub(
     hub_name=hub_config['hub']['name'],
@@ -126,7 +121,7 @@ def decide_color(samplename):
     for direction in 'pos', 'neg':
 
         # ASSUMPTION: bigwig filename pattern
-        bigwig = c.patterns['bigwig'][direction].format(sample=sample)
+        bigwig = f"data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.{direction}.bigwig"
 
         subgroup = df[df.iloc[:, 0] == sample].to_dict('records')[0]
         subgroup = {

From 69376c919ce561dfa3d7ba90d4d34aa068173d82 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 29 Mar 2025 14:27:32 +0000
Subject: [PATCH 077/196] rm colocalization workflow

---
 workflows/colocalization/Snakefile            | 230 ---------------
 workflows/colocalization/config/config.yaml   |   8 -
 workflows/colocalization/run_test.sh          |   3 -
 .../scripts/colocalization_heatmap.py         | 267 ------------------
 .../colocalization/scripts/heatmap_env.yaml   |   7 -
 5 files changed, 515 deletions(-)
 delete mode 100644 workflows/colocalization/Snakefile
 delete mode 100644 workflows/colocalization/config/config.yaml
 delete mode 100755 workflows/colocalization/run_test.sh
 delete mode 100644 workflows/colocalization/scripts/colocalization_heatmap.py
 delete mode 100644 workflows/colocalization/scripts/heatmap_env.yaml

diff --git a/workflows/colocalization/Snakefile b/workflows/colocalization/Snakefile
deleted file mode 100644
index cb5a7991..00000000
--- a/workflows/colocalization/Snakefile
+++ /dev/null
@@ -1,230 +0,0 @@
-import sys
-sys.path.insert(0, srcdir('../..'))
-import os
-from textwrap import dedent
-import yaml
-import tempfile
-import pandas as pd
-from lib import helpers, aligners
-from lib import utils
-from lib import common
-from lib.patterns_targets import RNASeqConfig, ChIPSeqConfig
-import os
-from snakemake.utils import makedirs
-import pandas
-import yaml
-import numpy as np
-
-configfile: 'config/config.yaml'
-
-chipseq_config = ChIPSeqConfig('config/config.yaml', 'config/chipseq_patterns.yaml', workdir='../chipseq')
-
-subworkflow chipseq:
-    configfile: chipseq_config.path
-    workdir: '../chipseq'
-
-subworkflow references:
-    configfile: chipseq_config.path
-    workdir: '../chipseq'
-
-subworkflow external:
-    workdir: '../external'
-
-chipseq_refdict, chipseq_args = common.references_dict(chipseq_config.config)
-
-# The rule to create the chromsizes file is in the references workflow; the
-# path to it can be determined from the config file (though it is awkwardly
-# nested)
-chromsizes = references(
-    chipseq_refdict[
-        chipseq_config.config['organism']
-    ][
-        chipseq_config.config['aligner']['tag']
-    ]['chromsizes']
-)
-
-# In the existing config file, we assume that all BED files are from the
-# `external` workflow.
-
-for k, v in config['beds'].items():
-    config['beds'][k] = external(v)
-
-# If ADD_CHIPSEQ_PEAKS is True, we will addn all the called peaks to the bed
-# files to check for colocalization.
-ADD_CHIPSEQ_PEAKS = True
-# ADD_CHIPSEQ_PEAKS = False  # [TEST SETTINGS -1]
-
-if ADD_CHIPSEQ_PEAKS:
-    peaks = chipseq(utils.flatten(chipseq_config.targets['peaks']))
-    for fn in peaks:
-        toks = fn.split('/')
-        peakcaller = toks[-3]
-        label = toks[-2]
-        key = peakcaller + '_' + label
-        config['beds'][key] = fn
-
-
-targets = expand(
-    '{outdir}/{algorithm}/{domain}/{query}/{query}_vs_{reference}.txt',
-    outdir=config['output'],
-    domain=config['domains'].keys(),
-    query=config['beds'].keys(),
-    reference=config['beds'].keys(),
-    algorithm=['IntervalStats', 'jaccard', 'fisher'],
-)
-
-# Currently-supported options {algorithm: (possible values)}
-# IntervalStats: (f_05, f_01, f_001)
-# jaccard: (jaccard)
-# fisher: (pval)
-pattern = '{outdir}/{algorithm}/{domain}/{value}_heatmap.pdf'
-targets += expand(pattern, outdir=config['output'], domain=config['domains'],
-                  algorithm='IntervalStats', value=['f_01'])
-targets += expand(pattern, outdir=config['output'], domain=config['domains'],
-                  algorithm='jaccard', value=['jaccard'])
-targets += expand(pattern, outdir=config['output'], domain=config['domains'],
-                  algorithm='fisher', value=['pval'])
-
-rule targets:
-    input: targets
-
-
-rule sorted_chromsizes:
-    input: chromsizes
-    output: os.path.join(config['output'], config['organism'] + '.sorted.chromsizes')
-    shell:
-        'sort -k1,1 {input} > {output}'
-
-rule chromsizes_bed:
-    input: rules.sorted_chromsizes.output
-    output: os.path.join(config['output'], config['organism'] + '.bed')
-    shell:
-        """awk '{{OFS="\\t"; print $1,"0",$2}}' {input} > {output}"""
-
-
-rule jaccard:
-    input:
-        domain=lambda wc: config['domains'][getattr(wc, 'domain')],
-        query=lambda wc: config['beds'][getattr(wc, 'query')],
-        reference=lambda wc: config['beds'][getattr(wc, 'reference')],
-        chromsizes=rules.sorted_chromsizes.output
-    output: '{outdir}/jaccard/{domain}/{query}/{query}_vs_{reference}.txt'
-    shell:
-        """
-        bedtools intersect -a {input.query} -b {input.domain} | sort -k1,1 -k2n  > {output}.query.jaccard
-        bedtools intersect -a {input.reference} -b {input.domain} | sort -k1,1 -k2n  > {output}.reference.jaccard
-        bedtools jaccard -a {output}.query.jaccard -b {output}.reference.jaccard -g {input.chromsizes} > {output}
-        rm {output}.query.jaccard {output}.reference.jaccard
-        """
-
-
-rule fisher:
-    input:
-        domain=lambda wc: config['domains'][getattr(wc, 'domain')],
-        query=lambda wc: config['beds'][getattr(wc, 'query')],
-        reference=lambda wc: config['beds'][getattr(wc, 'reference')],
-        chromsizes=rules.sorted_chromsizes.output
-    output: '{outdir}/fisher/{domain}/{query}/{query}_vs_{reference}.txt'
-    shell:
-        """
-        bedtools intersect -a {input.query} -b {input.domain} | sort -k1,1 -k2n > {output}.query.fisher
-        bedtools intersect -a {input.reference} -b {input.domain} | sort -k1,1 -k2n > {output}.reference.fisher
-        bedtools fisher -a {output}.query.fisher -b {output}.reference.fisher -g {input.chromsizes} > {output}
-        rm {output}.query.fisher {output}.reference.fisher
-        """
-
-
-rule intervalstats:
-    input:
-        domain=lambda wc: config['domains'][getattr(wc, 'domain')],
-        query=lambda wc: config['beds'][getattr(wc, 'query')],
-        reference=lambda wc: config['beds'][getattr(wc, 'reference')],
-    output: '{outdir}/IntervalStats/{domain}/{query}/{query}_vs_{reference}.txt'
-    log: '{outdir}/IntervalStats/{domain}/{query}/{query}_vs_{reference}.log'
-    run:
-        if input.query == input.reference:
-            run_self = '--self'
-        else:
-            run_self = ''
-        shell(
-            'IntervalStats '
-            '--query {input.query} '
-            '--reference {input.reference} '
-            '--output {output}.full '
-            '--domain {input.domain} '
-            '{run_self} &> {log}'
-        )
-
-        # Summarize the output into a faster-to-parse file used by downstream
-        # analysis code.
-        #
-        # Output has columns:
-        #
-        # - n_{05,01,001}: number of significant associations at {0.05, 0.01,
-        #   0.001} respectively
-        #
-        # - f_{05,01,001}: fraction of total that are signficant
-        #
-        # - n: number of features
-        #
-        # - query, reference: labels
-        #
-        # - filename: "all" filename containing the details in case anything
-        #   needs re-calculation.
-        _df = pandas.read_table(
-            str(output[0]) + '.full',
-            names=['query', 'closest_ref', 'length', 'distance',
-                   'numerator', 'denominator', 'pval'])
-
-        n = float(len(_df))
-
-        def frac(x):
-            if n == 0:
-                return np.nan
-            return x / n
-
-        n_05 = sum(_df.pval < 0.05)
-        n_01 = sum(_df.pval < 0.01)
-        n_001 = sum(_df.pval < 0.001)
-        f_05 = frac(n_05)
-        f_01 = frac(n_01)
-        f_001 = frac(n_001)
-
-        df = pandas.DataFrame(
-            [
-                dict(
-                    query=wildcards.query,
-                    filename=str(output[0]) + '.full',
-                    reference=wildcards.reference,
-                    n=float(n),
-                    n_05=n_05,
-                    n_01=n_01,
-                    n_001=n_001,
-                    f_05=f_05,
-                    f_01=f_01,
-                    f_001=f_001,
-                )
-            ]
-        )
-        df.to_csv(str(output[0]), sep='\t', index=False)
-
-
-rule heatmap:
-    input:
-        expand(
-            '{{outdir}}/{{algorithm}}/{{domain}}/{query}/{query}_vs_{reference}.txt',
-            query=list(config['beds'].keys()),
-            reference=list(config['beds'].keys())
-        )
-    output:
-        '{outdir}/{algorithm}/{domain}/{value}_heatmap.pdf'
-
-    shell:
-        'python scripts/colocalization_heatmap.py '
-        '--domain {wildcards.domain} '
-        '--algorithm {wildcards.algorithm} '
-        '--value {wildcards.value} '
-        '--outdir {config[output]} '
-        '--output {output}'
-
-# vim: ft=python
diff --git a/workflows/colocalization/config/config.yaml b/workflows/colocalization/config/config.yaml
deleted file mode 100644
index 40734704..00000000
--- a/workflows/colocalization/config/config.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-beds:
-  # from the external workflow
-  SuHw_Kc: data/suhw_kc.bed
-  CTCF_Kc: data/ctcf_kc.bed
-domains:
-  dm6: results/dm6.bed
-output: results
-organism: dm6
diff --git a/workflows/colocalization/run_test.sh b/workflows/colocalization/run_test.sh
deleted file mode 100755
index 7aacb413..00000000
--- a/workflows/colocalization/run_test.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-set -e
-python -m doctest ../../ci/preprocessor.py
-python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@"
diff --git a/workflows/colocalization/scripts/colocalization_heatmap.py b/workflows/colocalization/scripts/colocalization_heatmap.py
deleted file mode 100644
index b337fbb4..00000000
--- a/workflows/colocalization/scripts/colocalization_heatmap.py
+++ /dev/null
@@ -1,267 +0,0 @@
-import matplotlib
-matplotlib.use('agg')
-import os
-import glob
-import pandas as pd
-import numpy as np
-import seaborn as sns
-from scipy.spatial import distance
-from scipy.cluster import hierarchy
-from matplotlib import pyplot as plt
-import argparse
-
-ap = argparse.ArgumentParser()
-ap.add_argument('--domain')
-ap.add_argument('--algorithm')
-ap.add_argument('--value')
-ap.add_argument('--outdir')
-ap.add_argument('--output')
-args = ap.parse_args()
-
-domain = args.domain
-algorithm = args.algorithm
-value = args.value
-outdir = args.outdir
-output = args.output
-
-
-def dataframe_for_domain(domain, algorithm):
-    """
-    Read all files within a directory and build the dataframe.
-
-    Empty files are listed as NaNs in the dataframe.
-    """
-    df = []
-    files = glob.glob(os.path.join(outdir, algorithm, domain, '*', '*.txt'))
-    for filename in files:
-        query, reference = os.path.basename(filename).replace('.txt', '').split('_vs_')
-        try:
-            _df = pd.read_csv(filename, comment='#', sep='\t')
-        except pd.errors.EmptyDataError:
-            _df = pd.DataFrame([dict(value=np.nan)])
-
-        _df['query'] = query
-        _df['reference'] = reference
-        df.append(
-            _df.iloc[0].to_dict()
-        )
-    return pd.DataFrame(df)
-
-
-# Cluster methods
-METRIC = 'correlation'
-METHOD = 'average'
-
-
-def dataframe_for_value(domain, algorithm, value):
-
-    df = dataframe_for_domain(domain, algorithm)
-
-    vmin, vmax = None, None
-
-    # For IntervalStats, Use the "fraction of intervals with p<0.01" as the
-    # value.
-    #
-    # These are all positive values. NaNs are set to 0, and the diagonal is
-    # set to 1.0 (i.e., 100% of intervals are significant with respect to
-    # each other)
-    if algorithm == 'IntervalStats':
-        piv = df.pivot(index='query', columns='reference', values=value)
-        fill_piv = piv.fillna(0)
-        vmax = fill_piv.max().max()
-        np.fill_diagonal(fill_piv.values, 1)
-        units = 'fraction pvals < 0.%s' % (value.split('_')[-1])
-        title = 'IntervalStats'
-
-    # For GAT log2foldchange, set anything with qval > 0.05 to
-    # logfoldchange = 0. Diagonal is filled with 0 (log2foldchange of 1).
-    # NaNs are also set to 0.
-    elif algorithm == 'GAT' and value == 'l2fold':
-        piv = df.pivot(index='query', columns='reference', values='l2fold')
-
-        # used for checking
-        mask = df.pivot(index='query', columns='reference',
-                        values='qvalue')
-        title = 'GAT foldchange'
-        piv[mask > 0.05] = 0
-        piv = piv.fillna(0)
-        fill_piv = piv
-        np.fill_diagonal(fill_piv.values, 0)
-        units = 'log2fold'
-
-    # For GAT fractions, we set the upper and lower triangles of the matrix
-    # to the "track" and "annotation" overlaps in GAT terminology. We also
-    # get a significance value here (qval) so we set the fraction overlap
-    # to zero for anything with qval > 0.05.
-    elif algorithm == 'GAT' and value == 'fractions':
-        segment_frac = df.pivot(index='query', columns='reference',
-                                values='percent_overlap_size_track')
-        annotation_frac = df.pivot(index='query', columns='reference',
-                                   values='percent_overlap_size_annotation')
-        mask = df.pivot(index='query', columns='reference', values='qvalue')
-        piv = segment_frac
-        lower_tri_mask = np.ones(piv.shape, dtype='bool')
-        lower_tri_mask[np.tril_indices(len(piv))] = False
-        piv[lower_tri_mask] = annotation_frac[lower_tri_mask]
-        piv[mask > 0.05] = 0
-        fill = 0
-        fill_piv = piv
-        units = 'percentage overlap'
-        title = 'GAT percentage nucleotide overlap'
-
-    # For fisher, we want to plot the -log10(two-tail pval).
-    #
-    # So we keep track of the ratio, flip pvals where ratio <1, and replace
-    # inf and -inf with the otherwise max and min values respectively. NaNs
-    # are given a -log10(pval) = 0 (so a pval of 1.0).
-    elif algorithm == 'fisher' and value == 'pval':
-        piv = df.pivot(index='query', columns='reference',
-                       values='two-tail')
-        mask_left = df.pivot(index='query', columns='reference',
-                             values='left')
-        mask_right = df.pivot(index='query', columns='reference',
-                              values='right')
-        mask_ratio = df.pivot(index='query', columns='reference',
-                              values='ratio')
-        flip = mask_ratio < 1
-        piv = -np.log10(piv)
-        piv[flip] *= -1
-        mx = piv.replace([np.inf], 0).max().max()
-        mn = piv.replace([-np.inf], 0).min().min()
-        piv = piv.replace([np.inf], mx)
-        piv = piv.replace([-np.inf], mn)
-        fill_piv = piv.fillna(0)
-        units = '-log10(pval)'
-        title = 'Fisher'
-
-    ####################################################
-    # TODO: also plot fisher ratio
-    ####################################################
-
-    # For jaccard, we plot the value directly. While the value can range
-    # [0, 1], in practice we rarely find such good overlap.
-    elif algorithm == 'jaccard' and value == 'jaccard':
-        piv = df.pivot(index='query', columns='reference', values='jaccard')
-        fill_piv = piv
-        units = 'Jaccard statistic'
-        vmin, vmax = (0, .3)
-        title = 'Jaccard'
-
-    return dict(
-      fill_piv=fill_piv,
-      vmin=vmin,
-      vmax=vmax,
-      units=units,
-      title=title
-    )
-
-
-def plot_heatmap(fill_piv, vmin, vmax, title, units, metric='euclidean',
-                 method='average', idx=None, clustermap_kwargs=dict()):
-    """
-    Plot a clustered heatmap of the provided values. Rows are clustered
-    identically as columns so that the diagonal represents the self-self
-    comparisons.
-
-    Parameters
-    ----------
-
-    fill_piv : pandas.DataFrame
-        A prepared dataframe where rownames == colnames and where -inf, inf,
-        and NaN have been filled in with finite values.
-
-    vmin, vmax : float
-        Colormap limits. NOT CURRENTLY USED.
-
-    title : str
-        Title for plot
-
-    units : str
-        Units to use in colorbar
-
-    metric : str
-        Clustering metric. See `scipy.distance` for available options.
-
-    method : clustering method
-        Hierarchical clustering linkage method. See `scipy.hierarchy` for
-        available options.
-
-    idx : None or index
-        If not None, then this index is used to subset `fill_piv`.
-
-    clustermap_kwargs : dict
-        Additional arguments passed to seaborn.clustermap.
-    """
-
-
-    fill_piv = fill_piv.astype(float)
-    # subset if requested
-    if idx is not None:
-        fill_piv = fill_piv.loc[idx, idx]
-
-    # Distance matrix, setting NaN to zero if necessary
-    dist = distance.pdist(fill_piv.values, metric=metric)
-    dist[np.isnan(dist)] = 0
-    dist[dist < 0] = 0
-
-    # ward actually uses values directly rather than using the distance matrix.
-    if method == 'ward':
-        vals = fill_piv.values
-    else:
-        vals = dist
-
-    # Here we compute the row linkage and provide that to sns.clustermap as
-    # both row and column linkages so that the same clustering is used. This
-    # gets us the self-self colocalization on the diagonal.
-    row_linkage = hierarchy.linkage(vals, method=method)
-
-    # catch and fix errors in dendrogram before sending to clustermap
-    mx = row_linkage[np.isfinite(row_linkage)].max()
-    mn = row_linkage[np.isfinite(row_linkage)].min()
-    # row_linkage[np.isinf(row_linkage)] = mx
-    # scipy.clip(row_linkage, 0, mx, row_linkage)
-    ind = hierarchy.dendrogram(row_linkage, no_plot=True)['leaves']
-
-
-    a = sns.clustermap(fill_piv, row_linkage=row_linkage,
-                       col_linkage=row_linkage, **clustermap_kwargs)
-
-    # Fix labels
-    for txt in a.ax_heatmap.get_xticklabels():
-        txt.set_rotation(90)
-    for txt in a.ax_heatmap.get_yticklabels():
-        txt.set_rotation(0)
-
-    # Use the provided units to label the colorbar
-    a.cax.set_ylabel(units)
-
-    # Add figure-level title and tweak margins.
-    fig = plt.gcf()
-    fig.suptitle(title, weight='bold', size=20)
-    fig.subplots_adjust(right=0.8, bottom=0.2)
-    return a
-
-
-v = dataframe_for_value(domain, algorithm, value)
-
-if (v['fill_piv'] < 0).values.any() & (v['fill_piv'] > 0).values.any():
-    center = 0
-    cmap = 'RdBu_r'
-else:
-    center = None
-    cmap = sns.cubehelix_palette(as_cmap=True)
-
-
-fig = plot_heatmap(
-  fill_piv=v['fill_piv'],
-  vmin=v['vmin'],
-  vmax=v['vmax'],
-  title=v['title'],
-  units=v['units'],
-  metric='euclidean',
-  method='average',
-  idx=None,
-  clustermap_kwargs=dict(center=center, cmap=cmap)
-)
-
-fig.savefig(output)
diff --git a/workflows/colocalization/scripts/heatmap_env.yaml b/workflows/colocalization/scripts/heatmap_env.yaml
deleted file mode 100644
index 668a0d76..00000000
--- a/workflows/colocalization/scripts/heatmap_env.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-channels:
-  - conda-forge
-dependencies:
-  - matplotlib
-  - pandas
-  - seaborn
-  - scipy

From 1e50d55a40b3e8446b81d8fca0a1f620b4050c17 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 29 Mar 2025 14:40:37 +0000
Subject: [PATCH 078/196] rm references and colocalization tests

---
 .circleci/config.yml | 39 ---------------------------------------
 1 file changed, 39 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 16e5b5f0..66de1446 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -140,7 +140,6 @@ variables:
         cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh
         cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh
         cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh
-        # cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh
 
         mkdir $DEPLOY/ci
         mkdir $DEPLOY/test
@@ -271,17 +270,6 @@ variables:
 
 
 
-  # --------------------------------------------------------------------------
-  # Standard colocalization workflow
-  colocalization-step: &colocalization-step
-      run:
-        name: colocalization workflow
-        command: |
-          cd $DEPLOY/workflows/colocalization
-          source /opt/miniforge/etc/profile.d/conda.sh
-          conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -p -j2 --use-conda --orig $ORIG
-
 # --------------------------------------------------------------------------
 # Syntax note: All of the steps above, with their "&step-name" labels, can be
 # referred to by a corresponding "*step-name" below. The "<<: *defaults"
@@ -389,23 +377,6 @@ jobs:
       - *get-data
       - *rnaseq-misc-step
 
-  # colocalization:
-  #   <<: *defaults
-  #   steps:
-  #     - checkout
-  #     - *restore_cache
-  #     - *set-path
-  #     - *get-data
-  #     - *colocalization-step
-
-  # references:
-  #   <<: *defaults
-  #   steps:
-  #     - checkout
-  #     - *restore_cache
-  #     - *set-path
-  #     - *get-data
-  #     - *references-step
 
   build-docs:
     <<: *defaults
@@ -479,14 +450,6 @@ workflows:
           requires:
             - initial-setup
             - pytest
-      # - references:
-      #     requires:
-      #       - initial-setup
-      #       - pytest
-      # - colocalization:
-      #     requires:
-      #       - initial-setup
-      #       - pytest
       - build-docs:
           requires:
             - initial-setup
@@ -496,5 +459,3 @@ workflows:
             - rnaseq-misc
             - chipseq
             - chipseq-misc
-            - references
-            # - colocalization

From f37e666fee4e5e979dec6306fc856ec5a2473c9f Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 2 Apr 2025 01:57:10 +0000
Subject: [PATCH 079/196] rm rnaseq_patterns

---
 workflows/rnaseq/config/rnaseq_patterns.yaml | 51 --------------------
 1 file changed, 51 deletions(-)
 delete mode 100644 workflows/rnaseq/config/rnaseq_patterns.yaml

diff --git a/workflows/rnaseq/config/rnaseq_patterns.yaml b/workflows/rnaseq/config/rnaseq_patterns.yaml
deleted file mode 100644
index 35681125..00000000
--- a/workflows/rnaseq/config/rnaseq_patterns.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-strand_check:
-  fastq: 'strand_check/{sample}/{sample}_R{n}.strandedness.fastq'
-  bam: 'strand_check/{sample}/{sample}.strandedness.bam'
-  tsv: 'strand_check/{sample}/{sample}.strandedness'
-fastq: 'data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz'
-sra_fastq: 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz'
-cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz'
-bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam'
-fastqc:
-  raw: 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip'
-  cutadapt: 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip'
-  bam: 'data/rnaseq_samples/{sample}/fastqc/{sample}.cutadapt.bam_fastqc.zip'
-libsizes:
-  fastq:   'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz.libsize'
-  cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize'
-  bam:     'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.libsize'
-fastq_screen: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt'
-featurecounts:
-   per_sample: 'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt'
-   aggregated: 'data/rnaseq_aggregation/featurecounts.txt'
-libsizes_table: 'data/rnaseq_aggregation/libsizes_table.tsv'
-libsizes_yaml: 'data/rnaseq_aggregation/libsizes_table_mqc.yaml'
-rrna_percentages_table: 'data/rnaseq_aggregation/rrna_percentages_table.tsv'
-rrna_percentages_yaml: 'data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml'
-rrna:
-   bam: 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam'
-   libsize: 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize'
-multiqc: 'data/rnaseq_aggregation/multiqc.html'
-markduplicates:
-   bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam'
-   bai: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.bai'
-   metrics: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics'
-collectrnaseqmetrics:
-   metrics: 'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics'
-preseq: 'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt'
-salmon: 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf'
-kallisto: 'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5'
-rseqc:
-   infer_experiment: 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt'
-   read_distribution: 'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt'
-bigwig:
-   pos: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig'
-   neg: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig'
-downstream:
-   rnaseq: 'downstream/rnaseq.html'
-patterns_by_aggregate:
-  merged_bigwig: 'data/rnaseq_aggregation/merged_bigwigs/{merged_bigwig_label}.bigwig'
-samtools:
-  idxstats: 'data/rnaseq_samples/{sample}/idxstat_{sample}.txt'
-  flagstat: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat'
-  stats: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats'

From 6ab28076a7fc4724e6477586fdd836da5dd76b47 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 2 Apr 2025 01:57:47 +0000
Subject: [PATCH 080/196] move sra rule to separate file

---
 workflows/rnaseq/Snakefile | 43 ++------------------------------------
 workflows/rnaseq/sra.smk   | 40 +++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 41 deletions(-)
 create mode 100644 workflows/rnaseq/sra.smk

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 7247bbc2..470afbb8 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -36,47 +36,8 @@ rule all:
 include: "../references/Snakefile"
 
 
-if utils.detect_sra(sampletable):
-    sampletable["orig_filename"] = expand(
-        "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1
-    )
-
-    if is_paired:
-        sampletable["orig_filename_R2"] = expand(
-            "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
-            sample=SAMPLES,
-            n=2,
-        )
-
-    rule fastq_dump:
-        output:
-            fastq=expand(
-                "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
-                n=n,
-                allow_missing=True,
-            ),
-        log:
-            "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
-        params:
-            is_paired=is_paired,
-            # extra="-X 100000",  # [enable for test]
-        resources:
-            mem="1g",
-            disk="1g",
-            runtime="2h",
-        run:
-            srr = sampletable.loc[wildcards.sample, "Run"]
-            extra = params.get("extra", "")
-            if is_paired:
-                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
-                shell("mv {srr}_1.fastq.gz {output[0]}")
-                shell("mv {srr}_2.fastq.gz {output[1]}")
-            else:
-                shell(
-                    "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp"
-                )
-                shell("mv {output[0]}.tmp {output[0]}")
-
+# If the sampletable is from SRA, handle it here.
+include: "sra.smk"
 
 
 rule symlinks:
diff --git a/workflows/rnaseq/sra.smk b/workflows/rnaseq/sra.smk
new file mode 100644
index 00000000..5ee5f53b
--- /dev/null
+++ b/workflows/rnaseq/sra.smk
@@ -0,0 +1,40 @@
+if utils.detect_sra(sampletable):
+    sampletable["orig_filename"] = expand(
+        "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1
+    )
+
+    if is_paired:
+        sampletable["orig_filename_R2"] = expand(
+            "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+            sample=SAMPLES,
+            n=2,
+        )
+
+    rule fastq_dump:
+        output:
+            fastq=expand(
+                "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+                n=n,
+                allow_missing=True,
+            ),
+        log:
+            "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
+        params:
+            is_paired=is_paired,
+            # extra="-X 100000",  # [enable for test]
+        resources:
+            mem="1g",
+            disk="1g",
+            runtime="2h",
+        run:
+            srr = sampletable.loc[wildcards.sample, "Run"]
+            extra = params.get("extra", "")
+            if is_paired:
+                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
+                shell("mv {srr}_1.fastq.gz {output[0]}")
+                shell("mv {srr}_2.fastq.gz {output[1]}")
+            else:
+                shell(
+                    "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp"
+                )
+                shell("mv {output[0]}.tmp {output[0]}")

From 091e21538d03e6c6800f67cc8fdf121e30c9c22d Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 2 Apr 2025 01:58:12 +0000
Subject: [PATCH 081/196] move strand check to separate file

---
 workflows/rnaseq/Snakefile        | 81 ++-----------------------------
 workflows/rnaseq/strand_check.smk | 75 ++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 78 deletions(-)
 create mode 100644 workflows/rnaseq/strand_check.smk

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 470afbb8..91c036d4 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -35,6 +35,9 @@ rule all:
 
 include: "../references/Snakefile"
 
+# Optionally run `snakemake strand_check` to do a preliminary run on
+# automatically-subset data to evaluate strandedness.
+include: "strand_check.smk"
 
 # If the sampletable is from SRA, handle it here.
 include: "sra.smk"
@@ -66,84 +69,6 @@ rule symlink_targets:
         ),
 
 
-# Optionally run ``snakemake strand_check`` to do a preliminary run on
-# automatically-subset data to evaluate strandedness.
-rule sample_strand_check:
-    input:
-        fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
-        index=expand(rules.bowtie2_index.output, label="genome"),
-        bed12=rules.conversion_bed12.output,
-    output:
-        strandedness="strand_check/{sample}/{sample}.strandedness",
-        bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"),
-        bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"),
-        fastqs=temporary(
-            expand(
-                "strand_check/{sample}/{sample}_R{n}.strandedness.fastq",
-                n=n,
-                allow_missing=True,
-            )
-        ),
-    log:
-        "strand_check/{sample}/{sample}.strandedness.log",
-    threads: 6
-    resources:
-        mem="8g",
-        runtime="2h",
-    run:
-        prefix = os.path.commonprefix(input.index).rstrip(".")
-        nreads = int(1e5 * 4)
-        if is_paired:
-            shell(
-                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
-            )
-            shell(
-                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}"
-            )
-            fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} "
-        else:
-            shell(
-                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
-            )
-            fastqs = f"-U {output.fastqs[0]} "
-        shell(
-            "bowtie2 "
-            "-x {prefix} "
-            "{fastqs} "
-            "--no-unal "
-            "--threads {threads} 2> {log} "
-            "| samtools view -Sb - "
-            "| samtools sort - -o {output.bam} "
-        )
-        shell("samtools index {output.bam}")
-        shell(
-            "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}"
-        )
-
-
-rule strand_check:
-    input:
-        expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES),
-    output:
-        html="strand_check/strandedness.html",
-        filelist=temporary("strand_check/filelist"),
-    log:
-        "strand_check/strandedness.log",
-    resources:
-        mem="1g",
-        runtime="2h",
-    run:
-        with open(output.filelist, "w") as fout:
-            for i in input:
-                fout.write(i + "\n")
-        shell(
-            "multiqc "
-            "--force "
-            "--module rseqc "
-            "--file-list {output.filelist} "
-            "--filename {output.html} &> {log}"
-        )
-
 
 rule cutadapt:
     input:
diff --git a/workflows/rnaseq/strand_check.smk b/workflows/rnaseq/strand_check.smk
new file mode 100644
index 00000000..9c8a3467
--- /dev/null
+++ b/workflows/rnaseq/strand_check.smk
@@ -0,0 +1,75 @@
+rule sample_strand_check:
+    input:
+        fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
+        index=expand(rules.bowtie2_index.output, label="genome"),
+        bed12=rules.conversion_bed12.output,
+    output:
+        strandedness="strand_check/{sample}/{sample}.strandedness",
+        bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"),
+        bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"),
+        fastqs=temporary(
+            expand(
+                "strand_check/{sample}/{sample}_R{n}.strandedness.fastq",
+                n=n,
+                allow_missing=True,
+            )
+        ),
+    log:
+        "strand_check/{sample}/{sample}.strandedness.log",
+    threads: 6
+    resources:
+        mem="8g",
+        runtime="2h",
+    run:
+        prefix = os.path.commonprefix(input.index).rstrip(".")
+        nreads = int(1e5 * 4)
+        if is_paired:
+            shell(
+                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
+            )
+            shell(
+                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}"
+            )
+            fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} "
+        else:
+            shell(
+                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
+            )
+            fastqs = f"-U {output.fastqs[0]} "
+        shell(
+            "bowtie2 "
+            "-x {prefix} "
+            "{fastqs} "
+            "--no-unal "
+            "--threads {threads} 2> {log} "
+            "| samtools view -Sb - "
+            "| samtools sort - -o {output.bam} "
+        )
+        shell("samtools index {output.bam}")
+        shell(
+            "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}"
+        )
+
+
+rule strand_check:
+    input:
+        expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES),
+    output:
+        html="strand_check/strandedness.html",
+        filelist=temporary("strand_check/filelist"),
+    log:
+        "strand_check/strandedness.log",
+    resources:
+        mem="1g",
+        runtime="2h",
+    run:
+        with open(output.filelist, "w") as fout:
+            for i in input:
+                fout.write(i + "\n")
+        shell(
+            "multiqc "
+            "--force "
+            "--module rseqc "
+            "--file-list {output.filelist} "
+            "--filename {output.html} &> {log}"
+        )

From baf15c923953d5d443c48fc7dec7a07b3afe6a0f Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 2 Apr 2025 01:58:28 +0000
Subject: [PATCH 082/196] move params that don't depend on config back into
 rules

more params inside rules

more params in rule
---
 workflows/chipseq/Snakefile | 26 ++++++----------
 workflows/rnaseq/Snakefile  | 60 +++++++++++++------------------------
 2 files changed, 30 insertions(+), 56 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 4dabbb52..5fbdbe1c 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -115,18 +115,6 @@ rule cutadapt:
     resources:
         mem="2g",
         runtime="2h",
-    params:
-        extra=(
-            (
-                "--nextseq-trim 20 "
-                "--overlap 6 "
-                "--minimum-length 25 "
-                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
-            )
-            + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
-            if is_paired
-            else ""
-        ),
     run:
         if is_paired:
             shell(
@@ -134,7 +122,11 @@ rule cutadapt:
                 "-o {output[0]} "
                 "-p {output[1]} "
                 "-j {threads} "
-                "{params.extra} "
+                "--nextseq-trim 20 "
+                "--overlap 6 "
+                "--minimum-length 25 "
+                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+                "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
                 "{input.fastq[0]} "
                 "{input.fastq[1]} "
                 "&> {log}"
@@ -144,7 +136,10 @@ rule cutadapt:
                 "cutadapt "
                 "-o {output[0]} "
                 "-j {threads} "
-                "{params.extra} "
+                "--nextseq-trim 20 "
+                "--overlap 6 "
+                "--minimum-length 25 "
+                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
                 "{input.fastq[0]} "
                 "&> {log}"
             )
@@ -205,8 +200,6 @@ rule bowtie2:
     resources:
         mem="32g",
         runtime="2h",
-    params:
-        extra="",
     run:
         prefix = os.path.commonprefix(input.index).rstrip(".")
         sam = output.bam.replace(".bam", ".sam")
@@ -222,7 +215,6 @@ rule bowtie2:
             "--no-unal "
             "--threads {threads} "
             "-S {sam} "
-            "{params.extra} "
             "> {log} 2>&1"
         )
 
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 91c036d4..d8239552 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -83,18 +83,6 @@ rule cutadapt:
     resources:
         mem="2g",
         runtime="2h",
-    params:
-        extra=(
-            (
-                "--nextseq-trim 20 "
-                "--overlap 6 "
-                "--minimum-length 25 "
-                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
-            )
-            + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
-            if is_paired
-            else ""
-        ),
     run:
         if is_paired:
             shell(
@@ -102,6 +90,11 @@ rule cutadapt:
                 "-o {output[0]} "
                 "-p {output[1]} "
                 "-j {threads} "
+                "--nextseq-trim 20 "
+                "--overlap 6 "
+                "--minimum-length 25 "
+                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+                "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
                 "{params.extra} "
                 "{input.fastq[0]} "
                 "{input.fastq[1]} "
@@ -112,6 +105,10 @@ rule cutadapt:
                 "cutadapt "
                 "-o {output[0]} "
                 "-j {threads} "
+                "--nextseq-trim 20 "
+                "--overlap 6 "
+                "--minimum-length 25 "
+                "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
                 "{params.extra} "
                 "{input.fastq[0]} "
                 "&> {log}"
@@ -390,21 +387,16 @@ rule rRNA:
     resources:
         mem="2g",
         runtime="2h",
-    params:
-        extra=(
-            "-k 1 "
-            "--no-unal "
-        ),
     run:
         prefix = os.path.commonprefix(input.index).rstrip(".")
         sam = output.bam.replace(".bam", ".sam")
-
         shell(
             "bowtie2 "
             "-x {prefix} "
             "-U {input.fastq} "
             "--threads {threads} "
-            "{params.extra} "
+            "-k 1 "
+            "--no-unal "
             "-S {sam} "
             "> {log} 2>&1"
         )
@@ -630,13 +622,6 @@ rule salmon:
     resources:
         mem="32g",
         runtime="2h",
-    params:
-        extra=(
-            "--libType=A "
-            "--gcBias "
-            "--seqBias "
-            "--validateMappings "
-        ),
     run:
         outdir = os.path.dirname(output[0])
         index_dir = os.path.dirname(input.index)
@@ -649,7 +634,10 @@ rule salmon:
             "--index {index_dir} "
             "--output {outdir} "
             "--threads {threads} "
-            "{params.extra} "
+            "--libType=A "
+            "--gcBias "
+            "--seqBias "
+            "--validateMappings "
             "{fastq_arg} "
             "&> {log}"
         )
@@ -787,19 +775,16 @@ rule bigwig_neg:
             "fr-firststrand": "--filterRNAstrand reverse ",
             "fr-secondstrand": "--filterRNAstrand forward ",
         }[config["stranded"]],
-        extra=(
-            "--minMappingQuality 20 "
-            "--smoothLength 10 "
-            "--normalizeUsing BPM "  # [disable for test]
-        ),
     run:
         shell(
             "bamCoverage "
             "--bam {input.bam} "
             "-o {output} "
             "-p {threads} "
-            "{params.extra} "
             "{params.strand_arg} "
+            "--minMappingQuality 20 "
+            "--smoothLength 10 "
+            "--normalizeUsing BPM "  # [disable for test]
             "&> {log}"
         )
 
@@ -822,18 +807,15 @@ rule bigwig_pos:
             "fr-firststrand": "--filterRNAstrand forward ",
             "fr-secondstrand": "--filterRNAstrand reverse ",
         }[config["stranded"]],
-        extra=(
-            "--minMappingQuality 20 "
-            "--smoothLength 10 "
-            "--normalizeUsing BPM "  # [disable for test]
-        ),
     run:
         shell(
             "bamCoverage "
             "--bam {input.bam} "
             "-o {output} "
             "-p {threads} "
-            "{params.extra} "
+            "--minMappingQuality 20 "
+            "--smoothLength 10 "
+            "--normalizeUsing BPM "  # [disable for test]
             "{params.strand_arg} "
             "&> {log}"
         )

From 26d0834ca263054311927ab1ef4c3d6daefc5c2a Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 2 Apr 2025 01:58:59 +0000
Subject: [PATCH 083/196] support only star 1-pass mode

---
 workflows/references/Snakefile      |  29 ---
 workflows/rnaseq/Snakefile          | 276 +++++++---------------------
 workflows/rnaseq/config/config.yaml |   2 -
 3 files changed, 62 insertions(+), 245 deletions(-)

diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index 682f1bfe..6ee892f8 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -140,35 +140,6 @@ rule star_index:
         shell("ln -s {input.fasta} {genomedir}")
 
 
-rule hisat2_index:
-    input:
-        f"{REFERENCES}/genome.fa",
-    output:
-        multiext(
-            f"{REFERENCES}/hisat2/genome",
-            ".1.ht2",
-            ".2.ht2",
-            ".3.ht2",
-            ".4.ht2",
-            ".5.ht2",
-            ".6.ht2",
-            ".7.ht2",
-            ".8.ht2",
-            ".fa",
-        ),
-    log:
-        f"{REFERENCES}/logs/hisat2.log",
-    resources:
-        mem="32g",
-        disk="50g",
-        runtime="8h",
-    threads: 8
-    run:
-        index = os.path.commonprefix(output).rstrip(".")
-        shell("hisat2-build" " --threads {threads}" " {input}" " {index}" " &> {log}")
-        shell("ln -s {input} {output[-1]}")
-
-
 rule transcriptome_fasta:
     input:
         fasta=f"{REFERENCES}/genome.fa",
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index d8239552..dd736780 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -148,222 +148,70 @@ rule fastqc:
             shell("mv {out_html} {output.html}")
 
 
-if config["aligner"] == "hisat2":
-
-    rule hisat2:
-        input:
-            fastq=rules.cutadapt.output,
-            index=rules.hisat2_index.output,
-        output:
-            bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
-        log:
-            "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log",
-        threads: 16
-        resources:
-            mem="32g",
-            runtime="8h",
-        params:
-            extra="",
-        run:
-            prefix = os.path.commonprefix(input.index).rstrip(".")
-            sam = output.bam.replace(".bam", ".sam")
-
-            if is_paired:
-                assert len(input.fastq) == 2
-                fastqs = "-1 {0} -2 {1} ".format(*input.fastq)
-            else:
-                assert len(input.fastq) == 1
-                fastqs = "-U {0} ".format(input.fastq)
 
-            shell(
-                "hisat2 "
-                "-x {prefix} "
-                "{fastqs} "
-                "--no-unal "
-                "--threads {threads} "
-                "-S {sam} "
-                "{params.extra} "
-                "> {log} 2>&1"
-            )
-
-            shell(
-                "samtools view -Sb {sam} "
-                "| samtools sort - -o {output.bam} -O BAM "
-                "&& rm {sam}"
-            )
-
-
-
-if config["aligner"].startswith("star"):
-    if os.getenv("TMPDIR"):
-        tmpdir_arg = "--outTmpDir $TMPDIR/star "
-    else:
-        tmpdir_arg = ""
-    # STAR can be run in 1-pass or 2-pass modes. Since we may be running it
-    # more than once in almost the same way, we pull out the shell command here
-    # and use it below.
-    STAR_CMD = (
-        "STAR "
-        "--runThreadN {threads} "
-        "--genomeDir {genomedir} "
-        "--readFilesIn {input.fastq} "
-        "--readFilesCommand zcat "
-        "--outFileNamePrefix {prefix} "
-        "{tmpdir_arg} "
-        "{params.extra} "
-    )
-    STAR_PARAMS = (
-        # NOTE: The STAR docs indicate that the following parameters are
-        # standard options for ENCODE long-RNA-seq pipeline.  Comments are from
-        # the STAR docs.
-        "--outFilterType BySJout "  # reduces number of spurious junctions
-        "--outFilterMultimapNmax 20 "  # if more than this many multimappers, consider unmapped
-        "--alignSJoverhangMin 8 "  # min overhang for unannotated junctions
-        "--alignSJDBoverhangMin 1 "  # min overhang for annotated junctions
-        "--outFilterMismatchNmax 999 "  # max mismatches per pair
-        "--outFilterMismatchNoverReadLmax 0.04 "  # max mismatches per pair relative to read length
-        "--alignIntronMin 20 "  # min intron length
-        "--alignIntronMax 1000000 "  # max intron length
-        "--alignMatesGapMax 1000000 "  # max distance between mates
-        "--outSAMunmapped None "  # do not report aligned reads in output
-    )
-    logfile_extensions = ["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"]
-
-if config["aligner"] == "star":
-
-    rule star:
-        "Align with STAR (1-pass mode)"
-        input:
-            fastq=rules.cutadapt.output,
-            index=rules.star_index.output,
-            annotation=f"{REFERENCES}/annotation.gtf",
-        output:
-            bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
-            sjout=temporary(
-                "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab"
-            ),
-        log:
-            "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log",
-        threads: 16
-        resources:
-            mem="64g",
-            runtime="8h",
-            disk="80g",
-        params:
-            extra=STAR_PARAMS,
-        run:
-            genomedir = os.path.dirname(input.index[0])
-            outdir = os.path.dirname(output[0])
-            prefix = output.bam.replace(".bam", ".star.")
-            shell(
-                STAR_CMD
-                + (
-                    "--outSAMtype BAM SortedByCoordinate "
-                    "--outStd BAM_SortedByCoordinate > {output.bam} "
-                    "2> {log} "
-                )
-            )
-
-            # move various hard-coded log files to log directory
-            logfiles = expand(prefix + "{ext}", ext=logfile_extensions)
-            shell(
-                "mkdir -p {outdir}/star_logs " "&& mv {logfiles} {outdir}/star_logs"
-            )
-
-
-if config["aligner"] == "star-twopass":
-
-    rule star_pass1:
-        "First pass of alignment with STAR to get the junctions"
-        input:
-            fastq=rules.cutadapt.output,
-            index=rules.star_index.output,
-            annotation=f"{REFERENCES}/annotation.gtf",
-        output:
-            sjout=temporary(
-                "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab"
-            ),
-        log:
-            "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log",
-        threads: 16
-        resources:
-            mem="64g",
-            runtime="8h",
-            disk="80g",
-        params:
-            extra=STAR_PARAMS,
-        run:
-            genomedir = os.path.dirname(input.index[0])
-            outdir = os.path.dirname(output[0])
-            prefix = output.sjout.replace("SJ.out.tab", "")
-            shell(
-                STAR_CMD
-                + (
-                    # In this first pass, we don't actually care about the
-                    # alignment -- just the detected junctions. So we output
-                    # the SAM to /dev/null.
-                    "--outStd SAM > /dev/null "
-                    "2> {log} "
-                )
-            )
-
-            # move various hard-coded log files to log directory
-            logfiles = expand(prefix + "{ext}", ext=logfile_extensions)
-            shell(
-                "mkdir -p {outdir}/star-pass1_logs "
-                "&& mv {logfiles} {outdir}/star-pass1_logs"
-            )
-
-    rule star_pass2:
-        """
-        Second pass of alignment with STAR using splice junctions across all
-        samples to get the final BAM
-        """
-        input:
-            fastq=rules.cutadapt.output,
-            index=rules.star_index.output,
-            annotation=f"{REFERENCES}/annotation.gtf",
-            sjout=expand(rules.star_pass1.output, sample=SAMPLES),
-        output:
-            bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
-            sjout=temporary(
-                "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.SJ.out.tab"
-            ),
-        log:
-            "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log",
-        threads: 16
-        resources:
-            mem="64g",
-            runtime="8h",
-            disk="80g",
-        params:
-            extra=STAR_PARAMS,
-        run:
-            genomedir = os.path.dirname(input.index[0])
-            outdir = os.path.dirname(output[0])
-            prefix = output.bam.replace(".bam", ".star-pass2.")
-            shell(
-                STAR_CMD
-                + (
-                    # In contrast to pass 1, we will be keeping these BAMs --
-                    # so sort them
-                    "--outSAMtype BAM SortedByCoordinate "
-                    # Splice junction databases from all samples in the first
-                    # pass.
-                    "--sjdbFileChrStartEnd {input.sjout} "
-                    "--outStd BAM_SortedByCoordinate > {output.bam} "
-                    "2> {log} "
-                )
-            )
-
-            # move various hard-coded log files to log directory
-            logfiles = expand(prefix + "{ext}", ext=logfile_extensions)
-            shell(
-                "mkdir -p {outdir}/star-pass2_logs "
-                "&& mv {logfiles} {outdir}/star-pass2_logs"
-            )
+rule star:
+    "Align with STAR (1-pass mode)"
+    input:
+        fastq=rules.cutadapt.output,
+        index=rules.star_index.output,
+        annotation=f"{REFERENCES}/annotation.gtf",
+    output:
+        bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
+        sjout=temporary(
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab"
+        ),
+    log:
+        "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log",
+    threads: 16
+    resources:
+        mem="64g",
+        runtime="8h",
+        disk="80g",
+    run:
+        genomedir = os.path.dirname(input.index[0])
+        outdir = os.path.dirname(output[0])
+        prefix = output.bam.replace(".bam", ".star.")
+        if os.getenv("TMPDIR"):
+            tmpdir_arg = "--outTmpDir $TMPDIR/star "
+        else:
+            tmpdir_arg = ""
+        shell(
+            "STAR "
+            "--runThreadN {threads} "
+            "--genomeDir {genomedir} "
+            "--readFilesIn {input.fastq} "
+            "--readFilesCommand zcat "
+            "--outFileNamePrefix {prefix} "
+            "{tmpdir_arg} "
+            "--outSAMtype BAM SortedByCoordinate "
+            "--outStd BAM_SortedByCoordinate > {output.bam} "
+
+            # NOTE: The STAR docs indicate that the following parameters are
+            # standard options for ENCODE long-RNA-seq pipeline.  Comments are from
+            # the STAR docs.
+            "--outFilterType BySJout "  # reduces number of spurious junctions
+            "--outFilterMultimapNmax 20 "  # if more than this many multimappers, consider unmapped
+            "--alignSJoverhangMin 8 "  # min overhang for unannotated junctions
+            "--alignSJDBoverhangMin 1 "  # min overhang for annotated junctions
+            "--outFilterMismatchNmax 999 "  # max mismatches per pair
+            "--outFilterMismatchNoverReadLmax 0.04 "  # max mismatches per pair relative to read length
+            "--alignIntronMin 20 "  # min intron length
+            "--alignIntronMax 1000000 "  # max intron length
+            "--alignMatesGapMax 1000000 "  # max distance between mates
+            "--outSAMunmapped None "  # do not report aligned reads in output
+            "2> {log} "
+        )
 
-            shell("rm -r {prefix}_STARgenome")
+        # move various hard-coded log files to log directory
+        logfile_extensions = 
+        logfiles = expand(
+            prefix + "{ext}",
+            ext=["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"]
+        )
+        shell(
+            "mkdir -p {outdir}/star_logs "
+            "&& mv {logfiles} {outdir}/star_logs"
+        )
 
 
 rule rRNA:
diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml
index 2cbd3d66..26f5aba9 100644
--- a/workflows/rnaseq/config/config.yaml
+++ b/workflows/rnaseq/config/config.yaml
@@ -23,5 +23,3 @@ patterns: 'config/rnaseq_patterns.yaml'
 stranded: 'fr-firststrand'     # for dUTP libraries
 #         'fr-secondstrand'    # for ligation libraries
 #         'unstranded'         # for libraries without strand specificity
-
-aligner: 'star'

From 76affb6447800547ce17dfa9ee85fcda3e93bd84 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 2 Apr 2025 02:02:41 +0000
Subject: [PATCH 084/196] updates to env.yml

---
 env.yml | 230 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 133 insertions(+), 97 deletions(-)

diff --git a/env.yml b/env.yml
index 9bbc8a71..a4341cb0 100644
--- a/env.yml
+++ b/env.yml
@@ -9,17 +9,20 @@ dependencies:
   - alsa-lib=1.2.13
   - amply=0.1.6
   - annotated-types=0.7.0
+  - anyio=4.9.0
   - appdirs=1.4.4
-  - argcomplete=3.5.2
+  - argcomplete=3.6.1
   - argh=0.31.3
   - argparse-dataclass=2.0.0
   - asttokens=3.0.0
-  - attrs=24.3.0
-  - babel=2.16.0
-  - beautifulsoup4=4.12.3
+  - attrs=25.3.0
+  - babel=2.17.0
+  - backports=1.0
+  - backports.tarfile=1.2.0
+  - beautifulsoup4=4.13.3
   - bedtools=2.31.1
   - binutils_impl_linux-64=2.43
-  - biopython=1.84
+  - biopython=1.85
   - boost-cpp=1.85.0
   - bowtie=1.3.1
   - bowtie2=2.5.4
@@ -30,9 +33,9 @@ dependencies:
   - bx-python=0.13.0
   - bzip2=1.0.8
   - c-ares=1.34.4
-  - ca-certificates=2024.12.14
-  - cairo=1.18.2
-  - certifi=2024.12.14
+  - ca-certificates=2025.1.31
+  - cairo=1.18.4
+  - certifi=2025.1.31
   - cffi=1.17.1
   - charset-normalizer=3.4.1
   - click=8.1.8
@@ -49,16 +52,19 @@ dependencies:
   - configargparse=1.7
   - connection_pool=0.0.3
   - contourpy=1.3.1
-  - curl=8.11.1
+  - cryptography=44.0.2
+  - curl=8.12.1
   - cutadapt=5.0
   - cycler=0.12.1
-  - datrie=0.8.2
-  - decorator=5.1.1
-  - deeptools=3.5.5
+  - dbus=1.13.6
+  - decorator=5.2.1
+  - deeptools=3.5.6
   - deeptoolsintervals=0.1.9
+  - distlib=0.3.9
   - dnaio=1.2.2
   - docutils=0.21.2
   - dpath=2.2.0
+  - editables=0.5
   - eido=0.2.4
   - epic2=0.0.52
   - et_xmlfile=2.0.0
@@ -68,6 +74,7 @@ dependencies:
   - expat=2.6.4
   - fastq-screen=0.16.0
   - fastqc=0.12.1
+  - filelock=3.18.0
   - font-ttf-dejavu-sans-mono=2.37
   - font-ttf-inconsolata=3.000
   - font-ttf-source-code-pro=2.038
@@ -75,8 +82,8 @@ dependencies:
   - fontconfig=2.15.0
   - fonts-conda-ecosystem=1
   - fonts-conda-forge=1
-  - fonttools=4.55.3
-  - freetype=2.12.1
+  - fonttools=4.56.0
+  - freetype=2.13.3
   - fribidi=1.0.10
   - gcc_impl_linux-64=14.2.0
   - gffread=0.12.7
@@ -88,38 +95,50 @@ dependencies:
   - graphite2=1.3.13
   - gsl=1.16
   - gxx_impl_linux-64=14.2.0
-  - h2=4.1.0
-  - harfbuzz=10.1.0
+  - h11=0.14.0
+  - h2=4.2.0
+  - harfbuzz=11.0.0
+  - hatch=1.14.0
+  - hatchling=1.27.0
   - hdf5=1.14.3
   - hisat2=2.2.1
-  - hpack=4.0.0
+  - hpack=4.1.0
   - html5lib=1.1
   - htslib=1.21
+  - httpcore=1.0.7
+  - httpx=0.28.1
   - humanfriendly=10.0
-  - humanize=4.11.0
-  - hyperframe=6.0.1
+  - humanize=4.12.2
+  - hyperframe=6.1.0
+  - hyperlink=21.0.0
   - icu=75.1
   - idna=3.10
   - imagesize=1.4.1
   - immutables=0.21
-  - importlib-metadata=8.5.0
+  - importlib-metadata=8.6.1
   - importlib_resources=6.5.2
   - iniconfig=2.0.0
   - intervalstats=1.01
-  - ipython=8.31.0
-  - isa-l=2.31.0
+  - ipython=9.0.2
+  - ipython_pygments_lexers=1.1.1
+  - isa-l=2.31.1
+  - jaraco.classes=3.4.0
+  - jaraco.context=6.0.1
+  - jaraco.functools=4.1.0
   - jedi=0.19.2
-  - jinja2=3.1.5
+  - jeepney=0.9.0
+  - jinja2=3.1.6
   - jsonschema=4.23.0
   - jsonschema-specifications=2024.10.1
   - jupyter_core=5.7.2
   - kaleido-core=0.2.1
   - kallisto=0.51.1
   - kernel-headers_linux-64=3.10.0
+  - keyring=25.6.0
   - keyutils=1.6.1
   - kiwisolver=1.4.7
   - krb5=1.21.3
-  - lcms2=2.16
+  - lcms2=2.17
   - ld_impl_linux-64=2.43
   - lerc=4.0.0
   - libaec=1.1.3
@@ -132,36 +151,36 @@ dependencies:
   - libbrotlienc=1.1.0
   - libcblas=3.9.0
   - libcups=2.3.3
-  - libcurl=8.11.1
-  - libdeflate=1.23
-  - libedit=3.1.20240808
+  - libcurl=8.12.1
+  - libdeflate=1.22
+  - libedit=3.1.20250104
   - libev=4.33
   - libexpat=2.6.4
-  - libffi=3.4.2
+  - libffi=3.4.6
   - libgcc=14.2.0
   - libgcc-devel_linux-64=14.2.0
   - libgcc-ng=14.2.0
   - libgd=2.3.3
+  - libgff=2.0.0
   - libgfortran=14.2.0
-  - libgfortran-ng=14.2.0
   - libgfortran5=14.2.0
-  - libglib=2.82.2
+  - libglib=2.84.0
   - libgomp=14.2.0
   - libhwloc=2.11.2
-  - libiconv=1.17
+  - libiconv=1.18
   - libjemalloc=5.3.0
   - libjpeg-turbo=3.0.0
   - liblapack=3.9.0
   - liblapacke=3.9.0
-  - liblzma=5.6.3
-  - liblzma-devel=5.6.3
+  - liblzma=5.6.4
+  - liblzma-devel=5.6.4
   - libnghttp2=1.64.0
   - libnsl=2.0.1
-  - libopenblas=0.3.28
-  - libopenssl-static=3.4.0
-  - libpng=1.6.45
+  - libopenblas=0.3.29
+  - libopenssl-static=3.4.1
+  - libpng=1.6.47
   - libsanitizer=14.2.0
-  - libsqlite=3.47.2
+  - libsqlite=3.49.1
   - libssh2=1.11.1
   - libstdcxx=14.2.0
   - libstdcxx-devel_linux-64=14.2.0
@@ -171,41 +190,44 @@ dependencies:
   - libwebp-base=1.5.0
   - libxcb=1.17.0
   - libxcrypt=4.4.36
-  - libxml2=2.13.5
+  - libxml2=2.13.7
   - libzlib=1.3.1
   - logmuse=0.2.8
-  - logomaker=0.8
+  - logomaker=0.8.6
   - macs2=2.2.9.1
   - make=4.4.1
   - markdown=3.6
   - markdown-it-py=3.0.0
   - markupsafe=3.0.2
   - mathjax=2.7.7
-  - matplotlib-base=3.10.0
+  - matplotlib-base=3.10.1
   - matplotlib-inline=0.1.7
   - mdurl=0.1.2
-  - multiqc=1.26
+  - more-itertools=10.6.0
+  - multiqc=1.28
   - munkres=1.1.4
   - mysql-connector-c=6.1.11
+  - narwhals=1.32.0
   - natsort=8.4.0
   - nbformat=5.10.4
-  - ncbi-vdb=3.1.1
+  - ncbi-vdb=3.2.1
   - ncurses=6.5
   - networkx=3.4.2
   - nspr=4.36
-  - nss=3.107
-  - numpy=2.2.1
+  - nss=3.110
+  - numpy=2.2.4
   - numpydoc=1.8.0
-  - openjdk=23.0.1
+  - openjdk=23.0.2
   - openjpeg=2.5.3
   - openpyxl=3.1.5
-  - openssl=3.4.0
+  - openssl=3.4.1
   - ossuuid=1.6.2
   - packaging=24.2
   - pandas=2.2.3
-  - pandoc=3.6.1
-  - pango=1.54.0
+  - pandoc=3.6.4
+  - pango=1.56.3
   - parso=0.8.4
+  - pathspec=0.12.1
   - patsy=1.0.1
   - pbzip2=1.1.13
   - pcre2=10.44
@@ -226,7 +248,7 @@ dependencies:
   - perl-file-path=2.18
   - perl-file-temp=0.2304
   - perl-file-which=1.24
-  - perl-gd=2.56
+  - perl-gd=2.83
   - perl-gdgraph=1.54
   - perl-gdtextutil=0.86
   - perl-importer=0.026
@@ -237,6 +259,7 @@ dependencies:
   - perl-sub-info=0.002
   - perl-term-table=0.024
   - perl-test-fatal=0.016
+  - perl-test-nowarnings=1.06
   - perl-test-warnings=0.031
   - perl-test2-suite=0.000163
   - perl-try-tiny=0.31
@@ -250,40 +273,41 @@ dependencies:
   - pickleshare=0.7.5
   - pigz=2.8
   - pillow=11.1.0
-  - pip=24.3.1
+  - pip=25.0.1
   - pixman=0.44.2
   - pkgutil-resolve-name=1.3.10
   - plac=1.4.3
-  - platformdirs=4.3.6
-  - plotly=5.24.1
+  - platformdirs=4.3.7
+  - plotly=6.0.1
   - pluggy=1.5.0
   - preseq=2.0.2
-  - prompt-toolkit=3.0.48
-  - psutil=6.1.1
+  - prompt-toolkit=3.0.50
+  - psutil=7.0.0
   - pthread-stubs=0.4
   - ptyprocess=0.7.0
   - pulp=2.8.0
   - pure_eval=0.2.3
-  - py2bit=0.3.0
-  - pyaml-env=1.2.1
-  - pybedtools=0.11.0
-  - pybigwig=0.3.23
+  - py2bit=0.3.3
+  - pyaml-env=1.2.2
+  - pybedtools=0.12.0
+  - pybigwig=0.3.24
   - pycparser=2.22
-  - pydantic=2.10.4
+  - pydantic=2.10.6
   - pydantic-core=2.27.2
   - pyfaidx=0.8.1.3
   - pygments=2.19.1
-  - pyparsing=3.2.1
+  - pyparsing=3.2.3
   - pysam=0.22.1
   - pysocks=1.7.1
-  - pytest=8.3.4
+  - pytest=8.3.5
   - pytest-xdist=3.6.1
   - python=3.11.11
   - python-dateutil=2.9.0.post0
+  - python-dotenv=1.1.0
   - python-fastjsonschema=2.21.1
-  - python-isal=1.7.1
+  - python-isal=1.7.2
   - python-kaleido=0.2.1
-  - python-tzdata=2024.2
+  - python-tzdata=2025.2
   - python-zlib-ng=0.5.1
   - python_abi=3.11
   - pytz=2024.1
@@ -292,67 +316,76 @@ dependencies:
   - qhull=2020.2
   - r-base=4.2.3
   - readline=8.2
-  - referencing=0.35.1
+  - referencing=0.36.2
+  - regex=2024.11.6
   - requests=2.32.3
   - reretry=0.11.8
   - rich=13.9.4
-  - rich-click=1.8.5
-  - rpds-py=0.22.3
+  - rich-click=1.8.8
+  - roman-numerals-py=3.1.0
+  - rpds-py=0.24.0
   - rseqc=5.0.4
   - salmon=1.10.3
   - samtools=1.21
-  - scipy=1.15.0
+  - scipy=1.15.2
   - seaborn=0.13.2
   - seaborn-base=0.13.2
+  - secretstorage=3.3.3
   - sed=4.8
-  - setuptools=75.6.0
+  - setuptools=75.8.2
   - shellingham=1.5.4
-  - simplejson=3.19.3
+  - simplejson=3.20.1
   - six=1.17.0
-  - slack-sdk=3.34.0
-  - slack_sdk=3.34.0
+  - slack-sdk=3.35.0
+  - slack_sdk=3.35.0
   - smart_open=7.1.0
-  - smmap=5.0.0
-  - snakemake=8.27.0
+  - smmap=5.0.2
+  - snakemake=9.1.3
   - snakemake-interface-common=1.17.4
-  - snakemake-interface-executor-plugins=9.3.3
+  - snakemake-interface-executor-plugins=9.3.5
+  - snakemake-interface-logger-plugins=1.2.3
   - snakemake-interface-report-plugins=1.1.0
-  - snakemake-interface-storage-plugins=3.3.0
-  - snakemake-minimal=8.27.0
+  - snakemake-interface-storage-plugins=4.2.1
+  - snakemake-minimal=9.1.3
+  - sniffio=1.3.1
   - snowballstemmer=2.2.0
   - soupsieve=2.5
   - spectra=0.0.11
-  - sphinx=8.1.3
+  - sphinx=8.2.3
   - sphinxcontrib-applehelp=2.0.0
   - sphinxcontrib-devhelp=2.0.0
   - sphinxcontrib-htmlhelp=2.1.0
   - sphinxcontrib-jsmath=1.0.1
   - sphinxcontrib-qthelp=2.0.0
   - sphinxcontrib-serializinghtml=1.1.10
-  - sqlite=3.47.2
-  - sra-tools=3.1.1
+  - sqlite=3.49.1
+  - sra-tools=3.2.0
   - stack_data=0.6.3
+  - staden_io_lib=1.15.0
   - star=2.7.11b
   - statsmodels=0.14.4
   - subread=2.0.8
   - sysroot_linux-64=2.17
   - tabulate=0.9.0
   - tbb=2022.0.0
-  - tenacity=9.0.0
   - throttler=1.2.2
+  - tiktoken=0.9.0
   - tk=8.6.13
   - tktable=2.10
   - tomli=2.2.1
+  - tomli-w=1.2.0
+  - tomlkit=0.13.2
   - tqdm=4.67.1
   - trackhub=1.0
   - traitlets=5.14.3
-  - typeguard=4.4.1
-  - typer=0.15.1
-  - typer-slim=0.15.1
-  - typer-slim-standard=0.15.1
-  - typing-extensions=4.12.2
-  - typing_extensions=4.12.2
-  - tzdata=2024b
+  - trove-classifiers=2025.3.19.19
+  - typeguard=4.4.2
+  - typer=0.15.2
+  - typer-slim=0.15.2
+  - typer-slim-standard=0.15.2
+  - typing-extensions=4.13.0
+  - typing_extensions=4.13.0
+  - tzdata=2025b
   - ubiquerg=0.8.0
   - ucsc-bedgraphtobigwig=472
   - ucsc-bedsort=469
@@ -366,17 +399,20 @@ dependencies:
   - ucsc-stringify=472
   - ucsc-twobittofa=472
   - ucsc-wigtobigwig=472
-  - unicodedata2=15.1.0
+  - unicodedata2=16.0.0
   - urllib3=2.3.0
+  - userpath=1.9.2
+  - uv=0.6.10
   - veracitools=0.1.3
+  - virtualenv=20.29.3
   - wcwidth=0.2.13
   - webencodings=0.5.1
   - wheel=0.45.1
-  - wrapt=1.17.0
+  - wrapt=1.17.2
   - xopen=2.0.2
   - xorg-libice=1.1.2
-  - xorg-libsm=1.2.5
-  - xorg-libx11=1.8.10
+  - xorg-libsm=1.2.6
+  - xorg-libx11=1.8.12
   - xorg-libxau=1.0.12
   - xorg-libxdmcp=1.1.5
   - xorg-libxext=1.3.6
@@ -386,13 +422,13 @@ dependencies:
   - xorg-libxrender=0.9.12
   - xorg-libxt=1.3.1
   - xorg-libxtst=1.2.5
-  - xz=5.6.3
-  - xz-gpl-tools=5.6.3
-  - xz-tools=5.6.3
+  - xz=5.6.4
+  - xz-gpl-tools=5.6.4
+  - xz-tools=5.6.4
   - yaml=0.2.5
-  - yte=1.5.5
+  - yte=1.7.0
   - zipp=3.21.0
   - zlib=1.3.1
-  - zlib-ng=2.2.3
+  - zlib-ng=2.2.4
   - zstandard=0.23.0
-  - zstd=1.5.6
+  - zstd=1.5.7

From 0937a0a56bb5136ebd96f6ebecd90c16007b2156 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 2 Oct 2025 17:01:41 +0000
Subject: [PATCH 085/196] add draft of decisions.rst

---
 docs/decisions.rst | 85 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 docs/decisions.rst

diff --git a/docs/decisions.rst b/docs/decisions.rst
new file mode 100644
index 00000000..7abbc78d
--- /dev/null
+++ b/docs/decisions.rst
@@ -0,0 +1,85 @@
+Decision log
+============
+
+This document keeps track of the reasoning behind various architecture decisions.
+
+References
+----------
+Here are use-cases we have that are common enough to warrant supporting:
+
+- References should support multiple workflows (ChIP-seq, RNA-seq, etc)
+  - This implies that the means the references dir should be in the
+    ``workflows`` directory or above.
+  - For example, this may mean a STAR index for RNA-seq, a bowtie2 index for
+    rRNA contamination, and another bowtie2 index for ChIP-seq.
+
+- References should support different organisms in different workflows. There
+  should beo only one organism per workflow though.
+
+- References should be re-created for each project.
+  - What we've found is that if we have a central location for the references
+    (shared by multiple deployments of lcdb-wf over the years) then we get
+    conflicts where one deployment's aligner version is more recent, causing
+    errors when using the index for an older version.
+  - To keep using this, we'd need to version indexes based on aligner version.
+  - However, when writing up methods for a paper we need to be able to trace
+    back what commands were run to generate the reference, including additional
+    patching that may have taken place (as is supported by the references
+    workflow).
+  - Re-using indexes is space- and time-efficient in the short term, but has
+    shown to be inefficient in time and reproducibility in the long term.
+  - Keeping everything in the same deployment director also helps with the
+    archiving process. 
+
+Naming:
+
+- Top level should be organsim. Doesn't really matter in the case of
+  a single-organism workflow.
+- Next should be what has historically been called "tag". This could be the
+  assembly name for genomic indexes, or some combination of assembly
+  + annotation for transcriptome.
+- If we're assuming "deployment-local" references, these no longer have to be
+  globally unique. If we have a mouse reference with a transgene, we can just
+  call it "mouse/mm39" but have the transgene patched into it, and not worry
+  about conflicting (or worse, overwriting!) a central reference with the same
+  name that didn't have the transgene.
+- Fasta files are included next to their respective index.
+
+This example uses the ``dmel`` organism and ``test`` tag which is configured by
+default for tests.
+
+This uses ``$ORG/$TAG/<genome|annotation|transcriptome>/$TOOL`` as the path
+template. This lets us keep the fastq file used for building the various
+indexes alongside the indexes.
+
+::
+
+  references_data/
+  ├── dmel
+      ├── rRNA
+      │   └── genome
+      │       ├── bowtie2
+      │       │   └── dmel_rRNA.* <bowtie2 files>
+      │       └── dmel_rRNA.fasta
+      └── test
+          ├── annotation
+          │   ├── dmel_test.bed12
+          │   ├── dmel_test.gtf
+          │   └── dmel_test.refflat
+          ├── genome
+          │   ├── bowtie2
+          │   │   └── dmel_test.* <bowtie2 files>
+          │   ├── star
+          │   │   └── dmel_test
+          │   │       └── <STAR files>
+          │   ├── dmel_test.chromsizes
+          │   ├── dmel_test.fasta
+          │   ├── dmel_test.fasta.fai
+          └── transcriptome
+              ├── kallisto
+              │   └── dmel_test
+              │       └── transcripts.idx
+              ├── salmon
+              │   └── dmel_test
+              │       └── <salmon files>
+              └── dmel_test.fasta

From 4a570e09051e1060930939a13a457e9b768f835b Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 10 Jul 2025 09:28:19 -0400
Subject: [PATCH 086/196] default to conda rather than mamba as front-end

---
 deploy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy.py b/deploy.py
index 7ad7e1ac..38df1687 100755
--- a/deploy.py
+++ b/deploy.py
@@ -367,7 +367,7 @@ def build_envs(dest, conda_frontend="mamba"):
     ap.add_argument(
         "--conda-frontend",
         help="Set program (conda or mamba) to use when creating environments. Default is %(default)s.",
-        default="mamba",
+        default="conda",
     )
     ap.add_argument(
         "--rsync-args",

From e03f8816dca74aaed151a0a385a8d786c9e877b9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 10 Jul 2025 09:28:42 -0400
Subject: [PATCH 087/196] support additional packages during deployment

i.e., snakemake-executor-plugin-cluster-generic
---
 deploy.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 5 deletions(-)

diff --git a/deploy.py b/deploy.py
index 38df1687..c5c7cb39 100755
--- a/deploy.py
+++ b/deploy.py
@@ -267,7 +267,7 @@ def deployment_json(source, dest):
     info("Wrote details of deployment to {log}".format(**locals()))
 
 
-def build_envs(dest, conda_frontend="mamba"):
+def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="conda"):
     """
     Build conda environments.
 
@@ -279,15 +279,25 @@ def build_envs(dest, conda_frontend="mamba"):
         the command line with --dest) in which the env and env-r yaml files
         should already exist. Envs will be created in here.
 
+    additional_main : list
+        Other packages to install, e.g., a snakemake plugin needed for
+        a cluster profile, into the main environment.
+
+    additional_r : list
+        Other packages to install into the R environment.
+
     conda_frontend : 'mamba' | 'conda'
         Which front-end to use (terminology borrowed from Snakemake)
+
     """
     mapping = [
-        ("./env", "env.yml"),
-        ("./env-r", "env-r.yml"),
+        ("./env", "env.yml", additional_main),
+        ("./env-r", "env-r.yml", additional_r),
     ]
-    for env, yml in mapping:
+    for env, yml, additional in mapping:
         info("Building environment " + os.path.join(dest, env))
+        if additional:
+            info(f"Adding {additional} to environment")
 
         try:
             # conda and mamba can be hard to kill, possibly because they're
@@ -305,6 +315,8 @@ def build_envs(dest, conda_frontend="mamba"):
                 "--file",
                 yml,
             ]
+            if additional:
+                cmds += additional
             p = sp.Popen(cmds, universal_newlines=True, cwd=dest)
             p.wait()
 
@@ -375,6 +387,20 @@ def build_envs(dest, conda_frontend="mamba"):
         default="-rlt"
     )
 
+    ap.add_argument(
+        "--additional-main",
+        help="""Additional packages to install in main environment (only
+        relevant with --build-envs). For example,
+        'snakemake-executor-plugin-cluster-generic' to support a cluster
+        profile.""",
+        nargs="+"
+    )
+    ap.add_argument(
+        "--additional-r",
+        help="Additional packages to install in R environment (only relevant with --build-envs)",
+        nargs="+"
+    )
+
     ap.add_argument(
         "--mismatch-ok",
         action="store_true",
@@ -398,7 +424,28 @@ def build_envs(dest, conda_frontend="mamba"):
     rsync(include, source, dest, args.rsync_args)
     deployment_json(source, dest)
 
+    if args.additional_main and additional_main_from_env_var:
+        print(
+            "ERROR: Unset LCDBWF_ADDITIONAL_MAIN env var if you want to use the --additional-main argument."
+        )
+        sys.exit(1)
+
+    if additional_main_from_env_var:
+        if args.additional_main:
+            print(
+                "ERROR: Unset LCDBWF_ADDITIONAL_MAIN env var if you want to use the --additional-main argument."
+            )
+            sys.exit(1)
+        additional_main = [additional_main_from_env_var]
+    else:
+        additional_main = args.additional_main
+
     if args.build_envs:
-        build_envs(dest, conda_frontend=args.conda_frontend)
+        build_envs(
+            dest,
+            additional_main=additional_main,
+            additional_r=args.additional_r,
+            conda_frontend=args.conda_frontend,
+        )
 
     warning("Deployment complete in {args.dest}".format(**locals()))

From e86bbc216d50fdaab0d23b71ced279b5d0204701 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 10 Jul 2025 09:53:53 -0400
Subject: [PATCH 088/196] pep8 on deploy.py

---
 deploy.py | 121 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 63 insertions(+), 58 deletions(-)

diff --git a/deploy.py b/deploy.py
index c5c7cb39..4396654f 100755
--- a/deploy.py
+++ b/deploy.py
@@ -8,14 +8,13 @@
 import subprocess as sp
 import datetime
 import json
-import fnmatch
 import logging
 import hashlib
 from pathlib import Path
 from distutils import filelist
 
 # Determine default staging area, used in help
-default_staging = "/tmp/{0}-lcdb-wf-staging".format(os.getenv('USER'))
+default_staging = "/tmp/{0}-lcdb-wf-staging".format(os.getenv("USER"))
 
 usage = f"""
 This script assists in the deployment of relevant code from the lcdb-wf
@@ -74,52 +73,51 @@ def error(s):
     logging.error(RED + s + RESET)
 
 
-def write_include_file(source, flavor='all'):
+def write_include_file(source, flavor="all"):
 
     # Patterns follow that of MANIFEST.in
     # (https://packaging.python.org/en/latest/guides/using-manifest-in/),
     # and distutils.filelist is used below to parse them.
 
     PATTERN_DICT = {
-        'rnaseq': [
-            'include workflows/rnaseq/Snakefile',
-            'recursive-include workflows/rnaseq/config *',
-            'include workflows/rnaseq/rnaseq_trackhub.py',
-            'recursive-include workflows/rnaseq/downstream *.Rmd',
-            'recursive-include workflows/rnaseq/downstream *.yaml',
+        "rnaseq": [
+            "include workflows/rnaseq/Snakefile",
+            "recursive-include workflows/rnaseq/config *",
+            "include workflows/rnaseq/rnaseq_trackhub.py",
+            "recursive-include workflows/rnaseq/downstream *.Rmd",
+            "recursive-include workflows/rnaseq/downstream *.yaml",
         ],
-        'chipseq': [
-            'include workflows/chipseq/Snakefile',
-            'recursive-include workflows/chipseq/config *',
-            'include workflows/chipseq/chipseq_trackhub.py',
+        "chipseq": [
+            "include workflows/chipseq/Snakefile",
+            "recursive-include workflows/chipseq/config *",
+            "include workflows/chipseq/chipseq_trackhub.py",
         ],
-        'all': [
-            'recursive-include wrappers *',
-            'recursive-include include *',
-            'recursive-include lib *', 
-            'include env.yml env-r.yml .gitignore',
-            'include workflows/references/Snakefile',
-            'recursive-include workflows/references/config *',
-            'global-exclude __pycache__',
+        "all": [
+            "recursive-include wrappers *",
+            "recursive-include include *",
+            "recursive-include lib *",
+            "include env.yml env-r.yml .gitignore",
+            "include workflows/references/Snakefile",
+            "recursive-include workflows/references/config *",
+            "global-exclude __pycache__",
+        ],
+        "full": [
+            "include workflows/colocalization/Snakefile",
+            "recursive-include workflows/colocalization/config *",
+            "recursive-include workflows/colocalization/scripts *",
+            "recursive-include workflows/figures *",
+            "recursive-include workflows/external *",
         ],
-        'full': [
-            'include workflows/colocalization/Snakefile',
-            'recursive-include workflows/colocalization/config *',
-            'recursive-include workflows/colocalization/scripts *',
-            'recursive-include workflows/figures *',
-            'recursive-include workflows/external *',
-        ]
-
     }
 
     patterns = []
-    if flavor in ('full', 'rnaseq'):
-        patterns.extend(PATTERN_DICT['rnaseq'])
-    if flavor in ('full', 'chipseq'):
-        patterns.extend(PATTERN_DICT['chipseq'])
-    if flavor == 'full':
-        patterns.extend(PATTERN_DICT['full'])
-    patterns.extend(PATTERN_DICT['all'])
+    if flavor in ("full", "rnaseq"):
+        patterns.extend(PATTERN_DICT["rnaseq"])
+    if flavor in ("full", "chipseq"):
+        patterns.extend(PATTERN_DICT["chipseq"])
+    if flavor == "full":
+        patterns.extend(PATTERN_DICT["full"])
+    patterns.extend(PATTERN_DICT["all"])
 
     def fastwalk(path):
         """
@@ -128,13 +126,13 @@ def fastwalk(path):
         """
         path = str(path)
         for root, dirs, files in os.walk(path, topdown=True):
-            if 'conda-meta' in dirs:
+            if "conda-meta" in dirs:
                 dirs[:] = []
                 files[:] = []
             for d in dirs:
-                yield os.path.join(root, d).replace(path + '/', '')
+                yield os.path.join(root, d).replace(path + "/", "")
             for f in files:
-                yield os.path.join(root, f).replace(path + '/', '')
+                yield os.path.join(root, f).replace(path + "/", "")
 
     f = filelist.FileList()
     f.allfiles = list(fastwalk(source))
@@ -153,9 +151,9 @@ def fastwalk(path):
 
     to_transfer = list(set(under_version_control).intersection(f.files))
     include = tempfile.NamedTemporaryFile(delete=False).name
-    with open(include, 'w') as fout:
-        fout.write('\n\n')
-        fout.write('\n'.join(to_transfer))
+    with open(include, "w") as fout:
+        fout.write("\n\n")
+        fout.write("\n".join(to_transfer))
 
     return include
 
@@ -188,8 +186,8 @@ def check_md5(f):
         full_here = Path(__file__).resolve()
         full_there = Path(dest) / "deploy.py"
         error(
-            "Files {full_here} and {full_there} do not match! ".format(**locals()) +
-            "The deploy script you are running appears to be out of date. "
+            f"Files {full_here} and {full_there} do not match! "
+            + "The deploy script you are running appears to be out of date. "
             "Please get an updated copy from https://github.com/lcdb/lcdb-wf, perhaps "
             "with 'wget https://raw.githubusercontent.com/lcdb/lcdb-wf/master/deploy.py'"
         )
@@ -322,16 +320,21 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
 
         except KeyboardInterrupt:
             print("")
-            error("Killing running {conda_frontend} job, '".format(**locals()) + " ".join(cmds))
+            error(
+                "Killing running {conda_frontend} job, '".format(**locals())
+                + " ".join(cmds)
+            )
             p.kill()
             sys.exit(1)
 
         if p.returncode:
-            error("Error running {conda_frontend}, '".format(**locals()) + " ".join(cmds))
+            error(
+                "Error running {conda_frontend}, '".format(**locals()) + " ".join(cmds)
+            )
             sys.exit(1)
 
         full_env = Path(dest) / env
-        info("Created env {full_env}".format(**locals()))
+        info(f"Created env {full_env}")
 
 
 if __name__ == "__main__":
@@ -340,7 +343,9 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
     ap.add_argument(
         "--flavor",
         default="full",
-        help="""Options are {0}. Default is full.""".format(['full', 'rnaseq', 'chipseq']),
+        help="""Options are {0}. Default is full.""".format(
+            ["full", "rnaseq", "chipseq"]
+        ),
     )
     ap.add_argument(
         "--dest", help="""Destination directory in which to copy files""", required=True
@@ -352,7 +357,7 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
         help=f"""Make a new clone to a staging area (at the location specified
         by --staging which defaults to {default_staging}) and deploy from
         there. Useful if using this script as a standalone tool. You can also
-        use --branch to configure which branch to deploy from that clone."""
+        use --branch to configure which branch to deploy from that clone.""",
     )
 
     ap.add_argument(
@@ -384,7 +389,7 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
     ap.add_argument(
         "--rsync-args",
         help="Options for rsync when deploying to a new directory. Default is %(default)s.",
-        default="-rlt"
+        default="-rlt",
     )
 
     ap.add_argument(
@@ -393,25 +398,25 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
         relevant with --build-envs). For example,
         'snakemake-executor-plugin-cluster-generic' to support a cluster
         profile.""",
-        nargs="+"
+        nargs="+",
     )
     ap.add_argument(
         "--additional-r",
         help="Additional packages to install in R environment (only relevant with --build-envs)",
-        nargs="+"
+        nargs="+",
     )
 
-    ap.add_argument(
-        "--mismatch-ok",
-        action="store_true",
-        help="Used for testing")
+    ap.add_argument("--mismatch-ok", action="store_true", help="Used for testing")
     args = ap.parse_args()
     dest = args.dest
     flavor = args.flavor
 
     if args.staging and not args.clone:
-            print("ERROR: --staging was specified but --clone was not. Did you want to use --clone?", file=sys.stderr)
-            sys.exit(1)
+        print(
+            "ERROR: --staging was specified but --clone was not. Did you want to use --clone?",
+            file=sys.stderr,
+        )
+        sys.exit(1)
     if args.clone:
         if args.staging is None:
             args.staging = default_staging

From b1fc75e62bcc750ba6faafc8830ee6982d19dc04 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Mon, 14 Jul 2025 08:53:34 -0400
Subject: [PATCH 089/196] support for setting additional-main from env var

---
 deploy.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/deploy.py b/deploy.py
index 4396654f..0c6b2e6e 100755
--- a/deploy.py
+++ b/deploy.py
@@ -339,6 +339,8 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
 
 if __name__ == "__main__":
 
+    additional_main_from_env_var = os.getenv("LCDBWF_ADDITIONAL_MAIN", [])
+
     ap = argparse.ArgumentParser(usage=usage)
     ap.add_argument(
         "--flavor",
@@ -397,7 +399,8 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
         help="""Additional packages to install in main environment (only
         relevant with --build-envs). For example,
         'snakemake-executor-plugin-cluster-generic' to support a cluster
-        profile.""",
+        profile. You can use the env var LCDBWF_ADDITIONAL_MAIN to supply this
+        argument automatically instead.""",
         nargs="+",
     )
     ap.add_argument(

From 6eb46333f0254b66fcae1bed4a348239fc9993fb Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 29 Jul 2025 21:55:33 -0400
Subject: [PATCH 090/196] deploy.py actually installs additional

---
 deploy.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/deploy.py b/deploy.py
index 0c6b2e6e..1981804c 100755
--- a/deploy.py
+++ b/deploy.py
@@ -294,8 +294,6 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
     ]
     for env, yml, additional in mapping:
         info("Building environment " + os.path.join(dest, env))
-        if additional:
-            info(f"Adding {additional} to environment")
 
         try:
             # conda and mamba can be hard to kill, possibly because they're
@@ -313,8 +311,12 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
                 "--file",
                 yml,
             ]
+            p = sp.Popen(cmds, universal_newlines=True, cwd=dest)
+            p.wait()
+
             if additional:
-                cmds += additional
+                info(f"Adding {additional} to environment")
+                cmds = [conda_frontend, "install", "-y", "-p", env] + additional
             p = sp.Popen(cmds, universal_newlines=True, cwd=dest)
             p.wait()
 
@@ -432,12 +434,6 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co
     rsync(include, source, dest, args.rsync_args)
     deployment_json(source, dest)
 
-    if args.additional_main and additional_main_from_env_var:
-        print(
-            "ERROR: Unset LCDBWF_ADDITIONAL_MAIN env var if you want to use the --additional-main argument."
-        )
-        sys.exit(1)
-
     if additional_main_from_env_var:
         if args.additional_main:
             print(

From 93b89a6ee06c62f92678acbcb7f8ffdb686f97e0 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 2 Oct 2025 14:07:30 -0400
Subject: [PATCH 091/196] try disabling pre-install

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 66de1446..b216a899 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -104,7 +104,7 @@ variables:
             # https://docs.conda.io/projects/conda-build/en/latest/resources/link-scripts.html,
             # post-link scripts should not depend on any installed or
             # to-be-installed conda packages...but they do.
-            conda install -n base r-base yq
+            # conda install -n base r-base yq
 
             time conda env create -n $LCDBWF_ENV --file env.yml
             time conda env create -n $LCDBWF_ENV_R --file env-r.yml

From d2ebe753bebcc18215fdb08bfe72c89f5199508a Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 3 Oct 2025 09:08:16 -0400
Subject: [PATCH 092/196] don't copy test runner for references

---
 .circleci/config.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b216a899..b28f0491 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -139,7 +139,6 @@ variables:
         cp $ORIG/workflows/chipseq/run_test.sh $DEPLOY/workflows/chipseq/run_test.sh
         cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh
         cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh
-        cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh
 
         mkdir $DEPLOY/ci
         mkdir $DEPLOY/test

From 22d414fe0bc37d994c12cafbe70794bf80c53965 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 4 Oct 2025 00:56:22 +0000
Subject: [PATCH 093/196] fix typo

---
 workflows/rnaseq/Snakefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index dd736780..111b61b5 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -203,7 +203,6 @@ rule star:
         )
 
         # move various hard-coded log files to log directory
-        logfile_extensions = 
         logfiles = expand(
             prefix + "{ext}",
             ext=["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"]

From e91b14ead8f28e572681272c7055222e315a6984 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 4 Oct 2025 01:34:50 +0000
Subject: [PATCH 094/196] new syntax style for markduplicates

---
 workflows/chipseq/Snakefile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 5fbdbe1c..9c2767d4 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -302,11 +302,11 @@ rule markduplicates:
         "picard "
         "{params.java_args} "
         "MarkDuplicates "
-        "INPUT={input.bam} "
-        "OUTPUT={output.bam} "
-        "REMOVE_DUPLICATES=true "
-        "METRICS_FILE={output.metrics} "
-        "VALIDATION_STRINGENCY=LENIENT "
+        "-INPUT {input.bam} "
+        "-OUTPUT {output.bam} "
+        "-REMOVE_DUPLICATES true "
+        "-METRICS_FILE {output.metrics} "
+        "-VALIDATION_STRINGENCY LENIENT "
         "&> {log}"
 
 

From a7d973782b36da110d20f4bab601b2e07d14e483 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 4 Oct 2025 01:35:10 +0000
Subject: [PATCH 095/196] refactor chipseq config

---
 workflows/chipseq/config/config.yaml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml
index a8d10142..75466ad6 100644
--- a/workflows/chipseq/config/config.yaml
+++ b/workflows/chipseq/config/config.yaml
@@ -23,6 +23,9 @@ references_dir: 'references_data'
 
 peaks_dir: 'data/chipseq_peaks'
 
+fasta:
+  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
+  postprocess: 'lib.utils.gzipped'
 
 chipseq:
   # The peak_calling section is a list of dicts, each one defining a single
@@ -113,7 +116,3 @@ merged_bigwigs:
 aligner:
   index: 'bowtie2'
   tag: 'test'
-
-include_references:
-  - '../../include/reference_configs/Drosophila_melanogaster.yaml'
-  - '../../include/reference_configs/test.yaml'

From 9ee9e06d301d2ceff4ea73a093fac541648c7f90 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 17:48:44 +0000
Subject: [PATCH 096/196] invalidate cache

---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b28f0491..bedc4c18 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -26,7 +26,7 @@ variables:
 
   save_cache: &save_cache
     save_cache:
-      key: v5-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }}
+      key: v0-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }}
       paths:
         - /opt/miniforge
 
@@ -38,7 +38,7 @@ variables:
   restore_cache: &restore_cache
     restore_cache:
       keys:
-        - v5-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }}
+        - v0-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }}
 
   # --------------------------------------------------------------------------
   # The path needs to be set each time; in jobs below this will be called as

From d567601288cb77c62fef5dc5de0273e9e9fb98b6 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 19:03:32 +0000
Subject: [PATCH 097/196] macs2 -> macs3

and change to macs (with no version) throughout
---
 docs/chipseq.rst                              |  2 +-
 docs/config-yaml.rst                          | 24 +++++++++----------
 docs/developers.rst                           |  4 ++--
 docs/faqs.rst                                 |  2 +-
 docs/workflows.rst                            |  2 +-
 env.yml                                       |  2 +-
 include/requirements.txt                      |  2 +-
 lib/chipseq.py                                |  8 +++----
 .../{macs2_callpeak.py => macs_callpeak.py}   |  2 +-
 .../complex-dataset-chipseq-config.yaml       | 16 ++++++-------
 .../test_configs/test_chipseq_regression.yaml |  2 +-
 workflows/chipseq/Snakefile                   | 14 +++++------
 workflows/chipseq/chipseq_trackhub.py         |  2 +-
 workflows/chipseq/config/config.yaml          |  4 ++--
 14 files changed, 43 insertions(+), 43 deletions(-)
 rename scripts/{macs2_callpeak.py => macs_callpeak.py} (99%)

diff --git a/docs/chipseq.rst b/docs/chipseq.rst
index 202e0375..5302e973 100644
--- a/docs/chipseq.rst
+++ b/docs/chipseq.rst
@@ -20,7 +20,7 @@ Specifically, the workflow does the following:
     - optionally merges bigWigs to create one signal track for all replicates
     - runs deepTools plotFingerprint on grouped IP and input for QC and
       evaluation of enrichment
-    - calls peaks using macs2, spp, and/or sicer, with support for multiple
+    - calls peaks using macs, spp, and/or sicer, with support for multiple
       peak-calling runs using different parameters to assist with assessing
       performance and to help make decisions for downstream analysis
     - optionally runs a template diffBind RMarkdown file used for differential binding analysis
diff --git a/docs/config-yaml.rst b/docs/config-yaml.rst
index c8026325..7d86ceef 100644
--- a/docs/config-yaml.rst
+++ b/docs/config-yaml.rst
@@ -124,7 +124,7 @@ The major differences between ChIP-seq and RNA-seq configs are:
       peak_calling:
 
         - label: gaf-embryo-1
-          algorithm: macs2
+          algorithm: macs
           ip:
             - gaf-embryo-1
           control:
@@ -138,7 +138,7 @@ The major differences between ChIP-seq and RNA-seq configs are:
             - input-embryo-1
 
         - label: gaf-wingdisc-pooled
-          algorithm: macs2
+          algorithm: macs
           ip:
             - gaf-wingdisc-1
             - gaf-wingdisc-2
@@ -529,7 +529,7 @@ ChIP-seq-only fields
     ``algorithm``. This way, we can use the same label (e.g., `gaf-embryo-1`)
     across multiple peak-callers to help organize the output.
 
-   The currently-supported peak-callers are ``macs2``, ``spp``, and ``sicer``.
+   The currently-supported peak-callers are ``macs``, ``spp``, and ``sicer``.
    They each have corresponding wrappers in the ``wrappers`` directory. To add
    other peak-callers, see :ref:`new-peak-caller`.
 
@@ -537,7 +537,7 @@ ChIP-seq-only fields
     assessing the peak-calling performance.
 
     Here is a minimal example of a peak-calling config section. It defines
-    a single peak-calling run using the `macs2` algorithm. Note that the
+    a single peak-calling run using the `macs` algorithm. Note that the
     ``ip:`` and ``control:`` keys are lists of **labels** from the ChIP-seq
     sample table's ``label`` column, **not sample IDs** from the first column.
 
@@ -547,18 +547,18 @@ ChIP-seq-only fields
           peak_calling:
 
             - label: gaf-embryo-1
-              algorithm: macs2
+              algorithm: macs
               ip:
                 - gaf-embryo-1
               control:
                 - input-embryo-1
 
     The above peak-calling config will result in a file
-    ``data/chipseq_peaks/macs2/gaf-embryo-1/peaks.bed`` (that pattern is
+    ``data/chipseq_peaks/macs/gaf-embryo-1/peaks.bed`` (that pattern is
     defined in ``chipseq_patterns.yaml`` if you need to change it).
 
     We can specify additional command-line arguments that are passed verbatim
-    to `macs2` with the ``extra:`` section, for example:
+    to `macs` with the ``extra:`` section, for example:
 
     .. code-block:: yaml
 
@@ -566,7 +566,7 @@ ChIP-seq-only fields
           peak_calling:
 
             - label: gaf-embryo-1
-              algorithm: macs2
+              algorithm: macs
               ip:
                 - gaf-embryo-1
               control:
@@ -574,8 +574,8 @@ ChIP-seq-only fields
               extra: '--nomodel --extsize 147'
 
 
-    `macs2` supports multiple IP and input files, which internally are merged
-    by `macs2`. We can supply multiple IP and input labels for biological
+    `macs` supports multiple IP and input files, which internally are merged
+    by `macs`. We can supply multiple IP and input labels for biological
     replicates to get a set of peaks called on pooled samples. Note that we
     give it a different label so it doesn't overwrite the other peak-calling
     run we already have configured.
@@ -586,7 +586,7 @@ ChIP-seq-only fields
           peak_calling:
 
             - label: gaf-embryo-1
-              algorithm: macs2
+              algorithm: macs
               ip:
                 - gaf-embryo-1
               control:
@@ -595,7 +595,7 @@ ChIP-seq-only fields
 
 
             - label: gaf-embryo-pooled
-              algorithm: macs2
+              algorithm: macs
               ip:
                 - gaf-embryo-1
                 - gaf-embryo-2
diff --git a/docs/developers.rst b/docs/developers.rst
index fc45b00d..9e459a97 100644
--- a/docs/developers.rst
+++ b/docs/developers.rst
@@ -96,7 +96,7 @@ Testing
 Adding a new peak-caller
 ------------------------
 
-First, write a wrapper for the peak-caller. You can use the ``macs2``, ``spp``,
+First, write a wrapper for the peak-caller. You can use the ``macs``, ``spp``,
 and ``sicer`` wrappers as a guide. A wrapper should expect one or more sorted
 and indexed BAM files as IP, one or more sorted and indexed BAM files as input.
 The wrapper should create at least a sorted BED file of peaks, and can
@@ -105,7 +105,7 @@ optionally create other supplemental files as well.
 Next, add the peak-caller to the top of ``lib/patterns_targets.py`` in the
 ``PEAK_CALLERS`` list.
 
-Then write a rule for the peak-caller, again using ``macs2``, ``spp``, or
+Then write a rule for the peak-caller, again using ``macs``, ``spp``, or
 ``sicer`` rules as a guide.
 
 Last, add additional lines in
diff --git a/docs/faqs.rst b/docs/faqs.rst
index 86d31cb0..77ac5020 100644
--- a/docs/faqs.rst
+++ b/docs/faqs.rst
@@ -99,7 +99,7 @@ accordingly.
 
     A partial exception to this is that the peak-calling for ChIP-seq supports
     specifying custom parameters for each peak-calling run. For example, when
-    running macs2 you can specify "--nomodel" for a single peak-calling run, or
+    running macs you can specify "--nomodel" for a single peak-calling run, or
     any other parameter supported by the peak-caller.
 
     However, the BAM files used in peak-calling still need to have used uniform
diff --git a/docs/workflows.rst b/docs/workflows.rst
index 3ab1ec2d..99bb44cb 100644
--- a/docs/workflows.rst
+++ b/docs/workflows.rst
@@ -99,7 +99,7 @@ Situtations where we use wrappers:
   These wrappers call the aligner, followed by samtools sort and view. The end
   result is that FASTQs go in, and a sorted BAM comes out.
 - Tools with legacy dependencies like Python 2.7 that must be run in an
-  independent environment (macs2, sicer, rseqc)
+  independent environment (sicer, rseqc)
 - R analyses (particularly spp and dupradar, which build up an R script
   incrementally before calling it).
 - Tools that need complicated setup, or handling output files hard-coded by the
diff --git a/env.yml b/env.yml
index a4341cb0..fe54c60d 100644
--- a/env.yml
+++ b/env.yml
@@ -194,7 +194,7 @@ dependencies:
   - libzlib=1.3.1
   - logmuse=0.2.8
   - logomaker=0.8.6
-  - macs2=2.2.9.1
+  - macs3=3.0.3
   - make=4.4.1
   - markdown=3.6
   - markdown-it-py=3.0.0
diff --git a/include/requirements.txt b/include/requirements.txt
index a2b21ee3..ebd02582 100644
--- a/include/requirements.txt
+++ b/include/requirements.txt
@@ -14,7 +14,7 @@ hisat2
 intervalstats
 ipython
 kallisto
-macs2
+macs3
 multiqc
 pandas
 pandoc
diff --git a/lib/chipseq.py b/lib/chipseq.py
index 62608ed8..7015f83b 100644
--- a/lib/chipseq.py
+++ b/lib/chipseq.py
@@ -10,14 +10,14 @@
 #         [
 #             {
 #                 'label': 'rep1',
-#                 'algorithm': 'macs2',
+#                 'algorithm': 'macs',
 #                 'input': ['input_1'],
 #                 'ip': ['ip_1'],
 #                 'extra': '--gs dm',
 #             },
 #             {
 #                 'label': 'rep2',
-#                 'algorithm': 'macs2',
+#                 'algorithm': 'macs',
 #                 'input': ['input_2'],
 #                 'ip': ['ip_2'],
 #                 'extra': '--gs dm',
@@ -30,8 +30,8 @@
 # This needs to be expanded out to the following patterns:
 #
 # [
-#   'data/chipseq_peaks/macs2/rep1/peaks.bigbed',
-#   'data/chipseq_peaks/macs2/rep2/peaks.bigbed',
+#   'data/chipseq_peaks/macs/rep1/peaks.bigbed',
+#   'data/chipseq_peaks/macs/rep2/peaks.bigbed',
 # ]
 #
 # Which in turn needs these bams:
diff --git a/scripts/macs2_callpeak.py b/scripts/macs_callpeak.py
similarity index 99%
rename from scripts/macs2_callpeak.py
rename to scripts/macs_callpeak.py
index d90c17d6..1f1eb120 100644
--- a/scripts/macs2_callpeak.py
+++ b/scripts/macs_callpeak.py
@@ -18,7 +18,7 @@
     genome_count_flag = ' -g ' + effective_genome_count + ' '
 
 cmds = (
-    'macs2 '
+    'macs3 '
     'callpeak '
     '-c {snakemake.input.control} '
     '-t {snakemake.input.ip} '
diff --git a/test/test_configs/complex-dataset-chipseq-config.yaml b/test/test_configs/complex-dataset-chipseq-config.yaml
index 61406e94..ff724701 100644
--- a/test/test_configs/complex-dataset-chipseq-config.yaml
+++ b/test/test_configs/complex-dataset-chipseq-config.yaml
@@ -44,49 +44,49 @@ merged_bigwigs:
 chipseq:
   peak_calling:
     - label: BRD4-dBET6-1
-      algorithm: macs2
+      algorithm: macs
       ip:
         - BRD4-dBET6-1
       control:
         - input-dBET6-1
     - label: BRD4-dBET6-2
-      algorithm: macs2
+      algorithm: macs
       ip:
         - BRD4-dBET6-2
       control:
         - input-dBET6-2
     - label: BRD4-DMSO-1
-      algorithm: macs2
+      algorithm: macs
       ip:
         - BRD4-DMSO-1
       control:
         - input-DMSO-1
     - label: BRD4-DMSO-2
-      algorithm: macs2
+      algorithm: macs
       ip:
         - BRD4-DMSO-2
       control:
         - input-DMSO-2
     - label: MTHFD1-dBET6-1
-      algorithm: macs2
+      algorithm: macs
       ip:
         - MTHFD1-dBET6-1
       control:
         - input-dBET6-1
     - label: MTHFD1-dBET6-2
-      algorithm: macs2
+      algorithm: macs
       ip:
         - MTHFD1-dBET6-2
       control:
         - input-dBET6-2
     - label: MTHFD1-DMSO-1
-      algorithm: macs2
+      algorithm: macs
       ip:
         - MTHFD1-DMSO-1
       control:
         - input-DMSO-1
     - label: MTHFD1-DMSO-2
-      algorithm: macs2
+      algorithm: macs
       ip:
         - MTHFD1-DMSO-2
       control:
diff --git a/test/test_configs/test_chipseq_regression.yaml b/test/test_configs/test_chipseq_regression.yaml
index 8ca61ed0..c59ab9bf 100644
--- a/test/test_configs/test_chipseq_regression.yaml
+++ b/test/test_configs/test_chipseq_regression.yaml
@@ -7,7 +7,7 @@ chipseq:
   peak_calling:
 
     - label: gaf-wingdisc-1
-      algorithm: macs2
+      algorithm: macs
       ip:
         - gaf-wingdisc-1
       control:
diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 9c2767d4..ce1243a9 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -443,28 +443,28 @@ rule fingerprint:
 
 
 
-rule macs2:
+rule macs:
     input:
         ip=lambda wc: expand(
             "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
-            label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "ip"),
+            label=chipseq.samples_for_run(config, wc.macs_run, "macs", "ip"),
         ),
         control=lambda wc: expand(
             "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
-            label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "control"),
+            label=chipseq.samples_for_run(config, wc.macs_run, "macs", "control"),
         ),
         chromsizes=rules.chromsizes.output,
     output:
-        bed="data/chipseq_peaks/macs2/{macs2_run}/peaks.bed",
+        bed="data/chipseq_peaks/macs/{macs_run}/peaks.bed",
     resources:
         mem="16g",
         runtime="2h",
     log:
-        "data/chipseq_peaks/macs2/{macs2_run}/peaks.bed.log",
+        "data/chipseq_peaks/macs/{macs_run}/peaks.bed.log",
     params:
-        block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, "macs2"),
+        block=lambda wc: chipseq.block_for_run(config, wc.macs_run, "macs"),
     script:
-        "../../scripts/macs2_callpeak.py"
+        "../../scripts/macs_callpeak.py"
 
 
 rule epic2:
diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py
index d069b015..5726fc02 100644
--- a/workflows/chipseq/chipseq_trackhub.py
+++ b/workflows/chipseq/chipseq_trackhub.py
@@ -78,7 +78,7 @@
 subgroups.append(
     SubGroupDefinition(
         name='algorithm', label='algorithm', mapping={
-            'macs2': 'macs2',
+            'macs': 'macs',
             'epic2': 'epic2',
             'NA': 'NA',
         }))
diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml
index 75466ad6..d35898d2 100644
--- a/workflows/chipseq/config/config.yaml
+++ b/workflows/chipseq/config/config.yaml
@@ -48,7 +48,7 @@ chipseq:
   peak_calling:
 
     - label: gaf-embryo-1
-      algorithm: macs2
+      algorithm: macs
       ip:
         - gaf-embryo-1
       control:
@@ -61,7 +61,7 @@ chipseq:
       extra: '--nomodel --extsize 147'
 
     - label: gaf-wingdisc-pooled
-      algorithm: macs2
+      algorithm: macs
       ip:
         - gaf-wingdisc-1
         - gaf-wingdisc-2

From bee444e85b4c5c034d2c5425a84b53e9088d7599 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 19:16:09 +0000
Subject: [PATCH 098/196] rm params.extra for cutadapt

---
 workflows/rnaseq/Snakefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 111b61b5..5ec50596 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -95,7 +95,6 @@ rule cutadapt:
                 "--minimum-length 25 "
                 "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
                 "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
-                "{params.extra} "
                 "{input.fastq[0]} "
                 "{input.fastq[1]} "
                 "&> {log}"
@@ -109,7 +108,6 @@ rule cutadapt:
                 "--overlap 6 "
                 "--minimum-length 25 "
                 "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
-                "{params.extra} "
                 "{input.fastq[0]} "
                 "&> {log}"
             )

From f8f7143ef635041e04baefd8dd25e14f68a7480d Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 20:29:36 +0000
Subject: [PATCH 099/196] update env.yml

---
 env.yml | 407 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 219 insertions(+), 188 deletions(-)

diff --git a/env.yml b/env.yml
index fe54c60d..f7f89425 100644
--- a/env.yml
+++ b/env.yml
@@ -4,24 +4,27 @@ channels:
 dependencies:
   - _libgcc_mutex=0.1
   - _openmp_mutex=4.5
+  - _python_abi3_support=1.0
   - _r-mutex=1.0.1
   - alabaster=1.0.0
-  - alsa-lib=1.2.13
+  - alsa-lib=1.2.14
   - amply=0.1.6
+  - anndata=0.12.2
   - annotated-types=0.7.0
-  - anyio=4.9.0
+  - anyio=4.11.0
   - appdirs=1.4.4
-  - argcomplete=3.6.1
+  - argcomplete=3.6.2
   - argh=0.31.3
   - argparse-dataclass=2.0.0
+  - array-api-compat=1.12.0
   - asttokens=3.0.0
   - attrs=25.3.0
   - babel=2.17.0
   - backports=1.0
   - backports.tarfile=1.2.0
-  - beautifulsoup4=4.13.3
+  - beautifulsoup4=4.14.2
   - bedtools=2.31.1
-  - binutils_impl_linux-64=2.43
+  - binutils_impl_linux-64=2.44
   - biopython=1.85
   - boost-cpp=1.85.0
   - bowtie=1.3.1
@@ -30,51 +33,57 @@ dependencies:
   - brotli-bin=1.1.0
   - brotli-python=1.1.0
   - bwidget=1.10.1
-  - bx-python=0.13.0
+  - bx-python=0.14.0
   - bzip2=1.0.8
-  - c-ares=1.34.4
-  - ca-certificates=2025.1.31
+  - c-ares=1.34.5
+  - ca-certificates=2025.10.5
+  - cached-property=1.5.2
+  - cached_property=1.5.2
   - cairo=1.18.4
-  - certifi=2025.1.31
-  - cffi=1.17.1
-  - charset-normalizer=3.4.1
-  - click=8.1.8
+  - certifi=2025.10.5
+  - cffi=2.0.0
+  - charset-normalizer=3.4.3
+  - click=8.3.0
   - coin-or-cbc=2.10.12
   - coin-or-cgl=0.60.9
   - coin-or-clp=1.17.10
   - coin-or-osi=0.108.11
   - coin-or-utils=2.11.12
-  - coincbc=2.10.12
   - colorama=0.4.6
   - coloredlogs=15.0.1
   - colormath=3.0.0
   - conda-inject=1.3.2
-  - configargparse=1.7
+  - configargparse=1.7.1
   - connection_pool=0.0.3
-  - contourpy=1.3.1
-  - cryptography=44.0.2
-  - curl=8.12.1
-  - cutadapt=5.0
+  - contourpy=1.3.3
+  - cpython=3.11.13
+  - crc32c=2.7.1
+  - cryptography=46.0.2
+  - curl=8.14.1
+  - cutadapt=5.1
   - cycler=0.12.1
-  - dbus=1.13.6
+  - cykhash=2.0.1
+  - dbus=1.16.2
   - decorator=5.2.1
   - deeptools=3.5.6
   - deeptoolsintervals=0.1.9
-  - distlib=0.3.9
+  - deprecated=1.2.18
+  - distlib=0.4.0
   - dnaio=1.2.2
   - docutils=0.21.2
+  - donfig=0.8.1.post1
   - dpath=2.2.0
   - editables=0.5
   - eido=0.2.4
   - epic2=0.0.52
   - et_xmlfile=2.0.0
-  - exceptiongroup=1.2.2
+  - exceptiongroup=1.3.0
   - execnet=2.1.1
-  - executing=2.1.0
-  - expat=2.6.4
+  - executing=2.2.1
+  - expat=2.7.1
   - fastq-screen=0.16.0
   - fastqc=0.12.1
-  - filelock=3.18.0
+  - filelock=3.19.1
   - font-ttf-dejavu-sans-mono=2.37
   - font-ttf-inconsolata=3.000
   - font-ttf-source-code-pro=2.038
@@ -82,66 +91,70 @@ dependencies:
   - fontconfig=2.15.0
   - fonts-conda-ecosystem=1
   - fonts-conda-forge=1
-  - fonttools=4.56.0
-  - freetype=2.13.3
-  - fribidi=1.0.10
-  - gcc_impl_linux-64=14.2.0
+  - fonttools=4.60.1
+  - freetype=2.14.1
+  - fribidi=1.0.16
+  - gcc_impl_linux-64=15.2.0
   - gffread=0.12.7
   - gffutils=0.13
-  - gfortran_impl_linux-64=14.2.0
+  - gfortran_impl_linux-64=15.2.0
   - giflib=5.2.2
   - gitdb=4.0.12
-  - gitpython=3.1.44
-  - graphite2=1.3.13
+  - gitpython=3.1.45
+  - graphite2=1.3.14
   - gsl=1.16
-  - gxx_impl_linux-64=14.2.0
-  - h11=0.14.0
-  - h2=4.2.0
-  - harfbuzz=11.0.0
-  - hatch=1.14.0
+  - gxx_impl_linux-64=15.2.0
+  - h11=0.16.0
+  - h2=4.3.0
+  - h5py=3.13.0
+  - harfbuzz=11.4.5
+  - hatch=1.14.1
   - hatchling=1.27.0
   - hdf5=1.14.3
   - hisat2=2.2.1
+  - hmmlearn=0.3.3
   - hpack=4.1.0
   - html5lib=1.1
-  - htslib=1.21
-  - httpcore=1.0.7
+  - htslib=1.22.1
+  - httpcore=1.0.9
   - httpx=0.28.1
   - humanfriendly=10.0
-  - humanize=4.12.2
+  - humanize=4.13.0
   - hyperframe=6.1.0
   - hyperlink=21.0.0
   - icu=75.1
   - idna=3.10
   - imagesize=1.4.1
   - immutables=0.21
-  - importlib-metadata=8.6.1
+  - importlib-metadata=8.7.0
   - importlib_resources=6.5.2
   - iniconfig=2.0.0
   - intervalstats=1.01
-  - ipython=9.0.2
+  - ipython=9.6.0
   - ipython_pygments_lexers=1.1.1
   - isa-l=2.31.1
   - jaraco.classes=3.4.0
   - jaraco.context=6.0.1
-  - jaraco.functools=4.1.0
+  - jaraco.functools=4.3.0
   - jedi=0.19.2
   - jeepney=0.9.0
   - jinja2=3.1.6
-  - jsonschema=4.23.0
-  - jsonschema-specifications=2024.10.1
-  - jupyter_core=5.7.2
+  - joblib=1.5.2
+  - jsonschema=4.25.1
+  - jsonschema-specifications=2025.9.1
+  - jupyter_core=5.8.1
   - kaleido-core=0.2.1
   - kallisto=0.51.1
-  - kernel-headers_linux-64=3.10.0
+  - kernel-headers_linux-64=5.14.0
   - keyring=25.6.0
-  - keyutils=1.6.1
-  - kiwisolver=1.4.7
+  - keyutils=1.6.3
+  - kiwisolver=1.4.9
   - krb5=1.21.3
   - lcms2=2.17
-  - ld_impl_linux-64=2.43
+  - ld_impl_linux-64=2.44
+  - legacy-api-wrap=1.4.1
   - lerc=4.0.0
-  - libaec=1.1.3
+  - libaec=1.1.4
   - libblas=3.9.0
   - libboost=1.85.0
   - libboost-devel=1.85.0
@@ -151,91 +164,100 @@ dependencies:
   - libbrotlienc=1.1.0
   - libcblas=3.9.0
   - libcups=2.3.3
-  - libcurl=8.12.1
+  - libcurl=8.14.1
   - libdeflate=1.22
   - libedit=3.1.20250104
   - libev=4.33
-  - libexpat=2.6.4
+  - libexpat=2.7.1
   - libffi=3.4.6
-  - libgcc=14.2.0
-  - libgcc-devel_linux-64=14.2.0
-  - libgcc-ng=14.2.0
+  - libfreetype=2.14.1
+  - libfreetype6=2.14.1
+  - libgcc=15.2.0
+  - libgcc-devel_linux-64=15.2.0
+  - libgcc-ng=15.2.0
   - libgd=2.3.3
   - libgff=2.0.0
-  - libgfortran=14.2.0
-  - libgfortran5=14.2.0
-  - libglib=2.84.0
-  - libgomp=14.2.0
-  - libhwloc=2.11.2
+  - libgfortran=15.2.0
+  - libgfortran5=15.2.0
+  - libglib=2.84.3
+  - libgomp=15.2.0
+  - libhwloc=2.12.1
   - libiconv=1.18
   - libjemalloc=5.3.0
-  - libjpeg-turbo=3.0.0
+  - libjpeg-turbo=3.1.0
   - liblapack=3.9.0
   - liblapacke=3.9.0
-  - liblzma=5.6.4
-  - liblzma-devel=5.6.4
-  - libnghttp2=1.64.0
+  - liblzma=5.8.1
+  - liblzma-devel=5.8.1
+  - libnghttp2=1.67.0
   - libnsl=2.0.1
-  - libopenblas=0.3.29
-  - libopenssl-static=3.4.1
-  - libpng=1.6.47
-  - libsanitizer=14.2.0
-  - libsqlite=3.49.1
+  - libopenblas=0.3.30
+  - libopenssl-static=3.5.4
+  - libpng=1.6.50
+  - libsanitizer=15.2.0
+  - libsqlite=3.50.4
   - libssh2=1.11.1
-  - libstdcxx=14.2.0
-  - libstdcxx-devel_linux-64=14.2.0
-  - libstdcxx-ng=14.2.0
+  - libstdcxx=15.2.0
+  - libstdcxx-devel_linux-64=15.2.0
+  - libstdcxx-ng=15.2.0
   - libtiff=4.7.0
-  - libuuid=2.38.1
-  - libwebp-base=1.5.0
+  - libuuid=2.41.2
+  - libwebp-base=1.6.0
   - libxcb=1.17.0
   - libxcrypt=4.4.36
-  - libxml2=2.13.7
+  - libxml2=2.14.6
+  - libxml2-16=2.14.6
   - libzlib=1.3.1
+  - llvmlite=0.45.1
   - logmuse=0.2.8
   - logomaker=0.8.6
   - macs3=3.0.3
   - make=4.4.1
-  - markdown=3.6
-  - markdown-it-py=3.0.0
-  - markupsafe=3.0.2
+  - mariadb-connector-c=3.4.7
+  - markdown=3.9
+  - markdown-it-py=4.0.0
+  - markupsafe=3.0.3
   - mathjax=2.7.7
-  - matplotlib-base=3.10.1
+  - matplotlib-base=3.10.6
   - matplotlib-inline=0.1.7
   - mdurl=0.1.2
-  - more-itertools=10.6.0
-  - multiqc=1.28
+  - more-itertools=10.8.0
+  - msgpack-python=1.1.1
+  - multiqc=1.31
   - munkres=1.1.4
   - mysql-connector-c=6.1.11
-  - narwhals=1.32.0
+  - narwhals=2.6.0
   - natsort=8.4.0
   - nbformat=5.10.4
   - ncbi-vdb=3.2.1
   - ncurses=6.5
-  - networkx=3.4.2
-  - nspr=4.36
-  - nss=3.110
-  - numpy=2.2.4
-  - numpydoc=1.8.0
-  - openjdk=23.0.2
+  - networkx=3.5
+  - nspr=4.37
+  - nss=3.117
+  - numba=0.62.1
+  - numcodecs=0.16.1
+  - numpy=2.3.3
+  - numpydoc=1.9.0
+  - openjdk=24.0.2
   - openjpeg=2.5.3
   - openpyxl=3.1.5
-  - openssl=3.4.1
+  - openssl=3.5.4
   - ossuuid=1.6.2
-  - packaging=24.2
-  - pandas=2.2.3
-  - pandoc=3.6.4
-  - pango=1.56.3
-  - parso=0.8.4
+  - packaging=25.0
+  - pandas=2.3.3
+  - pandoc=3.8.1
+  - pango=1.56.4
+  - parso=0.8.5
   - pathspec=0.12.1
   - patsy=1.0.1
   - pbzip2=1.1.13
-  - pcre2=10.44
+  - pcre2=10.45
   - pephubclient=0.4.4
   - peppy=0.40.7
   - perl=5.32.1
   - perl-alien-build=2.84
-  - perl-alien-libxml2=0.17
+  - perl-alien-build-plugin-download-gitlab=0.01
+  - perl-alien-libxml2=0.20
   - perl-business-isbn=3.007
   - perl-business-isbn-data=20210112.006
   - perl-capture-tiny=0.48
@@ -249,7 +271,7 @@ dependencies:
   - perl-file-temp=0.2304
   - perl-file-which=1.24
   - perl-gd=2.83
-  - perl-gdgraph=1.54
+  - perl-gdgraph=1.56
   - perl-gdtextutil=0.86
   - perl-importer=0.026
   - perl-parent=0.243
@@ -257,13 +279,13 @@ dependencies:
   - perl-pathtools=3.75
   - perl-scope-guard=0.21
   - perl-sub-info=0.002
-  - perl-term-table=0.024
+  - perl-term-table=0.025
   - perl-test-fatal=0.016
   - perl-test-nowarnings=1.06
   - perl-test-warnings=0.031
   - perl-test2-suite=0.000163
   - perl-try-tiny=0.31
-  - perl-uri=5.17
+  - perl-uri=5.34
   - perl-xml-libxml=2.0210
   - perl-xml-namespacesupport=1.12
   - perl-xml-sax=1.02
@@ -272,17 +294,17 @@ dependencies:
   - picard=2.27.5
   - pickleshare=0.7.5
   - pigz=2.8
-  - pillow=11.1.0
-  - pip=25.0.1
-  - pixman=0.44.2
-  - pkgutil-resolve-name=1.3.10
-  - plac=1.4.3
-  - platformdirs=4.3.7
-  - plotly=6.0.1
-  - pluggy=1.5.0
+  - pillow=11.3.0
+  - pip=25.2
+  - pixman=0.46.4
+  - plac=1.4.5
+  - platformdirs=4.4.0
+  - plotly=6.3.1
+  - pluggy=1.6.0
+  - polars-lts-cpu=1.33.1
   - preseq=2.0.2
-  - prompt-toolkit=3.0.50
-  - psutil=7.0.0
+  - prompt-toolkit=3.0.52
+  - psutil=7.1.0
   - pthread-stubs=0.4
   - ptyprocess=0.7.0
   - pulp=2.8.0
@@ -292,64 +314,70 @@ dependencies:
   - pybedtools=0.12.0
   - pybigwig=0.3.24
   - pycparser=2.22
-  - pydantic=2.10.6
-  - pydantic-core=2.27.2
-  - pyfaidx=0.8.1.3
-  - pygments=2.19.1
-  - pyparsing=3.2.3
+  - pydantic=2.11.10
+  - pydantic-core=2.33.2
+  - pyfaidx=0.9.0.3
+  - pygments=2.19.2
+  - pynndescent=0.5.13
+  - pyparsing=3.2.5
   - pysam=0.22.1
   - pysocks=1.7.1
-  - pytest=8.3.5
-  - pytest-xdist=3.6.1
-  - python=3.11.11
+  - pytest=8.4.2
+  - pytest-xdist=3.8.0
+  - python=3.11.13
   - python-dateutil=2.9.0.post0
-  - python-dotenv=1.1.0
-  - python-fastjsonschema=2.21.1
-  - python-isal=1.7.2
+  - python-dotenv=1.1.1
+  - python-fastjsonschema=2.21.2
+  - python-gil=3.11.13
+  - python-isal=1.8.0
   - python-kaleido=0.2.1
   - python-tzdata=2025.2
-  - python-zlib-ng=0.5.1
+  - python-zlib-ng=1.0.0
   - python_abi=3.11
-  - pytz=2024.1
-  - pyvcf3=1.0.3
-  - pyyaml=6.0.2
+  - pytz=2025.2
+  - pyvcf3=1.0.4
+  - pyyaml=6.0.3
   - qhull=2020.2
   - r-base=4.2.3
   - readline=8.2
   - referencing=0.36.2
-  - regex=2024.11.6
-  - requests=2.32.3
+  - regex=2025.9.18
+  - requests=2.32.5
   - reretry=0.11.8
-  - rich=13.9.4
-  - rich-click=1.8.8
+  - rich=14.1.0
+  - rich-click=1.9.2
   - roman-numerals-py=3.1.0
-  - rpds-py=0.24.0
+  - rpds-py=0.27.1
   - rseqc=5.0.4
   - salmon=1.10.3
-  - samtools=1.21
-  - scipy=1.15.2
+  - samtools=1.22.1
+  - scanpy=1.11.4
+  - scikit-learn=1.7.2
+  - scipy=1.16.2
   - seaborn=0.13.2
   - seaborn-base=0.13.2
-  - secretstorage=3.3.3
-  - sed=4.8
-  - setuptools=75.8.2
+  - secretstorage=3.4.0
+  - sed=4.9
+  - session-info2=0.2.2
+  - setuptools=80.9.0
   - shellingham=1.5.4
-  - simplejson=3.20.1
+  - simplejson=3.20.2
   - six=1.17.0
-  - slack-sdk=3.35.0
-  - slack_sdk=3.35.0
-  - smart_open=7.1.0
+  - slack-sdk=3.36.0
+  - slack_sdk=3.36.0
+  - smart_open=7.3.1
   - smmap=5.0.2
-  - snakemake=9.1.3
-  - snakemake-interface-common=1.17.4
-  - snakemake-interface-executor-plugins=9.3.5
-  - snakemake-interface-logger-plugins=1.2.3
-  - snakemake-interface-report-plugins=1.1.0
-  - snakemake-interface-storage-plugins=4.2.1
-  - snakemake-minimal=9.1.3
+  - snakemake=9.12.0
+  - snakemake-interface-common=1.22.0
+  - snakemake-interface-executor-plugins=9.3.9
+  - snakemake-interface-logger-plugins=1.2.4
+  - snakemake-interface-report-plugins=1.2.0
+  - snakemake-interface-scheduler-plugins=2.0.1
+  - snakemake-interface-storage-plugins=4.2.3
+  - snakemake-minimal=9.12.0
   - sniffio=1.3.1
-  - snowballstemmer=2.2.0
-  - soupsieve=2.5
+  - snowballstemmer=3.0.1
+  - soupsieve=2.8
   - spectra=0.0.11
   - sphinx=8.2.3
   - sphinxcontrib-applehelp=2.0.0
@@ -358,57 +386,59 @@ dependencies:
   - sphinxcontrib-jsmath=1.0.1
   - sphinxcontrib-qthelp=2.0.0
   - sphinxcontrib-serializinghtml=1.1.10
-  - sqlite=3.49.1
-  - sra-tools=3.2.0
+  - sqlite=3.50.4
+  - sra-tools=3.2.1
   - stack_data=0.6.3
-  - staden_io_lib=1.15.0
+  - staden_io_lib=1.15.1
   - star=2.7.11b
-  - statsmodels=0.14.4
-  - subread=2.0.8
-  - sysroot_linux-64=2.17
+  - statsmodels=0.14.5
+  - subread=2.1.1
+  - sysroot_linux-64=2.34
   - tabulate=0.9.0
-  - tbb=2022.0.0
+  - tbb=2022.2.0
+  - threadpoolctl=3.6.0
   - throttler=1.2.2
-  - tiktoken=0.9.0
+  - tiktoken=0.11.0
   - tk=8.6.13
   - tktable=2.10
   - tomli=2.2.1
   - tomli-w=1.2.0
-  - tomlkit=0.13.2
+  - tomlkit=0.13.3
   - tqdm=4.67.1
   - trackhub=1.0
   - traitlets=5.14.3
-  - trove-classifiers=2025.3.19.19
-  - typeguard=4.4.2
-  - typer=0.15.2
-  - typer-slim=0.15.2
-  - typer-slim-standard=0.15.2
-  - typing-extensions=4.13.0
-  - typing_extensions=4.13.0
+  - trove-classifiers=2025.9.11.17
+  - typeguard=4.4.4
+  - typer=0.19.2
+  - typer-slim=0.19.2
+  - typer-slim-standard=0.19.2
+  - typing-extensions=4.15.0
+  - typing-inspection=0.4.2
+  - typing_extensions=4.15.0
   - tzdata=2025b
   - ubiquerg=0.8.0
-  - ucsc-bedgraphtobigwig=472
-  - ucsc-bedsort=469
-  - ucsc-bedtobigbed=473
-  - ucsc-bigwigmerge=469
-  - ucsc-fetchchromsizes=469
-  - ucsc-genepredtobed=469
-  - ucsc-gtftogenepred=469
-  - ucsc-liftover=469
-  - ucsc-oligomatch=469
-  - ucsc-stringify=472
-  - ucsc-twobittofa=472
-  - ucsc-wigtobigwig=472
+  - ucsc-bedgraphtobigwig=482
+  - ucsc-bedsort=482
+  - ucsc-bedtobigbed=482
+  - ucsc-bigwigmerge=482
+  - ucsc-fetchchromsizes=482
+  - ucsc-genepredtobed=482
+  - ucsc-gtftogenepred=482
+  - ucsc-liftover=482
+  - ucsc-oligomatch=482
+  - ucsc-twobittofa=482
+  - ucsc-wigtobigwig=482
+  - umap-learn=0.5.9.post2
   - unicodedata2=16.0.0
-  - urllib3=2.3.0
+  - urllib3=2.5.0
   - userpath=1.9.2
-  - uv=0.6.10
+  - uv=0.8.22
   - veracitools=0.1.3
-  - virtualenv=20.29.3
-  - wcwidth=0.2.13
+  - virtualenv=20.34.0
+  - wcwidth=0.2.14
   - webencodings=0.5.1
   - wheel=0.45.1
-  - wrapt=1.17.2
+  - wrapt=1.17.3
   - xopen=2.0.2
   - xorg-libice=1.1.2
   - xorg-libsm=1.2.6
@@ -416,19 +446,20 @@ dependencies:
   - xorg-libxau=1.0.12
   - xorg-libxdmcp=1.1.5
   - xorg-libxext=1.3.6
-  - xorg-libxfixes=6.0.1
+  - xorg-libxfixes=6.0.2
   - xorg-libxi=1.8.2
   - xorg-libxrandr=1.5.4
   - xorg-libxrender=0.9.12
   - xorg-libxt=1.3.1
   - xorg-libxtst=1.2.5
-  - xz=5.6.4
-  - xz-gpl-tools=5.6.4
-  - xz-tools=5.6.4
+  - xz=5.8.1
+  - xz-gpl-tools=5.8.1
+  - xz-tools=5.8.1
   - yaml=0.2.5
-  - yte=1.7.0
-  - zipp=3.21.0
+  - yte=1.8.1
+  - zarr=3.1.3
+  - zipp=3.23.0
   - zlib=1.3.1
-  - zlib-ng=2.2.4
-  - zstandard=0.23.0
+  - zlib-ng=2.2.5
+  - zstandard=0.25.0
   - zstd=1.5.7

From 22e3f5d6fd36b0a65ec59fb0e1154b36c02e9a67 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 20:30:58 +0000
Subject: [PATCH 100/196] move relevant references rules to rnaseq

---
 workflows/rnaseq/Snakefile | 257 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 255 insertions(+), 2 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 5ec50596..35c12757 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -33,8 +33,6 @@ rule all:
         "data/rnaseq_aggregation/multiqc.html",
 
 
-include: "../references/Snakefile"
-
 # Optionally run `snakemake strand_check` to do a preliminary run on
 # automatically-subset data to evaluate strandedness.
 include: "strand_check.smk"
@@ -62,6 +60,261 @@ rule symlinks:
             utils.make_relative_symlink(src, linkname)
 
 
+rule fasta:
+    output:
+        temporary(f"{REFERENCES}/genome.fa.gz"),
+    log:
+        f"{REFERENCES}/logs/genome.fa.gz.log",
+    resources:
+        mem_mb="4g",
+        runtime="2h",
+    run:
+        utils.download_and_postprocess(
+            urls=config["fasta"]["url"],
+            postprocess=config["fasta"].get("postprocess", None),
+            outfile=output[0],
+            log=log,
+        )
+
+
+rule gtf:
+    output:
+        temporary(f"{REFERENCES}/annotation.gtf.gz"),
+    log:
+        f"{REFERENCES}/logs/annotation.gtf.gz.log",
+    resources:
+        mem="4g",
+        runtime="2h",
+    run:
+        utils.download_and_postprocess(
+            urls=config["gtf"]["url"],
+            postprocess=config["gtf"].get("postprocess", None),
+            outfile=output[0],
+            log=log,
+        )
+
+
+rule rrna_fasta:
+    output:
+        f"{REFERENCES}/rrna.fa.gz",
+    log:
+        f"{REFERENCES}/logs/rrna.fa.log",
+    resources:
+        mem="4g",
+        runtime="2h",
+    run:
+        utils.download_and_postprocess(
+            urls=config["rrna"]["url"],
+            postprocess=config["rrna"].get("postprocess", None),
+            outfile=output[0],
+            log=log,
+        )
+
+
+rule unzip:
+    input:
+        f"{REFERENCES}/{{prefix}}.gz",
+    output:
+        f"{REFERENCES}/{{prefix}}",
+    resources:
+        mem="4g",
+        runtime="2h",
+    shell:
+        "gunzip -c {input} > {output}"
+
+
+rule rrna_index:
+    input:
+        f"{REFERENCES}/rrna.fa",
+    output:
+        f"{REFERENCES}/bowtie2/rrna.1.bt2",
+        f"{REFERENCES}/bowtie2/rrna.fa",
+    log:
+        f"{REFERENCES}/logs/bowtie2_rrna.log",
+    resources:
+        mem="32g",
+        disk="50g",
+        runtime="8h",
+    threads: 8
+    run:
+        index = f"{REFERENCES}/bowtie2/rrna"
+        shell("bowtie2-build" " --threads {threads}" " {input}" " {index}" " &> {log}")
+        utils.make_relative_symlink(input[0], output[-1])
+
+
+rule star_index:
+    input:
+        fasta=f"{REFERENCES}/genome.fa",
+        gtf=f"{REFERENCES}/annotation.gtf",
+    output:
+        f"{REFERENCES}/star/Genome",
+    log:
+        f"{REFERENCES}/logs/star.log",
+    threads: 8
+    resources:
+        mem="64g",
+        runtime="8h",
+    run:
+        genomedir = os.path.dirname(output[0])
+        shell("rm -r {genomedir}")
+        shell("mkdir -p {genomedir}")
+        shell(
+            "STAR "
+            "--runMode genomeGenerate "
+            "--runThreadN {threads} "
+            "--genomeDir {genomedir} "
+            "--genomeFastaFiles {input.fasta} "
+            # NOTE: GTF is optional
+            "--sjdbGTFfile {input.gtf} "
+            # NOTE: STAR docs say that 100 should work well.
+            "--sjdbOverhang 100 "
+            # NOTE: for small genomes, may need to scale this down to
+            # min(14, log2(GenomeLength) / 2 - 1)
+            # --genomeSAindexNbases 14
+            "&> {log}"
+        )
+        # STAR writes a hard-coded Log.out file to the current working
+        # directory. So put that on the end of the log file for the rule and
+        # then clean up.
+        shell("cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out")
+        shell("ln -s {input.fasta} {genomedir}")
+
+
+rule transcriptome_fasta:
+    input:
+        fasta=f"{REFERENCES}/genome.fa",
+        gtf=f"{REFERENCES}/annotation.gtf",
+    output:
+        f"{REFERENCES}/transcriptome.fa",
+    resources:
+        mem="4g",
+        runtime="2h",
+    shell:
+        "gffread {input.gtf} -w {output} -g {input.fasta}"
+
+
+rule salmon_index:
+    input:
+        f"{REFERENCES}/transcriptome.fa",
+    output:
+        f"{REFERENCES}/salmon/versionInfo.json",
+    log:
+        f"{REFERENCES}/logs/salmon.log",
+    params:
+        outdir=f"{REFERENCES}/salmon",
+    resources:
+        mem="32g",
+        runtime="2h",
+    run:
+        outdir = os.path.dirname(output[0])
+        shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}")
+
+
+rule conversion_refflat:
+    input:
+        f"{REFERENCES}/annotation.gtf.gz",
+    output:
+        f"{REFERENCES}/annotation.refflat",
+    log:
+        f"{REFERENCES}/logs/annotation.refflat.log",
+    resources:
+        mem="2g",
+        runtime="2h",
+    shell:
+        "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp "
+        """&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} """
+        "&& rm {output}.tmp "
+
+
+rule conversion_bed12:
+    input:
+        f"{REFERENCES}/annotation.gtf.gz",
+    output:
+        f"{REFERENCES}/annotation.bed12",
+    resources:
+        mem="2g",
+        runtime="2h",
+    shell:
+        "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp "
+        "&& genePredToBed {output}.tmp {output} "
+        "&& rm {output}.tmp"
+
+
+rule chromsizes:
+    input:
+        f"{REFERENCES}/genome.fa.gz",
+    output:
+        f"{REFERENCES}/genome.chromsizes",
+    log:
+        f"{REFERENCES}/logs/genome.chromsizes.log",
+    params:
+        java_args="-Xmx20g",
+        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
+    resources:
+        mem="24g",
+        runtime="2h",
+    shell:
+        "export LC_COLLATE=C; "
+        "rm -f {output}.tmp "
+        "&& picard "
+        "{params.java_args} "
+        "CreateSequenceDictionary R={input} O={output}.tmp &> {log} "
+        '&& grep "^@SQ" {output}.tmp '
+        """| awk '{{print $2, $3}}' """
+        '| sed "s/SN://g;s/ LN:/\\t/g" '
+        "| sort -k1,1 > {output} "
+        "&& rm -f {output}.tmp "
+
+
+rule mappings:
+    """
+    Creates gzipped TSV mapping between attributes in the GTF.
+    """
+    input:
+        gtf=f"{REFERENCES}/annotation.gtf.gz",
+    output:
+        f"{REFERENCES}/annotation.mapping.tsv.gz",
+    params:
+        include_featuretypes=lambda wildcards, output: conversion_kwargs[
+            output[0]
+        ].get("include_featuretypes", []),
+    resources:
+        mem="2g",
+        runtime="2h",
+    run:
+        import gffutils
+
+        # Will want to change the setting back to what it was originally when
+        # we're done
+        orig_setting = gffutils.constants.always_return_list
+        gffutils.constants.always_return_list = False
+
+        include_featuretypes = params.include_featuretypes
+
+        res = []
+        for f in gffutils.DataIterator(input[0]):
+
+            ft = f.featuretype
+
+            if include_featuretypes and (ft not in include_featuretypes):
+                continue
+
+            d = dict(f.attributes)
+            d["__featuretype__"] = ft
+            res.append(d)
+
+        df = pandas.DataFrame(res)
+
+        # Depending on how many attributes there were and the
+        # include_featuretypes settings, this may take a while.
+        df = df.drop_duplicates()
+
+        df.to_csv(output[0], sep="\t", index=False, compression="gzip")
+
+        # Restore original setting
+        gffutils.constants.always_return_list = orig_setting
+
+
 rule symlink_targets:
     input:
         expand(

From 0793fa31ffdf86c86981ec070ab3042f630db622 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 20:31:55 +0000
Subject: [PATCH 101/196] convert bowtie2 to shell block

---
 workflows/rnaseq/Snakefile | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 35c12757..07bdedca 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -485,25 +485,18 @@ rule rRNA:
     resources:
         mem="2g",
         runtime="2h",
-    run:
-        prefix = os.path.commonprefix(input.index).rstrip(".")
-        sam = output.bam.replace(".bam", ".sam")
-        shell(
-            "bowtie2 "
-            "-x {prefix} "
-            "-U {input.fastq} "
-            "--threads {threads} "
-            "-k 1 "
-            "--no-unal "
-            "-S {sam} "
-            "> {log} 2>&1"
-        )
-
-        shell(
-            "samtools view -Sb {sam} "
-            "| samtools sort - -o {output.bam} -O BAM "
-            "&& rm {sam}"
-        )
+    shell:
+        "bowtie2 "
+        f"-x {REFERENCES}/bowtie2/rrna "
+        "-U {input.fastq} "
+        "--threads {threads} "
+        "-k 1 "
+        "--no-unal "
+        "-S {output.bam}.sam "
+        "> {log} 2>&1 "
+        "&& samtools view -Sb {output.bam}.sam "
+        "| samtools sort - -o {output.bam} -O BAM "
+        "&& rm {output.bam}.sam"
 
 
 rule fastq_count:

From 84bb39f42ea4a8c72bfcef837e11be5afd480367 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 20:32:15 +0000
Subject: [PATCH 102/196] move strand check and sra to bottom

---
 workflows/rnaseq/Snakefile | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 07bdedca..9ebfac5d 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -33,14 +33,6 @@ rule all:
         "data/rnaseq_aggregation/multiqc.html",
 
 
-# Optionally run `snakemake strand_check` to do a preliminary run on
-# automatically-subset data to evaluate strandedness.
-include: "strand_check.smk"
-
-# If the sampletable is from SRA, handle it here.
-include: "sra.smk"
-
-
 rule symlinks:
     input:
         lambda wc: (
@@ -959,3 +951,10 @@ rule multiqc:
             "{analysis_directory} "
             "&> {log} "
         )
+
+# Optionally run `snakemake strand_check` to do a preliminary run on
+# automatically-subset data to evaluate strandedness.
+include: "strand_check.smk"
+
+# If the sampletable is from SRA, handle it here.
+include: "sra.smk"

From 822afe2d41fc951db01f57dfbc1d682ef52b603f Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 20:33:18 +0000
Subject: [PATCH 103/196] simplify inputs

---
 workflows/rnaseq/Snakefile | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 9ebfac5d..25f50a38 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -314,7 +314,6 @@ rule symlink_targets:
         ),
 
 
-
 rule cutadapt:
     input:
         fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
@@ -397,7 +396,7 @@ rule star:
     input:
         fastq=rules.cutadapt.output,
         index=rules.star_index.output,
-        annotation=f"{REFERENCES}/annotation.gtf",
+        annotation=f"{REFERENCES}/annotation.gtf.gz",
     output:
         bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
         sjout=temporary(
@@ -459,16 +458,7 @@ rule star:
 rule rRNA:
     input:
         fastq="data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz",
-        index=multiext(
-            f"{REFERENCES}/bowtie2/rrna",
-            ".1.bt2",
-            ".2.bt2",
-            ".3.bt2",
-            ".4.bt2",
-            ".rev.1.bt2",
-            ".rev.2.bt2",
-            ".fa",
-        ),
+        index=f"{REFERENCES}/bowtie2/rrna.1.bt2",
     output:
         bam="data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam",
     log:

From 1c1f6f7e409fc8facbd7a42ba2c7558df26fd39c Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 20:33:32 +0000
Subject: [PATCH 104/196] reflect changes to rule name

---
 workflows/rnaseq/strand_check.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/rnaseq/strand_check.smk b/workflows/rnaseq/strand_check.smk
index 9c8a3467..bd7c45d4 100644
--- a/workflows/rnaseq/strand_check.smk
+++ b/workflows/rnaseq/strand_check.smk
@@ -1,7 +1,7 @@
 rule sample_strand_check:
     input:
         fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
-        index=expand(rules.bowtie2_index.output, label="genome"),
+        index=expand(rules.rrna_index.output, label="genome"),
         bed12=rules.conversion_bed12.output,
     output:
         strandedness="strand_check/{sample}/{sample}.strandedness",

From cff4e095831c5a0fe6e8e520d228a2c613c47af9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 5 Oct 2025 20:34:22 +0000
Subject: [PATCH 105/196] minor formatting

---
 workflows/rnaseq/Snakefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 25f50a38..4f5e907e 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -390,7 +390,6 @@ rule fastqc:
             shell("mv {out_html} {output.html}")
 
 
-
 rule star:
     "Align with STAR (1-pass mode)"
     input:

From 4122012e76a70b5b53a0a80005fec1b0603068e2 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 7 Oct 2025 15:26:42 +0000
Subject: [PATCH 106/196] include additional .smk files when deploying

---
 deploy.py                 |  3 +++
 workflows/chipseq/sra.smk | 40 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 workflows/chipseq/sra.smk

diff --git a/deploy.py b/deploy.py
index 1981804c..8270348d 100755
--- a/deploy.py
+++ b/deploy.py
@@ -82,6 +82,8 @@ def write_include_file(source, flavor="all"):
     PATTERN_DICT = {
         "rnaseq": [
             "include workflows/rnaseq/Snakefile",
+            "include workflows/rnaseq/strand_check.smk",
+            "include workflows/rnaseq/sra.smk",
             "recursive-include workflows/rnaseq/config *",
             "include workflows/rnaseq/rnaseq_trackhub.py",
             "recursive-include workflows/rnaseq/downstream *.Rmd",
@@ -89,6 +91,7 @@ def write_include_file(source, flavor="all"):
         ],
         "chipseq": [
             "include workflows/chipseq/Snakefile",
+            "include workflows/rnaseq/sra.smk",
             "recursive-include workflows/chipseq/config *",
             "include workflows/chipseq/chipseq_trackhub.py",
         ],
diff --git a/workflows/chipseq/sra.smk b/workflows/chipseq/sra.smk
new file mode 100644
index 00000000..5ee5f53b
--- /dev/null
+++ b/workflows/chipseq/sra.smk
@@ -0,0 +1,40 @@
+if utils.detect_sra(sampletable):
+    sampletable["orig_filename"] = expand(
+        "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1
+    )
+
+    if is_paired:
+        sampletable["orig_filename_R2"] = expand(
+            "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+            sample=SAMPLES,
+            n=2,
+        )
+
+    rule fastq_dump:
+        output:
+            fastq=expand(
+                "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+                n=n,
+                allow_missing=True,
+            ),
+        log:
+            "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
+        params:
+            is_paired=is_paired,
+            # extra="-X 100000",  # [enable for test]
+        resources:
+            mem="1g",
+            disk="1g",
+            runtime="2h",
+        run:
+            srr = sampletable.loc[wildcards.sample, "Run"]
+            extra = params.get("extra", "")
+            if is_paired:
+                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
+                shell("mv {srr}_1.fastq.gz {output[0]}")
+                shell("mv {srr}_2.fastq.gz {output[1]}")
+            else:
+                shell(
+                    "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp"
+                )
+                shell("mv {output[0]}.tmp {output[0]}")

From d71877f1d7e3409459aebd10a25bfbd0ace00056 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 7 Oct 2025 15:27:18 +0000
Subject: [PATCH 107/196] update decision log

---
 docs/decisions.rst | 321 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 321 insertions(+)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 7abbc78d..ac808e63 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -83,3 +83,324 @@ indexes alongside the indexes.
               │   └── dmel_test
               │       └── <salmon files>
               └── dmel_test.fasta
+
+Params
+------
+The ``params:`` directive allows `non-file parameters for rules
+<https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#non-file-parameters-for-rules>`__.
+Much (perhaps all?) of what can be done in a ``params:`` directive can also be
+done in the body of ``run:`` block. On one hand, it can be nice to have a plain
+string ``shell:`` block, and put the complexity in the params. But on the other
+hand, sometimes it is harder to follow what's happening in params than it would
+be in Python in a ``run:`` block.
+
+This section talks about when and why we use params in lcdb-wf.
+
+One of the nice things sbout Snakemake is that the rules (in ``shell:`` blocks)
+can be quite close to the equivalent command-line call. Since rules in these
+Snakefiles are intended to be edited, it makes sense to keep them as close to
+the command-line as is reasonable.
+
+Take the cutadapt rule, for example, where we typically would want to include
+the adapters in the call, but it's not uncommon to add other arguments. Here
+we're working with a simplified, single-end version of it:
+
+.. code-block:: python
+
+  rule cutadapt:
+      input:
+          fastq='{sample}.fastq.gz"
+      output:
+          fastq='{sample}.cutadapt.fastq.gz'
+      threads:
+          8
+      shell:
+          "cutadapt "
+          "-o {output[0]} "
+          "-j {threads} "
+          "--nextseq-trim 20 "
+          "--overlap 6 "
+          "--minimum-length 25 "
+          "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+          "{input.fastq[0]} "
+          "&> {log}"
+
+
+Here's an extreme way of adding params where we pull out each argument into
+a separate params item. This isn't very flexible and has lots of repetition, so
+we probably don't want this::
+
+.. code-block:: python
+
+  rule cutadapt:
+      input:
+          '{sample}.fastq.gz"
+      output:
+          '{sample}.cutadapt.fastq.gz'
+      threads:
+          8
+      params:
+          nextseq_trim="--nextseq-trim 20",
+          overlap="--overlap 6",
+          minimum_length=25,
+          a="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA",
+      shell:
+          "cutadapt "
+          "-o {output} "
+          "-j {threads} "
+          "{params.nextseq_trim} "
+          "{params.overlap} "
+          "{params.minimum_length} "
+          "{params.a} "
+          "{input} "
+          "&> {log}"
+
+But we could add the arguments to be a single "extra" string and store that
+in params, like this:
+
+.. code-block:: python
+
+  rule cutadapt:
+      input:
+          '{sample}.fastq.gz"
+      output:
+          '{sample}.cutadapt.fastq.gz'
+      threads:
+          8
+      params:
+          extra=(
+              "--nextseq-trim 20 "
+              "--overlap 6 "
+              "--minimum-length 25 "
+              "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+          )
+      shell:
+          "cutadapt "
+          "-o {output} "
+          "-j {threads} "
+          "{params.extra} "
+          "{input} "
+          "&> {log}"
+
+One thing that's nice about this is that the "changeable things" are visually in
+a different location. When running Snakemake with `-p` then the params will be
+filled in to make one long string, which we could use for debugging.
+
+But we want to support single- and paired-end reads, and the arguments to
+cutadapt depend on that. Here's the actual rule:
+
+.. code-block:: python
+
+  rule cutadapt: input:
+          fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
+      output:
+          fastq=expand(
+              "data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz", n=n
+          ),
+      log:
+          "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
+      threads: 6
+      resources:
+          mem="2g",
+          runtime="2h",
+      run:
+          if is_paired:
+              shell(
+                  "cutadapt "
+                  "-o {output[0]} "
+                  "-p {output[1]} "
+                  "-j {threads} "
+                  "--nextseq-trim 20 "
+                  "--overlap 6 "
+                  "--minimum-length 25 "
+                  "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+                  "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
+                  "{input.fastq[0]} "
+                  "{input.fastq[1]} "
+                  "&> {log}"
+              )
+          else:
+              shell(
+                  "cutadapt "
+                  "-o {output[0]} "
+                  "-j {threads} "
+                  "--nextseq-trim 20 "
+                  "--overlap 6 "
+                  "--minimum-length 25 "
+                  "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+                  "{input.fastq[0]} "
+                  "&> {log}"
+              )
+
+Notice that we have some shared arguments as well as a PE-specific adapter
+argument. Converting this one to params would be something like the following:
+
+.. code-block:: python
+
+  rule cutadapt: input:
+          fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
+      output:
+          fastq=expand(
+              "data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz", n=n
+          ),
+      log:
+          "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
+      threads: 6
+      resources:
+          mem="2g",
+          runtime="2h",
+      params:
+          shared=(
+               "--nextseq-trim 20 "
+               "--overlap 6 "
+               "--minimum-length 25 "
+               "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+          ),
+          se_pe_specific=(
+                 "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
+          ) if is_paired else ""
+      run:
+          if is_paired:
+              shell(
+                  "cutadapt "
+                  "-o {output[0]} "
+                  "-p {output[1]} "
+                  "-j {threads} "
+                  "{params.shared} "
+                  "{params.se_pe_specific} "
+                  "{input.fastq[0]} "
+                  "{input.fastq[1]} "
+                  "&> {log}"
+              )
+          else:
+              shell(
+                  "cutadapt "
+                  "-o {output[0]} "
+                  "-j {threads} "
+                  "{params.shared} "
+                  "{params.se_pe_specific} "
+                  "{input.fastq[0]} "
+                  "&> {log}"
+              )
+
+Note in this case we need to provide ``-o`` and ``-p`` arguments
+separately for paired-end. So we still need to have the ``if is_paired`` clause
+in the body of the rule. This one could be a little bit confusing with the
+``se_pe_specific`` clause, but otherwise it supports both SE and PE.
+
+What if we split that out into params as well, so that everything SE or PE
+specific is handled there?
+
+.. code-block:: python
+
+  rule cutadapt:
+      input:
+          fastq=expand(
+              "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz",
+              n=n, allow_missing=True),
+      output:
+          fastq=expand(
+              "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz",
+              n=n, allow_missing=True),
+      log:
+          "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
+      threads: 6
+      resources:
+          mem="2g",
+          runtime="2h",
+      params:
+          extra=(
+              "--nextseq-trim 20 "
+              "--overlap 6 "
+              "--minimum-length 25 "
+              "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA "
+          ),
+          se_pe_specific=(
+              "-o {output[0]} "
+              "-p {output[1]} "
+              "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT "
+              "{input.fastq[0]} "
+              "{input.fastq[1]} "
+              if not is_paired else
+              "{input.fastq[0]} "
+              "-o {output[0]} "
+          )
+      shell:
+          "cutadapt "
+          "-j {threads} "
+          "{params.se_pe_specific} "
+          "{params.extra} "
+          "&> {log}"
+
+Now it becomes a little harder to understand what's going on, and we may have
+gone too far in pulling everything out into params. So maybe an absolute
+principle of "everything in params" is not useful.
+
+Let's take another example, the featureCounts rule for RNA-seq:
+
+.. code-block:: python
+
+  rule featurecounts:
+      input:
+          annotation=rules.gtf.output,
+          bam=rules.markduplicates.output.bam,
+      output:
+          "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt",
+      log:
+          "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log",
+      threads: 8
+      resources:
+          mem="16g",
+          runtime="2h",
+      params:
+          strand_arg={
+              "unstranded": "-s0",
+              "fr-firststrand": "-s2",
+              "fr-secondstrand": "-s1",
+          }[config["stranded"]],
+          se_pe_specific=(
+            "-p --countReadPairs" if is_paired
+            else ""
+          ),
+          extra="",
+      run:
+          shell(
+              "featureCounts "
+              "{params.strand_arg} "
+              "{params.se_pe_specific} "
+              "{params.extra} "
+              "-T {threads} "
+              "-a {input.annotation} "
+              "-o {output} "
+              "{input.bam} "
+              "&> {log}"
+          )
+
+Here, it is important to have ``strand_arg`` be in the params. To understand
+why, imagine if instead we determined that argument inside the ``run:`` block,
+and then we changed the config file's stranded entry (``config["stranded"]``).
+Then this rule would NOT re-run because the code didn't change -- Snakemake
+does not *evaluate* the code in a ``run:`` block to determine if it changed.
+However, it *does* evaluate the params. So in this case, it's necessary to keep
+the strand argument detection in the params to take advantage of this behavior,
+and correctly re-run the rule if the config's strand argument has changed.
+
+Next, we would want to decide whether *all* arguments should go in ``params:``.
+In this case, since we're sort of forced to split out ``strand_arg``, we might
+as well split everything out.
+
+In the end we have these observations:
+
+- strand-specific arguments *must* be in ``params:``
+- some tools have SE/PE-specific arguments. These need an ``if`` clause
+  *somewhere*, whether in a ``run:`` block or in ``params:``
+- understandability and configuration flexibility are important goals of lcdb-wf
+- factoring out *everything* into params weakens understandibility
+
+
+Guidelines:
+
+- Stranded arguments must be in params
+- SE/PE arguments should be handled inside a ``run:`` block
+- Any other arguments should be written in a  ``shell:`` block or a ``shell()``
+  call directly, to visually match the equivalent command-line call

From d4925c46bc661f27f1191ed2a810a7d6eb535587 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 7 Oct 2025 15:27:38 +0000
Subject: [PATCH 108/196] overhaul of rnaseq and chipseq; rm references

---
 workflows/chipseq/Snakefile    | 241 ++++++++++++++++-----------
 workflows/references/Snakefile | 294 ---------------------------------
 workflows/rnaseq/Snakefile     | 119 +++++--------
 3 files changed, 184 insertions(+), 470 deletions(-)
 delete mode 100644 workflows/references/Snakefile

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index ce1243a9..1ce812db 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -11,9 +11,6 @@ from lib import chipseq
 configfile: "config/config.yaml"
 
 
-include: "../references/Snakefile"
-
-
 REFERENCES = config.get("reference_dir", "../../references")
 sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
@@ -27,6 +24,7 @@ peaks = chipseq.add_bams_to_peak_calling(config)
 wildcard_constraints:
     n="[1,2]",
     sample="|".join(SAMPLES),
+    ext=".fa|.gtf",
 
 
 localrules:
@@ -34,43 +32,93 @@ localrules:
     symlink_targets,
 
 
-rule targets:
+rule all:
     input:
         "data/chipseq_aggregation/multiqc.html",
         expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=LABELS),
         [v["bed"] for k, v in peaks.items()],
 
 
-if utils.detect_sra(sampletable):
-    sampletable['orig_filename'] = expand(
-        'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1)
+# If the sampletable is from SRA, handle it here.
+include: "sra.smk"
 
-    if is_paired:
-        sampletable['orig_filename_R2'] = expand(
-            'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2)
+rule fasta:
+    output:
+        temporary(f"{REFERENCES}/genome.fa.gz"),
+    log:
+        f"{REFERENCES}/logs/genome.fa.gz.log",
+    resources:
+        mem_mb="4g",
+        runtime="2h",
+    run:
+        utils.download_and_postprocess(
+            urls=config["fasta"]["url"],
+            postprocess=config["fasta"].get("postprocess", None),
+            outfile=output[0],
+            log=log,
+        )
 
-    rule fastq_dump:
-        output:
-            fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n, allow_missing=True)
-        log:
-            'original_data/sra_samples/{sample}/{sample}.fastq.gz.log'
-        params:
-            is_paired=is_paired,
-            # extra="-X 100000",  # [enable for test]
-        resources:
-            mem="1g",
-            disk="1g",
-            runtime="2h",
-        run:
-            srr = sampletable.loc[wildcards.sample, "Run"]
-            extra = params.get("extra", "")
-            if is_paired:
-                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
-                shell("mv {srr}_1.fastq.gz {output[0]}")
-                shell("mv {srr}_2.fastq.gz {output[1]}")
-            else:
-                shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp")
-                shell("mv {output[0]}.tmp {output[0]}")
+
+rule chromsizes:
+    input:
+        f"{REFERENCES}/genome.fa.gz",
+    output:
+        f"{REFERENCES}/genome.chromsizes",
+    log:
+        f"{REFERENCES}/logs/genome.chromsizes.log",
+    params:
+        java_args="-Xmx20g",  # [disable for test]
+        # java_args='-Xmx2g'  # [enable for test]
+    resources:
+        mem="24g",
+        runtime="2h",
+    shell:
+        "export LC_COLLATE=C; "
+        "rm -f {output}.tmp "
+        "&& picard "
+        "{params.java_args} "
+        "CreateSequenceDictionary R={input} O={output}.tmp &> {log} "
+        '&& grep "^@SQ" {output}.tmp '
+        """| awk '{{print $2, $3}}' """
+        '| sed "s/SN://g;s/ LN:/\\t/g" '
+        "| sort -k1,1 > {output} "
+        "&& rm -f {output}.tmp "
+
+
+rule unzip:
+    input:
+        f"{REFERENCES}/{{prefix}}{{ext}}.gz",
+    output:
+        f"{REFERENCES}/{{prefix}}{{ext}}",
+    resources:
+        mem="4g",
+        runtime="2h",
+    shell:
+        "gunzip -c {input} > {output}"
+
+
+rule bowtie2_index:
+    input:
+        f"{REFERENCES}/genome.fa",
+    output:
+        f"{REFERENCES}/bowtie2/genome.1.bt2",
+        f"{REFERENCES}/bowtie2/genome.fa",
+    log:
+        f"{REFERENCES}/logs/bowtie2_genome.log",
+    resources:
+        mem="32g",
+        disk="50g",
+        runtime="8h",
+    threads: 8
+    run:
+        prefix = subpath(output[0], strip_suffix=".1.bt2")
+        shell(
+            "bowtie2-build "
+            "--threads {threads} "
+            "{input} "
+            "{prefix} &> {log}"
+        )
+        utils.make_relative_symlink(input[0], output[-1])
 
 
 rule symlinks:
@@ -119,9 +167,9 @@ rule cutadapt:
         if is_paired:
             shell(
                 "cutadapt "
+                "-j {threads} "
                 "-o {output[0]} "
                 "-p {output[1]} "
-                "-j {threads} "
                 "--nextseq-trim 20 "
                 "--overlap 6 "
                 "--minimum-length 25 "
@@ -134,8 +182,8 @@ rule cutadapt:
         else:
             shell(
                 "cutadapt "
-                "-o {output[0]} "
                 "-j {threads} "
+                "-o {output[0]} "
                 "--nextseq-trim 20 "
                 "--overlap 6 "
                 "--minimum-length 25 "
@@ -145,20 +193,29 @@ rule cutadapt:
             )
 
 
+
 rule fastqc:
     input:
-        "{sample_dir}/{sample}/{sample}{suffix}",
+        "data/chipseq_samples/{sample}/{sample}{suffix}",
     threads: 1
     output:
-        html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html",
-        zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip",
+        html="data/chipseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html",
+        zip="data/chipseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip",
     resources:
         mem="8g",
         runtime="2h",
     log:
-        "{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log",
+        "data/chipseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log",
     run:
+        # Calculate the paths FastQC will create so we can move them to
+        # specified output files if needed.
         outdir = os.path.dirname(output.html) or "."
+        outfile = os.path.basename(input[0])
+        for s in [".fastq", ".fq", ".gz", ".bam"]:
+            outfile = outfile.replace(s, "")
+        out_zip = os.path.join(outdir, outfile + "_fastqc.zip")
+        out_html = os.path.join(outdir, outfile + "_fastqc.html")
+
         shell(
             "fastqc "
             "--noextract "
@@ -167,13 +224,9 @@ rule fastqc:
             "{input} "
             "&> {log} "
         )
-        outfile = os.path.basename(input[0])
-        for s in [".fastq", ".fq", ".gz", ".bam"]:
-            outfile = outfile.replace(s, "")
-        out_zip = os.path.join(outdir, outfile + "_fastqc.zip")
+
         if not os.path.abspath(out_zip) == os.path.abspath(output.zip):
             shell("mv {out_zip} {output.zip}")
-        out_html = os.path.join(outdir, outfile + "_fastqc.html")
         if not os.path.abspath(out_html) == os.path.abspath(output.html):
             shell("mv {out_html} {output.html}")
 
@@ -182,16 +235,7 @@ rule bowtie2:
     input:
         fastq=expand(
             "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", n=n, allow_missing=True),
-        index=multiext(
-            f"{REFERENCES}/bowtie2/genome",
-            ".1.bt2",
-            ".2.bt2",
-            ".3.bt2",
-            ".4.bt2",
-            ".rev.1.bt2",
-            ".rev.2.bt2",
-            ".fa",
-        ),
+        index=f"{REFERENCES}/bowtie2/genome.1.bt2",
     output:
         bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"),
     log:
@@ -201,28 +245,26 @@ rule bowtie2:
         mem="32g",
         runtime="2h",
     run:
-        prefix = os.path.commonprefix(input.index).rstrip(".")
-        sam = output.bam.replace(".bam", ".sam")
-        fastqs = (
-            f"-1 {input.fastq[0]} -2 {input.fastq[1]}"
-            if is_paired
-            else f"-U {input.fastq}"
-        )
+        prefix = subpath(input.index, strip_suffix=".1.bt2")
+
+        if is_paired:
+            fastqs = f"-1 {input.fastq[0]} -2 {input.fastq[1]}"
+        else:
+            fastqs = f"-U {input.fastq}"
+
         shell(
             "bowtie2 "
-            "-x {prefix} "
+            f"-x {prefix} "
             "{fastqs} "
-            "--no-unal "
             "--threads {threads} "
-            "-S {sam} "
-            "> {log} 2>&1"
-        )
-
+            "--no-unal "
+            "-S {output.bam}.sam "
+            "> {log} 2>&1 ")
         shell(
-            "samtools view -Sb {sam} "
-            "| samtools sort - -o {output.bam} -O BAM "
-            "&& rm {sam}"
+            "samtools view -Sb {output.bam}.sam "
+            "| samtools sort -O BAM - -o {output.bam}"
         )
+        shell("rm {output.bam}.sam")
 
 
 rule unique:
@@ -235,12 +277,15 @@ rule unique:
         mem="1g",
         runtime="2h",
     params:
+    shell:
+        "samtools view "
+        "-b "
         # NOTE: the quality score chosen here should reflect the scores output
         # by the aligner used. For example, STAR uses 255 as max mapping
         # quality.
-        extra="-q 20",
-    shell:
-        "samtools view -b {params.extra} {input} > {output}"
+        "-q 20 "
+        "{input} "
+        "> {output}"
 
 
 rule fastq_count:
@@ -304,8 +349,8 @@ rule markduplicates:
         "MarkDuplicates "
         "-INPUT {input.bam} "
         "-OUTPUT {output.bam} "
-        "-REMOVE_DUPLICATES true "
         "-METRICS_FILE {output.metrics} "
+        "-REMOVE_DUPLICATES true "
         "-VALIDATION_STRINGENCY LENIENT "
         "&> {log}"
 
@@ -379,20 +424,12 @@ rule bigwig:
         "-p {threads} "
         "--minMappingQuality 20 "
         "--ignoreDuplicates "
-        # Can't use the CPM normalization for testing due to <1000 reads total
-        # in example data
-        "--normalizeUsing CPM "  # [disable for test]
         "--extendReads 300 "
+        "--normalizeUsing CPM "  # [disable for test]
         "&> {log}"
 
 
 rule fingerprint:
-    """
-    Runs deepTools plotFingerprint to assess how well the ChIP experiment
-    worked.
-
-    Note: uses the merged techreps.
-    """
     input:
         bams=lambda wc: expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=wc.ip_label),
         control=lambda wc: expand(
@@ -429,20 +466,19 @@ rule fingerprint:
             # The JSDsample argument is disabled for testing as it dramatically
             # increases the run time.
             "{jsdsample_arg} "  # [disable for test]
-            "--smartLabels "
-            "--extendReads=300 "
-            "--skipZeros "
             "--outQualityMetrics {output.metrics} "
             "--outRawCounts {output.raw_counts} "
             "--plotFile {output.plot} "
             # Default is 500k; use fewer to speed up testing:
             # '--numberOfSamples 50 '  # [enable for test]
+            "--smartLabels "
+            "--extendReads=300 "
+            "--skipZeros "
             "&> {log} "
             '&& sed -i "s/NA/0.0/g" {output.metrics} '
         )
 
 
-
 rule macs:
     input:
         ip=lambda wc: expand(
@@ -560,11 +596,11 @@ rule plotcorrelation:
     shell:
         "plotCorrelation "
         "--corData {input} "
+        "--plotFile {output.heatmap} "
+        "--outFileCorMatrix {output.tab} "
         "--corMethod spearman "
         "--whatToPlot heatmap "
-        "--plotFile {output.heatmap} "
         "--colorMap Reds "
-        "--outFileCorMatrix {output.tab}"
         # NOTE: if you're expecting negative correlation, try a divergent
         # colormap and setting the min/max to ensure that the colomap is
         # centered on zero:
@@ -573,7 +609,7 @@ rule plotcorrelation:
         # '--zMax 1 '
 
 
-rule idxstats:
+rule samtools_idxstats:
     input:
         bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
         bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai",
@@ -588,7 +624,7 @@ rule idxstats:
         "samtools idxstats {input.bam} 2> {log} 1> {output.txt}"
 
 
-rule flagstat:
+rule samtools_flagstat:
     input:
         bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
         bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai",
@@ -620,15 +656,20 @@ rule samtools_stats:
 
 rule multiqc:
     input:
-        expand("data/chipseq_samples/{sample}/{sample}.cutadapt.bam", sample=SAMPLES),
-        expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip", sample=SAMPLES),
-        expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip", sample=SAMPLES),
-        expand("data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip", sample=SAMPLES),
-        expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label),
-        expand("data/chipseq_samples/{sample}/samtools_stats_{sample}.txt", sample=SAMPLES),
-        expand("data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt", sample=SAMPLES),
-        expand("data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt", sample=SAMPLES),
-        expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=sampletable.label),
+        expand(
+            rules.fastqc.output.zip,
+            sample=SAMPLES,
+            suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"],
+        ),
+        expand(rules.cutadapt.output, sample=SAMPLES),
+        expand(rules.bowtie2.output, sample=SAMPLES),
+        expand(rules.markduplicates.output, sample=SAMPLES),
+        expand(rules.unique.output, sample=SAMPLES),
+        expand(rules.samtools_stats.output, sample=SAMPLES),
+        expand(rules.samtools_flagstat.output, sample=SAMPLES),
+        expand(rules.samtools_idxstats.output, sample=SAMPLES),
+        expand(rules.bigwig.output, label=sampletable.label),
+        expand(rules.merge_techreps.output, label=sampletable.label),
         expand(
             "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics",
             ip_label=sampletable.loc[sampletable.antibody != "input", "label"],
@@ -638,8 +679,6 @@ rule multiqc:
              sample=SAMPLES
         ) if is_paired else [],
         [v["bigbed"] for v in peaks.values()],
-        "data/chipseq_aggregation/deeptools/plotcorrelation.tab",
-        "data/chipseq_aggregation/deeptools/multibigwigsummary.tab",
         config="config/multiqc_config.yaml",
     output:
         "data/chipseq_aggregation/multiqc.html",
diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
deleted file mode 100644
index 6ee892f8..00000000
--- a/workflows/references/Snakefile
+++ /dev/null
@@ -1,294 +0,0 @@
-import os
-import sys
-import pandas
-
-sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
-from lib import utils
-
-REFERENCES = config.get("reference_dir", "../../references")
-
-
-def default_postprocess(origfn, newfn):
-    shell("mv {origfn} {newfn}")
-
-
-rule fasta:
-    output:
-        temporary(f"{REFERENCES}/genome.fa.gz"),
-    log:
-        f"{REFERENCES}/logs/genome.fa.gz.log",
-    resources:
-        mem_mb="4g",
-        runtime="2h",
-    run:
-        utils.download_and_postprocess(
-            urls=config["fasta"]["url"],
-            postprocess=config["fasta"].get("postprocess", None),
-            outfile=output[0],
-            log=log,
-        )
-
-
-rule gtf:
-    output:
-        temporary(f"{REFERENCES}/annotation.gtf.gz"),
-    log:
-        f"{REFERENCES}/logs/annotation.gtf.gz.log",
-    resources:
-        mem="4g",
-        runtime="2h",
-    run:
-        utils.download_and_postprocess(
-            urls=config["gtf"]["url"],
-            postprocess=config["gtf"].get("postprocess", None),
-            outfile=output[0],
-            log=log,
-        )
-
-
-rule rrna:
-    output:
-        temporary(f"{REFERENCES}/rrna.fa.gz"),
-    log:
-        f"{REFERENCES}/logs/rrna.fa.gz.log",
-    resources:
-        mem="4g",
-        runtime="2h",
-    run:
-        utils.download_and_postprocess(
-            urls=config["rrna"]["url"],
-            postprocess=config["rrna"].get("postprocess", None),
-            outfile=output[0],
-            log=log,
-        )
-
-
-rule unzip:
-    input:
-        f"{REFERENCES}/{{prefix}}.gz",
-    output:
-        f"{REFERENCES}/{{prefix}}",
-    resources:
-        mem="4g",
-        runtime="2h",
-    shell:
-        "gunzip -c {input} > {output}"
-
-
-rule bowtie2_index:
-    input:
-        f"{REFERENCES}/{{label}}.fa",
-    output:
-        multiext(
-            f"{REFERENCES}/bowtie2/{{label}}",
-            ".1.bt2",
-            ".2.bt2",
-            ".3.bt2",
-            ".4.bt2",
-            ".rev.1.bt2",
-            ".rev.2.bt2",
-            ".fa",
-        ),
-    log:
-        f"{REFERENCES}/logs/bowtie2_{{label}}.log",
-    resources:
-        mem="32g",
-        disk="50g",
-        runtime="8h",
-    threads: 8
-    run:
-        index = os.path.commonprefix(output).rstrip(".")
-        shell("bowtie2-build" " --threads {threads}" " {input}" " {index}" " &> {log}")
-        utils.make_relative_symlink(input[0], output[-1])
-
-
-rule star_index:
-    input:
-        fasta=f"{REFERENCES}/genome.fa",
-        gtf=f"{REFERENCES}/annotation.gtf",
-    output:
-        f"{REFERENCES}/star/Genome",
-    log:
-        f"{REFERENCES}/logs/star.log",
-    threads: 8
-    resources:
-        mem="64g",
-        runtime="8h",
-    run:
-        genomedir = os.path.dirname(output[0])
-        shell("rm -r {genomedir}")
-        shell("mkdir -p {genomedir}")
-        shell(
-            "STAR "
-            "--runMode genomeGenerate "
-            "--runThreadN {threads} "
-            "--genomeDir {genomedir} "
-            "--genomeFastaFiles {input.fasta} "
-            # NOTE: GTF is optional
-            "--sjdbGTFfile {input.gtf} "
-            # NOTE: STAR docs say that 100 should work well.
-            "--sjdbOverhang 100 "
-            # NOTE: for small genomes, may need to scale this down to
-            # min(14, log2(GenomeLength) / 2 - 1)
-            # --genomeSAindexNbases 14
-            "&> {log}"
-        )
-        # STAR writes a hard-coded Log.out file to the current working
-        # directory. So put that on the end of the log file for the rule and
-        # then clean up.
-        shell("cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out")
-        shell("ln -s {input.fasta} {genomedir}")
-
-
-rule transcriptome_fasta:
-    input:
-        fasta=f"{REFERENCES}/genome.fa",
-        gtf=f"{REFERENCES}/annotation.gtf",
-    output:
-        f"{REFERENCES}/transcriptome.fa",
-    resources:
-        mem="4g",
-        runtime="2h",
-    shell:
-        "gffread {input.gtf} -w {output} -g {input.fasta}"
-
-
-rule salmon_index:
-    input:
-        f"{REFERENCES}/transcriptome.fa",
-    output:
-        f"{REFERENCES}/salmon/versionInfo.json",
-    log:
-        f"{REFERENCES}/logs/salmon.log",
-    params:
-        outdir=f"{REFERENCES}/salmon",
-    resources:
-        mem="32g",
-        runtime="2h",
-    run:
-        outdir = os.path.dirname(output[0])
-        shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}")
-
-
-rule kallisto_index:
-    output:
-        f"{REFERENCES}/kallisto/transcripts.idx",
-    input:
-        f"{REFERENCES}/genome.fa",
-    log:
-        f"{REFERENCES}/logs/kallisto.log",
-    resources:
-        mem="32g",
-        runtime="2h",
-    shell:
-        "kallisto index "
-        "--index {output} "
-        "{input} "
-        "&> {log}"
-
-
-rule conversion_refflat:
-    input:
-        f"{REFERENCES}/annotation.gtf",
-    output:
-        f"{REFERENCES}/annotation.refflat",
-    log:
-        f"{REFERENCES}/logs/annotation.refflat.log",
-    resources:
-        mem="2g",
-        runtime="2h",
-    shell:
-        "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp "
-        """&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} """
-        "&& rm {output}.tmp "
-
-
-rule conversion_bed12:
-    input:
-        f"{REFERENCES}/annotation.gtf",
-    output:
-        f"{REFERENCES}/annotation.bed12",
-    resources:
-        mem="2g",
-        runtime="2h",
-    shell:
-        "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp "
-        "&& genePredToBed {output}.tmp {output} "
-        "&& rm {output}.tmp"
-
-
-rule chromsizes:
-    input:
-        f"{REFERENCES}/genome.fa",
-    output:
-        f"{REFERENCES}/genome.chromsizes",
-    log:
-        f"{REFERENCES}/logs/genome.chromsizes.log",
-    params:
-        # NOTE: Be careful with the memory here; make sure you have enough
-        # and/or it matches the resources you're requesting
-        java_args="-Xmx20g",
-        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
-    resources:
-        mem="24g",
-        runtime="2h",
-    shell:
-        "export LC_COLLATE=C; "
-        "rm -f {output}.tmp "
-        "&& picard "
-        "{params.java_args} "
-        "CreateSequenceDictionary R={input} O={output}.tmp &> {log} "
-        '&& grep "^@SQ" {output}.tmp '
-        """| awk '{{print $2, $3}}' """
-        '| sed "s/SN://g;s/ LN:/\\t/g" '
-        "| sort -k1,1 > {output} "
-        "&& rm -f {output}.tmp "
-
-
-rule mappings:
-    """
-    Creates gzipped TSV mapping between attributes in the GTF.
-    """
-    input:
-        gtf=f"{REFERENCES}/annotation.gtf",
-    output:
-        f"{REFERENCES}/annotation.mapping.tsv.gz",
-    params:
-        include_featuretypes=lambda wildcards, output: conversion_kwargs[
-            output[0]
-        ].get("include_featuretypes", []),
-    resources:
-        mem="2g",
-        runtime="2h",
-    run:
-        import gffutils
-
-        # Will want to change the setting back to what it was originally when
-        # we're done
-        orig_setting = gffutils.constants.always_return_list
-        gffutils.constants.always_return_list = False
-
-        include_featuretypes = params.include_featuretypes
-
-        res = []
-        for f in gffutils.DataIterator(input[0]):
-
-            ft = f.featuretype
-
-            if include_featuretypes and (ft not in include_featuretypes):
-                continue
-
-            d = dict(f.attributes)
-            d["__featuretype__"] = ft
-            res.append(d)
-
-        df = pandas.DataFrame(res)
-
-        # Depending on how many attributes there were and the
-        # include_featuretypes settings, this may take a while.
-        df = df.drop_duplicates()
-
-        df.to_csv(output[0], sep="\t", index=False, compression="gzip")
-
-        # Restore original setting
-        gffutils.constants.always_return_list = orig_setting
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 4f5e907e..650d9cfd 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -16,6 +16,7 @@ sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.index
+sample_dir = "data/rnaseq_samples"
 
 
 wildcard_constraints:
@@ -129,8 +130,13 @@ rule rrna_index:
         runtime="8h",
     threads: 8
     run:
-        index = f"{REFERENCES}/bowtie2/rrna"
-        shell("bowtie2-build" " --threads {threads}" " {input}" " {index}" " &> {log}")
+        prefix = subpath(output[0], strip_suffix=".1.bt2")
+        shell(
+            "bowtie2-build "
+            "--threads {threads} "
+            "{input} "
+            "{prefix} &> {log}"
+        )
         utils.make_relative_symlink(input[0], output[-1])
 
 
@@ -199,7 +205,12 @@ rule salmon_index:
         runtime="2h",
     run:
         outdir = os.path.dirname(output[0])
-        shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}")
+        shell(
+            "salmon index "
+            "--transcripts {input} "
+            "--index {outdir} "
+            "&> {log}"
+        )
 
 
 rule conversion_refflat:
@@ -240,8 +251,8 @@ rule chromsizes:
     log:
         f"{REFERENCES}/logs/genome.chromsizes.log",
     params:
-        java_args="-Xmx20g",
-        # java_args='-Xmx2g'  # [TEST SETTINGS -1]
+        java_args="-Xmx20g",  # [disable for test]
+        # java_args='-Xmx2g'  # [enable for test]
     resources:
         mem="24g",
         runtime="2h",
@@ -275,10 +286,6 @@ rule mappings:
         runtime="2h",
     run:
         import gffutils
-
-        # Will want to change the setting back to what it was originally when
-        # we're done
-        orig_setting = gffutils.constants.always_return_list
         gffutils.constants.always_return_list = False
 
         include_featuretypes = params.include_featuretypes
@@ -303,9 +310,6 @@ rule mappings:
 
         df.to_csv(output[0], sep="\t", index=False, compression="gzip")
 
-        # Restore original setting
-        gffutils.constants.always_return_list = orig_setting
-
 
 rule symlink_targets:
     input:
@@ -370,22 +374,26 @@ rule fastqc:
     log:
         "data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log",
     run:
+        # Calculate the paths FastQC will create so we can move them to
+        # specified output files if needed.
         outdir = os.path.dirname(output.html) or "."
+        outfile = os.path.basename(input[0])
+        for s in [".fastq", ".fq", ".gz", ".bam"]:
+            outfile = outfile.replace(s, "")
+        out_zip = os.path.join(outdir, outfile + "_fastqc.zip")
+        out_html = os.path.join(outdir, outfile + "_fastqc.html")
+
         shell(
             "fastqc "
             "--noextract "
             "--quiet "
             "--outdir {outdir} "
             "{input} "
-            "2> {log} "
+            "&> {log} "
         )
-        outfile = os.path.basename(input[0])
-        for s in [".fastq", ".fq", ".gz", ".bam"]:
-            outfile = outfile.replace(s, "")
-        out_zip = os.path.join(outdir, outfile + "_fastqc.zip")
+
         if not os.path.abspath(out_zip) == os.path.abspath(output.zip):
             shell("mv {out_zip} {output.zip}")
-        out_html = os.path.join(outdir, outfile + "_fastqc.html")
         if not os.path.abspath(out_html) == os.path.abspath(output.html):
             shell("mv {out_html} {output.html}")
 
@@ -466,19 +474,23 @@ rule rRNA:
     resources:
         mem="2g",
         runtime="2h",
-    shell:
-        "bowtie2 "
-        f"-x {REFERENCES}/bowtie2/rrna "
-        "-U {input.fastq} "
-        "--threads {threads} "
-        "-k 1 "
-        "--no-unal "
-        "-S {output.bam}.sam "
-        "> {log} 2>&1 "
-        "&& samtools view -Sb {output.bam}.sam "
-        "| samtools sort - -o {output.bam} -O BAM "
-        "&& rm {output.bam}.sam"
-
+    params:
+    run:
+        prefix = subpath(input.index, strip_suffix=".1.bt2")
+        shell(
+            "bowtie2 "
+            f"-x {prefix} "
+            "-U {input.fastq} "
+            "--threads {threads} "
+            "--no-unal "
+            "-k 1 "
+            "-S {output.bam}.sam "
+            "> {log} 2>&1 ")
+        shell(
+            "samtools view -Sb {output.bam}.sam "
+            "| samtools sort -O BAM - -o {output.bam}"
+        )
+        shell("rm {output.bam}.sam")
 
 rule fastq_count:
     input:
@@ -564,11 +576,8 @@ rule featurecounts:
             "fr-firststrand": "-s2 ",
             "fr-secondstrand": "-s1 ",
         }[config["stranded"]],
-        extra="",
     run:
-        p_arg = ""
-        if is_paired:
-            p_arg = "-p --countReadPairs "
+        p_arg = "-p --countReadPairs " if is_paired else ""
         shell(
             "featureCounts "
             "{params.strand_arg} "
@@ -715,45 +724,6 @@ rule salmon:
         )
 
 
-rule kallisto:
-    input:
-        fastq=rules.cutadapt.output,
-        index=REFERENCES + "/kallisto/transcripts.idx",
-    output:
-        "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5",
-    log:
-        "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log",
-    threads: 8
-    resources:
-        mem="32g",
-        runtime="2h",
-    params:
-        strand_arg={
-            "unstranded": "",
-            "fr-firststrand": "--rf-stranded",
-            "fr-secondstrand": "--fr-stranded",
-        }[config["stranded"]],
-        extra=(
-            "--bootstrap-samples 100"
-            if is_paired
-            else "--single --fragment-length 300 --sd 20 --bootstrap-samples 100"
-        ),
-    run:
-        outdir = os.path.dirname(output[0])
-        shell(
-            "kallisto quant "
-            "--index {input.index} "
-            "--output-dir {outdir} "
-            "--threads {threads} "
-            "--bootstrap-samples 100 "
-            "--threads {threads} "
-            "{params.strand_arg} "
-            "{params.extra} "
-            "{input.fastq} "
-            "&> {log}"
-        )
-
-
 rule rseqc_infer_experiment:
     input:
         bam=rules.markduplicates.output,
@@ -903,7 +873,6 @@ rule multiqc:
             ),
             expand(rules.markduplicates.output, sample=SAMPLES),
             expand(rules.salmon.output, sample=SAMPLES),
-            expand(rules.kallisto.output, sample=SAMPLES),
             expand(rules.preseq.output, sample=SAMPLES),
             expand(rules.collectrnaseqmetrics.output, sample=SAMPLES),
             expand(rules.samtools_stats.output, sample=SAMPLES),

From 3b5a3beec95886bc3059167a6944ef160c63235b Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 8 Oct 2025 13:03:43 +0000
Subject: [PATCH 109/196] fix paths in deploy

---
 deploy.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/deploy.py b/deploy.py
index 8270348d..11969246 100755
--- a/deploy.py
+++ b/deploy.py
@@ -91,7 +91,7 @@ def write_include_file(source, flavor="all"):
         ],
         "chipseq": [
             "include workflows/chipseq/Snakefile",
-            "include workflows/rnaseq/sra.smk",
+            "include workflows/chipseq/sra.smk",
             "recursive-include workflows/chipseq/config *",
             "include workflows/chipseq/chipseq_trackhub.py",
         ],
@@ -100,8 +100,7 @@ def write_include_file(source, flavor="all"):
             "recursive-include include *",
             "recursive-include lib *",
             "include env.yml env-r.yml .gitignore",
-            "include workflows/references/Snakefile",
-            "recursive-include workflows/references/config *",
+            "recursive-include scripts *",
             "global-exclude __pycache__",
         ],
         "full": [

From fc9af3afe814d43351febc7c760c6f258dff7b26 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 8 Oct 2025 14:02:54 +0000
Subject: [PATCH 110/196] references use params to properly trigger from config
 changes

---
 workflows/chipseq/Snakefile |  8 ++++++--
 workflows/rnaseq/Snakefile  | 21 +++++++++++++++------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 1ce812db..59292ef3 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -42,6 +42,7 @@ rule all:
 # If the sampletable is from SRA, handle it here.
 include: "sra.smk"
 
+
 rule fasta:
     output:
         temporary(f"{REFERENCES}/genome.fa.gz"),
@@ -50,10 +51,13 @@ rule fasta:
     resources:
         mem_mb="4g",
         runtime="2h",
+    params:
+        urls=config["fasta"]["url"],
+        postprocess=config["fasta"].get("postprocess", None)
     run:
         utils.download_and_postprocess(
-            urls=config["fasta"]["url"],
-            postprocess=config["fasta"].get("postprocess", None),
+            urls=params.urls,
+            postprocess=params.postprocess,
             outfile=output[0],
             log=log,
         )
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 650d9cfd..5e8fb069 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -61,10 +61,13 @@ rule fasta:
     resources:
         mem_mb="4g",
         runtime="2h",
+    params:
+        urls=config["fasta"]["url"],
+        postprocess=config["fasta"].get("postprocess", None)
     run:
         utils.download_and_postprocess(
-            urls=config["fasta"]["url"],
-            postprocess=config["fasta"].get("postprocess", None),
+            urls=params.urls,
+            postprocess=params.postprocess,
             outfile=output[0],
             log=log,
         )
@@ -78,10 +81,13 @@ rule gtf:
     resources:
         mem="4g",
         runtime="2h",
+    params:
+        urls=config["gtf"]["url"],
+        postprocess=config["gtf"].get("postprocess", None)
     run:
         utils.download_and_postprocess(
-            urls=config["gtf"]["url"],
-            postprocess=config["gtf"].get("postprocess", None),
+            urls=params.urls,
+            postprocess=params.postprocess,
             outfile=output[0],
             log=log,
         )
@@ -95,10 +101,13 @@ rule rrna_fasta:
     resources:
         mem="4g",
         runtime="2h",
+    params:
+        urls=config["rrna"]["url"],
+        postprocess=config["rrna"].get("postprocess", None)
     run:
         utils.download_and_postprocess(
-            urls=config["rrna"]["url"],
-            postprocess=config["rrna"].get("postprocess", None),
+            urls=params.urls,
+            postprocess=params.postprocess,
             outfile=output[0],
             log=log,
         )

From 82eb7c2851b968626085b4ee3e76ef9290acd73f Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 8 Oct 2025 14:03:29 +0000
Subject: [PATCH 111/196] no longer mark references as temporary

---
 workflows/chipseq/Snakefile |  2 +-
 workflows/rnaseq/Snakefile  | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 59292ef3..06dcd8cf 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -45,7 +45,7 @@ include: "sra.smk"
 
 rule fasta:
     output:
-        temporary(f"{REFERENCES}/genome.fa.gz"),
+        f"{REFERENCES}/genome.fa.gz",
     log:
         f"{REFERENCES}/logs/genome.fa.gz.log",
     resources:
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 5e8fb069..a70ea5a6 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -55,7 +55,7 @@ rule symlinks:
 
 rule fasta:
     output:
-        temporary(f"{REFERENCES}/genome.fa.gz"),
+        f"{REFERENCES}/genome.fa.gz",
     log:
         f"{REFERENCES}/logs/genome.fa.gz.log",
     resources:
@@ -75,7 +75,7 @@ rule fasta:
 
 rule gtf:
     output:
-        temporary(f"{REFERENCES}/annotation.gtf.gz"),
+        f"{REFERENCES}/annotation.gtf.gz",
     log:
         f"{REFERENCES}/logs/annotation.gtf.gz.log",
     resources:
@@ -224,7 +224,7 @@ rule salmon_index:
 
 rule conversion_refflat:
     input:
-        f"{REFERENCES}/annotation.gtf.gz",
+        f"{REFERENCES}/annotation.gtf",
     output:
         f"{REFERENCES}/annotation.refflat",
     log:
@@ -240,7 +240,7 @@ rule conversion_refflat:
 
 rule conversion_bed12:
     input:
-        f"{REFERENCES}/annotation.gtf.gz",
+        f"{REFERENCES}/annotation.gtf",
     output:
         f"{REFERENCES}/annotation.bed12",
     resources:
@@ -283,7 +283,7 @@ rule mappings:
     Creates gzipped TSV mapping between attributes in the GTF.
     """
     input:
-        gtf=f"{REFERENCES}/annotation.gtf.gz",
+        gtf=f"{REFERENCES}/annotation.gtf",
     output:
         f"{REFERENCES}/annotation.mapping.tsv.gz",
     params:
@@ -412,7 +412,7 @@ rule star:
     input:
         fastq=rules.cutadapt.output,
         index=rules.star_index.output,
-        annotation=f"{REFERENCES}/annotation.gtf.gz",
+        annotation=f"{REFERENCES}/annotation.gtf",
     output:
         bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
         sjout=temporary(
@@ -569,7 +569,7 @@ rule markduplicates:
 
 rule featurecounts:
     input:
-        annotation=rules.gtf.output,
+        annotation=f"{REFERENCES}/annotation.gtf",
         bam=rules.markduplicates.output.bam,
     output:
         "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt",

From 99feb61924b86abb7d03b53673efdaf25fa18a40 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 8 Oct 2025 14:03:47 +0000
Subject: [PATCH 112/196] add featurecounts aggreation to multiqc input

---
 workflows/rnaseq/Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index a70ea5a6..80284e27 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -892,6 +892,7 @@ rule multiqc:
             expand(rules.bigwig_pos.output, sample=SAMPLES),
             expand(rules.bigwig_neg.output, sample=SAMPLES),
             rules.rrna_libsizes_table.output,
+            rules.aggregate_featurecounts.output,
         ),
         config="config/multiqc_config.yaml",
     output:

From 523d3c34c72566e863cade9326c168e851d7be31 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 8 Oct 2025 14:04:20 +0000
Subject: [PATCH 113/196] rm kallisto from rnaseq

---
 workflows/rnaseq/downstream/config.yaml |  4 ----
 workflows/rnaseq/downstream/rnaseq.Rmd  | 32 +++++++------------------
 2 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/workflows/rnaseq/downstream/config.yaml b/workflows/rnaseq/downstream/config.yaml
index 2ec85070..a8a826b3 100644
--- a/workflows/rnaseq/downstream/config.yaml
+++ b/workflows/rnaseq/downstream/config.yaml
@@ -134,10 +134,6 @@ toggle:
   # `salmon=TRUE` argument to lcdbwf::make_dds.
   salmon: FALSE
 
-  # Import Kallisto results instead of featureCounts? See similar notes above
-  # for Salmon.
-  kallisto: FALSE
-
   # Create diagnostic plots for all dds objects?
   dds_diagnostics: TRUE
 
diff --git a/workflows/rnaseq/downstream/rnaseq.Rmd b/workflows/rnaseq/downstream/rnaseq.Rmd
index d21d13f2..50404469 100644
--- a/workflows/rnaseq/downstream/rnaseq.Rmd
+++ b/workflows/rnaseq/downstream/rnaseq.Rmd
@@ -15,7 +15,6 @@ knitr::opts_chunk$set(
     message=FALSE,
     cache.extra_file_dep_1 = file.info('../config/sampletable.tsv')$mtime,
     cache.extra_file_dep_2 = file.info('../data/rnaseq_aggregation/featurecounts.txt')$mtime,
-    cache.extra_file_dep_3 = file.info('../data/rnaseq_samples/*/*.kallisto/abundance.h5')$mtime,
     cache.extra_file_dep_4 = file.info('../data/rnaseq_samples/*/*.salmon/quant.sf')$mtime
 )
 ```
@@ -90,7 +89,7 @@ colData <- read.table(config$main$sampletable, sep='\t', header=TRUE, stringsAsF
 rownames(colData) <- colData[,1]
 ```
 
-```{r dds_initial, cache=TRUE, config=c(config$main, config$toggle$salmon, config$toggle$kallisto)}
+```{r dds_initial, cache=TRUE, config=c(config$main, config$toggle$salmon)}
 # Convert featureCounts gene-level counts into DESeq2 object, and run
 # variance-stabiliizing transform.
 dds_initial <- lcdbwf:::make_dds(
@@ -106,7 +105,7 @@ vsd <- varianceStabilizingTransformation(dds_initial, blind=TRUE)
 Here is the sample table with metadata used for this analysis:
 
 ```{r print_coldata}
-exclude.for.printing <- c('featurecounts.path', 'salmon.path', 'kallisto.path',
+exclude.for.printing <- c('featurecounts.path', 'salmon.path',
                           'orig_filename', 'orig_filename_R2', 'layout',
                           'sizeFactor')
 colData(dds_initial) %>%
@@ -138,8 +137,8 @@ for(group in config$plotting$covariates_for_plots){
 }
 ```
 
-```{r sizefactors, results='asis', eval=!(config$toggle$salmon | config$toggle$kallisto)}
-# Note that when loading Salmon or Kallisto, DESeq2 does not calculate size
+```{r sizefactors, results='asis', eval=!(config$toggle$salmon)}
+# Note that when loading Salmon, DESeq2 does not calculate size
 # factors.
 
 lcdbwf:::mdcat(text$sizefactors)
@@ -180,13 +179,8 @@ lst <- list(
                 design=~group,
                 salmon=TRUE),
 
-            # Example 4: use kallisto
-            kallisto=list(
-                sampletable=colData,
-                design=~group,
-                kallisto=TRUE),
 
-            # Example 5: use LRT
+            # Example 4: use LRT
             LRT=list(
                 sampletable=colData,
                 design=~group,
@@ -265,20 +259,10 @@ contr_03a_salmon <- lcdbwf:::make_results(
 )
 ```
 
+
 ```{r results_04, dependson='dds_list', cache=TRUE}
 # Example 4:
-#   - like example 3, but kallisto instead of salmon
-contr_03_kallisto <- lcdbwf:::make_results(
-  dds_name="kallisto",
-  contrast=c('group', 'treatment', 'control'),
-  type='normal',
-  label='Using Kallisto'
-)
-```
-
-```{r results_05, dependson='dds_list', cache=TRUE}
-# Example 5:
-#   - Examples 1-4 use the default DESeq2 test, Wald.
+#   - Examples 1-3 use the default DESeq2 test, Wald.
 #   - Here, we use the nBinomLRT (LRT) test.
 # NOTE: Use 'type=NULL' to skip LFC shrinkage as
 # make_results sets all LRT LFC values to 0.
@@ -288,7 +272,7 @@ contr_03_kallisto <- lcdbwf:::make_results(
 # make_results detects the 'test' type from the
 # dds object specified with 'dds_name'.
 
-contr_05_lrt <- lcdbwf:::make_results(
+contr_04_lrt <- lcdbwf:::make_results(
   dds_name="LRT",
   type=NULL,
   label='Using LRT'

From 6548c60ab67279c6c2824428192a43e95e9b92eb Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 8 Oct 2025 14:04:27 +0000
Subject: [PATCH 114/196] disable results diagnostics by default

---
 workflows/rnaseq/downstream/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/rnaseq/downstream/config.yaml b/workflows/rnaseq/downstream/config.yaml
index a8a826b3..740984c4 100644
--- a/workflows/rnaseq/downstream/config.yaml
+++ b/workflows/rnaseq/downstream/config.yaml
@@ -139,7 +139,7 @@ toggle:
 
   # Create diagnostic plots for results objects? If TRUE, will check the
   # config$plotting$diagnostics_for_results list.
-  results_diagnostics: TRUE
+  results_diagnostics: FALSE
 
 # ANNOTATION -------------------------------------------------------------------
 # Configuration specific to annotations and databases

From 19be1d9ced7b0f6c1b2e55567e4d772c15d4e8d9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 08:38:25 -0400
Subject: [PATCH 115/196] more updates to decision log

---
 docs/decisions.rst | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index ac808e63..74ddb01f 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -404,3 +404,31 @@ Guidelines:
 - SE/PE arguments should be handled inside a ``run:`` block
 - Any other arguments should be written in a  ``shell:`` block or a ``shell()``
   call directly, to visually match the equivalent command-line call
+
+Arguments for and against a separate references workflow
+--------------------------------------------------------
+
+RNA-seq, ChIP-seq, and the upcoming variant calling all need to do something
+with references, including possibly patching them. So we have to deal with this
+inherent complexity. It initially made sense to put such common rules in the
+separate references workflow.
+
+However, only a subset of the rules in the references workflow are actually
+shared across RNA-seq and ChIP-seq -- currently, only the bowtie2 index
+(genome-wide ChIP-seq alignment; rRNA screening for RNA-seq), the fasta rule,
+chromsizes, and the generic unzip rule. The others (gtf, mappings,
+conversion_bed12, conversion_refflat, kallisto_index, salmon_index,
+transcriptome_fasta, star_index, rrna) are all unique to RNA-seq. So the
+current references workflow is actually mostly an RNA-seq-only references
+workflow.
+
+Furthermore, much of the complexity is handled in the
+lib.utils.download_and_postprocess function, rather than in the workflow rules.
+We already are using the utils module separately in the ChIP-seq and RNA-seq
+workflows, so there's no additional overhead to import it.
+
+Last, having a workflow split across two Snakefiles hampers the ability to
+understand the complete workflow.
+
+Taken together, it made more sense to eliminate the references workflow
+entirely, and port the rules to the respective workflows.

From 72d77579ddb878072da92edbecff9b91f3ab8d22 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 08:55:44 -0400
Subject: [PATCH 116/196] rm kallisto throughout

---
 docs/config-yaml.rst                          |  7 ++---
 env.yml                                       |  1 -
 include/reference_configs/Danio_rerio.yaml    |  1 -
 .../Dictyostelium_discoideum.yaml             |  1 -
 .../Drosophila_melanogaster.yaml              |  2 --
 include/reference_configs/Gallus_gallus.yaml  |  1 -
 include/reference_configs/Homo_sapiens.yaml   |  3 ---
 include/reference_configs/Macaca_mulatta.yaml |  1 -
 include/reference_configs/Mus_musculus.yaml   |  2 --
 .../Plodia_interpunctella.yaml                |  1 -
 .../reference_configs/Rattus_norvegicus.yaml  |  1 -
 .../Saccharomyces_cerevisiae.yaml             |  1 -
 .../Schizosaccharomyces_pombe.yaml            |  1 -
 include/reference_configs/test.yaml           |  1 -
 include/requirements.txt                      |  1 -
 lib/lcdbwf/R/dds.R                            | 27 +++++--------------
 .../complex-dataset-rnaseq-config.yaml        |  3 ---
 test/test_configs/test_file_uri.yaml          |  4 ---
 18 files changed, 10 insertions(+), 49 deletions(-)

diff --git a/docs/config-yaml.rst b/docs/config-yaml.rst
index 7d86ceef..f492c70a 100644
--- a/docs/config-yaml.rst
+++ b/docs/config-yaml.rst
@@ -319,9 +319,10 @@ Required for RNA-seq
 ``stranded`` field
 ``````````````````
     This field specifies the strandedness of the library. This is used by
-    various rule to set the parameters correctly. For example,
-    ``featureCounts`` will use ``-s0``, ``-s1``, or ``-s2`` accordingly;
-    ``kallisto`` will use ``--fr-stranded`` if needed, and so on.
+    various rule to set the parameters correctly. For example, if this is set to ``fr-firststrand`` then
+    ``featureCounts`` will use ``-s2``; CollectRnaSeqMetrics will use
+    ``STRAND=SECOND_READ_TRANSCRIPTION_STRAND``, and deepTools bamCoverage will
+    use ``-filterRNAstrand reverse``.
 
     This field can take the following options:
 
diff --git a/env.yml b/env.yml
index f7f89425..41739e83 100644
--- a/env.yml
+++ b/env.yml
@@ -144,7 +144,6 @@ dependencies:
   - jsonschema-specifications=2025.9.1
   - jupyter_core=5.8.1
   - kaleido-core=0.2.1
-  - kallisto=0.51.1
   - kernel-headers_linux-64=5.14.0
   - keyring=25.6.0
   - keyutils=1.6.3
diff --git a/include/reference_configs/Danio_rerio.yaml b/include/reference_configs/Danio_rerio.yaml
index 038ef0ff..64f653df 100644
--- a/include/reference_configs/Danio_rerio.yaml
+++ b/include/reference_configs/Danio_rerio.yaml
@@ -23,7 +23,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/Dictyostelium_discoideum.yaml b/include/reference_configs/Dictyostelium_discoideum.yaml
index 9037d0f6..f703343d 100644
--- a/include/reference_configs/Dictyostelium_discoideum.yaml
+++ b/include/reference_configs/Dictyostelium_discoideum.yaml
@@ -17,7 +17,6 @@ references:
       transcriptome:
         indexes:
           - "salmon"
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/Drosophila_melanogaster.yaml b/include/reference_configs/Drosophila_melanogaster.yaml
index e228df7a..0e61fcad 100644
--- a/include/reference_configs/Drosophila_melanogaster.yaml
+++ b/include/reference_configs/Drosophila_melanogaster.yaml
@@ -40,7 +40,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     # Note: the mappings from r6.23 still work well for r6.28.
     r6-28:
@@ -71,4 +70,3 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
diff --git a/include/reference_configs/Gallus_gallus.yaml b/include/reference_configs/Gallus_gallus.yaml
index a618a5a9..13d6d49a 100644
--- a/include/reference_configs/Gallus_gallus.yaml
+++ b/include/reference_configs/Gallus_gallus.yaml
@@ -24,7 +24,6 @@ references:
     transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/Homo_sapiens.yaml b/include/reference_configs/Homo_sapiens.yaml
index 58d292ec..ff6720f1 100644
--- a/include/reference_configs/Homo_sapiens.yaml
+++ b/include/reference_configs/Homo_sapiens.yaml
@@ -29,7 +29,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
 
     gencode-v25:
@@ -65,7 +64,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
 
     gencode-v19:
@@ -90,7 +88,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/Macaca_mulatta.yaml b/include/reference_configs/Macaca_mulatta.yaml
index 111674c7..acefce08 100644
--- a/include/reference_configs/Macaca_mulatta.yaml
+++ b/include/reference_configs/Macaca_mulatta.yaml
@@ -24,7 +24,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/Mus_musculus.yaml b/include/reference_configs/Mus_musculus.yaml
index ef0eb30f..316bb389 100644
--- a/include/reference_configs/Mus_musculus.yaml
+++ b/include/reference_configs/Mus_musculus.yaml
@@ -28,7 +28,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     gencode_m12:
 
@@ -52,7 +51,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/Plodia_interpunctella.yaml b/include/reference_configs/Plodia_interpunctella.yaml
index 214e907f..ea3c59ca 100644
--- a/include/reference_configs/Plodia_interpunctella.yaml
+++ b/include/reference_configs/Plodia_interpunctella.yaml
@@ -25,7 +25,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/Rattus_norvegicus.yaml b/include/reference_configs/Rattus_norvegicus.yaml
index 3405d9f3..e12db2a4 100644
--- a/include/reference_configs/Rattus_norvegicus.yaml
+++ b/include/reference_configs/Rattus_norvegicus.yaml
@@ -23,7 +23,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/Saccharomyces_cerevisiae.yaml b/include/reference_configs/Saccharomyces_cerevisiae.yaml
index 1f536797..c965f7a5 100644
--- a/include/reference_configs/Saccharomyces_cerevisiae.yaml
+++ b/include/reference_configs/Saccharomyces_cerevisiae.yaml
@@ -29,7 +29,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/Schizosaccharomyces_pombe.yaml b/include/reference_configs/Schizosaccharomyces_pombe.yaml
index bbef64c3..74dcca1a 100644
--- a/include/reference_configs/Schizosaccharomyces_pombe.yaml
+++ b/include/reference_configs/Schizosaccharomyces_pombe.yaml
@@ -22,7 +22,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
     rRNA:
       genome:
diff --git a/include/reference_configs/test.yaml b/include/reference_configs/test.yaml
index a8f80b77..dc68f72d 100644
--- a/include/reference_configs/test.yaml
+++ b/include/reference_configs/test.yaml
@@ -38,7 +38,6 @@ references:
       transcriptome:
         indexes:
           - 'salmon'
-          - 'kallisto'
 
       metadata:
         reference_genome_build: 'dm6'
diff --git a/include/requirements.txt b/include/requirements.txt
index ebd02582..dfcb8601 100644
--- a/include/requirements.txt
+++ b/include/requirements.txt
@@ -13,7 +13,6 @@ gffutils
 hisat2
 intervalstats
 ipython
-kallisto
 macs3
 multiqc
 pandas
diff --git a/lib/lcdbwf/R/dds.R b/lib/lcdbwf/R/dds.R
index ae28ef6f..4e6b46bf 100644
--- a/lib/lcdbwf/R/dds.R
+++ b/lib/lcdbwf/R/dds.R
@@ -4,7 +4,6 @@
 
 
 salmon.path.func <- function (x) file.path('..', 'data', 'rnaseq_samples', x, paste0(x, '.salmon'), 'quant.sf')
-kallisto.path.func <- function (x) file.path('..', 'data', 'rnaseq_samples', x, paste0(x, '.salmon'), 'quant.sf')
 
 
 
@@ -34,11 +33,10 @@ kallisto.path.func <- function (x) file.path('..', 'data', 'rnaseq_samples', x,
 #' @param strip_dotted_version If TRUE, then remove Ensembl-style dotted
 #'   version numbers from gene IDs (ENSG000001.1 -> ENSG000001)
 #'
-#' @param salmon_pattern, kallisto_pattern Specify the patterns to locations of
-#'   Salmon or Kallisto files. Use the special placeholder string
+#' @param salmon_pattern Specify the pattern to locations of
+#'   Salmon files. Use the special placeholder string
 #'   `__SAMPLENAME__` which will be replaced with the sample name. Only
-#'   relevant if one of config$toggle$salmon or config$toggle$kallisto are
-#'   TRUE.
+#'   relevant if config$toggle$salmon is TRUE
 #'
 #' @param ... Additional arguments will be passed on to the DESeq() call (e.g.,
 #'   parallel, fitType, etc)
@@ -48,7 +46,6 @@ make_dds <- function(design_data, config=NULL, collapse_by=NULL,
                      strip_dotted_version=NULL,
                      featureCounts='../data/rnaseq_aggregation/featurecounts.txt',
                      salmon_pattern="../data/rnaseq_samples/__SAMPLENAME__/__SAMPLENAME__.salmon/quant.sf",
-                     kallisto_pattern="../data/rnaseq_samples/__SAMPLENAME__/__SAMPLENAME__.kallisto/abundance.h5",
                      ...){
 
   # Note we're using pluck() here for the convenience of setting defaults
@@ -65,33 +62,27 @@ make_dds <- function(design_data, config=NULL, collapse_by=NULL,
   }
   location <- purrr::pluck(design_data, 'filename', .default=featureCounts)
   salmon <- purrr::pluck(design_data, 'salmon')
-  kallisto <- purrr::pluck(design_data, 'kallisto')
   subset_counts <- purrr::pluck(design_data, 'subset_counts')
   sample_func <- purrr::pluck(design_data, 'sample_func', .default=lcdbwf_samplename)
 
   # Allow overriding of config values.
   if (!is.null(config)){
     if (is.null(salmon)) salmon <- config$toggle$salmon
-    if (is.null(kallisto)) kallisto <- config$toggle$kallisto
     if (is.null(collapse_by)) collapse_by <- config$main$collapse_by
     if (is.null(strip_dotted_version)) strip_dotted_version <- config$main$strip_dotted_version
   }
 
-  if (salmon & kallisto){
-    stop("Both salmon and kallisto are set to TRUE, not sure how to handle this.")
-  }
-
-  if (salmon | kallisto){
+  if (salmon) {
     # If these arguments were provided, the corresponding loading functions
     # don't accept them so we need to remove. Issue a warning as well.
     if (!is.null(subset_counts) | !is.null(sample_func)){
-      warning("Salmon or Kallisto was specified, but additional arguments ",
+      warning("Salmon was specified, but additional arguments ",
               "were provided to the loading function.")
       subset_counts <- NULL
       sample_func <- NULL
     }
 
-    # For Salmon and Kallisto, we need a tx2gene dataframe. We can get this
+    # For Salmon, we need a tx2gene dataframe. We can get this
     # from a TxDb, which in turn can be retrieved from AnnotationHub, which in
     # turn can be configured with the config object. Luckily, we have the
     # config object here!
@@ -104,12 +95,6 @@ make_dds <- function(design_data, config=NULL, collapse_by=NULL,
       coldata$salmon.path <- sapply(coldata$samplename, function (x) gsub("__SAMPLENAME__", x, salmon_pattern))
       txi <- tximport::tximport(coldata[, 'salmon.path'], type='salmon', tx2gene=tx2gene, ignoreTxVersion=strip_dotted_version)
       dds <- DESeq2::DESeqDataSetFromTximport(txi, colData=coldata, design=design)
-
-  } else if (kallisto) {
-      coldata$kallisto.path <- sapply(coldata$samplename, function (x) gsub("__SAMPLENAME__", x, kallisto_pattern))
-      txi <- tximport::tximport(coldata[, 'kallisto.path'], type='kallisto', tx2gene=tx2gene, ignoreTxVersion=strip_dotted_version)
-      dds <- DESeq2::DESeqDataSetFromTximport(txi, colData=coldata, design=design)
-
   } else {
     dds <- lcdbwf:::DESeqDataSetFromCombinedFeatureCounts(
       location,
diff --git a/test/test_configs/complex-dataset-rnaseq-config.yaml b/test/test_configs/complex-dataset-rnaseq-config.yaml
index d4a3ed90..ee7264b8 100644
--- a/test/test_configs/complex-dataset-rnaseq-config.yaml
+++ b/test/test_configs/complex-dataset-rnaseq-config.yaml
@@ -23,9 +23,6 @@ gtf:
 salmon:
   tag: "gencode-v28"
 
-kallisto:
-  tag: "gencode-v28"
-
 fastq_screen:
   - label: rRNA
     organism: human
diff --git a/test/test_configs/test_file_uri.yaml b/test/test_configs/test_file_uri.yaml
index 571078c6..2315525a 100644
--- a/test/test_configs/test_file_uri.yaml
+++ b/test/test_configs/test_file_uri.yaml
@@ -24,10 +24,6 @@ gtf:
 salmon:
   tag: "test"
 
-kallisto:
-  tag: "test"
-
-
 fastq_screen:
   - label: test
     organism: filebased

From a999bab442bf5c2a365cd73be97402bf481e2316 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 09:56:27 -0400
Subject: [PATCH 117/196] modify gene patterns test settings

---
 workflows/rnaseq/downstream/gene-patterns.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/rnaseq/downstream/gene-patterns.Rmd b/workflows/rnaseq/downstream/gene-patterns.Rmd
index 4c425102..c46e790c 100644
--- a/workflows/rnaseq/downstream/gene-patterns.Rmd
+++ b/workflows/rnaseq/downstream/gene-patterns.Rmd
@@ -100,8 +100,8 @@ col <- NULL
 
 # NOTE: This is set very low for test data. Default is 15.---------------------
 #   Minimum cluster size.
-# minc <- 1  # [ TEST SETTINGS +1 ]
 minc <- 15
+# minc <- 1  # [ enable for test ]
 
 # NOTE: This is a very low value used for getting the degPatterns to run -----
 low.minc <- 1

From f7996d597b64c17d3e7378c9a0d06210b76d6084 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 10:01:19 -0400
Subject: [PATCH 118/196] sra & strand_check rules directly in respective
 snakefiles

---
 workflows/chipseq/Snakefile       |  42 +++++++++++
 workflows/chipseq/sra.smk         |  40 ----------
 workflows/rnaseq/Snakefile        | 118 +++++++++++++++++++++++++++++-
 workflows/rnaseq/sra.smk          |  40 ----------
 workflows/rnaseq/strand_check.smk |  75 -------------------
 5 files changed, 157 insertions(+), 158 deletions(-)
 delete mode 100644 workflows/chipseq/sra.smk
 delete mode 100644 workflows/rnaseq/sra.smk
 delete mode 100644 workflows/rnaseq/strand_check.smk

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 06dcd8cf..82a460f3 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -707,3 +707,45 @@ rule multiqc:
             "{analysis_directory} "
             "&> {log} "
         )
+
+
+if utils.detect_sra(sampletable):
+    sampletable["orig_filename"] = expand(
+        "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1
+    )
+
+    if is_paired:
+        sampletable["orig_filename_R2"] = expand(
+            "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+            sample=SAMPLES,
+            n=2,
+        )
+
+    rule fastq_dump:
+        output:
+            fastq=expand(
+                "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+                n=n,
+                allow_missing=True,
+            ),
+        log:
+            "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
+        params:
+            is_paired=is_paired,
+            # extra="-X 100000",  # [enable for test]
+        resources:
+            mem="1g",
+            disk="1g",
+            runtime="2h",
+        run:
+            srr = sampletable.loc[wildcards.sample, "Run"]
+            extra = params.get("extra", "")
+            if is_paired:
+                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
+                shell("mv {srr}_1.fastq.gz {output[0]}")
+                shell("mv {srr}_2.fastq.gz {output[1]}")
+            else:
+                shell(
+                    "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp"
+                )
+                shell("mv {output[0]}.tmp {output[0]}")
diff --git a/workflows/chipseq/sra.smk b/workflows/chipseq/sra.smk
deleted file mode 100644
index 5ee5f53b..00000000
--- a/workflows/chipseq/sra.smk
+++ /dev/null
@@ -1,40 +0,0 @@
-if utils.detect_sra(sampletable):
-    sampletable["orig_filename"] = expand(
-        "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1
-    )
-
-    if is_paired:
-        sampletable["orig_filename_R2"] = expand(
-            "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
-            sample=SAMPLES,
-            n=2,
-        )
-
-    rule fastq_dump:
-        output:
-            fastq=expand(
-                "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
-                n=n,
-                allow_missing=True,
-            ),
-        log:
-            "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
-        params:
-            is_paired=is_paired,
-            # extra="-X 100000",  # [enable for test]
-        resources:
-            mem="1g",
-            disk="1g",
-            runtime="2h",
-        run:
-            srr = sampletable.loc[wildcards.sample, "Run"]
-            extra = params.get("extra", "")
-            if is_paired:
-                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
-                shell("mv {srr}_1.fastq.gz {output[0]}")
-                shell("mv {srr}_2.fastq.gz {output[1]}")
-            else:
-                shell(
-                    "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp"
-                )
-                shell("mv {output[0]}.tmp {output[0]}")
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 80284e27..eb4eb884 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -922,7 +922,119 @@ rule multiqc:
 
 # Optionally run `snakemake strand_check` to do a preliminary run on
 # automatically-subset data to evaluate strandedness.
-include: "strand_check.smk"
+rule sample_strand_check:
+    input:
+        fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
+        index=expand(rules.rrna_index.output, label="genome"),
+        bed12=rules.conversion_bed12.output,
+    output:
+        strandedness="strand_check/{sample}/{sample}.strandedness",
+        bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"),
+        bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"),
+        fastqs=temporary(
+            expand(
+                "strand_check/{sample}/{sample}_R{n}.strandedness.fastq",
+                n=n,
+                allow_missing=True,
+            )
+        ),
+    log:
+        "strand_check/{sample}/{sample}.strandedness.log",
+    threads: 6
+    resources:
+        mem="8g",
+        runtime="2h",
+    run:
+        prefix = os.path.commonprefix(input.index).rstrip(".")
+        nreads = int(1e5 * 4)
+        if is_paired:
+            shell(
+                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
+            )
+            shell(
+                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}"
+            )
+            fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} "
+        else:
+            shell(
+                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
+            )
+            fastqs = f"-U {output.fastqs[0]} "
+        shell(
+            "bowtie2 "
+            "-x {prefix} "
+            "{fastqs} "
+            "--no-unal "
+            "--threads {threads} 2> {log} "
+            "| samtools view -Sb - "
+            "| samtools sort - -o {output.bam} "
+        )
+        shell("samtools index {output.bam}")
+        shell(
+            "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}"
+        )
+
+
+rule strand_check:
+    input:
+        expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES),
+    output:
+        html="strand_check/strandedness.html",
+        filelist=temporary("strand_check/filelist"),
+    log:
+        "strand_check/strandedness.log",
+    resources:
+        mem="1g",
+        runtime="2h",
+    run:
+        with open(output.filelist, "w") as fout:
+            for i in input:
+                fout.write(i + "\n")
+        shell(
+            "multiqc "
+            "--force "
+            "--module rseqc "
+            "--file-list {output.filelist} "
+            "--filename {output.html} &> {log}"
+        )
 
-# If the sampletable is from SRA, handle it here.
-include: "sra.smk"
+if utils.detect_sra(sampletable):
+    sampletable["orig_filename"] = expand(
+        "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1
+    )
+
+    if is_paired:
+        sampletable["orig_filename_R2"] = expand(
+            "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+            sample=SAMPLES,
+            n=2,
+        )
+
+    rule fastq_dump:
+        output:
+            fastq=expand(
+                "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
+                n=n,
+                allow_missing=True,
+            ),
+        log:
+            "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
+        params:
+            is_paired=is_paired,
+            # extra="-X 100000",  # [enable for test]
+        resources:
+            mem="1g",
+            disk="1g",
+            runtime="2h",
+        run:
+            srr = sampletable.loc[wildcards.sample, "Run"]
+            extra = params.get("extra", "")
+            if is_paired:
+                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
+                shell("mv {srr}_1.fastq.gz {output[0]}")
+                shell("mv {srr}_2.fastq.gz {output[1]}")
+            else:
+                shell(
+                    "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp"
+                )
+                shell("mv {output[0]}.tmp {output[0]}")
diff --git a/workflows/rnaseq/sra.smk b/workflows/rnaseq/sra.smk
deleted file mode 100644
index 5ee5f53b..00000000
--- a/workflows/rnaseq/sra.smk
+++ /dev/null
@@ -1,40 +0,0 @@
-if utils.detect_sra(sampletable):
-    sampletable["orig_filename"] = expand(
-        "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1
-    )
-
-    if is_paired:
-        sampletable["orig_filename_R2"] = expand(
-            "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
-            sample=SAMPLES,
-            n=2,
-        )
-
-    rule fastq_dump:
-        output:
-            fastq=expand(
-                "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz",
-                n=n,
-                allow_missing=True,
-            ),
-        log:
-            "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
-        params:
-            is_paired=is_paired,
-            # extra="-X 100000",  # [enable for test]
-        resources:
-            mem="1g",
-            disk="1g",
-            runtime="2h",
-        run:
-            srr = sampletable.loc[wildcards.sample, "Run"]
-            extra = params.get("extra", "")
-            if is_paired:
-                shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}")
-                shell("mv {srr}_1.fastq.gz {output[0]}")
-                shell("mv {srr}_2.fastq.gz {output[1]}")
-            else:
-                shell(
-                    "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp"
-                )
-                shell("mv {output[0]}.tmp {output[0]}")
diff --git a/workflows/rnaseq/strand_check.smk b/workflows/rnaseq/strand_check.smk
deleted file mode 100644
index bd7c45d4..00000000
--- a/workflows/rnaseq/strand_check.smk
+++ /dev/null
@@ -1,75 +0,0 @@
-rule sample_strand_check:
-    input:
-        fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
-        index=expand(rules.rrna_index.output, label="genome"),
-        bed12=rules.conversion_bed12.output,
-    output:
-        strandedness="strand_check/{sample}/{sample}.strandedness",
-        bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"),
-        bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"),
-        fastqs=temporary(
-            expand(
-                "strand_check/{sample}/{sample}_R{n}.strandedness.fastq",
-                n=n,
-                allow_missing=True,
-            )
-        ),
-    log:
-        "strand_check/{sample}/{sample}.strandedness.log",
-    threads: 6
-    resources:
-        mem="8g",
-        runtime="2h",
-    run:
-        prefix = os.path.commonprefix(input.index).rstrip(".")
-        nreads = int(1e5 * 4)
-        if is_paired:
-            shell(
-                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
-            )
-            shell(
-                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}"
-            )
-            fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} "
-        else:
-            shell(
-                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
-            )
-            fastqs = f"-U {output.fastqs[0]} "
-        shell(
-            "bowtie2 "
-            "-x {prefix} "
-            "{fastqs} "
-            "--no-unal "
-            "--threads {threads} 2> {log} "
-            "| samtools view -Sb - "
-            "| samtools sort - -o {output.bam} "
-        )
-        shell("samtools index {output.bam}")
-        shell(
-            "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}"
-        )
-
-
-rule strand_check:
-    input:
-        expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES),
-    output:
-        html="strand_check/strandedness.html",
-        filelist=temporary("strand_check/filelist"),
-    log:
-        "strand_check/strandedness.log",
-    resources:
-        mem="1g",
-        runtime="2h",
-    run:
-        with open(output.filelist, "w") as fout:
-            for i in input:
-                fout.write(i + "\n")
-        shell(
-            "multiqc "
-            "--force "
-            "--module rseqc "
-            "--file-list {output.filelist} "
-            "--filename {output.html} &> {log}"
-        )

From ad559319f125e23e7b9ddf072495aebf46fd54b4 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 10:33:34 -0400
Subject: [PATCH 119/196] rm include sra.smk

---
 workflows/chipseq/Snakefile | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 82a460f3..1393e034 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -39,10 +39,6 @@ rule all:
         [v["bed"] for k, v in peaks.items()],
 
 
-# If the sampletable is from SRA, handle it here.
-include: "sra.smk"
-
-
 rule fasta:
     output:
         f"{REFERENCES}/genome.fa.gz",
@@ -709,6 +705,7 @@ rule multiqc:
         )
 
 
+# If the sampletable is from SRA, handle it here.
 if utils.detect_sra(sampletable):
     sampletable["orig_filename"] = expand(
         "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1

From 7023a8d3313a4316cc0f7de04a44eab9bfd74932 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 10:34:05 -0400
Subject: [PATCH 120/196] rm *.smk from deployment

---
 deploy.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/deploy.py b/deploy.py
index 11969246..6e98596b 100755
--- a/deploy.py
+++ b/deploy.py
@@ -82,8 +82,6 @@ def write_include_file(source, flavor="all"):
     PATTERN_DICT = {
         "rnaseq": [
             "include workflows/rnaseq/Snakefile",
-            "include workflows/rnaseq/strand_check.smk",
-            "include workflows/rnaseq/sra.smk",
             "recursive-include workflows/rnaseq/config *",
             "include workflows/rnaseq/rnaseq_trackhub.py",
             "recursive-include workflows/rnaseq/downstream *.Rmd",
@@ -91,7 +89,6 @@ def write_include_file(source, flavor="all"):
         ],
         "chipseq": [
             "include workflows/chipseq/Snakefile",
-            "include workflows/chipseq/sra.smk",
             "recursive-include workflows/chipseq/config *",
             "include workflows/chipseq/chipseq_trackhub.py",
         ],

From 687af4f80f2a8a4305bbd8c7ea0f548a4bda6f99 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 15:21:34 +0000
Subject: [PATCH 121/196] rm strand_check

---
 docs/config-yaml.rst           | 21 ----------
 lib/utils.py                   |  4 +-
 test/workflow_test_params.yaml | 14 -------
 workflows/rnaseq/Snakefile     | 77 ----------------------------------
 4 files changed, 1 insertion(+), 115 deletions(-)

diff --git a/docs/config-yaml.rst b/docs/config-yaml.rst
index f492c70a..ad1d3fb3 100644
--- a/docs/config-yaml.rst
+++ b/docs/config-yaml.rst
@@ -343,27 +343,6 @@ Required for RNA-seq
     Rules that require information about strand will check the config file at
     run time and raise an error if this field doesn't exist.
 
-    If you don't know the strandedness of the library, run the Snakefile in
-    such a way to only run the ``strand_check`` rule:
-
-    .. code-block:: bash
-
-        snakemake -j 2 strand_check
-
-    Or, when using the Slurm wrapper on cluster,
-
-    .. code-block:: bash
-
-        sbatch ../../include/WRAPPER_SLURM strand_check
-
-    When complete, there will be a MultiQC HTML file in the ``strand_check/``
-    directory that you can inspect to make your choice.
-
-    This will align the first 10,000 reads to the specified reference and run
-    RSeQC's ``infer_experiment.py`` on the results and then run MultiQC on just
-    those output files.
-
-    .. versionadded:: 1.8
 
 Optional fields
 ~~~~~~~~~~~~~~~
diff --git a/lib/utils.py b/lib/utils.py
index 0e5cc9e2..c7e4bf7b 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -712,9 +712,7 @@ def strand_arg_lookup(config, lookup):
         raise ConfigurationError(
             "Starting in v1.8, 'stranded' is required in the config file. "
             "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), "
-            "or 'fr-secondstrand' (R1 aligns sense to original transcript). If you are not sure, "
-            "run the workflow with only the 'strand_check' rule, like "
-            "'snakemake -j 5 strand_check'."
+            "or 'fr-secondstrand' (R1 aligns sense to original transcript)."
         )
     if config.stranded not in lookup:
         keys = list(lookup.keys())
diff --git a/test/workflow_test_params.yaml b/test/workflow_test_params.yaml
index 5d74fac9..2255cdda 100644
--- a/test/workflow_test_params.yaml
+++ b/test/workflow_test_params.yaml
@@ -27,20 +27,6 @@ rnaseq:
       --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml
       --config sampletable=__ORIG__/test/test_configs/test_sra_sampletable_SE_only.tsv
 
-  strandedness-pe:
-    desc: Tests running the strandedness pre-check using paired-end data.
-    args: |
-      --until strand_check
-      --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml
-      --config sampletable=__ORIG__/test/test_configs/test_pe_sampletable.tsv
-
-  strandedness-se:
-    desc: Tests running the strandedness pre-check using single-ended data.
-    args: |
-      --until strand_check
-      --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml
-      --config sampletable=__ORIG__/test/test_configs/two_samples.tsv
-
   star-2pass:
     desc: Tests running STAR in 2-pass mode. Only runs until the star_pass2 rule.
     args: |
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index eb4eb884..10be673d 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -920,83 +920,6 @@ rule multiqc:
             "&> {log} "
         )
 
-# Optionally run `snakemake strand_check` to do a preliminary run on
-# automatically-subset data to evaluate strandedness.
-rule sample_strand_check:
-    input:
-        fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n),
-        index=expand(rules.rrna_index.output, label="genome"),
-        bed12=rules.conversion_bed12.output,
-    output:
-        strandedness="strand_check/{sample}/{sample}.strandedness",
-        bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"),
-        bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"),
-        fastqs=temporary(
-            expand(
-                "strand_check/{sample}/{sample}_R{n}.strandedness.fastq",
-                n=n,
-                allow_missing=True,
-            )
-        ),
-    log:
-        "strand_check/{sample}/{sample}.strandedness.log",
-    threads: 6
-    resources:
-        mem="8g",
-        runtime="2h",
-    run:
-        prefix = os.path.commonprefix(input.index).rstrip(".")
-        nreads = int(1e5 * 4)
-        if is_paired:
-            shell(
-                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
-            )
-            shell(
-                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}"
-            )
-            fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} "
-        else:
-            shell(
-                "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}"
-            )
-            fastqs = f"-U {output.fastqs[0]} "
-        shell(
-            "bowtie2 "
-            "-x {prefix} "
-            "{fastqs} "
-            "--no-unal "
-            "--threads {threads} 2> {log} "
-            "| samtools view -Sb - "
-            "| samtools sort - -o {output.bam} "
-        )
-        shell("samtools index {output.bam}")
-        shell(
-            "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}"
-        )
-
-
-rule strand_check:
-    input:
-        expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES),
-    output:
-        html="strand_check/strandedness.html",
-        filelist=temporary("strand_check/filelist"),
-    log:
-        "strand_check/strandedness.log",
-    resources:
-        mem="1g",
-        runtime="2h",
-    run:
-        with open(output.filelist, "w") as fout:
-            for i in input:
-                fout.write(i + "\n")
-        shell(
-            "multiqc "
-            "--force "
-            "--module rseqc "
-            "--file-list {output.filelist} "
-            "--filename {output.html} &> {log}"
-        )
 
 if utils.detect_sra(sampletable):
     sampletable["orig_filename"] = expand(

From 1e56148e1d63976a848fe3982a8871b5da0edee9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 12:59:38 -0400
Subject: [PATCH 122/196] rm more strandedness

---
 .circleci/config.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index bedc4c18..5959188e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -262,7 +262,6 @@ variables:
           # configs from the original clone rather than the deployed directory.
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe          -k -p -j2 --use-conda --orig $ORIG
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se          -k -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k -p -j2 --use-conda --orig $ORIG
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass      -k -p -j2 --use-conda --orig $ORIG
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass      -k -p -j2 --use-conda --orig $ORIG
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe              -k -p -j2 --use-conda --orig $ORIG

From a9d467709712cf67d889a86ae1743d71d9eac949 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 15:23:12 -0400
Subject: [PATCH 123/196] rm now-irrelvant tests

---
 test/workflow_test_params.yaml | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/test/workflow_test_params.yaml b/test/workflow_test_params.yaml
index 2255cdda..5c483e59 100644
--- a/test/workflow_test_params.yaml
+++ b/test/workflow_test_params.yaml
@@ -1,4 +1,5 @@
 # This file configures arguments for running various workflows that are  pulled
+#
 # into the test/lcdb-wf-test runner script automatically. It is a way of 
 #
 # NOTE:
@@ -27,22 +28,6 @@ rnaseq:
       --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml
       --config sampletable=__ORIG__/test/test_configs/test_sra_sampletable_SE_only.tsv
 
-  star-2pass:
-    desc: Tests running STAR in 2-pass mode. Only runs until the star_pass2 rule.
-    args: |
-        --until star_pass2
-        --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml
-        --config sampletable=__ORIG__/test/test_configs/star_2pass.tsv
-        --config aligner="star-twopass"
-
-  hisat2:
-    desc: Tests running HISAT2
-    args: |
-        --until hisat2
-        --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml
-        --config sampletable=__ORIG__/test/test_configs/hisat2.tsv
-        --config aligner=hisat2
-
   pe:
     desc: Tests paired-end data
     args: |

From 0b2b3ce946f99c9aa425faaaa2717c19d1e0f3e9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 16:04:25 -0400
Subject: [PATCH 124/196] rm more star 2pass tests

---
 .circleci/config.yml             | 1 -
 test/lcdb-wf-test                | 2 --
 test/test_configs/star_2pass.tsv | 3 ---
 3 files changed, 6 deletions(-)
 delete mode 100644 test/test_configs/star_2pass.tsv

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5959188e..8195050c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -262,7 +262,6 @@ variables:
           # configs from the original clone rather than the deployed directory.
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe          -k -p -j2 --use-conda --orig $ORIG
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se          -k -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass      -k -p -j2 --use-conda --orig $ORIG
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass      -k -p -j2 --use-conda --orig $ORIG
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe              -k -p -j2 --use-conda --orig $ORIG
 
diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test
index 21f6978c..8e8525fb 100755
--- a/test/lcdb-wf-test
+++ b/test/lcdb-wf-test
@@ -141,8 +141,6 @@ class Runner(object):
             %(prog)s rnaseq --run-workflow --sra-se
             %(prog)s rnaseq --run-workflow --strandedness-pe
             %(prog)s rnaseq --run-workflow --strandedness-se
-            %(prog)s rnaseq --run-workflow --star-2pass
-            %(prog)s rnaseq --run-workflow --hisat2
             %(prog)s rnaseq --run-workflow --pe
 
             # Since there are a lot of parameters here, see
diff --git a/test/test_configs/star_2pass.tsv b/test/test_configs/star_2pass.tsv
deleted file mode 100644
index 8cf98eb0..00000000
--- a/test/test_configs/star_2pass.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-samplename	group	layout	orig_filename
-sample1-star-2pass	control	SE	data/example_data/rnaseq_sample1PE_1.fq.gz
-sample2-star-2pass	control	SE	data/example_data/rnaseq_sample2.fq.gz

From 9902f642a113e0f68345a00777d59bf8a915cc46 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 9 Oct 2025 19:50:25 -0400
Subject: [PATCH 125/196] rm another test

---
 .circleci/config.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 8195050c..bee8727d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -262,7 +262,6 @@ variables:
           # configs from the original clone rather than the deployed directory.
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe          -k -p -j2 --use-conda --orig $ORIG
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se          -k -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass      -k -p -j2 --use-conda --orig $ORIG
           $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe              -k -p -j2 --use-conda --orig $ORIG
 
 

From 8ffc2bab2778f5988761a469d0beb19576237495 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 10 Oct 2025 20:12:58 -0400
Subject: [PATCH 126/196] decision log on references

---
 docs/decisions.rst | 162 +++++++++++++++++++++++++--------------------
 1 file changed, 89 insertions(+), 73 deletions(-)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 74ddb01f..5dcb3460 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -7,82 +7,98 @@ References
 ----------
 Here are use-cases we have that are common enough to warrant supporting:
 
-- References should support multiple workflows (ChIP-seq, RNA-seq, etc)
-  - This implies that the means the references dir should be in the
-    ``workflows`` directory or above.
-  - For example, this may mean a STAR index for RNA-seq, a bowtie2 index for
-    rRNA contamination, and another bowtie2 index for ChIP-seq.
-
-- References should support different organisms in different workflows. There
-  should beo only one organism per workflow though.
-
-- References should be re-created for each project.
-  - What we've found is that if we have a central location for the references
-    (shared by multiple deployments of lcdb-wf over the years) then we get
-    conflicts where one deployment's aligner version is more recent, causing
-    errors when using the index for an older version.
-  - To keep using this, we'd need to version indexes based on aligner version.
-  - However, when writing up methods for a paper we need to be able to trace
-    back what commands were run to generate the reference, including additional
-    patching that may have taken place (as is supported by the references
-    workflow).
-  - Re-using indexes is space- and time-efficient in the short term, but has
-    shown to be inefficient in time and reproducibility in the long term.
-  - Keeping everything in the same deployment director also helps with the
-    archiving process. 
-
-Naming:
-
-- Top level should be organsim. Doesn't really matter in the case of
-  a single-organism workflow.
-- Next should be what has historically been called "tag". This could be the
-  assembly name for genomic indexes, or some combination of assembly
-  + annotation for transcriptome.
-- If we're assuming "deployment-local" references, these no longer have to be
-  globally unique. If we have a mouse reference with a transgene, we can just
-  call it "mouse/mm39" but have the transgene patched into it, and not worry
-  about conflicting (or worse, overwriting!) a central reference with the same
-  name that didn't have the transgene.
-- Fasta files are included next to their respective index.
-
-This example uses the ``dmel`` organism and ``test`` tag which is configured by
-default for tests.
-
-This uses ``$ORG/$TAG/<genome|annotation|transcriptome>/$TOOL`` as the path
-template. This lets us keep the fastq file used for building the various
-indexes alongside the indexes.
+**References should support multiple workflows (ChIP-seq, RNA-seq, etc)**
+
+- This implies that the means the references dir should be in the ``workflows``
+  directory or above.
+- For example, this may mean a STAR index for RNA-seq, a bowtie2 index for rRNA
+  contamination, and another bowtie2 index for ChIP-seq.
+
+**References should support different organisms in different workflows. There
+should be only one organism per workflow though.**
+
+- For example, ``workflows/mouse-rnaseq`` and ``workflows/human-rnaseq`` should
+  be supported in the same project.
+
+
+**References should be re-created for each project.**
+
+- Historically we had a central location for the references (shared by multiple
+  deployments of lcdb-wf over the years) but we got conflicts where one
+  deployment's aligner version was more recent, causing errors when using the
+  index for an older version.
+- To keep using this, we'd need to version indexes based on aligner version.
+- However, when writing up methods for a paper we need to be able to trace
+  back what commands were run to generate the reference, including additional
+  patching that may have taken place (as is supported by the references
+  workflow).
+- Re-using indexes is space- and time-efficient in the short term, but has
+  shown to be inefficient in time and reproducibility in the long term.
+- Keeping everything in the same deployment directory also helps with the
+  archiving process. 
+- We were hesitant to update the references in the central location due to
+  being unsure of what was depending on them.
+- Overall, making the decision that the time and space cost to re-make
+  references for each project is worth the gain in simplicity and isolation.
+
+Reference nomenclature and directory structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Options considered:
+
+1. ``references`` (top-level of project, shared by all workflows)
+2. ``workflows/<workflow>/references`` (workflow-specific)
+
+The location ``workflows/references`` is functionally similar to top-level
+``references`` (in a parent directory of individual workflows) but references
+is no longer a workflow so it doesn't make sense to have it right in the
+``workflows`` directory.
+
+Recall that in lcdb-wf <2.0, we have organism and then tag. For example, we
+might have configurations available for different human genome assemblies
+(hg19, hg38) and in the central location we needed to differentiate between
+them (e.g. ``references/human/hg19/``).
+
+If we assume a single organism per workflow, and that the references are
+workflow-specific, then we don't need any of this.
+``workflows/<workflow>/references/genome.fa`` for example should cover it.
+
+This becomes inefficient in the case where there are multiple workflows, all
+for the same organism and all the same workflow type. However in such cases,
+manually creating symlinks can get around this, and I think it's an acceptable
+workaround for the benefit of simplified references more generally.
 
 ::
 
-  references_data/
-  ├── dmel
-      ├── rRNA
-      │   └── genome
-      │       ├── bowtie2
-      │       │   └── dmel_rRNA.* <bowtie2 files>
-      │       └── dmel_rRNA.fasta
-      └── test
-          ├── annotation
-          │   ├── dmel_test.bed12
-          │   ├── dmel_test.gtf
-          │   └── dmel_test.refflat
-          ├── genome
-          │   ├── bowtie2
-          │   │   └── dmel_test.* <bowtie2 files>
-          │   ├── star
-          │   │   └── dmel_test
-          │   │       └── <STAR files>
-          │   ├── dmel_test.chromsizes
-          │   ├── dmel_test.fasta
-          │   ├── dmel_test.fasta.fai
-          └── transcriptome
-              ├── kallisto
-              │   └── dmel_test
-              │       └── transcripts.idx
-              ├── salmon
-              │   └── dmel_test
-              │       └── <salmon files>
-              └── dmel_test.fasta
+  workflows/rnaseq/references
+    genome.fasta
+    genome.chromsizes
+    rrna.fasta
+    annotation.gtf
+    annotation.bed12
+    annotation.refflat
+    transcriptome.fasta
+    star/
+      genome.fasta <symlink to ../genome.fasta>
+      <star files>
+    bowtie2/
+      rrna.fasta <symlink to ../rrna.fasta>
+      <bowtie2 files>
+    salmon/
+      transcriptome.fasta <symlink to ../transcriptome.fasta>
+      <salmon files>
+
+For ChIP-seq:
+
+::
+
+  workflows/chipseq/references
+    genome.fasta
+    genome.chromsizes
+    bowtie2/
+      genome.fasta <symlink to ../genome.fasta>
+      <bowtie2 files>
+
 
 Params
 ------

From 82f91a9517d5bddf0b6924562e13460540b56a63 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 10 Oct 2025 20:15:05 -0400
Subject: [PATCH 127/196] config file cleanup

---
 workflows/chipseq/config/config.yaml | 46 ----------------------------
 workflows/rnaseq/config/config.yaml  | 17 +++++-----
 2 files changed, 7 insertions(+), 56 deletions(-)

diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml
index d35898d2..268dcf59 100644
--- a/workflows/chipseq/config/config.yaml
+++ b/workflows/chipseq/config/config.yaml
@@ -1,28 +1,5 @@
-# NOTE: all paths are relative to the calling Snakefile.
-#
-# sampletable: TSV file defining sample metadata.
-# First column must have header name "samplename".
 sampletable: 'config/sampletable.tsv'
 
-# Which key in the `references` dict below to use
-organism: 'dmel'
-
-# What reference genome used
-# Check the assembly in https://www.ncbi.nlm.nih.gov/datasets/genome/
-# options:
-# - 'mm10' for mouse
-# - 'hg38' or 'hg19' for human
-# - 'dm6' for drosophila
-# - 'danRer11' for zebrafish
-# - 'sacCer3' for yeast
-# - 'rn6' for rat
-# genome: 'dm6'
-
-# If not specified here, use the environment variable REFERENCES_DIR.
-references_dir: 'references_data'
-
-peaks_dir: 'data/chipseq_peaks'
-
 fasta:
   url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
   postprocess: 'lib.utils.gzipped'
@@ -70,7 +47,6 @@ chipseq:
         - input-wingdisc-2
       extra: '--nomodel --extsize 147'
 
-
     - label: gaf-wingdisc-pooled-1
       algorithm: epic2
       ip:
@@ -94,25 +70,3 @@ chipseq:
         - gaf-wingdisc-2
       control: []
       extra: ''
-
-fastq_screen:
-  - label: rRNA
-    organism: dmel
-    tag: test
-  - label: Fly
-    organism: dmel
-    tag: test
-
-merged_bigwigs:
-  input-wingdisc:
-    - input-wingdisc-1
-    - input-wingdisc-2
-  gaf-wingdisc:
-    - gaf-wingdisc-1
-    - gaf-wingdisc-2
-  gaf-embryo:
-    - gaf-embryo-1
-
-aligner:
-  index: 'bowtie2'
-  tag: 'test'
diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml
index 26f5aba9..657c92a4 100644
--- a/workflows/rnaseq/config/config.yaml
+++ b/workflows/rnaseq/config/config.yaml
@@ -1,3 +1,10 @@
+sampletable: 'config/sampletable.tsv'
+
+# See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info.
+stranded: 'fr-firststrand'     # for dUTP libraries
+#         'fr-secondstrand'    # for ligation libraries
+#         'unstranded'         # for libraries without strand specificity
+
 fasta:
   url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
   postprocess: 'lib.utils.gzipped'
@@ -13,13 +20,3 @@ rrna:
   postprocess:
     function: 'lib.utils.filter_fastas'
     args: 'Drosophila melanogaster'
-
-
-sampletable: 'config/sampletable.tsv'
-
-patterns: 'config/rnaseq_patterns.yaml'
-
-# See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info.
-stranded: 'fr-firststrand'     # for dUTP libraries
-#         'fr-secondstrand'    # for ligation libraries
-#         'unstranded'         # for libraries without strand specificity

From c89b1eb4a1c39df8fababa054f6c45eb02f99c5c Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 11 Oct 2025 00:38:52 +0000
Subject: [PATCH 128/196] simplify references for rnaseq

---
 workflows/rnaseq/Snakefile | 75 +++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 10be673d..2ac4a01b 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -10,7 +10,6 @@ from lib import utils
 configfile: "config/config.yaml"
 
 
-REFERENCES = config.get("reference_dir", "../../references")
 sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
@@ -55,9 +54,9 @@ rule symlinks:
 
 rule fasta:
     output:
-        f"{REFERENCES}/genome.fa.gz",
+        "references/genome.fa.gz",
     log:
-        f"{REFERENCES}/logs/genome.fa.gz.log",
+        "references/logs/genome.fa.gz.log",
     resources:
         mem_mb="4g",
         runtime="2h",
@@ -75,9 +74,9 @@ rule fasta:
 
 rule gtf:
     output:
-        f"{REFERENCES}/annotation.gtf.gz",
+        "references/annotation.gtf.gz",
     log:
-        f"{REFERENCES}/logs/annotation.gtf.gz.log",
+        "references/logs/annotation.gtf.gz.log",
     resources:
         mem="4g",
         runtime="2h",
@@ -95,9 +94,9 @@ rule gtf:
 
 rule rrna_fasta:
     output:
-        f"{REFERENCES}/rrna.fa.gz",
+        "references/rrna.fa.gz",
     log:
-        f"{REFERENCES}/logs/rrna.fa.log",
+        "references/logs/rrna.fa.log",
     resources:
         mem="4g",
         runtime="2h",
@@ -115,9 +114,9 @@ rule rrna_fasta:
 
 rule unzip:
     input:
-        f"{REFERENCES}/{{prefix}}.gz",
+        "references/{prefix}.gz",
     output:
-        f"{REFERENCES}/{{prefix}}",
+        "references/{prefix}",
     resources:
         mem="4g",
         runtime="2h",
@@ -127,12 +126,12 @@ rule unzip:
 
 rule rrna_index:
     input:
-        f"{REFERENCES}/rrna.fa",
+        "references/rrna.fa",
     output:
-        f"{REFERENCES}/bowtie2/rrna.1.bt2",
-        f"{REFERENCES}/bowtie2/rrna.fa",
+        "references/bowtie2/rrna.1.bt2",
+        "references/bowtie2/rrna.fa",
     log:
-        f"{REFERENCES}/logs/bowtie2_rrna.log",
+        "references/logs/bowtie2_rrna.log",
     resources:
         mem="32g",
         disk="50g",
@@ -151,12 +150,12 @@ rule rrna_index:
 
 rule star_index:
     input:
-        fasta=f"{REFERENCES}/genome.fa",
-        gtf=f"{REFERENCES}/annotation.gtf",
+        fasta="references/genome.fa",
+        gtf="references/annotation.gtf",
     output:
-        f"{REFERENCES}/star/Genome",
+        "references/star/Genome",
     log:
-        f"{REFERENCES}/logs/star.log",
+        "references/logs/star.log",
     threads: 8
     resources:
         mem="64g",
@@ -189,10 +188,10 @@ rule star_index:
 
 rule transcriptome_fasta:
     input:
-        fasta=f"{REFERENCES}/genome.fa",
-        gtf=f"{REFERENCES}/annotation.gtf",
+        fasta="references/genome.fa",
+        gtf="references/annotation.gtf",
     output:
-        f"{REFERENCES}/transcriptome.fa",
+        "references/transcriptome.fa",
     resources:
         mem="4g",
         runtime="2h",
@@ -202,13 +201,13 @@ rule transcriptome_fasta:
 
 rule salmon_index:
     input:
-        f"{REFERENCES}/transcriptome.fa",
+        "references/transcriptome.fa",
     output:
-        f"{REFERENCES}/salmon/versionInfo.json",
+        "references/salmon/versionInfo.json",
     log:
-        f"{REFERENCES}/logs/salmon.log",
+        "references/logs/salmon.log",
     params:
-        outdir=f"{REFERENCES}/salmon",
+        outdir="references/salmon",
     resources:
         mem="32g",
         runtime="2h",
@@ -224,11 +223,11 @@ rule salmon_index:
 
 rule conversion_refflat:
     input:
-        f"{REFERENCES}/annotation.gtf",
+        "references/annotation.gtf",
     output:
-        f"{REFERENCES}/annotation.refflat",
+        "references/annotation.refflat",
     log:
-        f"{REFERENCES}/logs/annotation.refflat.log",
+        "references/logs/annotation.refflat.log",
     resources:
         mem="2g",
         runtime="2h",
@@ -240,9 +239,9 @@ rule conversion_refflat:
 
 rule conversion_bed12:
     input:
-        f"{REFERENCES}/annotation.gtf",
+        "references/annotation.gtf",
     output:
-        f"{REFERENCES}/annotation.bed12",
+        "references/annotation.bed12",
     resources:
         mem="2g",
         runtime="2h",
@@ -254,11 +253,11 @@ rule conversion_bed12:
 
 rule chromsizes:
     input:
-        f"{REFERENCES}/genome.fa.gz",
+        "references/genome.fa.gz",
     output:
-        f"{REFERENCES}/genome.chromsizes",
+        "references/genome.chromsizes",
     log:
-        f"{REFERENCES}/logs/genome.chromsizes.log",
+        "references/logs/genome.chromsizes.log",
     params:
         java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
@@ -283,9 +282,9 @@ rule mappings:
     Creates gzipped TSV mapping between attributes in the GTF.
     """
     input:
-        gtf=f"{REFERENCES}/annotation.gtf",
+        gtf="references/annotation.gtf",
     output:
-        f"{REFERENCES}/annotation.mapping.tsv.gz",
+        "references/annotation.mapping.tsv.gz",
     params:
         include_featuretypes=lambda wildcards, output: conversion_kwargs[
             output[0]
@@ -412,7 +411,7 @@ rule star:
     input:
         fastq=rules.cutadapt.output,
         index=rules.star_index.output,
-        annotation=f"{REFERENCES}/annotation.gtf",
+        annotation="references/annotation.gtf",
     output:
         bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
         sjout=temporary(
@@ -474,7 +473,7 @@ rule star:
 rule rRNA:
     input:
         fastq="data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz",
-        index=f"{REFERENCES}/bowtie2/rrna.1.bt2",
+        index="references/bowtie2/rrna.1.bt2",
     output:
         bam="data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam",
     log:
@@ -569,7 +568,7 @@ rule markduplicates:
 
 rule featurecounts:
     input:
-        annotation=f"{REFERENCES}/annotation.gtf",
+        annotation="references/annotation.gtf",
         bam=rules.markduplicates.output.bam,
     output:
         "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt",
@@ -703,7 +702,7 @@ rule preseq:
 rule salmon:
     input:
         fastq=rules.cutadapt.output,
-        index=REFERENCES + "/salmon/versionInfo.json",
+        index="references/salmon/versionInfo.json",
     output:
         "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf",
     log:

From 7fa4b69dda381b8dbfe3870535804847f281c2bc Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 11 Oct 2025 00:43:37 +0000
Subject: [PATCH 129/196] simplify references for chipseq

---
 workflows/chipseq/Snakefile | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 1393e034..08aa75b9 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -11,7 +11,6 @@ from lib import chipseq
 configfile: "config/config.yaml"
 
 
-REFERENCES = config.get("reference_dir", "../../references")
 sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
@@ -41,9 +40,9 @@ rule all:
 
 rule fasta:
     output:
-        f"{REFERENCES}/genome.fa.gz",
+        "references/genome.fa.gz",
     log:
-        f"{REFERENCES}/logs/genome.fa.gz.log",
+        "references/logs/genome.fa.gz.log",
     resources:
         mem_mb="4g",
         runtime="2h",
@@ -61,11 +60,11 @@ rule fasta:
 
 rule chromsizes:
     input:
-        f"{REFERENCES}/genome.fa.gz",
+        "references/genome.fa.gz",
     output:
-        f"{REFERENCES}/genome.chromsizes",
+        "references/genome.chromsizes",
     log:
-        f"{REFERENCES}/logs/genome.chromsizes.log",
+        "references/logs/genome.chromsizes.log",
     params:
         java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
@@ -87,9 +86,9 @@ rule chromsizes:
 
 rule unzip:
     input:
-        f"{REFERENCES}/{{prefix}}{{ext}}.gz",
+        "references/{prefix}{ext}.gz",
     output:
-        f"{REFERENCES}/{{prefix}}{{ext}}",
+        "references/{prefix}{ext}",
     resources:
         mem="4g",
         runtime="2h",
@@ -99,12 +98,12 @@ rule unzip:
 
 rule bowtie2_index:
     input:
-        f"{REFERENCES}/genome.fa",
+        "references/genome.fa",
     output:
-        f"{REFERENCES}/bowtie2/genome.1.bt2",
-        f"{REFERENCES}/bowtie2/genome.fa",
+        "references/bowtie2/genome.1.bt2",
+        "references/bowtie2/genome.fa",
     log:
-        f"{REFERENCES}/logs/bowtie2_genome.log",
+        "references/logs/bowtie2_genome.log",
     resources:
         mem="32g",
         disk="50g",
@@ -235,7 +234,7 @@ rule bowtie2:
     input:
         fastq=expand(
             "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", n=n, allow_missing=True),
-        index=f"{REFERENCES}/bowtie2/genome.1.bt2",
+        index="references/bowtie2/genome.1.bt2",
     output:
         bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"),
     log:

From 2989fcd609288e4d781911ac7ac3f60e60713fe2 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 11 Oct 2025 00:51:29 +0000
Subject: [PATCH 130/196] snakefmt on rnaseq

---
 workflows/rnaseq/Snakefile | 46 +++++++++++++-------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 2ac4a01b..6f5d3153 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -62,7 +62,7 @@ rule fasta:
         runtime="2h",
     params:
         urls=config["fasta"]["url"],
-        postprocess=config["fasta"].get("postprocess", None)
+        postprocess=config["fasta"].get("postprocess", None),
     run:
         utils.download_and_postprocess(
             urls=params.urls,
@@ -82,7 +82,7 @@ rule gtf:
         runtime="2h",
     params:
         urls=config["gtf"]["url"],
-        postprocess=config["gtf"].get("postprocess", None)
+        postprocess=config["gtf"].get("postprocess", None),
     run:
         utils.download_and_postprocess(
             urls=params.urls,
@@ -102,7 +102,7 @@ rule rrna_fasta:
         runtime="2h",
     params:
         urls=config["rrna"]["url"],
-        postprocess=config["rrna"].get("postprocess", None)
+        postprocess=config["rrna"].get("postprocess", None),
     run:
         utils.download_and_postprocess(
             urls=params.urls,
@@ -139,12 +139,7 @@ rule rrna_index:
     threads: 8
     run:
         prefix = subpath(output[0], strip_suffix=".1.bt2")
-        shell(
-            "bowtie2-build "
-            "--threads {threads} "
-            "{input} "
-            "{prefix} &> {log}"
-        )
+        shell("bowtie2-build --threads {threads} {input} {prefix} &> {log}")
         utils.make_relative_symlink(input[0], output[-1])
 
 
@@ -213,12 +208,7 @@ rule salmon_index:
         runtime="2h",
     run:
         outdir = os.path.dirname(output[0])
-        shell(
-            "salmon index "
-            "--transcripts {input} "
-            "--index {outdir} "
-            "&> {log}"
-        )
+        shell("salmon index --transcripts {input} --index {outdir} &> {log}")
 
 
 rule conversion_refflat:
@@ -259,8 +249,8 @@ rule chromsizes:
     log:
         "references/logs/genome.chromsizes.log",
     params:
-        java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
+        java_args="-Xmx20g",  # [disable for test]
     resources:
         mem="24g",
         runtime="2h",
@@ -294,6 +284,7 @@ rule mappings:
         runtime="2h",
     run:
         import gffutils
+
         gffutils.constants.always_return_list = False
 
         include_featuretypes = params.include_featuretypes
@@ -382,8 +373,6 @@ rule fastqc:
     log:
         "data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log",
     run:
-        # Calculate the paths FastQC will create so we can move them to
-        # specified output files if needed.
         outdir = os.path.dirname(output.html) or "."
         outfile = os.path.basename(input[0])
         for s in [".fastq", ".fq", ".gz", ".bam"]:
@@ -442,7 +431,6 @@ rule star:
             "{tmpdir_arg} "
             "--outSAMtype BAM SortedByCoordinate "
             "--outStd BAM_SortedByCoordinate > {output.bam} "
-
             # NOTE: The STAR docs indicate that the following parameters are
             # standard options for ENCODE long-RNA-seq pipeline.  Comments are from
             # the STAR docs.
@@ -462,12 +450,9 @@ rule star:
         # move various hard-coded log files to log directory
         logfiles = expand(
             prefix + "{ext}",
-            ext=["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"]
-        )
-        shell(
-            "mkdir -p {outdir}/star_logs "
-            "&& mv {logfiles} {outdir}/star_logs"
+            ext=["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"],
         )
+        shell("mkdir -p {outdir}/star_logs && mv {logfiles} {outdir}/star_logs")
 
 
 rule rRNA:
@@ -482,7 +467,6 @@ rule rRNA:
     resources:
         mem="2g",
         runtime="2h",
-    params:
     run:
         prefix = subpath(input.index, strip_suffix=".1.bt2")
         shell(
@@ -493,13 +477,15 @@ rule rRNA:
             "--no-unal "
             "-k 1 "
             "-S {output.bam}.sam "
-            "> {log} 2>&1 ")
+            "> {log} 2>&1 "
+        )
         shell(
             "samtools view -Sb {output.bam}.sam "
             "| samtools sort -O BAM - -o {output.bam}"
         )
         shell("rm {output.bam}.sam")
 
+
 rule fastq_count:
     input:
         fastq="{sample_dir}/{sample}/{sample}{suffix}.fastq.gz",
@@ -553,8 +539,8 @@ rule markduplicates:
         runtime="2h",
         disk="100g",
     params:
-        java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
+        java_args="-Xmx20g",  # [disable for test]
     shell:
         "picard "
         "{params.java_args} "
@@ -610,7 +596,7 @@ rule aggregate_featurecounts:
     threads: 1
     resources:
         mem="8g",
-        runtime="1h"
+        runtime="1h",
     run:
         for i, file in enumerate(input):
             df = pd.read_csv(file, sep="\t", comment="#")
@@ -659,8 +645,8 @@ rule collectrnaseqmetrics:
         mem="32g",
         runtime="2h",
     params:
+        # java_args='-Xmx2g', # [enable for test]
         java_args="-Xmx20g",  # [disable for test]
-        # java_args='-Xmx2g',  # [enable for test]
         strand_arg={
             "unstranded": "STRAND=NONE ",
             "fr-firststrand": "STRAND=SECOND_READ_TRANSCRIPTION_STRAND ",
@@ -942,8 +928,8 @@ if utils.detect_sra(sampletable):
         log:
             "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
         params:
-            is_paired=is_paired,
             # extra="-X 100000",  # [enable for test]
+            is_paired=is_paired,
         resources:
             mem="1g",
             disk="1g",

From 288c7077029c8376c71d42f42ebf6fdd7cea846a Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 11 Oct 2025 00:56:07 +0000
Subject: [PATCH 131/196] snakefmt on chipseq

---
 workflows/chipseq/Snakefile | 98 ++++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 46 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 08aa75b9..46a37a5e 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -34,7 +34,10 @@ localrules:
 rule all:
     input:
         "data/chipseq_aggregation/multiqc.html",
-        expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=LABELS),
+        expand(
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig",
+            label=LABELS,
+        ),
         [v["bed"] for k, v in peaks.items()],
 
 
@@ -48,7 +51,7 @@ rule fasta:
         runtime="2h",
     params:
         urls=config["fasta"]["url"],
-        postprocess=config["fasta"].get("postprocess", None)
+        postprocess=config["fasta"].get("postprocess", None),
     run:
         utils.download_and_postprocess(
             urls=params.urls,
@@ -66,8 +69,8 @@ rule chromsizes:
     log:
         "references/logs/genome.chromsizes.log",
     params:
-        java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
+        java_args="-Xmx20g",  # [disable for test]
     resources:
         mem="24g",
         runtime="2h",
@@ -111,12 +114,7 @@ rule bowtie2_index:
     threads: 8
     run:
         prefix = subpath(output[0], strip_suffix=".1.bt2")
-        shell(
-            "bowtie2-build "
-            "--threads {threads} "
-            "{input} "
-            "{prefix} &> {log}"
-        )
+        shell("bowtie2-build --threads {threads} {input} {prefix} &> {log}")
         utils.make_relative_symlink(input[0], output[-1])
 
 
@@ -128,8 +126,11 @@ rule symlinks:
             else sampletable.loc[wc.sample, ["orig_filename"]]
         ),
     output:
-        expand("data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", n=n,
-               allow_missing=True),
+        expand(
+            "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz",
+            n=n,
+            allow_missing=True,
+        ),
     threads: 1
     resources:
         mem="1g",
@@ -151,11 +152,15 @@ rule cutadapt:
     input:
         fastq=expand(
             "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz",
-            n=n, allow_missing=True),
+            n=n,
+            allow_missing=True,
+        ),
     output:
         fastq=expand(
             "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz",
-            n=n, allow_missing=True),
+            n=n,
+            allow_missing=True,
+        ),
     log:
         "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log",
     threads: 6
@@ -192,7 +197,6 @@ rule cutadapt:
             )
 
 
-
 rule fastqc:
     input:
         "data/chipseq_samples/{sample}/{sample}{suffix}",
@@ -206,8 +210,6 @@ rule fastqc:
     log:
         "data/chipseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log",
     run:
-        # Calculate the paths FastQC will create so we can move them to
-        # specified output files if needed.
         outdir = os.path.dirname(output.html) or "."
         outfile = os.path.basename(input[0])
         for s in [".fastq", ".fq", ".gz", ".bam"]:
@@ -233,7 +235,10 @@ rule fastqc:
 rule bowtie2:
     input:
         fastq=expand(
-            "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", n=n, allow_missing=True),
+            "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz",
+            n=n,
+            allow_missing=True,
+        ),
         index="references/bowtie2/genome.1.bt2",
     output:
         bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"),
@@ -258,7 +263,8 @@ rule bowtie2:
             "--threads {threads} "
             "--no-unal "
             "-S {output.bam}.sam "
-            "> {log} 2>&1 ")
+            "> {log} 2>&1 "
+        )
         shell(
             "samtools view -Sb {output.bam}.sam "
             "| samtools sort -O BAM - -o {output.bam}"
@@ -275,13 +281,9 @@ rule unique:
     resources:
         mem="1g",
         runtime="2h",
-    params:
     shell:
         "samtools view "
         "-b "
-        # NOTE: the quality score chosen here should reflect the scores output
-        # by the aligner used. For example, STAR uses 255 as max mapping
-        # quality.
         "-q 20 "
         "{input} "
         "> {output}"
@@ -331,17 +333,17 @@ rule markduplicates:
         bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam",
     output:
         bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
-        metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics"
+        metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics",
     log:
-        "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.log"
+        "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.log",
     threads: 1
     resources:
         mem="32g",
         disk="100g",
         runtime="2h",
     params:
-        java_args="-Xmx20g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
+        java_args="-Xmx20g",  # [disable for test]
     shell:
         "picard "
         "{params.java_args} "
@@ -364,15 +366,15 @@ rule merge_techreps:
         bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
         metrics="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.metrics",
     log:
-        "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.log"
+        "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.log",
     threads: 1
     resources:
         mem="32g",
         disk="100g",
         runtime="2h",
     params:
-        java_args="-Xmx32g",  # [disable for test]
         # java_args='-Xmx2g'  # [enable for test]
+        java_args="-Xmx32g",  # [disable for test]
     script:
         "../../scripts/merge_and_dedup.py"
 
@@ -386,14 +388,14 @@ if is_paired:
             pdf="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.pdf",
             metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics",
         log:
-            "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics.log"
+            "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics.log",
         threads: 1
         resources:
             mem="32g",
             runtime="2h",
         params:
-            java_args="-Xmx20g",  # [disable for test]
             # java_args='-Xmx2g'  # [enable for test]
+            java_args="-Xmx20g",  # [disable for test]
         shell:
             "picard "
             "{params.java_args} "
@@ -424,20 +426,24 @@ rule bigwig:
         "--minMappingQuality 20 "
         "--ignoreDuplicates "
         "--extendReads 300 "
-        "--normalizeUsing CPM "  # [disable for test]
+        "--normalizeUsing CPM "
         "&> {log}"
 
 
 rule fingerprint:
     input:
-        bams=lambda wc: expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=wc.ip_label),
+        bams=lambda wc: expand(
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
+            label=wc.ip_label,
+        ),
         control=lambda wc: expand(
             "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
             label=chipseq.merged_input_for_ip(sampletable, wc.ip_label),
         ),
         bais=lambda wc: expand(
             "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai",
-            label=wc.ip_label),
+            label=wc.ip_label,
+        ),
         control_bais=lambda wc: expand(
             "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai",
             label=chipseq.merged_input_for_ip(sampletable, wc.ip_label),
@@ -478,6 +484,7 @@ rule fingerprint:
         )
 
 
+
 rule macs:
     input:
         ip=lambda wc: expand(
@@ -527,7 +534,7 @@ rule epic2:
         mem="16g",
         runtime="2h",
     log:
-        "data/chipseq_peaks/epic2/{epic2_run}/peaks.bed.log"
+        "data/chipseq_peaks/epic2/{epic2_run}/peaks.bed.log",
     params:
         block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, "epic2"),
         is_paired=is_paired,
@@ -558,7 +565,10 @@ rule multibigwigsummary:
     Summarize the bigWigs across genomic bins
     """
     input:
-        expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label),
+        expand(
+            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig",
+            label=sampletable.label,
+        ),
     output:
         npz="data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz",
         tab="data/chipseq_aggregation/deeptools/multibigwigsummary.tab",
@@ -600,12 +610,6 @@ rule plotcorrelation:
         "--corMethod spearman "
         "--whatToPlot heatmap "
         "--colorMap Reds "
-        # NOTE: if you're expecting negative correlation, try a divergent
-        # colormap and setting the min/max to ensure that the colomap is
-        # centered on zero:
-        # '--colorMap RdBu_r '
-        # '--zMin -1 '
-        # '--zMax 1 '
 
 
 rule samtools_idxstats:
@@ -618,7 +622,7 @@ rule samtools_idxstats:
         mem="16g",
         runtime="2h",
     log:
-        "data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt.log"
+        "data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt.log",
     shell:
         "samtools idxstats {input.bam} 2> {log} 1> {output.txt}"
 
@@ -633,7 +637,7 @@ rule samtools_flagstat:
         mem="8g",
         runtime="2h",
     log:
-        "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt.log"
+        "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt.log",
     shell:
         "samtools flagstat {input.bam} > {output}"
 
@@ -648,7 +652,7 @@ rule samtools_stats:
         mem="8g",
         runtime="2h",
     log:
-        "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt.log"
+        "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt.log",
     shell:
         "samtools stats {input.bam} > {output}"
 
@@ -675,8 +679,10 @@ rule multiqc:
         ),
         expand(
             "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics",
-             sample=SAMPLES
-        ) if is_paired else [],
+            sample=SAMPLES,
+        )
+        if is_paired
+        else [],
         [v["bigbed"] for v in peaks.values()],
         config="config/multiqc_config.yaml",
     output:
@@ -727,8 +733,8 @@ if utils.detect_sra(sampletable):
         log:
             "original_data/sra_samples/{sample}/{sample}.fastq.gz.log",
         params:
-            is_paired=is_paired,
             # extra="-X 100000",  # [enable for test]
+            is_paired=is_paired,
         resources:
             mem="1g",
             disk="1g",

From 7f281bc1748101b8cbd3ca4ae38ab27242f04989 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 11 Oct 2025 01:09:17 +0000
Subject: [PATCH 132/196] update .gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index ab3fd51e..b1f7c8ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,3 +66,6 @@ workflows/rnaseq/downstream/rnaseq.html
 ._*
 Rplots.pdf
 /lib/include/*
+
+workflows/*/references
+

From 917c90f8ec3f4d7eaf5ee57b986e3ebdcac69d0e Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 11 Oct 2025 09:44:49 -0400
Subject: [PATCH 133/196] hard-code peaks dir in chipseq_trackhub.py

---
 workflows/chipseq/chipseq_trackhub.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py
index 5726fc02..4e520be2 100644
--- a/workflows/chipseq/chipseq_trackhub.py
+++ b/workflows/chipseq/chipseq_trackhub.py
@@ -38,6 +38,8 @@
 # details
 config = yaml.load(open(args.config), Loader=yaml.FullLoader)
 
+peaks_dir = "data/chipseq_peaks"
+
 if args.additional_configs:
     for cfg in args.additional_configs:
         update_config(config, yaml.load(open(cfg), Loader=yaml.FullLoader))
@@ -208,14 +210,14 @@ def decide_color(samplename):
 
     # ASSUMPTION: BED filename pattern
     bed_filename = os.path.join(
-        config['peaks_dir'],
+        peaks_dir,
         algorithm,
         label,
         'peaks.bed')
 
     # ASSUMPTION: bigBed filename pattern
     bigbed_filename = os.path.join(
-        config['peaks_dir'],
+        peaks_dir,
         algorithm,
         label,
         'peaks.bigbed')
@@ -241,7 +243,7 @@ def decide_color(samplename):
 
     if algorithm == "sicer":
         subgroup['peaks'] = 'no'
-        prefilter_wig = glob.glob(os.path.join(config['peaks_dir'],
+        prefilter_wig = glob.glob(os.path.join(peaks_dir,
                                                algorithm,
                                                label,
                                                '*prefilter.bigWig'))
@@ -249,7 +251,7 @@ def decide_color(samplename):
             prefilter_wig = prefilter_wig[0]
         else:
             raise ValueError('SICER output for {0} has no prefilter bigWig file'.format(label))
-        postfilter_wig = glob.glob(os.path.join(config['peaks_dir'],
+        postfilter_wig = glob.glob(os.path.join(peaks_dir,
                                                 algorithm,
                                                 label,
                                                 '*postfilter.bigWig'))

From e298d290ba281cad9488b946ab57064e92a7a69b Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 11 Oct 2025 15:41:54 +0000
Subject: [PATCH 134/196] use .gz for those rules that can

---
 workflows/chipseq/Snakefile | 14 +-------------
 workflows/rnaseq/Snakefile  | 12 ++++++------
 2 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 46a37a5e..8b2b9004 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -87,21 +87,9 @@ rule chromsizes:
         "&& rm -f {output}.tmp "
 
 
-rule unzip:
-    input:
-        "references/{prefix}{ext}.gz",
-    output:
-        "references/{prefix}{ext}",
-    resources:
-        mem="4g",
-        runtime="2h",
-    shell:
-        "gunzip -c {input} > {output}"
-
-
 rule bowtie2_index:
     input:
-        "references/genome.fa",
+        "references/genome.fa.gz",
     output:
         "references/bowtie2/genome.1.bt2",
         "references/bowtie2/genome.fa",
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 6f5d3153..8ccb0ba4 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -116,7 +116,7 @@ rule unzip:
     input:
         "references/{prefix}.gz",
     output:
-        "references/{prefix}",
+        temporary("references/{prefix}"),
     resources:
         mem="4g",
         runtime="2h",
@@ -126,10 +126,10 @@ rule unzip:
 
 rule rrna_index:
     input:
-        "references/rrna.fa",
+        "references/rrna.fa.gz",
     output:
         "references/bowtie2/rrna.1.bt2",
-        "references/bowtie2/rrna.fa",
+        "references/bowtie2/rrna.fa.gz",
     log:
         "references/logs/bowtie2_rrna.log",
     resources:
@@ -213,7 +213,7 @@ rule salmon_index:
 
 rule conversion_refflat:
     input:
-        "references/annotation.gtf",
+        "references/annotation.gtf.gz",
     output:
         "references/annotation.refflat",
     log:
@@ -229,7 +229,7 @@ rule conversion_refflat:
 
 rule conversion_bed12:
     input:
-        "references/annotation.gtf",
+        "references/annotation.gtf.gz",
     output:
         "references/annotation.bed12",
     resources:
@@ -272,7 +272,7 @@ rule mappings:
     Creates gzipped TSV mapping between attributes in the GTF.
     """
     input:
-        gtf="references/annotation.gtf",
+        gtf="references/annotation.gtf.gz",
     output:
         "references/annotation.mapping.tsv.gz",
     params:

From e4642674ed0ef9ddb152b6e653a57438d6e0b5be Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 11 Oct 2025 15:57:52 +0000
Subject: [PATCH 135/196] temporarily name-sort PE bams for featurecounts

---
 workflows/rnaseq/Snakefile | 56 ++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 8ccb0ba4..b610419c 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -552,14 +552,36 @@ rule markduplicates:
         "&> {log}"
 
 
-rule featurecounts:
+rule namesorted_bam:
     input:
-        annotation="references/annotation.gtf",
         bam=rules.markduplicates.output.bam,
     output:
-        "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt",
+        temporary(
+            "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.namesort.bam"
+        ),
+    threads: 1
+    resources:
+        mem="16g",
+        runtime="2h",
+    shell:
+        "samtools sort -n -o {output} {input}"
+
+
+rule featurecounts:
+    input:
+        annotation="references/annotation.gtf.gz",
+        bam=expand(
+            (
+                rules.namesorted_bam.output
+                if is_paired
+                else rules.markduplicates.output.bam
+            ),
+            sample=SAMPLES,
+        ),
+    output:
+        "data/rnaseq_aggregation/featurecounts.txt",
     log:
-        "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log",
+        "data/rnaseq_aggregation/featurecounts.txt.log",
     threads: 8
     resources:
         mem="16g",
@@ -584,30 +606,6 @@ rule featurecounts:
         )
 
 
-rule aggregate_featurecounts:
-    input:
-        expand(
-            "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", sample=SAMPLES
-        ),
-    output:
-        "data/rnaseq_aggregation/featurecounts.txt",
-    log:
-        "data/rnaseq_aggregation/featurecounts.txt.log",
-    threads: 1
-    resources:
-        mem="8g",
-        runtime="1h",
-    run:
-        for i, file in enumerate(input):
-            df = pd.read_csv(file, sep="\t", comment="#")
-            df = df.set_index("Geneid", drop=False)
-            if i == 0:
-                final = df
-                continue
-            final[df.columns[-1]] = df[df.columns[-1]]
-        final.to_csv(output[0], sep="\t", index=False)
-
-
 rule rrna_libsizes_table:
     input:
         rrna=expand(
@@ -877,7 +875,7 @@ rule multiqc:
             expand(rules.bigwig_pos.output, sample=SAMPLES),
             expand(rules.bigwig_neg.output, sample=SAMPLES),
             rules.rrna_libsizes_table.output,
-            rules.aggregate_featurecounts.output,
+            rules.featurecounts.output,
         ),
         config="config/multiqc_config.yaml",
     output:

From 087e008f94a78d2eec923575b2782ef7a71268ac Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 14 Oct 2025 22:16:00 +0000
Subject: [PATCH 136/196] improve mappings.tsv generation

---
 lib/utils.py               | 63 ++++++++++++++++++++++++++++++++++++++
 workflows/rnaseq/Snakefile | 41 ++++++-------------------
 2 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index c7e4bf7b..8593f534 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -4,6 +4,7 @@
 import gzip
 import os
 import re
+import sys
 import subprocess
 import warnings
 from collections.abc import Iterable
@@ -15,6 +16,8 @@
 from Bio import SeqIO
 from snakemake.io import expand, regex_from_filepattern
 from snakemake.shell import shell
+import gffutils
+import csv
 
 # Small helper functions
 
@@ -1190,4 +1193,64 @@ def wrapper_for(path):
 def detect_sra(sampletable):
     return 'Run' in sampletable.columns and any(sampletable['Run'].str.startswith('SRR'))
 
+
+def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None, include_attributes=None):
+    """
+    Create a TSV file of attributes found in a GTF file.
+
+    Parameters
+    ----------
+
+    gtf, tsv : str
+        Input and output filenames respectively
+
+    exclude_featuretypes, include_featuretypes : list
+        Mutually exclusive; use these to restrict the features considered.
+        E.g., we likely don't need entries for start_codon if those are in the
+        GTF.
+
+    include_attributes : list
+        Restrict the attributes reported in the TSV. Should at least have
+        a column for gene ID and transcript ID in order for downstream RNA-seq
+        work.
+    """
+
+    if exclude_featuretypes and include_featuretypes:
+        raise ValueError("Both include_featuretypes and exclude_featuretypes were specified.")
+
+    res = []
+    keys = set(['__featuretype__'])
+    seen = set()
+    for f in gffutils.DataIterator(gtf):
+        ft = f.featuretype
+        if exclude_featuretypes and ft in exclude_featuretypes:
+            continue
+        if include_featuretypes and ft not in include_featuretypes:
+            continue
+        d = dict(f.attributes)
+        keys.update(d.keys())
+        d["__featuretype__"] = ft
+        h = hash(str(d))
+        if h in seen:
+            continue
+        seen.update([h])
+        res.append(d)
+
+    def unlist_dict(d):
+        for k, v in d.items():
+            if isinstance(v, list):
+                d[k] = "|".join(v)
+        return d
+
+    if include_attributes:
+        sorted_keys = sorted(include_attributes)
+    else:
+        sorted_keys = sorted(keys)
+    with open(tsv, 'w') as fout:
+        writer = csv.DictWriter(fout, fieldnames=sorted_keys, restval="", delimiter='\t')
+        writer.writeheader()
+        for row in res:
+            writer.writerow(unlist_dict(row))
+
+
 # vim: ft=python
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index b610419c..ab82deeb 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -31,6 +31,7 @@ localrules:
 rule all:
     input:
         "data/rnaseq_aggregation/multiqc.html",
+        "references/annotation.mapping.tsv",
 
 
 rule symlinks:
@@ -268,46 +269,24 @@ rule chromsizes:
 
 
 rule mappings:
-    """
-    Creates gzipped TSV mapping between attributes in the GTF.
-    """
     input:
         gtf="references/annotation.gtf.gz",
     output:
-        "references/annotation.mapping.tsv.gz",
-    params:
-        include_featuretypes=lambda wildcards, output: conversion_kwargs[
-            output[0]
-        ].get("include_featuretypes", []),
+        tsv="references/annotation.mapping.tsv",
     resources:
         mem="2g",
         runtime="2h",
     run:
-        import gffutils
-
-        gffutils.constants.always_return_list = False
-
-        include_featuretypes = params.include_featuretypes
-
-        res = []
-        for f in gffutils.DataIterator(input[0]):
-
-            ft = f.featuretype
-
-            if include_featuretypes and (ft not in include_featuretypes):
-                continue
-
-            d = dict(f.attributes)
-            d["__featuretype__"] = ft
-            res.append(d)
-
-        df = pandas.DataFrame(res)
+        mappings_args = dict(
+            exclude_featuretypes=None,
+            include_featuretypes=None,
+            include_attributes=None,
+        )
+        print(config["annotation"].get("mappings", {}))
 
-        # Depending on how many attributes there were and the
-        # include_featuretypes settings, this may take a while.
-        df = df.drop_duplicates()
+        mappings_args.update(config["annotation"].get("mappings", {}))
 
-        df.to_csv(output[0], sep="\t", index=False, compression="gzip")
+        utils.mappings_tsv(input.gtf, output.tsv, **mappings_args)
 
 
 rule symlink_targets:

From a2cc1855a9ed682f37df50438f8610b5b3578697 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 14 Oct 2025 22:16:33 +0000
Subject: [PATCH 137/196] minor refactoring in utils.py

---
 lib/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index 8593f534..b74fc7e4 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -265,7 +265,7 @@ def extract_wildcards(pattern, target):
         return m.groupdict()
 
 
-def _is_gzipped(fn):
+def is_gzipped(fn):
     """
     Filename-independent method of checking if a file is gzipped or not. Uses
     the magic number.
@@ -280,7 +280,7 @@ def openfile(tmp, mode):
     """
     Returns an open file handle; auto-detects gzipped files.
     """
-    if _is_gzipped(tmp):
+    if is_gzipped(tmp):
         return gzip.open(tmp, mode)
     else:
         return open(tmp, mode)
@@ -783,6 +783,8 @@ def twobit_to_fasta(tmpfiles, outfile):
     shell("cat {fastas} | gzip -c > {outfile}")
     shell("rm {fastas}")
 
+def default_postprocess(origfn, newfn):
+    shell("mv {origfn} {newfn}")
 
 def download_and_postprocess(urls, postprocess, outfile, log):
     """
@@ -865,8 +867,6 @@ def func(infiles, outfile, *args, **kwargs):
 
     """
 
-    def default_postprocess(origfn, newfn):
-        shell("mv {origfn} {newfn}")
 
     if not isinstance(postprocess, list):
         postprocess = [postprocess]
@@ -990,7 +990,7 @@ def default_postprocess(origfn, newfn):
         for i in to_delete:
             if os.path.exists(i):
                 shell("rm {i}")
-    if not _is_gzipped(outfile):
+    if not is_gzipped(outfile):
         raise ValueError(f"{outfile} does not appear to be gzipped.")
 
 
@@ -1181,7 +1181,7 @@ def gff2gtf(gff, gtf):
     """
     Converts a gff file to a gtf format using the gffread function from Cufflinks
     """
-    if _is_gzipped(gff[0]):
+    if is_gzipped(gff[0]):
         shell("gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}")
     else:
         shell("gffread {gff} -T -o- | gzip -c > {gtf}")

From a4485ec0ccbbd40085b2afcea7cf9582d9664432 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 14 Oct 2025 22:17:30 +0000
Subject: [PATCH 138/196] fasta -> genome and gtf -> annotation in configs

---
 workflows/rnaseq/Snakefile          | 10 +++++-----
 workflows/rnaseq/config/config.yaml |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index ab82deeb..b299dcdd 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -62,8 +62,8 @@ rule fasta:
         mem_mb="4g",
         runtime="2h",
     params:
-        urls=config["fasta"]["url"],
-        postprocess=config["fasta"].get("postprocess", None),
+        urls=config["genome"]["url"],
+        postprocess=config["genome"].get("postprocess", None),
     run:
         utils.download_and_postprocess(
             urls=params.urls,
@@ -73,7 +73,7 @@ rule fasta:
         )
 
 
-rule gtf:
+rule annotation:
     output:
         "references/annotation.gtf.gz",
     log:
@@ -82,8 +82,8 @@ rule gtf:
         mem="4g",
         runtime="2h",
     params:
-        urls=config["gtf"]["url"],
-        postprocess=config["gtf"].get("postprocess", None),
+        urls=config["annotation"]["url"],
+        postprocess=config["annotation"].get("postprocess", None),
     run:
         utils.download_and_postprocess(
             urls=params.urls,
diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml
index 657c92a4..9047b4ab 100644
--- a/workflows/rnaseq/config/config.yaml
+++ b/workflows/rnaseq/config/config.yaml
@@ -5,11 +5,11 @@ stranded: 'fr-firststrand'     # for dUTP libraries
 #         'fr-secondstrand'    # for ligation libraries
 #         'unstranded'         # for libraries without strand specificity
 
-fasta:
+genome:
   url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
   postprocess: 'lib.utils.gzipped'
 
-gtf:
+annotation:
   url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf"
   postprocess: 'lib.utils.gzipped'
 

From ba3c7ce21a752839fbb31570f0908d4da2cbb4ed Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 14 Oct 2025 22:18:14 +0000
Subject: [PATCH 139/196] add faidx rule

---
 workflows/rnaseq/Snakefile | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index b299dcdd..c0574925 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -72,6 +72,16 @@ rule fasta:
             log=log,
         )
 
+rule faidx:
+    input:
+        "references/genome.fa"
+    output:
+        "references/genome.fa.fai"
+    resources:
+        mem_mb="4g",
+        runtime="2h",
+    shell:
+        "samtools faidx {input}"
 
 rule annotation:
     output:

From 8a320202ff0d6f985c4e34574725cfbd9a86e08d Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 14 Oct 2025 22:19:09 +0000
Subject: [PATCH 140/196] pep8 on postprocess.utils

---
 lib/postprocess/utils.py | 64 ++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py
index f8fc64a6..1e254ef1 100644
--- a/lib/postprocess/utils.py
+++ b/lib/postprocess/utils.py
@@ -44,7 +44,7 @@ def extract_from_zip(tmpfiles, outfile, path_in_zip):
     shutil.rmtree(extraction_dir)
 
 
-def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand = "None"):
+def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand="None"):
     """
     Matches string to the 9th field of GTF and an optional strand that defaults to None;
     if the pattern is found and the provided strand match then the line is excluded
@@ -63,21 +63,26 @@ def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand = "None"):
     optstrand : str
         String to match to the strand. Default is None
     """
-    regex_strmatch = re.compile(r'|'.join(strmatch))
+    regex_strmatch = re.compile(r"|".join(strmatch))
 
-    with gzip.open(outfile, 'wt') as fout:
+    with gzip.open(outfile, "wt") as fout:
         for tmpfn in tmpfiles:
-            with openfile(tmpfn, 'rt') as tmp:
+            with openfile(tmpfn, "rt") as tmp:
                 for line in tmp:
                     if line.startswith("#"):
                         fout.write(line)
                     else:
-                        toks = line.split('\t')
-                        if not (regex_strmatch.search(toks[8]) != None and toks[6] == optstrand):
+                        toks = line.split("\t")
+                        if not (
+                            regex_strmatch.search(toks[8]) != None
+                            and toks[6] == optstrand
+                        ):
                             fout.write(line)
 
+
 # match_gtf_9th(['/home/esnaultcm/Downloads/Rattus_norvegicus.Rnor_6.0.94.gtf.gz'], "test.gz", ['ENSRNOG00000046319'], '-')
 
+
 def convert_gtf_chroms(tmpfiles, outfile, conv_table):
     """
     Convert chrom names in GTF file according to conversion table.
@@ -95,28 +100,32 @@ def convert_gtf_chroms(tmpfiles, outfile, conv_table):
         read lookup table, so it can be file://, a path relative to the
         snakefile, or an http://, https://, or ftp:// URL.
     """
-    lookup = pd.read_csv(
-        conv_table, sep='\t', header=None, names=('a', 'b')
-    ).set_index('a')['b'].to_dict()
+    lookup = (
+        pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b"))
+        .set_index("a")["b"]
+        .to_dict()
+    )
 
-    with gzip.open(outfile, 'wt') as fout:
+    with gzip.open(outfile, "wt") as fout:
         for tmpfn in tmpfiles:
-            with openfile(tmpfn, 'rt') as tmp:
+            with openfile(tmpfn, "rt") as tmp:
                 for line in tmp:
                     if not line.startswith("#"):
-                        toks = line.split('\t')
+                        toks = line.split("\t")
                         chrom = toks[0]
                         if chrom in lookup.keys():
-                            toks[0]= lookup[chrom]
-                            line = '\t'.join(toks)
+                            toks[0] = lookup[chrom]
+                            line = "\t".join(toks)
                         else:
                             raise ValueError(
                                 'Chromosome "{chrom}" not found in conversion table '
-                                '"{conv_table}"'
-                                .format(chrom=chrom, conv_table=conv_table)
+                                '"{conv_table}"'.format(
+                                    chrom=chrom, conv_table=conv_table
+                                )
                             )
                     fout.write(line)
 
+
 def convert_fasta_chroms(tmpfiles, outfile, conv_table):
     """
     Convert chrom names in fasta file according to conversion table.
@@ -135,26 +144,29 @@ def convert_fasta_chroms(tmpfiles, outfile, conv_table):
         snakefile, or an http://, https://, or ftp:// URL.
     """
 
-    lookup = pd.read_csv(
-        conv_table, sep='\t', header=None, names=('a', 'b')
-    ).set_index('a')['b'].to_dict()
+    lookup = (
+        pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b"))
+        .set_index("a")["b"]
+        .to_dict()
+    )
 
-    with gzip.open(outfile, 'wt') as fout:
+    with gzip.open(outfile, "wt") as fout:
         for tmpfn in tmpfiles:
-            with openfile(tmpfn, 'rt') as tmp:
+            with openfile(tmpfn, "rt") as tmp:
                 for line in tmp:
                     if line.startswith(">"):
                         line = line.rstrip("\n")
-                        toks = line.split(' ')
+                        toks = line.split(" ")
                         chrom = toks[0].lstrip(">")
                         chrom = chrom.rstrip("\n")
                         if chrom in lookup.keys():
-                            toks[0]= ">" + lookup[chrom]
-                            line = ' '.join(toks) + "\n"
+                            toks[0] = ">" + lookup[chrom]
+                            line = " ".join(toks) + "\n"
                         else:
                             raise ValueError(
                                 'Chromosome "{chrom}" not found in conversion table '
-                                '"{conv_table}"'
-                                .format(chrom=chrom, conv_table=conv_table)
+                                '"{conv_table}"'.format(
+                                    chrom=chrom, conv_table=conv_table
+                                )
                             )
                     fout.write(line)

From 69f3ed6533e10740c35a12a146b865ae41fe4393 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 14 Oct 2025 22:19:27 +0000
Subject: [PATCH 141/196] support gtf and fasta filtering on regexps

to support hg19 from gencode, which doesn't provide primary assembly and
associated gtf
---
 lib/postprocess/utils.py | 112 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 107 insertions(+), 5 deletions(-)

diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py
index 1e254ef1..18fa5296 100644
--- a/lib/postprocess/utils.py
+++ b/lib/postprocess/utils.py
@@ -1,16 +1,118 @@
-import sys
+import gzip
+import logging
 import os
 import re
-import gzip
-import zipfile
-import shutil
+import sys
 import tempfile
+import zipfile
+
+import gffutils
 import pandas as pd
+from snakemake.shell import shell
 
 here = os.path.dirname(os.path.abspath(__file__))
 sys.path.insert(0, os.path.join(here, "../../lib"))
-from utils import openfile
+from .. import utils as u
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO)
+
+
+def ensure_single_unzipped(tmpfiles, outfile):
+    """
+    Sometimes it makes things easier in downstream code to assume there's
+    a single uncompressed file to work with.
+    """
+    all_gzipped = all([u.is_gzipped(i) for i in tmpfiles])
+    none_gzipped = all([not u.is_gzipped(i) for i in tmpfiles])
+
+    if all_gzipped:
+        shell("zcat {tmpfiles} > {outfile}")
+        return outfile
+
+    elif none_gzipped:
+        shell("cat {tmpfiles} > {outfile}")
+        return outfile
 
+    else:
+        raise ValueError("Mixture of compressed and uncompressed files")
+
+
+def _patterns(include_patterns, exclude_patterns):
+    """
+    Return a function that will include/exclude strings based on the patterns
+    provided.
+    """
+
+    if include_patterns and exclude_patterns:
+        raise ValueError("include_patterns and exclude_patterns are mutually exclusive")
+    patterns = []
+    if include_patterns:
+        for p in include_patterns:
+            patterns.append(re.compile(p))
+
+        def keep(s):
+            for p in patterns:
+                if p.search(s):
+                    logger.info(f"Keeping {s} because it matches {p}")
+                    return True
+            return False
+
+    elif exclude_patterns:
+        for p in exclude_patterns:
+            patterns.append(re.compile(p))
+
+        def keep(s):
+            for p in patterns:
+                if p.search(s):
+                    logger.info(f"Excluding {s} because it matches {p}")
+                    return False
+            return True
+
+    else:
+        raise ValueError(
+            "Expecting exactly one of include_patterns or exclude_patterns"
+        )
+
+    return keep
+
+
+def filter_fasta_chroms(
+    tmpfiles, outfile, include_patterns=None, exclude_patterns=None
+):
+    # samtools won't work with gzip (only bgzip) files, so the lowest common
+    # denominator is to use uncompressed.
+    working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp")
+    if include_patterns and exclude_patterns:
+        raise ValueError("include_patterns and exclude_patterns are mutually exclusive")
+
+    logger.info(f"Finding chrom names and putting them in {working_file}.record_names")
+    shell(
+        'grep ">" {working_file} | cut -f1 -d " " | sed "s/>//g" > {working_file}.record_names'
+    )
+
+    keep = _patterns(include_patterns, exclude_patterns)
+    with open(outfile + ".keep", "w") as fout, open(
+        working_file + ".record_names", "r"
+    ) as fin:
+        for line in fin:
+            line = line.replace(">", "").strip()
+            chrom = line.split()[0]
+            if keep(chrom):
+                fout.write(chrom + "\n")
+    shell("samtools faidx -r {outfile}.keep {working_file} | bgzip -c > {outfile}")
+    # shell("rm {outfile}.tmp {outfile}.tmp.fai {outfile}.keep")
+    shell("rm {tmpfiles}")
+
+
+def filter_gtf_chroms(tmpfiles, outfile, include_patterns=None, exclude_patterns=None):
+    working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp")
+    keep = _patterns(include_patterns, exclude_patterns)
+    with gzip.open(outfile, "wt") as fout:
+        for feature in gffutils.DataIterator(working_file):
+            if keep(feature.chrom):
+                fout.write(str(feature) + "\n")
+    shell("rm {tmpfiles}")
 
 
 def extract_from_zip(tmpfiles, outfile, path_in_zip):

From b4870cd8a28fc40b33e6cc1defb62b50ecef7080 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 14 Oct 2025 22:20:06 +0000
Subject: [PATCH 142/196] add reference config templates for human

---
 .../Homo_sapiens/GENCODE.yaml                 |  10 +
 .../Homo_sapiens/GENCODE_v19.yaml             | 176 ++++++++++++++++++
 .../Homo_sapiens/GRCh37.yaml                  |   1 +
 .../Homo_sapiens/hg19.yaml                    |   1 +
 4 files changed, 188 insertions(+)
 create mode 100644 include/reference_config_templates/Homo_sapiens/GENCODE.yaml
 create mode 100644 include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml
 create mode 120000 include/reference_config_templates/Homo_sapiens/GRCh37.yaml
 create mode 120000 include/reference_config_templates/Homo_sapiens/hg19.yaml

diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
new file mode 100644
index 00000000..dd4ae34f
--- /dev/null
+++ b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
@@ -0,0 +1,10 @@
+# This config is intended to always point to the latest GENCODE version. If
+# there is a newer version, please update and submit a pull request.
+#
+# https://www.gencodegenes.org/human/
+#
+genome:
+  url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz"
+
+annotation:
+  url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz"
diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml
new file mode 100644
index 00000000..c2e0bf6a
--- /dev/null
+++ b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml
@@ -0,0 +1,176 @@
+# This is the last GENCODE release for hg19 / GRCh37. See
+# https://www.gencodegenes.org/human/release_19.html.
+#
+# A primary assembly is not available like it is for GRCh38, so we make one by
+# selecting the main chromosomes and unassembled contigs. It's not obvious
+# which ones are the unassembled contigs, but the original fasta file has
+# space-separated record names like this:
+#
+#    >chr20 20
+#    >chr21 21
+#    >chr22 22
+#    >chrX X
+#    >chrY Y
+#    >chrM MT
+#    >GL877870.2 HG1007_PATCH
+#    >GL877872.1 HG1032_PATCH
+#    >GL383535.1 HG104_HG975_PATCH
+#    >JH159133.1 HG1063_PATCH
+#
+# Spot-checking the entries, those that have PATCH in the line are assembly
+# patches; those with HSCHR and HG*TEST are alt loci. None of those should be
+# in a primary assembly. So the "include_pattern" list below was obtained with
+# the following command:
+#
+#  zcat GRCh37.p13.genome.fa.gz \
+#    | grep -Ev "HS|PATCH|HG" \
+#    | cut -f1 -d " " \
+#    | sed "s/>//g"
+#
+# Spot-checking the remaining non-chr, they do all appear to be unassembled
+# contigs, which we do want.
+#
+# So we can use this list of chroms to filter both the fasta as well as the gtf.
+#
+genome:
+  url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/GRCh37.p13.genome.fa.gz"
+  postprocess:
+    function: "lib.postprocess.utils.filter_fasta_chroms"
+    kwargs:
+      include_patterns:
+        - chr.*
+        - GL000191.1
+        - GL000192.1
+        - GL000193.1
+        - GL000194.1
+        - GL000195.1
+        - GL000196.1
+        - GL000197.1
+        - GL000198.1
+        - GL000199.1
+        - GL000200.1
+        - GL000201.1
+        - GL000202.1
+        - GL000203.1
+        - GL000204.1
+        - GL000205.1
+        - GL000206.1
+        - GL000207.1
+        - GL000208.1
+        - GL000209.1
+        - GL000210.1
+        - GL000211.1
+        - GL000212.1
+        - GL000213.1
+        - GL000214.1
+        - GL000215.1
+        - GL000216.1
+        - GL000217.1
+        - GL000218.1
+        - GL000219.1
+        - GL000220.1
+        - GL000221.1
+        - GL000222.1
+        - GL000223.1
+        - GL000224.1
+        - GL000225.1
+        - GL000226.1
+        - GL000227.1
+        - GL000228.1
+        - GL000229.1
+        - GL000230.1
+        - GL000231.1
+        - GL000232.1
+        - GL000233.1
+        - GL000234.1
+        - GL000235.1
+        - GL000236.1
+        - GL000237.1
+        - GL000238.1
+        - GL000239.1
+        - GL000240.1
+        - GL000241.1
+        - GL000242.1
+        - GL000243.1
+        - GL000244.1
+        - GL000245.1
+        - GL000246.1
+        - GL000247.1
+        - GL000248.1
+        - GL000249.1
+
+
+annotation:
+  url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz"
+  postprocess:
+    function: "lib.postprocess.utils.filter_gtf_chroms"
+    kwargs:
+      include_patterns:
+        - chr.*
+        - GL000191.1
+        - GL000192.1
+        - GL000193.1
+        - GL000194.1
+        - GL000195.1
+        - GL000196.1
+        - GL000197.1
+        - GL000198.1
+        - GL000199.1
+        - GL000200.1
+        - GL000201.1
+        - GL000202.1
+        - GL000203.1
+        - GL000204.1
+        - GL000205.1
+        - GL000206.1
+        - GL000207.1
+        - GL000208.1
+        - GL000209.1
+        - GL000210.1
+        - GL000211.1
+        - GL000212.1
+        - GL000213.1
+        - GL000214.1
+        - GL000215.1
+        - GL000216.1
+        - GL000217.1
+        - GL000218.1
+        - GL000219.1
+        - GL000220.1
+        - GL000221.1
+        - GL000222.1
+        - GL000223.1
+        - GL000224.1
+        - GL000225.1
+        - GL000226.1
+        - GL000227.1
+        - GL000228.1
+        - GL000229.1
+        - GL000230.1
+        - GL000231.1
+        - GL000232.1
+        - GL000233.1
+        - GL000234.1
+        - GL000235.1
+        - GL000236.1
+        - GL000237.1
+        - GL000238.1
+        - GL000239.1
+        - GL000240.1
+        - GL000241.1
+        - GL000242.1
+        - GL000243.1
+        - GL000244.1
+        - GL000245.1
+        - GL000246.1
+        - GL000247.1
+        - GL000248.1
+        - GL000249.1
+
+rrna:
+  url:
+    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
+    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
+  postprocess:
+    function: 'lib.utils.filter_fastas'
+    args: 'Homo sapiens'
diff --git a/include/reference_config_templates/Homo_sapiens/GRCh37.yaml b/include/reference_config_templates/Homo_sapiens/GRCh37.yaml
new file mode 120000
index 00000000..99b7f940
--- /dev/null
+++ b/include/reference_config_templates/Homo_sapiens/GRCh37.yaml
@@ -0,0 +1 @@
+GENCODE_v19.yaml
\ No newline at end of file
diff --git a/include/reference_config_templates/Homo_sapiens/hg19.yaml b/include/reference_config_templates/Homo_sapiens/hg19.yaml
new file mode 120000
index 00000000..99b7f940
--- /dev/null
+++ b/include/reference_config_templates/Homo_sapiens/hg19.yaml
@@ -0,0 +1 @@
+GENCODE_v19.yaml
\ No newline at end of file

From 40a64ff3ebaa29327218ba390f606f87cc67c2a3 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 14 Oct 2025 22:35:41 +0000
Subject: [PATCH 143/196] verbose arg for filtering

---
 lib/postprocess/utils.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py
index 18fa5296..1963fa76 100644
--- a/lib/postprocess/utils.py
+++ b/lib/postprocess/utils.py
@@ -4,6 +4,7 @@
 import re
 import sys
 import tempfile
+import shutil
 import zipfile
 
 import gffutils
@@ -38,7 +39,7 @@ def ensure_single_unzipped(tmpfiles, outfile):
         raise ValueError("Mixture of compressed and uncompressed files")
 
 
-def _patterns(include_patterns, exclude_patterns):
+def _patterns(include_patterns, exclude_patterns, verbose=False):
     """
     Return a function that will include/exclude strings based on the patterns
     provided.
@@ -51,10 +52,11 @@ def _patterns(include_patterns, exclude_patterns):
         for p in include_patterns:
             patterns.append(re.compile(p))
 
-        def keep(s):
+        def keep(s):     
             for p in patterns:
                 if p.search(s):
-                    logger.info(f"Keeping {s} because it matches {p}")
+                    if verbose:
+                        logger.info(f"Keeping {s} because it matches {p}")
                     return True
             return False
 
@@ -65,7 +67,8 @@ def keep(s):
         def keep(s):
             for p in patterns:
                 if p.search(s):
-                    logger.info(f"Excluding {s} because it matches {p}")
+                    if verbose:
+                        logger.info(f"Excluding {s} because it matches {p}")
                     return False
             return True
 
@@ -107,7 +110,7 @@ def filter_fasta_chroms(
 
 def filter_gtf_chroms(tmpfiles, outfile, include_patterns=None, exclude_patterns=None):
     working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp")
-    keep = _patterns(include_patterns, exclude_patterns)
+    keep = _patterns(include_patterns, exclude_patterns, verbose=False)
     with gzip.open(outfile, "wt") as fout:
         for feature in gffutils.DataIterator(working_file):
             if keep(feature.chrom):
@@ -182,8 +185,6 @@ def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand="None"):
                             fout.write(line)
 
 
-# match_gtf_9th(['/home/esnaultcm/Downloads/Rattus_norvegicus.Rnor_6.0.94.gtf.gz'], "test.gz", ['ENSRNOG00000046319'], '-')
-
 
 def convert_gtf_chroms(tmpfiles, outfile, conv_table):
     """

From 999e12299fcc151c5be92d18e696927c6e8b98e9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 17 Oct 2025 02:37:04 +0000
Subject: [PATCH 144/196] updates to decision log

---
 docs/decisions.rst | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 5dcb3460..117deca7 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -448,3 +448,100 @@ understand the complete workflow.
 
 Taken together, it made more sense to eliminate the references workflow
 entirely, and port the rules to the respective workflows.
+
+featureCounts all-in-one or individually
+----------------------------------------
+
+featureCounts can accept a list of BAMs and run everything in one shot, or can
+be run once per sample and then manusally aggregated later. Previously, we
+provided all BAMs. However, for paired-end BAMs, featureCounts will internally
+name sort each BAM before counting. It does this serially. The result is
+possibly substantial memory usage and a lot of time. 
+
+One approach could be to temporarily name-sort BAMs in a separate rule,
+conditional on paired-end reads, and the featureCounts rule would need to have
+conditional input filenames as well. This adds a little bit of complexity for
+the benefit of being able to more finely control resource usage. Another
+approach would be to run featureCounts independently on each BAM, allosing it
+to name-sort independently each one in parallel, and then manually aggregate
+the featureCounts output of each.
+
+Since the conditional inclusion of a namesorted rule was straightforward (a
+matter of choosing the input file for featureCounts rule), it made the most
+sense to run featureCounts once, providing it all samples.
+
+Selection of reference genomes and annotations
+----------------------------------------------
+
+Where possible, we select "primary" assemblies -- those with th canonical
+chromosomes and unassembled contigs (scaffolds) but NOT haplotypes, alternate
+loci, or assembly patches.
+
+`Heng Li's blog post
+<https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use>`__ on
+the subject is a useful guideline. To summarize, we want to exclude alt contigs
+/ haplotypes because they may create multimapping issues, and we want to
+include unassembled contigs because excluding them will artificially decrease
+alignment percentage.
+
+Since lcdb-wf is intended to be used with arbitrary organisms, the PAR and
+mitochondrial sequences mentioned there are not relevant in general.
+
+Ideally, we would have a tool that, given the URLs for raw fastq and gtf,
+
+1. Displays the set of chromosomes
+2. Infers if there are any that look like rDNA or mtDNA
+3. Ensures the GTF matches the fasta match chromosomes
+4. Accepts a template config to assess to process
+
+
+Annotations
+-----------
+
+We use the most comprehensive annotations. For human and mouse, this is the
+GENCODE "comprehensive" annotation for the primary assembly, which will include
+many more than just protein-coding transcripts. For example, here are the
+frequencies of ``transcript_type`` values in GENCODE v19's comprehensive
+annotation:
+
+::
+
+  1726632 protein_coding
+  214952 nonsense_mediated_decay
+  154780 processed_transcript
+  135772 retained_intron
+   54584 lincRNA
+   44207 antisense
+   22976 processed_pseudogene
+   15313 pseudogene
+   11202 unprocessed_pseudogene
+    9477 miRNA
+    7090 transcribed_unprocessed_pseudogene
+    6149 misc_RNA
+    5783 snRNA
+    4521 snoRNA
+    3148 sense_intronic
+    1662 polymorphic_pseudogene
+    1610 rRNA
+    1430 unitary_pseudogene
+    1417 sense_overlapping
+    1117 IG_V_gene
+    1091 transcribed_processed_pseudogene
+    1035 non_stop_decay
+     755 TR_V_gene
+     681 IG_V_pseudogene
+     300 TR_J_gene
+     185 IG_C_gene
+     152 IG_D_gene
+     100 3prime_overlapping_ncrna
+      99 TR_V_pseudogene
+      80 IG_J_gene
+      66 Mt_tRNA
+      56 TR_C_gene
+      36 IG_C_pseudogene
+      12 TR_J_pseudogene
+      12 TR_D_gene
+       9 IG_J_pseudogene
+       6 Mt_rRNA
+       3 translated_processed_pseudogene
+

From 98a75b2727bd7ec1495d22fbc691505623518e05 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 17 Oct 2025 14:18:22 +0000
Subject: [PATCH 145/196] add gencode_m25 config for mouse

---
 .../Mus_musculus/GENCODE_M25.yaml             | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml

diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
new file mode 100644
index 00000000..b959d3a5
--- /dev/null
+++ b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
@@ -0,0 +1,20 @@
+# This is the latest GENCODE release for GRCm38/mm10.
+#
+# Primary assembly and associated annotations are directly available from GENCODE,
+# https://www.gencodegenes.org/mouse/release_M25.html
+
+species: "Mus musculus"
+
+genome:
+  url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz"
+
+annotation:
+  url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.primary_assembly.annotation.gtf.gz"
+  
+rrna:
+  url:
+    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
+    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
+  postprocess:
+    function: 'lib.utils.filter_fastas'
+    args: 'Mus musculus'

From a0610ed61a4c2aba40c49fd47f2ad4f5371e1c68 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 19 Oct 2025 00:32:15 +0000
Subject: [PATCH 146/196] move postprocess.utils to postprocess

---
 .../Homo_sapiens/GENCODE.yaml                 |   2 +
 .../Homo_sapiens/GENCODE_v19.yaml             |   4 +-
 .../Mus_musculus/GENCODE_M25.yaml             |   3 +
 lib/postprocess/__init__.py                   | 280 ++++++++++++++++++
 lib/postprocess/utils.py                      | 275 -----------------
 lib/utils.py                                  |   2 -
 6 files changed, 287 insertions(+), 279 deletions(-)
 delete mode 100644 lib/postprocess/utils.py

diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
index dd4ae34f..86d0538d 100644
--- a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
+++ b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
@@ -5,6 +5,8 @@
 #
 genome:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz"
+  postprocess: lib.postprocess.default
 
 annotation:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz"
+  postprocess: lib.postprocess.default
diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml
index c2e0bf6a..f32dc7e9 100644
--- a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml
+++ b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml
@@ -35,7 +35,7 @@
 genome:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/GRCh37.p13.genome.fa.gz"
   postprocess:
-    function: "lib.postprocess.utils.filter_fasta_chroms"
+    function: "lib.postprocess.filter_fasta_chroms"
     kwargs:
       include_patterns:
         - chr.*
@@ -103,7 +103,7 @@ genome:
 annotation:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz"
   postprocess:
-    function: "lib.postprocess.utils.filter_gtf_chroms"
+    function: "lib.postprocess.filter_gtf_chroms"
     kwargs:
       include_patterns:
         - chr.*
diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
index b959d3a5..a077e399 100644
--- a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
+++ b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
@@ -7,9 +7,12 @@ species: "Mus musculus"
 
 genome:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz"
+  postprocess: lib.postprocess.default
+
 
 annotation:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.primary_assembly.annotation.gtf.gz"
+  postprocess: lib.postprocess.default
   
 rrna:
   url:
diff --git a/lib/postprocess/__init__.py b/lib/postprocess/__init__.py
index b6e690fd..3d7fbbfe 100644
--- a/lib/postprocess/__init__.py
+++ b/lib/postprocess/__init__.py
@@ -1 +1,281 @@
+import gzip
+import logging
+import os
+import re
+import sys
+import tempfile
+import shutil
+import zipfile
+
+import gffutils
+import pandas as pd
+from snakemake.shell import shell
+
+here = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, os.path.join(here, "../../lib"))
+from .. import utils as u
+
 from . import *
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO)
+
+
+def default(origfn, newfn):
+    shell("mv {origfn} {newfn}")
+
+
+def ensure_single_unzipped(tmpfiles, outfile):
+    """
+    Sometimes it makes things easier in downstream code to assume there's
+    a single uncompressed file to work with.
+    """
+    all_gzipped = all([u.is_gzipped(i) for i in tmpfiles])
+    none_gzipped = all([not u.is_gzipped(i) for i in tmpfiles])
+
+    if all_gzipped:
+        shell("zcat {tmpfiles} > {outfile}")
+        return outfile
+
+    elif none_gzipped:
+        shell("cat {tmpfiles} > {outfile}")
+        return outfile
+
+    else:
+        raise ValueError("Mixture of compressed and uncompressed files")
+
+
+def _patterns(include_patterns, exclude_patterns, verbose=False):
+    """
+    Return a function that will include/exclude strings based on the patterns
+    provided.
+    """
+
+    if include_patterns and exclude_patterns:
+        raise ValueError("include_patterns and exclude_patterns are mutually exclusive")
+    patterns = []
+    if include_patterns:
+        for p in include_patterns:
+            patterns.append(re.compile(p))
+
+        def keep(s):     
+            for p in patterns:
+                if p.search(s):
+                    if verbose:
+                        logger.info(f"Keeping {s} because it matches {p}")
+                    return True
+            return False
+
+    elif exclude_patterns:
+        for p in exclude_patterns:
+            patterns.append(re.compile(p))
+
+        def keep(s):
+            for p in patterns:
+                if p.search(s):
+                    if verbose:
+                        logger.info(f"Excluding {s} because it matches {p}")
+                    return False
+            return True
+
+    else:
+        raise ValueError(
+            "Expecting exactly one of include_patterns or exclude_patterns"
+        )
+
+    return keep
+
+
+def filter_fasta_chroms(
+    tmpfiles, outfile, include_patterns=None, exclude_patterns=None
+):
+    # samtools won't work with gzip (only bgzip) files, so the lowest common
+    # denominator is to use uncompressed.
+    working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp")
+    if include_patterns and exclude_patterns:
+        raise ValueError("include_patterns and exclude_patterns are mutually exclusive")
+
+    logger.info(f"Finding chrom names and putting them in {working_file}.record_names")
+    shell(
+        'grep ">" {working_file} | cut -f1 -d " " | sed "s/>//g" > {working_file}.record_names'
+    )
+
+    keep = _patterns(include_patterns, exclude_patterns)
+    with open(outfile + ".keep", "w") as fout, open(
+        working_file + ".record_names", "r"
+    ) as fin:
+        for line in fin:
+            line = line.replace(">", "").strip()
+            chrom = line.split()[0]
+            if keep(chrom):
+                fout.write(chrom + "\n")
+    shell("samtools faidx -r {outfile}.keep {working_file} | bgzip -c > {outfile}")
+    # shell("rm {outfile}.tmp {outfile}.tmp.fai {outfile}.keep")
+    shell("rm {tmpfiles}")
+
+
+def filter_gtf_chroms(tmpfiles, outfile, include_patterns=None, exclude_patterns=None):
+    working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp")
+    keep = _patterns(include_patterns, exclude_patterns, verbose=False)
+    with gzip.open(outfile, "wt") as fout:
+        for feature in gffutils.DataIterator(working_file):
+            if keep(feature.chrom):
+                fout.write(str(feature) + "\n")
+    shell("rm {tmpfiles}")
+
+
+def extract_from_zip(tmpfiles, outfile, path_in_zip):
+    """
+    Parameters
+    ----------
+
+    tmpfiles : list
+        One-item list containing zip file
+
+    outfile : str
+        gzipped output file to create
+
+    path_in_zip : str
+        Path within zipfile to extract. You can identify the path using unzip
+        -l x.zip from bash.
+    """
+    assert len(tmpfiles) == 1, f"expected single zip file, got {tmpfiles}"
+
+    extraction_dir = tempfile.mkdtemp()
+
+    with zipfile.ZipFile(tmpfiles[0], "r") as z:
+        z.extract(path_in_zip, path=extraction_dir)
+
+    full_path_to_extracted = os.path.join(extraction_dir, path_in_zip)
+
+    with open(full_path_to_extracted, "rb") as fin:
+        with gzip.open(outfile, "wb") as fout:
+            shutil.copyfileobj(fin, fout)
+
+    shutil.rmtree(extraction_dir)
+
+
+def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand="None"):
+    """
+    Matches string to the 9th field of GTF and an optional strand that defaults to None;
+    if the pattern is found and the provided strand match then the line is excluded
+
+    Parameters
+    ----------
+    tmpfiles : str
+        GTF files
+
+    outfile : str
+        gzipped output GTF file
+
+    strmatch : list
+        List of strings to match in the 9th field of the GTF. Must be list
+
+    optstrand : str
+        String to match to the strand. Default is None
+    """
+    regex_strmatch = re.compile(r"|".join(strmatch))
+
+    with gzip.open(outfile, "wt") as fout:
+        for tmpfn in tmpfiles:
+            with openfile(tmpfn, "rt") as tmp:
+                for line in tmp:
+                    if line.startswith("#"):
+                        fout.write(line)
+                    else:
+                        toks = line.split("\t")
+                        if not (
+                            regex_strmatch.search(toks[8]) != None
+                            and toks[6] == optstrand
+                        ):
+                            fout.write(line)
+
+
+
+def convert_gtf_chroms(tmpfiles, outfile, conv_table):
+    """
+    Convert chrom names in GTF file according to conversion table.
+
+    Parameters
+    ----------
+    tmpfiles : str
+        GTF files to look through
+
+    outfile : str
+        gzipped output GTF file
+
+    conv_table : str
+        Lookup table file for the chromosome name conversion. Uses pandas to
+        read lookup table, so it can be file://, a path relative to the
+        snakefile, or an http://, https://, or ftp:// URL.
+    """
+    lookup = (
+        pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b"))
+        .set_index("a")["b"]
+        .to_dict()
+    )
+
+    with gzip.open(outfile, "wt") as fout:
+        for tmpfn in tmpfiles:
+            with openfile(tmpfn, "rt") as tmp:
+                for line in tmp:
+                    if not line.startswith("#"):
+                        toks = line.split("\t")
+                        chrom = toks[0]
+                        if chrom in lookup.keys():
+                            toks[0] = lookup[chrom]
+                            line = "\t".join(toks)
+                        else:
+                            raise ValueError(
+                                'Chromosome "{chrom}" not found in conversion table '
+                                '"{conv_table}"'.format(
+                                    chrom=chrom, conv_table=conv_table
+                                )
+                            )
+                    fout.write(line)
+
+
+def convert_fasta_chroms(tmpfiles, outfile, conv_table):
+    """
+    Convert chrom names in fasta file according to conversion table.
+
+    Parameters
+    ----------
+    tmpfiles : str
+        fasta files to look through
+
+    outfile : str
+        gzipped output fasta file
+
+    conv_table : str
+        Lookup table file for the chromosome name conversion. Uses pandas to
+        read lookup table, so it can be file://, a path relative to the
+        snakefile, or an http://, https://, or ftp:// URL.
+    """
+
+    lookup = (
+        pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b"))
+        .set_index("a")["b"]
+        .to_dict()
+    )
+
+    with gzip.open(outfile, "wt") as fout:
+        for tmpfn in tmpfiles:
+            with openfile(tmpfn, "rt") as tmp:
+                for line in tmp:
+                    if line.startswith(">"):
+                        line = line.rstrip("\n")
+                        toks = line.split(" ")
+                        chrom = toks[0].lstrip(">")
+                        chrom = chrom.rstrip("\n")
+                        if chrom in lookup.keys():
+                            toks[0] = ">" + lookup[chrom]
+                            line = " ".join(toks) + "\n"
+                        else:
+                            raise ValueError(
+                                'Chromosome "{chrom}" not found in conversion table '
+                                '"{conv_table}"'.format(
+                                    chrom=chrom, conv_table=conv_table
+                                )
+                            )
+                    fout.write(line)
diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py
deleted file mode 100644
index 1963fa76..00000000
--- a/lib/postprocess/utils.py
+++ /dev/null
@@ -1,275 +0,0 @@
-import gzip
-import logging
-import os
-import re
-import sys
-import tempfile
-import shutil
-import zipfile
-
-import gffutils
-import pandas as pd
-from snakemake.shell import shell
-
-here = os.path.dirname(os.path.abspath(__file__))
-sys.path.insert(0, os.path.join(here, "../../lib"))
-from .. import utils as u
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO)
-
-
-def ensure_single_unzipped(tmpfiles, outfile):
-    """
-    Sometimes it makes things easier in downstream code to assume there's
-    a single uncompressed file to work with.
-    """
-    all_gzipped = all([u.is_gzipped(i) for i in tmpfiles])
-    none_gzipped = all([not u.is_gzipped(i) for i in tmpfiles])
-
-    if all_gzipped:
-        shell("zcat {tmpfiles} > {outfile}")
-        return outfile
-
-    elif none_gzipped:
-        shell("cat {tmpfiles} > {outfile}")
-        return outfile
-
-    else:
-        raise ValueError("Mixture of compressed and uncompressed files")
-
-
-def _patterns(include_patterns, exclude_patterns, verbose=False):
-    """
-    Return a function that will include/exclude strings based on the patterns
-    provided.
-    """
-
-    if include_patterns and exclude_patterns:
-        raise ValueError("include_patterns and exclude_patterns are mutually exclusive")
-    patterns = []
-    if include_patterns:
-        for p in include_patterns:
-            patterns.append(re.compile(p))
-
-        def keep(s):     
-            for p in patterns:
-                if p.search(s):
-                    if verbose:
-                        logger.info(f"Keeping {s} because it matches {p}")
-                    return True
-            return False
-
-    elif exclude_patterns:
-        for p in exclude_patterns:
-            patterns.append(re.compile(p))
-
-        def keep(s):
-            for p in patterns:
-                if p.search(s):
-                    if verbose:
-                        logger.info(f"Excluding {s} because it matches {p}")
-                    return False
-            return True
-
-    else:
-        raise ValueError(
-            "Expecting exactly one of include_patterns or exclude_patterns"
-        )
-
-    return keep
-
-
-def filter_fasta_chroms(
-    tmpfiles, outfile, include_patterns=None, exclude_patterns=None
-):
-    # samtools won't work with gzip (only bgzip) files, so the lowest common
-    # denominator is to use uncompressed.
-    working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp")
-    if include_patterns and exclude_patterns:
-        raise ValueError("include_patterns and exclude_patterns are mutually exclusive")
-
-    logger.info(f"Finding chrom names and putting them in {working_file}.record_names")
-    shell(
-        'grep ">" {working_file} | cut -f1 -d " " | sed "s/>//g" > {working_file}.record_names'
-    )
-
-    keep = _patterns(include_patterns, exclude_patterns)
-    with open(outfile + ".keep", "w") as fout, open(
-        working_file + ".record_names", "r"
-    ) as fin:
-        for line in fin:
-            line = line.replace(">", "").strip()
-            chrom = line.split()[0]
-            if keep(chrom):
-                fout.write(chrom + "\n")
-    shell("samtools faidx -r {outfile}.keep {working_file} | bgzip -c > {outfile}")
-    # shell("rm {outfile}.tmp {outfile}.tmp.fai {outfile}.keep")
-    shell("rm {tmpfiles}")
-
-
-def filter_gtf_chroms(tmpfiles, outfile, include_patterns=None, exclude_patterns=None):
-    working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp")
-    keep = _patterns(include_patterns, exclude_patterns, verbose=False)
-    with gzip.open(outfile, "wt") as fout:
-        for feature in gffutils.DataIterator(working_file):
-            if keep(feature.chrom):
-                fout.write(str(feature) + "\n")
-    shell("rm {tmpfiles}")
-
-
-def extract_from_zip(tmpfiles, outfile, path_in_zip):
-    """
-    Parameters
-    ----------
-
-    tmpfiles : list
-        One-item list containing zip file
-
-    outfile : str
-        gzipped output file to create
-
-    path_in_zip : str
-        Path within zipfile to extract. You can identify the path using unzip
-        -l x.zip from bash.
-    """
-    assert len(tmpfiles) == 1, f"expected single zip file, got {tmpfiles}"
-
-    extraction_dir = tempfile.mkdtemp()
-
-    with zipfile.ZipFile(tmpfiles[0], "r") as z:
-        z.extract(path_in_zip, path=extraction_dir)
-
-    full_path_to_extracted = os.path.join(extraction_dir, path_in_zip)
-
-    with open(full_path_to_extracted, "rb") as fin:
-        with gzip.open(outfile, "wb") as fout:
-            shutil.copyfileobj(fin, fout)
-
-    shutil.rmtree(extraction_dir)
-
-
-def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand="None"):
-    """
-    Matches string to the 9th field of GTF and an optional strand that defaults to None;
-    if the pattern is found and the provided strand match then the line is excluded
-
-    Parameters
-    ----------
-    tmpfiles : str
-        GTF files
-
-    outfile : str
-        gzipped output GTF file
-
-    strmatch : list
-        List of strings to match in the 9th field of the GTF. Must be list
-
-    optstrand : str
-        String to match to the strand. Default is None
-    """
-    regex_strmatch = re.compile(r"|".join(strmatch))
-
-    with gzip.open(outfile, "wt") as fout:
-        for tmpfn in tmpfiles:
-            with openfile(tmpfn, "rt") as tmp:
-                for line in tmp:
-                    if line.startswith("#"):
-                        fout.write(line)
-                    else:
-                        toks = line.split("\t")
-                        if not (
-                            regex_strmatch.search(toks[8]) != None
-                            and toks[6] == optstrand
-                        ):
-                            fout.write(line)
-
-
-
-def convert_gtf_chroms(tmpfiles, outfile, conv_table):
-    """
-    Convert chrom names in GTF file according to conversion table.
-
-    Parameters
-    ----------
-    tmpfiles : str
-        GTF files to look through
-
-    outfile : str
-        gzipped output GTF file
-
-    conv_table : str
-        Lookup table file for the chromosome name conversion. Uses pandas to
-        read lookup table, so it can be file://, a path relative to the
-        snakefile, or an http://, https://, or ftp:// URL.
-    """
-    lookup = (
-        pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b"))
-        .set_index("a")["b"]
-        .to_dict()
-    )
-
-    with gzip.open(outfile, "wt") as fout:
-        for tmpfn in tmpfiles:
-            with openfile(tmpfn, "rt") as tmp:
-                for line in tmp:
-                    if not line.startswith("#"):
-                        toks = line.split("\t")
-                        chrom = toks[0]
-                        if chrom in lookup.keys():
-                            toks[0] = lookup[chrom]
-                            line = "\t".join(toks)
-                        else:
-                            raise ValueError(
-                                'Chromosome "{chrom}" not found in conversion table '
-                                '"{conv_table}"'.format(
-                                    chrom=chrom, conv_table=conv_table
-                                )
-                            )
-                    fout.write(line)
-
-
-def convert_fasta_chroms(tmpfiles, outfile, conv_table):
-    """
-    Convert chrom names in fasta file according to conversion table.
-
-    Parameters
-    ----------
-    tmpfiles : str
-        fasta files to look through
-
-    outfile : str
-        gzipped output fasta file
-
-    conv_table : str
-        Lookup table file for the chromosome name conversion. Uses pandas to
-        read lookup table, so it can be file://, a path relative to the
-        snakefile, or an http://, https://, or ftp:// URL.
-    """
-
-    lookup = (
-        pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b"))
-        .set_index("a")["b"]
-        .to_dict()
-    )
-
-    with gzip.open(outfile, "wt") as fout:
-        for tmpfn in tmpfiles:
-            with openfile(tmpfn, "rt") as tmp:
-                for line in tmp:
-                    if line.startswith(">"):
-                        line = line.rstrip("\n")
-                        toks = line.split(" ")
-                        chrom = toks[0].lstrip(">")
-                        chrom = chrom.rstrip("\n")
-                        if chrom in lookup.keys():
-                            toks[0] = ">" + lookup[chrom]
-                            line = " ".join(toks) + "\n"
-                        else:
-                            raise ValueError(
-                                'Chromosome "{chrom}" not found in conversion table '
-                                '"{conv_table}"'.format(
-                                    chrom=chrom, conv_table=conv_table
-                                )
-                            )
-                    fout.write(line)
diff --git a/lib/utils.py b/lib/utils.py
index b74fc7e4..0ae3edc7 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -783,8 +783,6 @@ def twobit_to_fasta(tmpfiles, outfile):
     shell("cat {fastas} | gzip -c > {outfile}")
     shell("rm {fastas}")
 
-def default_postprocess(origfn, newfn):
-    shell("mv {origfn} {newfn}")
 
 def download_and_postprocess(urls, postprocess, outfile, log):
     """

From 999fde27a5ccd7d11be894f61f97389bb98a332d Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 19 Oct 2025 00:32:32 +0000
Subject: [PATCH 147/196] match chipseq to rnaseq fasta->genome

---
 workflows/chipseq/Snakefile          | 4 ++--
 workflows/chipseq/config/config.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 8b2b9004..4f347eb7 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -50,8 +50,8 @@ rule fasta:
         mem_mb="4g",
         runtime="2h",
     params:
-        urls=config["fasta"]["url"],
-        postprocess=config["fasta"].get("postprocess", None),
+        urls=config["genome"]["url"],
+        postprocess=config["genome"].get("postprocess", None),
     run:
         utils.download_and_postprocess(
             urls=params.urls,
diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml
index 268dcf59..d20f6e36 100644
--- a/workflows/chipseq/config/config.yaml
+++ b/workflows/chipseq/config/config.yaml
@@ -1,6 +1,6 @@
 sampletable: 'config/sampletable.tsv'
 
-fasta:
+genome:
   url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
   postprocess: 'lib.utils.gzipped'
 

From 854c362d9ddd1bb6ee5380d24c509a9cfe96189a Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 19 Oct 2025 02:39:41 +0000
Subject: [PATCH 148/196] minor cleanups in rnaseq snakefile

---
 .../Homo_sapiens/GENCODE.yaml                 |  2 ++
 .../Homo_sapiens/GENCODE_v19.yaml             | 11 +++-----
 .../Mus_musculus/GENCODE_M25.yaml             | 10 +------
 lib/utils.py                                  |  7 ++---
 workflows/rnaseq/Snakefile                    | 26 ++++++++++++++-----
 5 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
index 86d0538d..507877bb 100644
--- a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
+++ b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
@@ -3,6 +3,8 @@
 #
 # https://www.gencodegenes.org/human/
 #
+organism: "Homo sapiens"
+
 genome:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz"
   postprocess: lib.postprocess.default
diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml
index f32dc7e9..18648cd6 100644
--- a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml
+++ b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml
@@ -32,6 +32,9 @@
 #
 # So we can use this list of chroms to filter both the fasta as well as the gtf.
 #
+
+organism: "Homo sapiens"
+
 genome:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/GRCh37.p13.genome.fa.gz"
   postprocess:
@@ -166,11 +169,3 @@ annotation:
         - GL000247.1
         - GL000248.1
         - GL000249.1
-
-rrna:
-  url:
-    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
-    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
-  postprocess:
-    function: 'lib.utils.filter_fastas'
-    args: 'Homo sapiens'
diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
index a077e399..99120cbf 100644
--- a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
+++ b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
@@ -3,7 +3,7 @@
 # Primary assembly and associated annotations are directly available from GENCODE,
 # https://www.gencodegenes.org/mouse/release_M25.html
 
-species: "Mus musculus"
+organism: "Mus musculus"
 
 genome:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz"
@@ -13,11 +13,3 @@ genome:
 annotation:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.primary_assembly.annotation.gtf.gz"
   postprocess: lib.postprocess.default
-  
-rrna:
-  url:
-    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
-    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
-  postprocess:
-    function: 'lib.utils.filter_fastas'
-    args: 'Mus musculus'
diff --git a/lib/utils.py b/lib/utils.py
index 0ae3edc7..a0f90a7f 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -723,7 +723,7 @@ def strand_arg_lookup(config, lookup):
     return lookup[config.stranded]
 
 
-def filter_fastas(tmpfiles, outfile, pattern):
+def filter_rrna_fastas(tmpfiles, outfile, pattern):
     """
     Extract records from fasta file(s) given a search pattern.
 
@@ -742,7 +742,8 @@ def filter_fastas(tmpfiles, outfile, pattern):
         Look for this string in each record's description
 
     """
-
+    if pattern is None:
+        raise ValueError("Pattern cannot be None")
     def gen():
         for tmp in tmpfiles:
             handle = gzip.open(tmp, "rt")
@@ -751,7 +752,7 @@ def gen():
                 if pattern not in rec.description:
                     continue
                 rec.seq = rec.seq.back_transcribe()
-                rec.description = rec.name
+                # rec.description = rec.name
                 yield rec
 
     with gzip.open(outfile, "wt") as fout:
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index c0574925..1e4f3ab2 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -112,12 +112,22 @@ rule rrna_fasta:
         mem="4g",
         runtime="2h",
     params:
-        urls=config["rrna"]["url"],
-        postprocess=config["rrna"].get("postprocess", None),
+        organism=config.get("organism", None),
+        silva_release="138.1",
     run:
+        # SILVA database fasta file with all species
+        urls=[
+            f'https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_LSURef_NR99_tax_silva.fasta.gz',
+            f'https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_SSURef_NR99_tax_silva.fasta.gz',
+        ]
+
+        # Keep only sequences for the configured organism
         utils.download_and_postprocess(
-            urls=params.urls,
-            postprocess=params.postprocess,
+            urls=urls,
+            postprocess={
+                "function": "lib.utils.filter_rrna_fastas",
+                "args": params.organism,
+            },
             outfile=output[0],
             log=log,
         )
@@ -127,7 +137,7 @@ rule unzip:
     input:
         "references/{prefix}.gz",
     output:
-        temporary("references/{prefix}"),
+        "references/{prefix}",
     resources:
         mem="4g",
         runtime="2h",
@@ -176,7 +186,7 @@ rule star_index:
             "--runThreadN {threads} "
             "--genomeDir {genomedir} "
             "--genomeFastaFiles {input.fasta} "
-            # NOTE: GTF is optional
+            # NOTE: GTF is optional but highly recommended by STAR docs
             "--sjdbGTFfile {input.gtf} "
             # NOTE: STAR docs say that 100 should work well.
             "--sjdbOverhang 100 "
@@ -198,11 +208,13 @@ rule transcriptome_fasta:
         gtf="references/annotation.gtf",
     output:
         "references/transcriptome.fa",
+    log:
+        "references/transcriptome.log",
     resources:
         mem="4g",
         runtime="2h",
     shell:
-        "gffread {input.gtf} -w {output} -g {input.fasta}"
+        "gffread {input.gtf} -w {output} -g {input.fasta} &> {log}"
 
 
 rule salmon_index:

From e6ef55410aca7ad4ea122180fd31ed986cdbaa44 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 19 Oct 2025 02:40:08 +0000
Subject: [PATCH 149/196] more simplification of references

---
 workflows/rnaseq/config/config.yaml | 16 ----------------
 workflows/rnaseq/run_test.sh        |  2 +-
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml
index 9047b4ab..2c34c6d9 100644
--- a/workflows/rnaseq/config/config.yaml
+++ b/workflows/rnaseq/config/config.yaml
@@ -4,19 +4,3 @@ sampletable: 'config/sampletable.tsv'
 stranded: 'fr-firststrand'     # for dUTP libraries
 #         'fr-secondstrand'    # for ligation libraries
 #         'unstranded'         # for libraries without strand specificity
-
-genome:
-  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
-  postprocess: 'lib.utils.gzipped'
-
-annotation:
-  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf"
-  postprocess: 'lib.utils.gzipped'
-
-rrna:
-  url:
-    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
-    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
-  postprocess:
-    function: 'lib.utils.filter_fastas'
-    args: 'Drosophila melanogaster'
diff --git a/workflows/rnaseq/run_test.sh b/workflows/rnaseq/run_test.sh
index 7aacb413..fc76064e 100755
--- a/workflows/rnaseq/run_test.sh
+++ b/workflows/rnaseq/run_test.sh
@@ -1,3 +1,3 @@
 set -e
 python -m doctest ../../ci/preprocessor.py
-python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@"
+python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test --configfile ../../include/reference_config_templates/test.yaml "$@" 

From b37edbf5230bcb772e6e55e59c63088d5916aa03 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 19 Oct 2025 02:40:22 +0000
Subject: [PATCH 150/196] decision log updates

---
 docs/decisions.rst | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 117deca7..71d2a20b 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -545,3 +545,19 @@ annotation:
        6 Mt_rRNA
        3 translated_processed_pseudogene
 
+Erring on the side of too many annotations (i.e., using the comprehensive
+annotation instead of a curated version) will result in more features, which at
+face value might make the FDR adjustment more harsh in DESeq2. But DESeq2's
+independent filtering (not even testing those features with so few reads that
+they would not reach significance) guards against this.
+
+Zipping/unzipping references
+----------------------------
+STAR requires uncogffread references/annotation.gtf -w
+references/transcriptome.fa -g references/genome.fampressed
+FASTA and GTF files to build the index. Making uncompressed
+temporary means running the risk of another rule needing
+uncompressed to trigger costly STAR alignment. The extra
+storage cost of leaving an uncompressed fasta (~3 GB) around is
+minimal compared to the scale of all other data, and guards
+against inadvertently re-running all alignment jobs.

From a82e10c85a00e3dfb56d0dfd792b74745889b4de Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 19 Oct 2025 02:40:35 +0000
Subject: [PATCH 151/196] add test config

---
 include/reference_config_templates/test.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 include/reference_config_templates/test.yaml

diff --git a/include/reference_config_templates/test.yaml b/include/reference_config_templates/test.yaml
new file mode 100644
index 00000000..ceb36877
--- /dev/null
+++ b/include/reference_config_templates/test.yaml
@@ -0,0 +1,9 @@
+organism: 'Drosophila melanogaster'
+
+genome:
+  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
+  postprocess: 'lib.utils.gzipped'
+
+annotation:
+  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf"
+  postprocess: 'lib.utils.gzipped'

From 2910b85d6319de4bb6ac37a4bbe937e5f91cb575 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 19 Oct 2025 02:41:16 +0000
Subject: [PATCH 152/196] snakefmt

---
 workflows/rnaseq/Snakefile | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 1e4f3ab2..6c377137 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -72,17 +72,19 @@ rule fasta:
             log=log,
         )
 
+
 rule faidx:
     input:
-        "references/genome.fa"
+        "references/genome.fa",
     output:
-        "references/genome.fa.fai"
+        "references/genome.fa.fai",
     resources:
         mem_mb="4g",
         runtime="2h",
     shell:
         "samtools faidx {input}"
 
+
 rule annotation:
     output:
         "references/annotation.gtf.gz",
@@ -116,9 +118,9 @@ rule rrna_fasta:
         silva_release="138.1",
     run:
         # SILVA database fasta file with all species
-        urls=[
-            f'https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_LSURef_NR99_tax_silva.fasta.gz',
-            f'https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_SSURef_NR99_tax_silva.fasta.gz',
+        urls = [
+            f"https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_LSURef_NR99_tax_silva.fasta.gz",
+            f"https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_SSURef_NR99_tax_silva.fasta.gz",
         ]
 
         # Keep only sequences for the configured organism

From f2ddbe486602a8e432c464c1299002b13abed824 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 24 Oct 2025 00:38:16 +0000
Subject: [PATCH 153/196] reconfigure tests

---
 .circleci/config.yml                      | 98 +++++++++++------------
 test/test_configs/test_rnaseq_config.yaml | 27 ++-----
 test/test_configs/test_sra_config.yaml    |  5 +-
 3 files changed, 54 insertions(+), 76 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index bee8727d..80767ded 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -108,6 +108,8 @@ variables:
 
             time conda env create -n $LCDBWF_ENV --file env.yml
             time conda env create -n $LCDBWF_ENV_R --file env-r.yml
+            conda env export -n $LCDBWF_ENV > /opt/miniconda/env.yml
+            conda env export -n $LCDBWF_ENV_R > /opt/miniconda/env.yml
         fi
 
   # --------------------------------------------------------------------------
@@ -135,32 +137,31 @@ variables:
         tree $ORIG
         set +x
 
-        # Separately copy over some test-specific files
+        # Separately copy over some test-specific files that are not part of deploying
         cp $ORIG/workflows/chipseq/run_test.sh $DEPLOY/workflows/chipseq/run_test.sh
         cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh
         cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh
 
         mkdir $DEPLOY/ci
         mkdir $DEPLOY/test
-        cp $ORIG/test/lcdb-wf-test $DEPLOY/test/lcdb-wf-test
-        cp $ORIG/test/workflow_test_params.yaml $DEPLOY/test/workflow_test_params.yaml
         cp $ORIG/ci/get-data.py $DEPLOY/ci/get-data.py
 
         # the ./run_test.sh scripts run this
         cp $ORIG/ci/preprocessor.py $DEPLOY/ci/preprocessor.py
 
-        # download example data
+        # Now we can download example data
         cd $DEPLOY
-        test/lcdb-wf-test data --kind=all --verbose
+        ci/get-data.py
 
   # --------------------------------------------------------------------------
   # Run the doctests across the included modules
   pytest-step: &pytest-step
     run:
-      name: Run pytest suite and testthat suite
+      name: Run pytest suite and R testthat suite
       command: |
         source /opt/miniforge/etc/profile.d/conda.sh
         conda activate $LCDBWF_ENV
+
         # run unit tests and doctests for the modules in lib
         test/lcdb-wf-test unit_tests --pytest
 
@@ -183,8 +184,9 @@ variables:
           cd $DEPLOY/workflows/chipseq
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p
-          $DEPLOY/test/lcdb-wf-test chipseq --trackhub
+          cd $DEPLOY/workflows/chipseq
+          ./run_test.sh --use-conda -j2 -k -p
+          python chipseq_trackhub.py config/config.yaml config/hub_config.yaml
 
   # --------------------------------------------------------------------------
   # Previous versions had an error where chipseq peaks needed to be defined for
@@ -194,10 +196,9 @@ variables:
       run:
         name: chipseq misc
         command: |
-          cd $DEPLOY/workflows/chipseq
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
-
+          cd $DEPLOY/workflows/chipseq
           ./run_test.sh --use-conda -j2 -k -p \
             --configfile $ORIG/test/test_configs/test_chipseq_regression.yaml \
             --config sampletable=$ORIG/test/test_configs/chipseq_one_run.tsv \
@@ -227,15 +228,18 @@ variables:
           cd $DEPLOY
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p --orig $ORIG
 
-          $DEPLOY/test/lcdb-wf-test rnaseq --trackhub --orig $ORIG
+          cd workflows/rnaseq
+
+          ./run_test.sh -n \
+            --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml
+
+          ./run_test.sh --use-conda -j2 -k -p \
+            --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml
 
-          # This run the preprocessor on the Rmd files and stores them
-          # in a new download-test directory (see the comments in the script
-          # for details)
-          $DEPLOY/test/lcdb-wf-test rnaseq --downstream
+          python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml
+
+          ./run_downstream_test.sh
 
           # bundle up the entire directory to be used as an artifact
           tar -zcf /tmp/downstream.tar.gz workflows/rnaseq/downstream-test/
@@ -256,13 +260,22 @@ variables:
           source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
 
-          # Check the help for test/lcdb-wf-test to see what args these
-          # provide; some of them use the --until argument to restrict the
-          # rules that are run. Note the use of --orig $ORIG to use the test
-          # configs from the original clone rather than the deployed directory.
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe          -k -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se          -k -p -j2 --use-conda --orig $ORIG
-          $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe              -k -p -j2 --use-conda --orig $ORIG
+          cd workflows/rnaseq
+
+          # SRA test
+          ./run_test.sh -k -p -j2 --use-conda \
+            --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \
+            --config sampletable=$ORIG/test/test_configs/test_sra_sampletable.tsv
+
+          # SRA SE only
+          ./run_test.sh -k -p -j2 --use-conda \
+            --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \
+            --config sampletable=$ORIG/test/test_configs/test_sra_sampletable_SE_only.tsv
+
+          # PE
+          ./run_test.sh -k -p -j2 --use-conda \
+            --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \
+            --config sampletable=$ORIG/test/test_configs/test_pe_sampletable.tsv
 
 
 
@@ -305,10 +318,13 @@ jobs:
       # themselves.
       - *save_cache
 
+      # These files were created during conda setup, and become part of the
+      # cache. So we should get them as artifacts regardless of if the conda
+      # setup ran this time.
       - store_artifacts:
-          path: /tmp/lcdb-wf-test/env.yaml
+          path: /opt/miniforge/env.yml
       - store_artifacts:
-          path: /tmp/lcdb-wf-test/env-r.yaml
+          path: /opt/miniforge/env-r.yml
   pytest:
     <<: *defaults
     resource_class: small
@@ -328,7 +344,7 @@ jobs:
       - *get-data
       - *chipseq-step
       - store_artifacts:
-          path: /tmp/lcdb-wf-test/workflows/chipseq/data/chipseq_aggregation/multiqc.html
+          path: $DEST/workflows/chipseq/data/chipseq_aggregation/multiqc.html
 
   chipseq-misc:
     <<: *defaults
@@ -363,7 +379,6 @@ jobs:
           path: /tmp/gene-patterns.html
           destination: gene-patterns.html
 
-
   rnaseq-misc:
     <<: *defaults
     steps:
@@ -373,7 +388,6 @@ jobs:
       - *get-data
       - *rnaseq-misc-step
 
-
   build-docs:
     <<: *defaults
     resource_class: small
@@ -402,24 +416,6 @@ jobs:
       - store_artifacts:
           path: /tmp/docs.tar.gz
 
-  report-env:
-    <<: *defaults
-    resource_class: small
-    steps:
-      - checkout
-      - *restore_cache
-      - *set-path
-      - run:
-          name: Report environment
-          command: |
-            source /opt/miniforge/etc/profile.d/conda.sh
-            conda env export -n lcdb-wf-test > /tmp/env.yaml
-            conda env export -n lcdb-wf-test-r > /tmp/env-r.yaml
-      - store_artifacts:
-          path: /tmp/env.yaml
-      - store_artifacts:
-          path: /tmp/env-r.yaml
-
 # ----------------------------------------------------------------------------
 # This section configures the dependencies across jobs.
 workflows:
@@ -438,6 +434,7 @@ workflows:
          requires:
            - initial-setup
            - pytest
+           - chipseq
       - rnaseq:
           requires:
             - initial-setup
@@ -446,12 +443,7 @@ workflows:
           requires:
             - initial-setup
             - pytest
+            - rnaseq
       - build-docs:
           requires:
             - initial-setup
-      - report-env:
-          requires:
-            - rnaseq
-            - rnaseq-misc
-            - chipseq
-            - chipseq-misc
diff --git a/test/test_configs/test_rnaseq_config.yaml b/test/test_configs/test_rnaseq_config.yaml
index 2cbd3d66..ff043f40 100644
--- a/test/test_configs/test_rnaseq_config.yaml
+++ b/test/test_configs/test_rnaseq_config.yaml
@@ -1,27 +1,16 @@
-fasta:
-  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
-  postprocess: 'lib.utils.gzipped'
-
-gtf:
-  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf"
-  postprocess: 'lib.utils.gzipped'
-
-rrna:
-  url:
-    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
-    - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
-  postprocess:
-    function: 'lib.utils.filter_fastas'
-    args: 'Drosophila melanogaster'
-
+organism: Drosophila melanogaster
 
 sampletable: 'config/sampletable.tsv'
 
-patterns: 'config/rnaseq_patterns.yaml'
-
 # See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info.
 stranded: 'fr-firststrand'     # for dUTP libraries
 #         'fr-secondstrand'    # for ligation libraries
 #         'unstranded'         # for libraries without strand specificity
 
-aligner: 'star'
+genome:
+  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
+  postprocess: 'lib.utils.gzipped'
+
+annotation:
+  url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf"
+  postprocess: 'lib.utils.gzipped'
diff --git a/test/test_configs/test_sra_config.yaml b/test/test_configs/test_sra_config.yaml
index f3f92cc4..427cae90 100644
--- a/test/test_configs/test_sra_config.yaml
+++ b/test/test_configs/test_sra_config.yaml
@@ -1,7 +1,4 @@
-patterns: 'config/rnaseq_patterns.yaml'
-
-# Which key in the `references` dict below to use
-organism: 'human'
+organism: 'Homo sapiens'
 
 # If not specified here, use the environment variable REFERENCES_DIR.
 references_dir: 'references_data'

From 1291a2373de6ad8ee22d72472ee76773f87c8b79 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 24 Oct 2025 00:43:07 +0000
Subject: [PATCH 154/196] rm lcdb-wf-test

---
 docs/decisions.rst |  23 +-
 test/lcdb-wf-test  | 584 ---------------------------------------------
 2 files changed, 15 insertions(+), 592 deletions(-)
 delete mode 100755 test/lcdb-wf-test

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 71d2a20b..38e1a2f7 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -553,11 +553,18 @@ they would not reach significance) guards against this.
 
 Zipping/unzipping references
 ----------------------------
-STAR requires uncogffread references/annotation.gtf -w
-references/transcriptome.fa -g references/genome.fampressed
-FASTA and GTF files to build the index. Making uncompressed
-temporary means running the risk of another rule needing
-uncompressed to trigger costly STAR alignment. The extra
-storage cost of leaving an uncompressed fasta (~3 GB) around is
-minimal compared to the scale of all other data, and guards
-against inadvertently re-running all alignment jobs.
+
+STAR requires uncompressed FASTA and GTF files to build the index. Making
+uncompressed files temporary means running the risk of another rule needing
+uncompressed to trigger costly STAR alignment. The extra storage cost of
+leaving an uncompressed fasta (~3 GB) around is minimal compared to the scale
+of all other data, and guards against inadvertently re-running all alignment
+jobs.
+
+Test framework
+--------------
+
+I had previously thought that the CircleCI tests were annoying to run and
+reproduce locally, so the ``tests/lcdb-wf-test`` script was born. Turns out
+that got rather complicated, and ended up being just as annoying. In the spirit
+of reducing complexity, that test harness script is removed.
diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test
deleted file mode 100755
index 8e8525fb..00000000
--- a/test/lcdb-wf-test
+++ /dev/null
@@ -1,584 +0,0 @@
-#!/usr/bin/env python
-
-"""
-This script aims to make it more convenient to run various tests using
-different configs.
-
-Below are configured various tests that are exposed to the commandline as
-subcommands. These in turn support other commandline args to run a specific
-test under that subcommand.
-
-The command-line help is the authoritative source for commands. Since it is
-partly autogenerated, be sure to check it out by running with -h from the
-command line.
-
-Here is a high-level description of what's going on here, which is not in the
-command-line help:
-
-The Runner class, at creation time, sets up a top-level ArgumentParser with
-args used throughout, like which env to use, or which dir to consider as the
-"original" directory (for testing cases where we've deployed somewhere but we
-want to use the test configs from the originally-cloned repo).
-
-The Runner class also has `_cmd_<subcommand name>` methods. At runtime, the
-Runner's ArgumentParser inspects the Runner to see what `_cmd_*` methods it
-has, and adds subcommands for each one it finds.
-
-It's the job of each of those methods to make an ArgumentParser, parse the
-args, and do the right thing.
-
-Since there are a lot of RNA-seq tests, and they use different parameters (like
-different config files, and restricting the run to a sub-dag), these are
-configured in the workflow_test_params.yaml file and the ArgumentParser is
-automatically populated with these arguments.
-
-You can always see the CI tests (currently in .circleci/config.yml at the
-top-level of the repo) for how this tool is used.
-
-"""
-
-import os
-import shlex
-from textwrap import dedent
-import subprocess as sp
-import sys
-from pathlib import Path
-import argparse
-import yaml
-
-HERE = Path(__file__).resolve().parent
-TOPLEVEL = Path(__file__).resolve().parent.parent
-
-WORKFLOW_ARGS = yaml.safe_load(open(TOPLEVEL / "test" / "workflow_test_params.yaml"))
-
-
-def print_header(name):
-    print("-" * 80)
-    print("lcdb-wf-test: ", name)
-    print("-" * 80)
-
-
-class Runner(object):
-    """
-    To add a new command, create a new method with a name starting with
-    "_cmd_", create a new ArgumentParser.
-    """
-
-    default_env = os.getenv("LCDBWF_ENV", str(TOPLEVEL / "env"))
-    default_env_r = os.getenv("LCDBWF_ENV_R", str(TOPLEVEL / "env-r"))
-    global_parser = argparse.ArgumentParser(add_help=False)
-    global_parser.add_argument(
-        "--env", default=default_env,
-        help=f"""Main conda environment to use. Override
-        by setting $LCDBWF_ENV or override that by explicity setting --env. Currently will use {default_env}"""
-    )
-    global_parser.add_argument(
-        "--env-r",
-        default=default_env_r,
-        help=f"""Main R conda environment to use. Override by setting
-        $LCDBWF_ENV_R or override that by explicity setting --env-r. Currently
-        will use {default_env_r}"""
-    )
-    global_parser.add_argument(
-        "--orig",
-        default=str(TOPLEVEL),
-        help=f"""If specified, you can use the special string '__ORIG__' in
-        command line arguments which will be filled in with the value provided
-        here. Mostly used in CI.""",
-    )
-
-    def __init__(self):
-        parser = argparse.ArgumentParser(
-            description="""
-            Test runner for lcdb-wf.
-
-            For any any tests that use Snakemake, you'll need to provide the
-            relevant extra arguments for Snakemake as well (-n, -j,
-            --use-conda, etc). These additional args are passed directly to
-            Snakemake.
-
-                %(prog)s data --kind all
-                %(prog)s unit_tests --pytest
-                %(prog)s unit_tests --r-test
-                %(prog)s rnaseq --run-workflow
-                %(prog)s rnaseq --trackhub
-                %(prog)s rnaseq --downstream
-                %(prog)s chipseq --run-workflow
-                %(prog)s references --run-workflow --configfile=config/config.yaml
-
-            DATA
-            ----
-            %(prog)s data --kind all --verbose
-
-            UNIT TESTS
-            ----------
-            # Run the pytest unit tests on the lib/
-            %(prog)s unit_tests --pytest
-
-            # Run tests on lcdbwf R package
-            %(prog)s unit_tests --r-test
-
-            # Ensure URLs in the configs exist
-            %(prog)s unit_tests --url-check
-
-            # Ensure rnaseq.Rmd has matching sections in the docs
-            %(prog)s unit_tests --ensure-docs
-
-            RNASEQ
-            ------
-            # Run main workflow
-            %(prog)s rnaseq --run-workflow
-
-            # Build RNA-seq trackhub from output of main workflow
-            %(prog)s rnaseq --trackhub
-
-            # Run rnaseq.Rmd
-            %(prog)s rnaseq --downstream
-
-            # Each of these runs a restricted subset of the workflow with
-            # customized configs; they should be run one at a time.
-            %(prog)s rnaseq --run-workflow --sra-pe
-            %(prog)s rnaseq --run-workflow --sra-se
-            %(prog)s rnaseq --run-workflow --strandedness-pe
-            %(prog)s rnaseq --run-workflow --strandedness-se
-            %(prog)s rnaseq --run-workflow --pe
-
-            # Since there are a lot of parameters here, see
-            # "workflow_test_params.yaml" for how they are managed.
-
-            """,
-            formatter_class=argparse.RawDescriptionHelpFormatter
-        )
-
-        # Introspection to build subcommands based on which `_cmd_*` methods
-        # are defined
-        choices = [i.replace("_cmd_", "") for i in dir(self) if i.startswith("_cmd_")]
-
-        parser.add_argument("command", help="Subcommand to run", choices=choices)
-
-        # Second arg is the subcommand; dispatch to the appropriate method
-        args = parser.parse_args(sys.argv[1:2])
-
-        if not hasattr(self, "_cmd_" + args.command):
-            print("Unrecognized command")
-            parser.print_help()
-            sys.exit(1)
-
-        # Get it and then immediately call it.
-        subcommand = getattr(self, "_cmd_" + args.command)
-        subcommand()
-
-    def _cmd_data(self):
-        """
-        Subcommand for downloading test data
-        """
-
-        parser = argparse.ArgumentParser(
-            description="Download data",
-            parents=[self.global_parser],
-        )
-
-        parser.add_argument(
-            "--kind",
-            default="all",
-            choices=["all", "rnaseq", "chipseq"],
-            help="Kind of data to download",
-        )
-        parser.add_argument(
-            "--branch", default="master", help="Branch from lcdb-test-data to use"
-        )
-        parser.add_argument(
-            "--verbose",
-            action="store_true",
-            help="Be verbose about what's being downloaded",
-        )
-
-        args = parser.parse_args(sys.argv[2:])
-
-        repo = "lcdb-test-data"
-        URL = f"https://github.com/lcdb/{repo}/blob/{args.branch}/data/{{}}?raw=true"
-
-        # This dict maps files in the `data` directory of test-data repo to
-        # a local path to which it should be downloaded, as expected by the
-        # various test configs and sampletables. Directories are made as
-        # needed. First one is commented as an example.
-        data_files = {
-            "rnaseq": [
-                (
-                    # Path in test data repo on GitHub
-                    "rnaseq_samples/sample1/sample1.small_R1.fastq.gz",
-
-                    # Download it to this path locally
-                    "workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz",
-                ),
-                (
-                    "rnaseq_samples/sample2/sample2.small_R1.fastq.gz",
-                    "workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz",
-                ),
-                (
-                    "rnaseq_samples/sample3/sample3.small_R1.fastq.gz",
-                    "workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz",
-                ),
-                (
-                    "rnaseq_samples/sample4/sample4.small_R1.fastq.gz",
-                    "workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz",
-                ),
-                (
-                    "rnaseq_samples/sample1/sample1.small_R1.fastq.gz",
-                    "workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz",
-                ),
-                (
-                    "rnaseq_samples/sample1/sample1.small_R2.fastq.gz",
-                    "workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz",
-                ),
-                (
-                    "rnaseq_samples/sample2/sample2.small_R1.fastq.gz",
-                    "workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz",
-                ),
-                (
-                    "rnaseq_samples/sample2/sample2.small_R2.fastq.gz",
-                    "workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz",
-                ),
-            ],
-            "chipseq": [
-                (
-                    "chipseq_samples/input_1/input_1.tiny_R1.fastq.gz",
-                    "workflows/chipseq/data/example_data/chipseq_input1.fq.gz",
-                ),
-                (
-                    "chipseq_samples/input_2/input_2.tiny_R1.fastq.gz",
-                    "workflows/chipseq/data/example_data/chipseq_input2.fq.gz",
-                ),
-                (
-                    "chipseq_samples/input_3/input_3.tiny_R1.fastq.gz",
-                    "workflows/chipseq/data/example_data/chipseq_input3.fq.gz",
-                ),
-                (
-                    "chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz",
-                    "workflows/chipseq/data/example_data/chipseq_ip1.fq.gz",
-                ),
-                (
-                    "chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz",
-                    "workflows/chipseq/data/example_data/chipseq_ip2.fq.gz",
-                ),
-                (
-                    "chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz",
-                    "workflows/chipseq/data/example_data/chipseq_ip3.fq.gz",
-                ),
-                (
-                    "chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz",
-                    "workflows/chipseq/data/example_data/chipseq_ip4.fq.gz",
-                ),
-            ],
-        }
-
-        if args.kind == "all":
-            kinds = list(data_files.keys())
-        else:
-            kinds = [args.kind]
-        for kind in kinds:
-            for fn, dest in data_files[kind]:
-                url = URL.format(fn)
-                if args.verbose:
-                    print(f"downloading {url}")
-                if dest is None:
-                    dest = fn
-                dest = Path(dest)
-                dest.parent.mkdir(parents=True, exist_ok=True)
-                sp.run(
-                    f"wget -q -O- {url} > {dest}", shell=True, check=True, cwd=TOPLEVEL
-                )
-
-    def _cmd_unit_tests(self):
-        """
-        Subcommand for unit tests -- these don't run Snakemake.
-        """
-        parser = argparse.ArgumentParser(
-            description="Run various unit tests and checks",
-            parents=[self.global_parser],
-        )
-        parser.add_argument(
-            "--pytest",
-            action="store_true",
-            help="Run pytest unit tests and module doctests on lib/ directory",
-        )
-        parser.add_argument(
-            "--url-check",
-            action="store_true",
-            help="Ensure that URLs found in config files (e.g., to genome references) are still valid",
-        )
-        parser.add_argument(
-            "--r-test",
-            action="store_true",
-            help="""Run devtools::test on the lcdbwf R package. Activates the
-            conda environment specified by --env-r just before running.""",
-        )
-
-        parser.add_argument(
-            "--ensure-docs",
-            action="store_true",
-            help="Ensure that all named R chunks are documented in the online help docs",
-        )
-
-        args = parser.parse_args(sys.argv[2:])
-
-        if args.pytest:
-            print_header("pytest")
-            sp.run(["pytest", "--doctest-modules", "lib"], check=True, cwd=TOPLEVEL)
-
-        if args.url_check:
-            print_header("url check")
-            sys.path.insert(0, str(TOPLEVEL))
-            from lib.utils import check_all_urls_found
-
-            check_all_urls_found()
-
-        if args.r_test:
-            print_header("R test")
-            p = sp.run(
-                'eval "$(conda shell.bash hook)" '
-                f"&& conda activate {args.env_r} "
-                '''&& Rscript -e "devtools::test('lib/lcdbwf', reporter=c('summary', 'fail'), export_all=TRUE)"''',
-                shell=True,
-                check=True,
-                executable="/bin/bash"
-            )
-            if p.returncode:
-                sys.exit(1)
-
-        if args.ensure_docs:
-            sp.run(["./ensure_docs.py"], check=True, cwd=TOPLEVEL / "ci")
-
-    def _cmd_rnaseq(self):
-        """
-        Subcommand for RNA-seq. There are many tests here, with different
-        config files and sampletables etc. So the possibilities are configured
-        over in workflow_test_params.yaml and auto-generated here.
-        """
-
-        parser = argparse.ArgumentParser(
-            description="Run rnaseq workflow and downstream tests",
-            parents=[self.global_parser],
-        )
-        parser.add_argument(
-            "--run-workflow",
-            action="store_true",
-            help="""Run rnaseq workflow using the run_tesh.sh harness, which
-            edits the Snakefile to use test settings before running. Additional
-            args not specified here are passed to Snakemake, or use other flags
-            below to easily specify config sets.""",
-        )
-        parser.add_argument(
-            "--trackhub", action="store_true", help="Build the rnaseq track hub"
-        )
-        parser.add_argument(
-            "--downstream",
-            action="store_true",
-            help="""Run the downstream rnaseq.Rmd, via
-            workflows/rnaseq/run_downstream_test.sh. This runs the preprocessor
-            on the files to allow the use of # [TEST SETTINGS] comments; see
-            that script for details. Activates environment configured in
-            --env-r before running.""",
-        )
-
-        # Here we programmatically build the parser from the
-        # workflow_test_params.yaml file which configures arguments for each
-        # test. Here, the configured tests are added to a mutually-exclusive
-        # group to avoid inadvertently overwriting each others' config file
-        # params (in which case the test would not be the the thing you thought
-        # you were testing...). They all write their params to the
-        # args.additional_args attribute, which is passed to run_test.sh, which
-        # in turn passes them to Snakemake itself.
-        group = parser.add_mutually_exclusive_group()
-        workflow_prefix = "bash run_test.sh"
-        workflow_dir = TOPLEVEL / "workflows/rnaseq"
-        for key, val in WORKFLOW_ARGS["rnaseq"].items():
-            group.add_argument(
-                "--" + key,
-                action="store_const",
-                default="",
-                dest="additional_args",
-                const=val["args"],
-
-                # Be really explicit about what's being run, so you can run it
-                # yourself separately if you want (or for double-checking this
-                # is doing what you want it to do)
-                help=dedent(
-                    f"""
-                    {val['desc']}
-
-                    Runs the following, as configured in workflow_test_params.yaml:
-
-                      cd {workflow_dir} && {workflow_prefix} {val['args']}
-                    """),
-            )
-
-        args, extra = parser.parse_known_args(sys.argv[2:])
-
-        if args.run_workflow:
-            print(args)
-            if args.additional_args:
-                extra.extend(shlex.split(args.additional_args))
-
-            extra = [i.replace("__ORIG__", args.orig) for i in extra]
-            strargs = " ".join(extra)
-            cmd = (
-                'eval "$(conda shell.bash hook)" '
-                f"&& conda activate {args.env} "
-                f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})"
-            )
-            print_header(f"Running the following command:\n{cmd}")
-            sp.run(
-                cmd,
-                check=True,
-                shell=True,
-                executable="/bin/bash"
-            )
-        if args.trackhub:
-            cmd = (
-                'eval "$(conda shell.bash hook)" '
-                f"&& conda activate {args.env} "
-                f"&& (cd {workflow_dir} "
-                "&& python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml)"
-            )
-            print_header(f"Building trackhub with command: {cmd}")
-
-            sp.run(
-                cmd,
-                shell=True,
-                check=True,
-                executable="/bin/bash"
-            )
-            print("See workflows/rnaseq/staging for the built trackhub")
-
-        if args.downstream:
-            print_header("running downstream rnaseq.Rmd")
-            sp.run(
-                'eval "$(conda shell.bash hook)" '
-                f"&& conda activate {args.env_r} "
-                "&& (cd workflows/rnaseq && bash run_downstream_test.sh)",
-                shell=True,
-                check=True,
-                executable="/bin/bash"
-            )
-
-    def _cmd_chipseq(self):
-        """
-        This function handles the "chipseq" subcommand.
-        """
-
-        parser = argparse.ArgumentParser(
-            description="Run chipseq workflow",
-            parents=[self.global_parser],
-        )
-        parser.add_argument(
-            "--run-workflow",
-            action="store_true",
-            help="""Run chipseq workflow using the run_tesh.sh harness, which
-            edits the Snakefile to use test settings before running. Additional
-            args not specified here are passed to Snakemake, or use other flags
-            below to easily specify config sets.""",
-        )
-        parser.add_argument(
-            "--trackhub", action="store_true", help="Build the rnaseq track hub"
-        )
-        args, extra = parser.parse_known_args(sys.argv[2:])
-        workflow_prefix = "bash run_test.sh"
-        workflow_dir = TOPLEVEL / "workflows/chipseq"
-
-        if args.run_workflow:
-            extra = [i.replace("__ORIG__", args.orig) for i in extra]
-            strargs = " ".join(extra)
-            cmd = (
-                'eval "$(conda shell.bash hook)" '
-                f"&& conda activate {args.env} "
-                f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})"
-            )
-            print_header(f"Running the following command:\n{cmd}")
-            sp.run(
-                cmd,
-                shell=True,
-                check=True,
-                executable="/bin/bash"
-            )
-        if args.trackhub:
-            cmd = (
-                'eval "$(conda shell.bash hook)" '
-                f"&& conda activate {args.env} "
-                f"&& (cd {workflow_dir} "
-                "&& python chipseq_trackhub.py config/config.yaml config/hub_config.yaml)"
-            )
-            print_header(f"Building trackhub with command: {cmd}")
-
-            sp.run(
-                cmd,
-                shell=True,
-                check=True,
-                executable="/bin/bash"
-            )
-            print("See workflows/chipseq/staging for the built trackhub")
-
-    def _cmd_references(self):
-        parser = argparse.ArgumentParser(
-            description="Run references workflow",
-            parents=[self.global_parser],
-        )
-        parser.add_argument(
-            "--run-workflow",
-            action="store_true",
-            help="""Run references workflow using the run_tesh.sh harness, which
-            edits the Snakefile to use test settings before running."""
-        )
-        args, extra = parser.parse_known_args(sys.argv[2:])
-
-        workflow_prefix = "bash run_test.sh"
-        workflow_dir = TOPLEVEL / "workflows/references"
-        if args.run_workflow:
-            extra = [i.replace("__ORIG__", args.orig) for i in extra]
-            strargs = " ".join(extra)
-            cmd = (
-                'eval "$(conda shell.bash hook)" '
-                f"&& conda activate {args.env} "
-                f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})"
-            )
-            print_header(f"Running the following command:\n{cmd}")
-            sp.run(
-                cmd,
-                shell=True,
-                check=True,
-                executable="/bin/bash"
-            )
-
-    def _cmd_colocalization(self):
-        parser = argparse.ArgumentParser(
-            description="Run colocalization workflow",
-            parents=[self.global_parser],
-        )
-        parser.add_argument(
-            "--run-workflow",
-            action="store_true",
-            help="""Run colocalization workflow using the run_test.sh harness"""
-        )
-        args, extra = parser.parse_known_args(sys.argv[2:])
-        workflow_prefix = "bash run_test.sh"
-        workflow_dir = TOPLEVEL / "workflows/colocalization"
-        if args.run_workflow:
-            extra = [i.replace("__ORIG__", args.orig) for i in extra]
-            strargs = " ".join(extra)
-            cmd = (
-                'eval "$(conda shell.bash hook)" '
-                f"&& conda activate {args.env} "
-                f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})"
-            )
-            print_header(f"Running the following command:\n{cmd}")
-            sp.run(
-                cmd,
-                shell=True,
-                check=True,
-                executable="/bin/bash"
-            )
-
-if __name__ == "__main__":
-    Runner()
-
-# vim: ft=python

From 5372cfcd0741c57352d7c93037017ccfb0571294 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 24 Oct 2025 01:46:36 +0000
Subject: [PATCH 155/196] fix some tests

---
 .circleci/config.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 80767ded..996d6577 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -163,16 +163,17 @@ variables:
         conda activate $LCDBWF_ENV
 
         # run unit tests and doctests for the modules in lib
-        test/lcdb-wf-test unit_tests --pytest
+        pytest --doctest-modules lib
 
         # Ensure that the chunks in rnaseq.Rmd have matching documentation
-        test/lcdb-wf-test unit_tests --ensure-docs
+        (cd ci && ./ensure_docs.py)
 
         # find all URLs in reference configs and make sure they exist
-        test/lcdb-wf-test unit_tests --url-check
+        python -c "import sys; sys.path.insert(0, '$DEST'); from lib.utils import check_all_urls_found; check_all_urls_found()"
 
         # run R package unit tests using the R env
-        test/lcdb-wf-test unit_tests --r-test
+        conda activate $LCDBWF_ENV_R
+        Rscript -e "devtools::test('lib/lcdbwf', reporter=c('summary', 'fail'), export_all=TRUE)"
 
 
   # --------------------------------------------------------------------------

From 0394f850d64123e3df30f504d69e6ed6498dfdbf Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 24 Oct 2025 02:03:33 +0000
Subject: [PATCH 156/196] update decision log

---
 docs/decisions.rst | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 38e1a2f7..1ceeb339 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -567,4 +567,47 @@ Test framework
 I had previously thought that the CircleCI tests were annoying to run and
 reproduce locally, so the ``tests/lcdb-wf-test`` script was born. Turns out
 that got rather complicated, and ended up being just as annoying. In the spirit
-of reducing complexity, that test harness script is removed.
+of reducing complexity, that test harness script is removed. In part, the new
+reference config simplification allows control over configs from the
+commandline, reducing the need to handle that from the test script.
+
+rRNA
+----
+Assessing ribosomal RNA contamination is an important QC step. Different
+annotation sources have different ways of indicating ribosomal RNA. For example,
+Ensembl GTF files typically have "trancript_biotype" attributes on transcript
+featuretypes and "gene_biotype" attributes on gene features, depending on
+version (older versions have separate rRNA featuretypes). FlyBase uses separate
+rRNA feature types. Dictyostelium does not have anything in the GTF. PomBase
+uses the "biotype" attribute.
+
+One way of handling this is to have post-processing steps that extract the rRNA
+features from a GTF (probably defaulting to assuming an Ensembl-like
+"gene_biotype" attribute) and convert them to `IntervalList format
+<https://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/samtools/util/IntervalList.html>`__
+to pass to Picard CollectRnaSeqMetrics.
+
+Another way is to bypass the GTF altogether and align to rRNA directly, which is
+what we have historically done here. Previously, the reference configs would all
+need an rRNA entry that basically did the same thing for each organism, since
+every model organism we've worked with is in the SILVA database. It would
+download the full SILVA fasta (for large and small subunits), grep out the
+records for our species of interest, and build a bowtie2 index out of that. That
+means this method is more general, and arguably more complete, but has its own
+complexity: we need to download and filter the fasta, build the bowtie2 index,
+and aggregate the results into a MultiQC module.
+
+In the 2.0 refactor, rRNA fasta creation now only needs an organism name and the
+Snakefile does what was always in the references config, which is to use the
+post-process mechanism to filter the fasta.
+
+
+
+
+Aligners
+--------
+
+Previously, HISAT2 and STAR were both supported; salmon and kallisto were both
+supported. This created additional complexity in the references workflow and in
+the configs. Now, we're just using STAR and salmon (for RNA-seq) and bowtie2 for
+ChIP-seq.

From 0d009b06356d4448e1530af3002104524ecebaa7 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 24 Oct 2025 02:08:55 +0000
Subject: [PATCH 157/196] another test fix

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 996d6577..5dafe186 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -240,6 +240,7 @@ variables:
 
           python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml
 
+          conda activate $LCDBWF_ENV_R
           ./run_downstream_test.sh
 
           # bundle up the entire directory to be used as an artifact

From d6933363553b4a4badd5be250aed65cc541254fb Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 26 Oct 2025 21:00:55 +0000
Subject: [PATCH 158/196] fix artifacts for rnaseq

---
 .circleci/config.yml | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5dafe186..3da87834 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -241,14 +241,13 @@ variables:
           python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml
 
           conda activate $LCDBWF_ENV_R
+
+          # This creates files in `workflows/rnaseq/downstream-test`:
           ./run_downstream_test.sh
 
           # bundle up the entire directory to be used as an artifact
-          tar -zcf /tmp/downstream.tar.gz workflows/rnaseq/downstream-test/
-          cp workflows/rnaseq/downstream-test/rnaseq.html /tmp/rnaseq.html
-          cp workflows/rnaseq/downstream-test/functional-enrichment.html /tmp/functional-enrichment.html
-          cp workflows/rnaseq/downstream-test/gene-patterns.html /tmp/gene-patterns.html
-          cp workflows/rnaseq/data/rnaseq_aggregation/multiqc.html /tmp/rnaseq.html
+          tar -zcf /tmp/downstream.tar.gz downstream-test/
+          cp data/rnaseq_aggregation/multiqc.html /tmp/rnaseq.html
 
   # --------------------------------------------------------------------------
   # Various tests on RNA-seq workflow that don't warrant the overhead of a new
@@ -368,18 +367,9 @@ jobs:
       - store_artifacts:
           path: /tmp/downstream.tar.gz
           destination: downstream.tar.gz
-      - store_artifacts:
-          path: /tmp/rnaseq.html
-          destination: rnaseq.html
       - store_artifacts:
           path: /tmp/multiqc.html
           destination: multiqc.html
-      - store_artifacts:
-          path: /tmp/functional-enrichment.html
-          destination: functional-enrichment.html
-      - store_artifacts:
-          path: /tmp/gene-patterns.html
-          destination: gene-patterns.html
 
   rnaseq-misc:
     <<: *defaults

From da705f46447da93ff70ded3b22a50a920b189691 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Mon, 27 Oct 2025 13:50:41 +0000
Subject: [PATCH 159/196] fix export path

---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3da87834..3899fa03 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -108,8 +108,8 @@ variables:
 
             time conda env create -n $LCDBWF_ENV --file env.yml
             time conda env create -n $LCDBWF_ENV_R --file env-r.yml
-            conda env export -n $LCDBWF_ENV > /opt/miniconda/env.yml
-            conda env export -n $LCDBWF_ENV_R > /opt/miniconda/env.yml
+            conda env export -n $LCDBWF_ENV > /opt/miniforge/env.yml
+            conda env export -n $LCDBWF_ENV_R > /opt/miniforge/env.yml
         fi
 
   # --------------------------------------------------------------------------

From 828dcea5a35c68399e49d9c7c89b788a283029db Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Mon, 27 Oct 2025 17:32:41 +0000
Subject: [PATCH 160/196] initial round of reference configs

---
 .../Danio_rerio/GRCz11.yaml                         | 10 ++++++++++
 .../Drosophila_melanogaster/r6.65.yaml              |  7 +++++++
 .../Eschericia_coli.yaml/ASM584v2.yaml              |  7 +++++++
 .../Eschericia_coli.yaml/K-12_substr.yaml           |  1 +
 .../Mus_musculus/GENCODE_M38.yaml                   | 12 ++++++++++++
 .../Plodia_interpunctella/ilPloInte3.2.yaml         |  7 +++++++
 .../Rattus_norvegicus/GRCr8.yaml                    | 13 +++++++++++++
 .../Saccharomyces_cerevisiae/R64-1-1.115.yaml       |  1 +
 .../Saccharomyces_cerevisiae/S288C.yaml             |  9 +++++++++
 .../Schizosaccharomyces_pombe/ASM294v2.yaml         |  7 +++++++
 10 files changed, 74 insertions(+)
 create mode 100644 include/reference_config_templates/Danio_rerio/GRCz11.yaml
 create mode 100644 include/reference_config_templates/Drosophila_melanogaster/r6.65.yaml
 create mode 100644 include/reference_config_templates/Eschericia_coli.yaml/ASM584v2.yaml
 create mode 120000 include/reference_config_templates/Eschericia_coli.yaml/K-12_substr.yaml
 create mode 100644 include/reference_config_templates/Mus_musculus/GENCODE_M38.yaml
 create mode 100644 include/reference_config_templates/Plodia_interpunctella/ilPloInte3.2.yaml
 create mode 100644 include/reference_config_templates/Rattus_norvegicus/GRCr8.yaml
 create mode 120000 include/reference_config_templates/Saccharomyces_cerevisiae/R64-1-1.115.yaml
 create mode 100644 include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml
 create mode 100644 include/reference_config_templates/Schizosaccharomyces_pombe/ASM294v2.yaml

diff --git a/include/reference_config_templates/Danio_rerio/GRCz11.yaml b/include/reference_config_templates/Danio_rerio/GRCz11.yaml
new file mode 100644
index 00000000..909d27b8
--- /dev/null
+++ b/include/reference_config_templates/Danio_rerio/GRCz11.yaml
@@ -0,0 +1,10 @@
+organism: "Danio rerio"
+
+# Primary assembly (excludes haplotypes and alt regions) from Ensembl,
+# soft-masked
+genome:
+  url: "https://ftp.ensembl.org/pub/release-115/fasta/danio_rerio/dna/Danio_rerio.GRCz11.dna_sm.primary_assembly.fa.gz"
+
+# Ensembl provides a version with "chr" prefixes, but this one matches the fasta above.
+annotation:
+  url: "https://ftp.ensembl.org/pub/release-115/gtf/danio_rerio/Danio_rerio.GRCz11.115.gtf.gz"
diff --git a/include/reference_config_templates/Drosophila_melanogaster/r6.65.yaml b/include/reference_config_templates/Drosophila_melanogaster/r6.65.yaml
new file mode 100644
index 00000000..92704845
--- /dev/null
+++ b/include/reference_config_templates/Drosophila_melanogaster/r6.65.yaml
@@ -0,0 +1,7 @@
+organism: "Drosophila melanogaster"
+
+genome:
+  url: "https://s3ftp.flybase.org/genomes/Drosophila_melanogaster/dmel_r6.65_FB2025_04/fasta/dmel-all-chromosome-r6.65.fasta.gz"
+
+annotation:
+  url: "https://s3ftp.flybase.org/genomes/Drosophila_melanogaster/dmel_r6.65_FB2025_04/gtf/dmel-all-r6.65.gtf.gz"
diff --git a/include/reference_config_templates/Eschericia_coli.yaml/ASM584v2.yaml b/include/reference_config_templates/Eschericia_coli.yaml/ASM584v2.yaml
new file mode 100644
index 00000000..97ee4c38
--- /dev/null
+++ b/include/reference_config_templates/Eschericia_coli.yaml/ASM584v2.yaml
@@ -0,0 +1,7 @@
+organism: "Escherichia coli"
+
+# From NCBI.
+genome:
+  url: "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz"
+annotation:
+  url: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gtf.gz"
diff --git a/include/reference_config_templates/Eschericia_coli.yaml/K-12_substr.yaml b/include/reference_config_templates/Eschericia_coli.yaml/K-12_substr.yaml
new file mode 120000
index 00000000..e7f0926c
--- /dev/null
+++ b/include/reference_config_templates/Eschericia_coli.yaml/K-12_substr.yaml
@@ -0,0 +1 @@
+ASM584v2.yaml
\ No newline at end of file
diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M38.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M38.yaml
new file mode 100644
index 00000000..47d4fbb0
--- /dev/null
+++ b/include/reference_config_templates/Mus_musculus/GENCODE_M38.yaml
@@ -0,0 +1,12 @@
+# This is the latest release for GRCm39 (mm39).
+
+organism: "Mus musculus"
+
+genome:
+  url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M38/GRCm39.primary_assembly.genome.fa.gz"
+
+# Although there is a separate lncRNA annotation that does not specify that it
+# is a subset, it does appear to be a subset because those features are in
+# this primary assembly annotation.
+annotation:
+  url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M38/gencode.vM38.primary_assembly.annotation.gtf.gz"
diff --git a/include/reference_config_templates/Plodia_interpunctella/ilPloInte3.2.yaml b/include/reference_config_templates/Plodia_interpunctella/ilPloInte3.2.yaml
new file mode 100644
index 00000000..c23f990c
--- /dev/null
+++ b/include/reference_config_templates/Plodia_interpunctella/ilPloInte3.2.yaml
@@ -0,0 +1,7 @@
+organism: "Plodia interpunctella"
+
+genome:
+  url: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/027/563/975/GCF_027563975.2_ilPloInte3.2/GCF_027563975.2_ilPloInte3.2_genomic.fna.gz"
+
+annotation:
+  url: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/027/563/975/GCF_027563975.2_ilPloInte3.2/GCF_027563975.2_ilPloInte3.2_genomic.gtf.gz"
diff --git a/include/reference_config_templates/Rattus_norvegicus/GRCr8.yaml b/include/reference_config_templates/Rattus_norvegicus/GRCr8.yaml
new file mode 100644
index 00000000..b913d412
--- /dev/null
+++ b/include/reference_config_templates/Rattus_norvegicus/GRCr8.yaml
@@ -0,0 +1,13 @@
+organism: "Rattus norvegicus"
+
+# Although there are individual chromosome fastas with "primary" in the
+# filename, there is no corresponding genome-wide fasta file with "primary" in
+# the filename. However, the toplevel fasta here has "dna_sm:primary_assembly"
+# in all of its record descriptions, so it does not appear to have haplotypes
+# or alt regions.
+genome:
+  url: "https://ftp.ensembl.org/pub/release-115/fasta/rattus_norvegicus/dna/Rattus_norvegicus.GRCr8.dna_sm.toplevel.fa.gz"
+
+annotation:
+  url: "https://ftp.ensembl.org/pub/release-115/gtf/rattus_norvegicus/Rattus_norvegicus.GRCr8.115.gtf.gz"
+
diff --git a/include/reference_config_templates/Saccharomyces_cerevisiae/R64-1-1.115.yaml b/include/reference_config_templates/Saccharomyces_cerevisiae/R64-1-1.115.yaml
new file mode 120000
index 00000000..04eff7f4
--- /dev/null
+++ b/include/reference_config_templates/Saccharomyces_cerevisiae/R64-1-1.115.yaml
@@ -0,0 +1 @@
+S288C.yaml
\ No newline at end of file
diff --git a/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml b/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml
new file mode 100644
index 00000000..4e0204d0
--- /dev/null
+++ b/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml
@@ -0,0 +1,9 @@
+# https://www.yeastgenome.org/strain/s288c
+
+# From Ensembl. According to README in this FTP dir, if there's no primary
+# assembly then the toplevel is assumed to be the primary assembly.
+genome:
+  url: "https://ftp.ensembl.org/pub/release-115/fasta/saccharomyces_cerevisiae/dna/Saccharomyces_cerevisiae.R64-1-1.dna_sm.toplevel.fa.gz"
+
+annotation:
+  url: "https://ftp.ensembl.org/pub/release-115/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.115.gtf.gz"
diff --git a/include/reference_config_templates/Schizosaccharomyces_pombe/ASM294v2.yaml b/include/reference_config_templates/Schizosaccharomyces_pombe/ASM294v2.yaml
new file mode 100644
index 00000000..ff0d37c8
--- /dev/null
+++ b/include/reference_config_templates/Schizosaccharomyces_pombe/ASM294v2.yaml
@@ -0,0 +1,7 @@
+organism: "Schizosaccharomyces pombe"
+genome:
+  url: "http://ftp.ensemblgenomes.org/pub/fungi/release-62/fasta/schizosaccharomyces_pombe/dna/Schizosaccharomyces_pombe.ASM294v2.dna_sm.toplevel.fa.gz"
+
+annotation:
+  url: "https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-62/gff3/schizosaccharomyces_pombe/Schizosaccharomyces_pombe.ASM294v2.62.gff3.gz"
+  postprocess: 'lib.postprocess.gff2gtf'

From db65bb7e19a60a9af9873dc70ef7ee268d0fb8db Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Mon, 27 Oct 2025 17:32:50 +0000
Subject: [PATCH 161/196] ci/get-data.py: pep8, run from any dir, verbose mode

---
 ci/get-data.py | 117 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 96 insertions(+), 21 deletions(-)

diff --git a/ci/get-data.py b/ci/get-data.py
index cd2d356b..984ed9de 100755
--- a/ci/get-data.py
+++ b/ci/get-data.py
@@ -1,37 +1,112 @@
 #!/usr/bin/env python
+import argparse
 import os
+
 from snakemake.shell import shell
 from snakemake.utils import makedirs
 
-shell.executable('/bin/bash')
-BRANCH = 'master'
-URL = 'https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true'.format(BRANCH)
+BRANCH = "master"
+URL = "https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true".format(
+    BRANCH
+)
+
+TOPLEVEL = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
 
-def _download_file(fn, dest=None):
+def _download_file(fn, dest=None, verbose=False):
     url = URL.format(fn)
     if dest is None:
         dest = fn
+    dest = os.path.join(TOPLEVEL, dest)
     makedirs(os.path.dirname(dest))
-    basename = os.path.basename(fn)
-    shell('wget -q -O- {url} > {dest}')
+    if not verbose:
+        q = "-q"
+    else:
+        q = ""
+    shell(f"wget {q} -O- {url} > {dest}")
+    if verbose:
+        print(f"Saved {dest}")
     return dest
 
 
-_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz')
-_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz')
-_download_file('rnaseq_samples/sample3/sample3.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz')
-_download_file('rnaseq_samples/sample4/sample4.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz')
+ap = argparse.ArgumentParser()
+ap.add_argument("-v", "--verbose", action="store_true", help="Be verbose when downloading")
+args = ap.parse_args()
+
+_download_file(
+    "rnaseq_samples/sample1/sample1.small_R1.fastq.gz",
+    "workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "rnaseq_samples/sample2/sample2.small_R1.fastq.gz",
+    "workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "rnaseq_samples/sample3/sample3.small_R1.fastq.gz",
+    "workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "rnaseq_samples/sample4/sample4.small_R1.fastq.gz",
+    "workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz",
+    args.verbose,
+)
 
-_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz')
-_download_file('rnaseq_samples/sample1/sample1.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz')
-_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz')
-_download_file('rnaseq_samples/sample2/sample2.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz')
+_download_file(
+    "rnaseq_samples/sample1/sample1.small_R1.fastq.gz",
+    "workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "rnaseq_samples/sample1/sample1.small_R2.fastq.gz",
+    "workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "rnaseq_samples/sample2/sample2.small_R1.fastq.gz",
+    "workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "rnaseq_samples/sample2/sample2.small_R2.fastq.gz",
+    "workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz",
+    args.verbose,
+)
 
-_download_file('chipseq_samples/input_1/input_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input1.fq.gz')
-_download_file('chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip1.fq.gz')
-_download_file('chipseq_samples/input_2/input_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input2.fq.gz')
-_download_file('chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip2.fq.gz')
-_download_file('chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip3.fq.gz')
-_download_file('chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip4.fq.gz')
-_download_file('chipseq_samples/input_3/input_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input3.fq.gz')
+_download_file(
+    "chipseq_samples/input_1/input_1.tiny_R1.fastq.gz",
+    "workflows/chipseq/data/example_data/chipseq_input1.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz",
+    "workflows/chipseq/data/example_data/chipseq_ip1.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "chipseq_samples/input_2/input_2.tiny_R1.fastq.gz",
+    "workflows/chipseq/data/example_data/chipseq_input2.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz",
+    "workflows/chipseq/data/example_data/chipseq_ip2.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz",
+    "workflows/chipseq/data/example_data/chipseq_ip3.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz",
+    "workflows/chipseq/data/example_data/chipseq_ip4.fq.gz",
+    args.verbose,
+)
+_download_file(
+    "chipseq_samples/input_3/input_3.tiny_R1.fastq.gz",
+    "workflows/chipseq/data/example_data/chipseq_input3.fq.gz",
+    args.verbose,
+)

From bc787f2e0f3abbcc0b457e8a55bf929421a37acb Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Mon, 27 Oct 2025 17:33:18 +0000
Subject: [PATCH 162/196] .fai as input for transcriptome fasta

---
 workflows/rnaseq/Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 6c377137..d62e2ec9 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -208,6 +208,7 @@ rule transcriptome_fasta:
     input:
         fasta="references/genome.fa",
         gtf="references/annotation.gtf",
+        fai="references/genome.fa.fai",
     output:
         "references/transcriptome.fa",
     log:

From 1d1683ef864e23c57d03fd0724ce06270a68063e Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Mon, 27 Oct 2025 17:33:34 +0000
Subject: [PATCH 163/196] unzipped references are marked temp()

---
 workflows/rnaseq/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index d62e2ec9..f9e33b1c 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -139,7 +139,7 @@ rule unzip:
     input:
         "references/{prefix}.gz",
     output:
-        "references/{prefix}",
+        temporary("references/{prefix}"),
     resources:
         mem="4g",
         runtime="2h",

From d73109e5fa94b5f8357b6e4fd2ec8a0688c3f971 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Mon, 27 Oct 2025 17:33:53 +0000
Subject: [PATCH 164/196] updates to decision log

---
 docs/decisions.rst | 167 +++++++++++++++++++++++++++------------------
 1 file changed, 99 insertions(+), 68 deletions(-)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 1ceeb339..4de9189f 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -9,18 +9,12 @@ Here are use-cases we have that are common enough to warrant supporting:
 
 **References should support multiple workflows (ChIP-seq, RNA-seq, etc)**
 
-- This implies that the means the references dir should be in the ``workflows``
-  directory or above.
-- For example, this may mean a STAR index for RNA-seq, a bowtie2 index for rRNA
-  contamination, and another bowtie2 index for ChIP-seq.
-
 **References should support different organisms in different workflows. There
 should be only one organism per workflow though.**
 
 - For example, ``workflows/mouse-rnaseq`` and ``workflows/human-rnaseq`` should
   be supported in the same project.
 
-
 **References should be re-created for each project.**
 
 - Historically we had a central location for the references (shared by multiple
@@ -32,13 +26,13 @@ should be only one organism per workflow though.**
   back what commands were run to generate the reference, including additional
   patching that may have taken place (as is supported by the references
   workflow).
-- Re-using indexes is space- and time-efficient in the short term, but has
-  shown to be inefficient in time and reproducibility in the long term.
+- Re-using indexes is space- and time-efficient in the short term, but experience has
+  shown it to be inefficient in time and reproducibility in the long term.
 - Keeping everything in the same deployment directory also helps with the
   archiving process. 
 - We were hesitant to update the references in the central location due to
   being unsure of what was depending on them.
-- Overall, making the decision that the time and space cost to re-make
+- Overall, here we make the decision that the time and space cost to re-make
   references for each project is worth the gain in simplicity and isolation.
 
 Reference nomenclature and directory structure
@@ -49,24 +43,28 @@ Options considered:
 1. ``references`` (top-level of project, shared by all workflows)
 2. ``workflows/<workflow>/references`` (workflow-specific)
 
-The location ``workflows/references`` is functionally similar to top-level
-``references`` (in a parent directory of individual workflows) but references
-is no longer a workflow so it doesn't make sense to have it right in the
-``workflows`` directory.
+The possible location ``workflows/references`` is functionally similar to
+top-level ``references`` (in a parent directory of individual workflows) but
+references is no longer a workflow so it doesn't make sense to have it right in
+the ``workflows`` directory. So this was excluded as an option.
 
 Recall that in lcdb-wf <2.0, we have organism and then tag. For example, we
 might have configurations available for different human genome assemblies
 (hg19, hg38) and in the central location we needed to differentiate between
-them (e.g. ``references/human/hg19/``).
+them (e.g. ``references/human/hg19/``), which we did with tags.
 
-If we assume a single organism per workflow, and that the references are
-workflow-specific, then we don't need any of this.
+If we assume a single organism per workflow, which seems reasonable and that
+the references are workflow-specific, then we don't need any of this.
 ``workflows/<workflow>/references/genome.fa`` for example should cover it.
 
 This becomes inefficient in the case where there are multiple workflows, all
-for the same organism and all the same workflow type. However in such cases,
-manually creating symlinks can get around this, and I think it's an acceptable
-workaround for the benefit of simplified references more generally.
+for the same organism and all the same workflow type. For example, a project
+with chipseq and a two different RNA-seq experiments would have three copies of
+the genome fasta. However in such cases, manually creating symlinks can get
+around this if space is a problem, and I think it's an acceptable workaround
+for the benefit of simplified references more generally.
+
+So we might have something like the following:
 
 ::
 
@@ -117,8 +115,8 @@ can be quite close to the equivalent command-line call. Since rules in these
 Snakefiles are intended to be edited, it makes sense to keep them as close to
 the command-line as is reasonable.
 
-Take the cutadapt rule, for example, where we typically would want to include
-the adapters in the call, but it's not uncommon to add other arguments. Here
+Take the cutadapt rule for example, where we typically would want to include
+the adapters but it's not uncommon to add other arguments. Here
 we're working with a simplified, single-end version of it:
 
 .. code-block:: python
@@ -128,8 +126,7 @@ we're working with a simplified, single-end version of it:
           fastq='{sample}.fastq.gz"
       output:
           fastq='{sample}.cutadapt.fastq.gz'
-      threads:
-          8
+      threads: 8
       shell:
           "cutadapt "
           "-o {output[0]} "
@@ -248,8 +245,9 @@ cutadapt depend on that. Here's the actual rule:
                   "&> {log}"
               )
 
-Notice that we have some shared arguments as well as a PE-specific adapter
-argument. Converting this one to params would be something like the following:
+Notice that we have some shared arguments (``--nextseq-trim``, ``--overlap``,
+``--minimum-length``) as well as a PE-specific adapter argument. Converting
+this one to params would be something like the following:
 
 .. code-block:: python
 
@@ -350,7 +348,8 @@ specific is handled there?
 
 Now it becomes a little harder to understand what's going on, and we may have
 gone too far in pulling everything out into params. So maybe an absolute
-principle of "everything in params" is not useful.
+principle of "everything must go in params" is not useful because it impacts
+clarity.
 
 Let's take another example, the featureCounts rule for RNA-seq:
 
@@ -393,13 +392,15 @@ Let's take another example, the featureCounts rule for RNA-seq:
           )
 
 Here, it is important to have ``strand_arg`` be in the params. To understand
-why, imagine if instead we determined that argument inside the ``run:`` block,
-and then we changed the config file's stranded entry (``config["stranded"]``).
-Then this rule would NOT re-run because the code didn't change -- Snakemake
-does not *evaluate* the code in a ``run:`` block to determine if it changed.
-However, it *does* evaluate the params. So in this case, it's necessary to keep
-the strand argument detection in the params to take advantage of this behavior,
-and correctly re-run the rule if the config's strand argument has changed.
+why, imagine if we determined that argument inside the ``run:`` block instead
+of in params, and then we changed the config file's stranded entry
+(``config["stranded"]``). Even though we would want it to re-run (since the
+config changed), this rule would NOT re-run because the *code* didn't change --
+Snakemake does not *evaluate* the code in a ``run:`` block to determine if it
+changed. However, it *does* evaluate the params. So in this case, it's
+necessary to keep the strand argument detection in the params to take advantage
+of this behavior, and correctly re-run the rule if the config's strand argument
+has changed.
 
 Next, we would want to decide whether *all* arguments should go in ``params:``.
 In this case, since we're sort of forced to split out ``strand_arg``, we might
@@ -419,29 +420,34 @@ Guidelines:
 - Stranded arguments must be in params
 - SE/PE arguments should be handled inside a ``run:`` block
 - Any other arguments should be written in a  ``shell:`` block or a ``shell()``
-  call directly, to visually match the equivalent command-line call
+  call directly, to visually match the equivalent command-line call and to make
+  it clear what should be edited.
 
 Arguments for and against a separate references workflow
 --------------------------------------------------------
 
 RNA-seq, ChIP-seq, and the upcoming variant calling all need to do something
-with references, including possibly patching them. So we have to deal with this
-inherent complexity. It initially made sense to put such common rules in the
+with references, including possibly patching them. We have to deal with this
+inherent complexity. It initially made sense to put common rules in the
 separate references workflow.
 
 However, only a subset of the rules in the references workflow are actually
 shared across RNA-seq and ChIP-seq -- currently, only the bowtie2 index
 (genome-wide ChIP-seq alignment; rRNA screening for RNA-seq), the fasta rule,
-chromsizes, and the generic unzip rule. The others (gtf, mappings,
-conversion_bed12, conversion_refflat, kallisto_index, salmon_index,
-transcriptome_fasta, star_index, rrna) are all unique to RNA-seq. So the
-current references workflow is actually mostly an RNA-seq-only references
-workflow.
+chromsizes, and the generic unzip rule. The other rules in the <v2.0 references
+workflow (gtf, mappings, conversion_bed12, conversion_refflat, kallisto_index,
+salmon_index, transcriptome_fasta, star_index, rrna) are all unique to RNA-seq.
+So the <v2.0 references workflow is actually mostly an RNA-seq-only references
+workflow. It would make more sense to have those RNA-seq-specific rules in the
+RNA-seq workflow directly.
 
 Furthermore, much of the complexity is handled in the
 lib.utils.download_and_postprocess function, rather than in the workflow rules.
-We already are using the utils module separately in the ChIP-seq and RNA-seq
-workflows, so there's no additional overhead to import it.
+This is the function that downloads, figures out what functions to apply for
+post-processing, and outputs the prepared file. We already are using the utils
+module separately in the ChIP-seq and RNA-seq workflows, so there's no
+additional overhead to import it into the Snakefiles. We can use that function
+directly.
 
 Last, having a workflow split across two Snakefiles hampers the ability to
 understand the complete workflow.
@@ -453,22 +459,25 @@ featureCounts all-in-one or individually
 ----------------------------------------
 
 featureCounts can accept a list of BAMs and run everything in one shot, or can
-be run once per sample and then manusally aggregated later. Previously, we
-provided all BAMs. However, for paired-end BAMs, featureCounts will internally
-name sort each BAM before counting. It does this serially. The result is
-possibly substantial memory usage and a lot of time. 
+be run once per sample and then those outputs can be aggregated later. Previously, we
+provided all BAMs to a single all-in-one call of featureCounts. However, for
+paired-end BAMs, featureCounts will internally name sort each BAM before
+counting. It does this serially. The result is possibly substantial memory
+usage and a lot of time. 
 
 One approach could be to temporarily name-sort BAMs in a separate rule,
 conditional on paired-end reads, and the featureCounts rule would need to have
 conditional input filenames as well. This adds a little bit of complexity for
 the benefit of being able to more finely control resource usage. Another
-approach would be to run featureCounts independently on each BAM, allosing it
-to name-sort independently each one in parallel, and then manually aggregate
-the featureCounts output of each.
+approach would be to run featureCounts independently on each BAM, allowing it
+to name-sort independently each one (which would happen in parallel jobs
+managed by Snakemake), and then manually aggregate the featureCounts output of
+each.
 
-Since the conditional inclusion of a namesorted rule was straightforward (a
+Turns out the conditional inclusion of a namesorted rule was straightforward (a
 matter of choosing the input file for featureCounts rule), it made the most
-sense to run featureCounts once, providing it all samples.
+sense to run featureCounts once, providing it all samples, and having it use
+the temporarily name-sorted BAMs as input for paired-end experiments.
 
 Selection of reference genomes and annotations
 ----------------------------------------------
@@ -481,19 +490,12 @@ loci, or assembly patches.
 <https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use>`__ on
 the subject is a useful guideline. To summarize, we want to exclude alt contigs
 / haplotypes because they may create multimapping issues, and we want to
-include unassembled contigs because excluding them will artificially decrease
+include unassembled contigs because excluding them would artificially decrease
 alignment percentage.
 
 Since lcdb-wf is intended to be used with arbitrary organisms, the PAR and
 mitochondrial sequences mentioned there are not relevant in general.
 
-Ideally, we would have a tool that, given the URLs for raw fastq and gtf,
-
-1. Displays the set of chromosomes
-2. Infers if there are any that look like rDNA or mtDNA
-3. Ensures the GTF matches the fasta match chromosomes
-4. Accepts a template config to assess to process
-
 
 Annotations
 -----------
@@ -549,17 +551,30 @@ Erring on the side of too many annotations (i.e., using the comprehensive
 annotation instead of a curated version) will result in more features, which at
 face value might make the FDR adjustment more harsh in DESeq2. But DESeq2's
 independent filtering (not even testing those features with so few reads that
-they would not reach significance) guards against this.
+they would not reach significance) guards against this. So we stick with the
+comprehensive annotations when available.
 
 Zipping/unzipping references
 ----------------------------
 
-STAR requires uncompressed FASTA and GTF files to build the index. Making
-uncompressed files temporary means running the risk of another rule needing
-uncompressed to trigger costly STAR alignment. The extra storage cost of
-leaving an uncompressed fasta (~3 GB) around is minimal compared to the scale
-of all other data, and guards against inadvertently re-running all alignment
-jobs.
+Some tools need uncompressed files, others are fine with compressed. For example,
+STAR requires uncompressed FASTA and GTF files to build the index, but bowtie2
+can use a compressed fasta. gffread nees uncompressed FASTA and GTF to make
+a transcriptome fasta.
+
+Previously, anything using a FASTA or GTF would use the uncompressed version,
+and the ``unzip`` rule marked the uncompressed output as temporary. The problem
+with this was when we wanted to make a change in featureCounts. Since this used
+the temp uncompressed GTF file, the ``unzip`` rule needed to run again...but
+that would then trigger the STAR rule to rerun, because it too used that temp
+file and it was being changed (well, re-created but that's the same to
+Snakemake). As a result, we had to spend the time/resource cost to realign
+*everything* and all the downstream jobs after alignment, just to run
+featureCounts.
+
+Making the featureCounts rule use the compressed GTF avoids this issue. Now,
+just the transcriptome fasta and the STAR index need the uncompressed
+references, and these are set in the ``unzip`` rule to be temporary.
 
 Test framework
 --------------
@@ -603,7 +618,6 @@ post-process mechanism to filter the fasta.
 
 
 
-
 Aligners
 --------
 
@@ -611,3 +625,20 @@ Previously, HISAT2 and STAR were both supported; salmon and kallisto were both
 supported. This created additional complexity in the references workflow and in
 the configs. Now, we're just using STAR and salmon (for RNA-seq) and bowtie2 for
 ChIP-seq.
+
+Aligners don't seem to make that much of a difference, and officially
+supporting just one (plus a psueodaligner for RNA-seq) makes the workflows and
+config simpler.
+
+Reference genome and annotation sources
+---------------------------------------
+
+lcdb-wf has always been organism-agnostic. It would be nice to have a single
+source of all genomics data such that we could pass an organism name and get
+back the referencs. But even Ensembl and NCBI are not uniform in their support.
+Sometimes primary assemblies are available; sometimes primary chromosome fastas
+are available but the top-level is actually primary (rat, Ensembl); A GTF might
+not be available (pombe, Ensembl); or only a toplevel assembly is available and
+we need to remove the haplotypes and alt loci out (hg19, Ensembl).
+
+

From 919c45874901889328c935abb9bcfd83da7079b9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 28 Oct 2025 18:29:13 +0000
Subject: [PATCH 165/196] gzip transcriptome fasta and mapping tsv

---
 workflows/rnaseq/Snakefile | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index f9e33b1c..f5a6f801 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -31,7 +31,7 @@ localrules:
 rule all:
     input:
         "data/rnaseq_aggregation/multiqc.html",
-        "references/annotation.mapping.tsv",
+        "references/annotation.mapping.tsv.gz",
 
 
 rule symlinks:
@@ -210,19 +210,21 @@ rule transcriptome_fasta:
         gtf="references/annotation.gtf",
         fai="references/genome.fa.fai",
     output:
-        "references/transcriptome.fa",
+        fa=temporary("references/transcriptome.fa"),
+        gz="references/transcriptome.fa.gz",
     log:
         "references/transcriptome.log",
     resources:
         mem="4g",
         runtime="2h",
     shell:
-        "gffread {input.gtf} -w {output} -g {input.fasta} &> {log}"
+        "gffread {input.gtf} -w {output.fa} -g {input.fasta} &> {log} "
+        "&& gzip -c {output.fa} > {output.gz} "
 
 
 rule salmon_index:
     input:
-        "references/transcriptome.fa",
+        "references/transcriptome.fa.gz",
     output:
         "references/salmon/versionInfo.json",
     log:
@@ -297,21 +299,20 @@ rule mappings:
     input:
         gtf="references/annotation.gtf.gz",
     output:
-        tsv="references/annotation.mapping.tsv",
+        "references/annotation.mapping.tsv.gz",
     resources:
-        mem="2g",
+        mem="24g",
         runtime="2h",
     run:
+        tsv = output[0].replace(".gz", "")
         mappings_args = dict(
             exclude_featuretypes=None,
             include_featuretypes=None,
             include_attributes=None,
         )
-        print(config["annotation"].get("mappings", {}))
-
         mappings_args.update(config["annotation"].get("mappings", {}))
-
-        utils.mappings_tsv(input.gtf, output.tsv, **mappings_args)
+        utils.mappings_tsv(input.gtf, tsv, **mappings_args)
+        shell("gzip {tsv}")
 
 
 rule symlink_targets:

From a3b6b33260256fd00eece56f340f33cddba3b805 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 28 Oct 2025 18:30:24 +0000
Subject: [PATCH 166/196] pep8

---
 lib/utils.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index a0f90a7f..97fed6cb 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1,31 +1,33 @@
 import binascii
 import collections
 import contextlib
+import csv
 import gzip
 import os
 import re
-import sys
 import subprocess
+import sys
 import warnings
 from collections.abc import Iterable
 from itertools import product
 
+import gffutils
 import pandas
 import pandas as pd
 import yaml
 from Bio import SeqIO
 from snakemake.io import expand, regex_from_filepattern
 from snakemake.shell import shell
-import gffutils
-import csv
 
 # Small helper functions
 
+
 def render_r1_r2(pattern):
-    return expand(pattern, sample='{sample}', n=c.n)
+    return expand(pattern, sample="{sample}", n=c.n)
+
 
 def render_r1_only(pattern):
-    return expand(pattern, sample='{sample}', n=1)
+    return expand(pattern, sample="{sample}", n=1)
 
 
 def resolve_name(name):
@@ -744,6 +746,7 @@ def filter_rrna_fastas(tmpfiles, outfile, pattern):
     """
     if pattern is None:
         raise ValueError("Pattern cannot be None")
+
     def gen():
         for tmp in tmpfiles:
             handle = gzip.open(tmp, "rt")
@@ -866,7 +869,6 @@ def func(infiles, outfile, *args, **kwargs):
 
     """
 
-
     if not isinstance(postprocess, list):
         postprocess = [postprocess]
 
@@ -1187,10 +1189,13 @@ def gff2gtf(gff, gtf):
 
 
 def wrapper_for(path):
-    return 'file:' + os.path.join('../..','wrappers', 'wrappers', path)
+    return "file:" + os.path.join("../..", "wrappers", "wrappers", path)
+
 
 def detect_sra(sampletable):
-    return 'Run' in sampletable.columns and any(sampletable['Run'].str.startswith('SRR'))
+    return "Run" in sampletable.columns and any(
+        sampletable["Run"].str.startswith("SRR")
+    )
 
 
 def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None, include_attributes=None):
@@ -1218,7 +1223,7 @@ def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None,
         raise ValueError("Both include_featuretypes and exclude_featuretypes were specified.")
 
     res = []
-    keys = set(['__featuretype__'])
+    keys = set(["__featuretype__"])
     seen = set()
     for f in gffutils.DataIterator(gtf):
         ft = f.featuretype
@@ -1245,8 +1250,10 @@ def unlist_dict(d):
         sorted_keys = sorted(include_attributes)
     else:
         sorted_keys = sorted(keys)
-    with open(tsv, 'w') as fout:
-        writer = csv.DictWriter(fout, fieldnames=sorted_keys, restval="", delimiter='\t')
+    with open(tsv, "w") as fout:
+        writer = csv.DictWriter(
+            fout, fieldnames=sorted_keys, restval="", delimiter="\t"
+        )
         writer.writeheader()
         for row in res:
             writer.writerow(unlist_dict(row))

From 1c698b242bce47d9ed5146b0db9035ac5f84f5ce Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 28 Oct 2025 18:30:38 +0000
Subject: [PATCH 167/196] include/exclude attributes in mappings

---
 lib/utils.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index 97fed6cb..1363cdf2 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1198,7 +1198,14 @@ def detect_sra(sampletable):
     )
 
 
-def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None, include_attributes=None):
+def mappings_tsv(
+    gtf,
+    tsv,
+    exclude_featuretypes=None,
+    include_featuretypes=None,
+    include_attributes=None,
+    exclude_attributes=None,
+):
     """
     Create a TSV file of attributes found in a GTF file.
 
@@ -1213,14 +1220,20 @@ def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None,
         E.g., we likely don't need entries for start_codon if those are in the
         GTF.
 
-    include_attributes : list
-        Restrict the attributes reported in the TSV. Should at least have
+    include_attributes, exclude_attributes : list
+        Mutually exclusive. Restrict the attributes reported in the TSV. Should at least have
         a column for gene ID and transcript ID in order for downstream RNA-seq
         work.
     """
 
     if exclude_featuretypes and include_featuretypes:
-        raise ValueError("Both include_featuretypes and exclude_featuretypes were specified.")
+        raise ValueError(
+            "Both include_featuretypes and exclude_featuretypes were specified."
+        )
+    if exclude_attributes and include_attributes:
+        raise ValueError(
+            "Both include_attributes and exclude_attributes were specified."
+        )
 
     res = []
     keys = set(["__featuretype__"])
@@ -1231,13 +1244,23 @@ def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None,
             continue
         if include_featuretypes and ft not in include_featuretypes:
             continue
+
         d = dict(f.attributes)
+
+        if include_featuretypes:
+            d = {k: v for k, v in d.items() if k in include_featuretypes}
+        if exclude_featuretypes:
+            d = {k: v for k, v in d.items() if k not in exclude_featuretypes}
+
         keys.update(d.keys())
         d["__featuretype__"] = ft
+
+        # Exclude duplicates (rather than sorting and uniq-ing the file later)
         h = hash(str(d))
         if h in seen:
             continue
         seen.update([h])
+
         res.append(d)
 
     def unlist_dict(d):

From 6b788ed78753fe1620ee2ffcc5f0d2c65efa8e52 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 31 Oct 2025 15:55:38 +0000
Subject: [PATCH 168/196] fix default postprocess

---
 lib/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/utils.py b/lib/utils.py
index 1363cdf2..03dbe62f 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -895,10 +895,13 @@ def func(infiles, outfile, *args, **kwargs):
     #
     #   ]
     #
+    def _default(origfn, newfn):
+        shell("mv {origfn} {newfn}")
+
     for i, postprocess_i in enumerate(postprocess):
 
         if postprocess_i is None:
-            func = default_postprocess
+            func = _default
             args = ()
             kwargs = {}
             name = None

From 2b7084f5d60ea63061e8877c1ae299a976460157 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 31 Oct 2025 15:57:17 +0000
Subject: [PATCH 169/196] use configurable references dir

---
 workflows/chipseq/Snakefile | 21 ++++-----
 workflows/rnaseq/Snakefile  | 87 +++++++++++++++++++------------------
 2 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 4f347eb7..0c9688a0 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -17,6 +17,7 @@ is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.iloc[:, 0].values
 LABELS = sampletable.label.values
+REFERENCES = config.get("references", "references")
 peaks = chipseq.add_bams_to_peak_calling(config)
 
 
@@ -43,9 +44,9 @@ rule all:
 
 rule fasta:
     output:
-        "references/genome.fa.gz",
+        f"{REFERENCES}/genome.fa.gz",
     log:
-        "references/logs/genome.fa.gz.log",
+        f"{REFERENCES}/logs/genome.fa.gz.log",
     resources:
         mem_mb="4g",
         runtime="2h",
@@ -63,11 +64,11 @@ rule fasta:
 
 rule chromsizes:
     input:
-        "references/genome.fa.gz",
+        f"{REFERENCES}/genome.fa.gz",
     output:
-        "references/genome.chromsizes",
+        f"{REFERENCES}/genome.chromsizes",
     log:
-        "references/logs/genome.chromsizes.log",
+        f"{REFERENCES}/logs/genome.chromsizes.log",
     params:
         # java_args='-Xmx2g'  # [enable for test]
         java_args="-Xmx20g",  # [disable for test]
@@ -89,12 +90,12 @@ rule chromsizes:
 
 rule bowtie2_index:
     input:
-        "references/genome.fa.gz",
+        f"{REFERENCES}/genome.fa.gz",
     output:
-        "references/bowtie2/genome.1.bt2",
-        "references/bowtie2/genome.fa",
+        f"{REFERENCES}/bowtie2/genome.1.bt2",
+        f"{REFERENCES}/bowtie2/genome.fa",
     log:
-        "references/logs/bowtie2_genome.log",
+        f"{REFERENCES}/logs/bowtie2_genome.log",
     resources:
         mem="32g",
         disk="50g",
@@ -227,7 +228,7 @@ rule bowtie2:
             n=n,
             allow_missing=True,
         ),
-        index="references/bowtie2/genome.1.bt2",
+        index=f"{REFERENCES}/bowtie2/genome.1.bt2",
     output:
         bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"),
     log:
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index f5a6f801..9327b0ac 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -15,6 +15,7 @@ sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.index
+REFERENCES = config.get("references", "references")
 sample_dir = "data/rnaseq_samples"
 
 
@@ -31,7 +32,7 @@ localrules:
 rule all:
     input:
         "data/rnaseq_aggregation/multiqc.html",
-        "references/annotation.mapping.tsv.gz",
+        f"{REFERENCES}/annotation.mapping.tsv.gz",
 
 
 rule symlinks:
@@ -55,9 +56,9 @@ rule symlinks:
 
 rule fasta:
     output:
-        "references/genome.fa.gz",
+        f"{REFERENCES}/genome.fa.gz",
     log:
-        "references/logs/genome.fa.gz.log",
+        f"{REFERENCES}/logs/genome.fa.gz.log",
     resources:
         mem_mb="4g",
         runtime="2h",
@@ -75,9 +76,9 @@ rule fasta:
 
 rule faidx:
     input:
-        "references/genome.fa",
+        f"{REFERENCES}/genome.fa",
     output:
-        "references/genome.fa.fai",
+        f"{REFERENCES}/genome.fa.fai",
     resources:
         mem_mb="4g",
         runtime="2h",
@@ -87,9 +88,9 @@ rule faidx:
 
 rule annotation:
     output:
-        "references/annotation.gtf.gz",
+        f"{REFERENCES}/annotation.gtf.gz",
     log:
-        "references/logs/annotation.gtf.gz.log",
+        f"{REFERENCES}/logs/annotation.gtf.gz.log",
     resources:
         mem="4g",
         runtime="2h",
@@ -107,9 +108,9 @@ rule annotation:
 
 rule rrna_fasta:
     output:
-        "references/rrna.fa.gz",
+        f"{REFERENCES}/rrna.fa.gz",
     log:
-        "references/logs/rrna.fa.log",
+        f"{REFERENCES}/logs/rrna.fa.log",
     resources:
         mem="4g",
         runtime="2h",
@@ -137,9 +138,9 @@ rule rrna_fasta:
 
 rule unzip:
     input:
-        "references/{prefix}.gz",
+        f"{REFERENCES}/{{prefix}}.gz",
     output:
-        temporary("references/{prefix}"),
+        temporary(f"{REFERENCES}/{{prefix}}"),
     resources:
         mem="4g",
         runtime="2h",
@@ -149,12 +150,12 @@ rule unzip:
 
 rule rrna_index:
     input:
-        "references/rrna.fa.gz",
+        f"{REFERENCES}/rrna.fa.gz",
     output:
-        "references/bowtie2/rrna.1.bt2",
-        "references/bowtie2/rrna.fa.gz",
+        f"{REFERENCES}/bowtie2/rrna.1.bt2",
+        f"{REFERENCES}/bowtie2/rrna.fa.gz",
     log:
-        "references/logs/bowtie2_rrna.log",
+        f"{REFERENCES}/logs/bowtie2_rrna.log",
     resources:
         mem="32g",
         disk="50g",
@@ -168,12 +169,12 @@ rule rrna_index:
 
 rule star_index:
     input:
-        fasta="references/genome.fa",
-        gtf="references/annotation.gtf",
+        fasta=f"{REFERENCES}/genome.fa",
+        gtf=f"{REFERENCES}/annotation.gtf",
     output:
-        "references/star/Genome",
+        f"{REFERENCES}/star/Genome",
     log:
-        "references/logs/star.log",
+        f"{REFERENCES}/logs/star.log",
     threads: 8
     resources:
         mem="64g",
@@ -206,14 +207,14 @@ rule star_index:
 
 rule transcriptome_fasta:
     input:
-        fasta="references/genome.fa",
-        gtf="references/annotation.gtf",
-        fai="references/genome.fa.fai",
+        fasta=f"{REFERENCES}/genome.fa",
+        gtf=f"{REFERENCES}/annotation.gtf",
+        fai=f"{REFERENCES}/genome.fa.fai",
     output:
-        fa=temporary("references/transcriptome.fa"),
-        gz="references/transcriptome.fa.gz",
+        fa=temporary(f"{REFERENCES}/transcriptome.fa"),
+        gz=f"{REFERENCES}/transcriptome.fa.gz",
     log:
-        "references/transcriptome.log",
+        f"{REFERENCES}/transcriptome.log",
     resources:
         mem="4g",
         runtime="2h",
@@ -224,13 +225,13 @@ rule transcriptome_fasta:
 
 rule salmon_index:
     input:
-        "references/transcriptome.fa.gz",
+        f"{REFERENCES}/transcriptome.fa.gz",
     output:
-        "references/salmon/versionInfo.json",
+        f"{REFERENCES}/salmon/versionInfo.json",
     log:
-        "references/logs/salmon.log",
+        f"{REFERENCES}/logs/salmon.log",
     params:
-        outdir="references/salmon",
+        outdir=f"{REFERENCES}/salmon",
     resources:
         mem="32g",
         runtime="2h",
@@ -241,11 +242,11 @@ rule salmon_index:
 
 rule conversion_refflat:
     input:
-        "references/annotation.gtf.gz",
+        f"{REFERENCES}/annotation.gtf.gz",
     output:
-        "references/annotation.refflat",
+        f"{REFERENCES}/annotation.refflat",
     log:
-        "references/logs/annotation.refflat.log",
+        f"{REFERENCES}/logs/annotation.refflat.log",
     resources:
         mem="2g",
         runtime="2h",
@@ -257,9 +258,9 @@ rule conversion_refflat:
 
 rule conversion_bed12:
     input:
-        "references/annotation.gtf.gz",
+        f"{REFERENCES}/annotation.gtf.gz",
     output:
-        "references/annotation.bed12",
+        f"{REFERENCES}/annotation.bed12",
     resources:
         mem="2g",
         runtime="2h",
@@ -271,11 +272,11 @@ rule conversion_bed12:
 
 rule chromsizes:
     input:
-        "references/genome.fa.gz",
+        f"{REFERENCES}/genome.fa.gz",
     output:
-        "references/genome.chromsizes",
+        f"{REFERENCES}/genome.chromsizes",
     log:
-        "references/logs/genome.chromsizes.log",
+        f"{REFERENCES}/logs/genome.chromsizes.log",
     params:
         # java_args='-Xmx2g'  # [enable for test]
         java_args="-Xmx20g",  # [disable for test]
@@ -297,9 +298,9 @@ rule chromsizes:
 
 rule mappings:
     input:
-        gtf="references/annotation.gtf.gz",
+        gtf=f"{REFERENCES}/annotation.gtf.gz",
     output:
-        "references/annotation.mapping.tsv.gz",
+        f"{REFERENCES}/annotation.mapping.tsv.gz",
     resources:
         mem="24g",
         runtime="2h",
@@ -405,7 +406,7 @@ rule star:
     input:
         fastq=rules.cutadapt.output,
         index=rules.star_index.output,
-        annotation="references/annotation.gtf",
+        annotation=f"{REFERENCES}/annotation.gtf",
     output:
         bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"),
         sjout=temporary(
@@ -463,7 +464,7 @@ rule star:
 rule rRNA:
     input:
         fastq="data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz",
-        index="references/bowtie2/rrna.1.bt2",
+        index=f"{REFERENCES}/bowtie2/rrna.1.bt2",
     output:
         bam="data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam",
     log:
@@ -574,7 +575,7 @@ rule namesorted_bam:
 
 rule featurecounts:
     input:
-        annotation="references/annotation.gtf.gz",
+        annotation=f"{REFERENCES}/annotation.gtf.gz",
         bam=expand(
             (
                 rules.namesorted_bam.output
@@ -691,7 +692,7 @@ rule preseq:
 rule salmon:
     input:
         fastq=rules.cutadapt.output,
-        index="references/salmon/versionInfo.json",
+        index=f"{REFERENCES}/salmon/versionInfo.json",
     output:
         "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf",
     log:

From 0ea7cfcbadd5b464a400cec82ad9fc563ebb7005 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 31 Oct 2025 21:24:58 +0000
Subject: [PATCH 170/196] preflight checks

---
 lib/utils.py                | 30 ++++++++++++++++++++++--------
 workflows/chipseq/Snakefile |  1 +
 workflows/rnaseq/Snakefile  |  1 +
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index 03dbe62f..202cddd3 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -698,14 +698,28 @@ def preflight(config):
     check_unique_samplename(sampletable)
     if "orig_filename" in sampletable.columns:
         check_unique_fn(sampletable)
-
-
-def rnaseq_preflight(c):
-    pass
-
-
-def chipseq_preflight(c):
-    pass
+    if "genome" not in config:
+        raise ConfigurationError("Config is missing 'genome' key")
+    if "url" not in config["genome"]:
+        raise ConfigurationError("Config is missing 'url' key for 'genome'")
+
+
+def rnaseq_preflight(config):
+    preflight(config)
+    if "annotation" not in config:
+        raise ConfigurationError("Config is missing 'annotation' key")
+    if "url" not in config["annotation"]:
+        raise ConfigurationError("Config is missing 'url' key for 'annotation'")
+    if "stranded" not in config:
+        raise ConfigurationError("Config is missing 'stranded' key")
+    if "organism" not in config:
+        raise ConfigurationError("Config is missing 'organism' key")
+
+
+def chipseq_preflight(config):
+    preflight(config)
+    if "peaks" not in config:
+        config["peaks"] = []
 
 
 def strand_arg_lookup(config, lookup):
diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 0c9688a0..83386e82 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -11,6 +11,7 @@ from lib import chipseq
 configfile: "config/config.yaml"
 
 
+utils.chipseq_preflight(config)
 sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 9327b0ac..6c776dde 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -10,6 +10,7 @@ from lib import utils
 configfile: "config/config.yaml"
 
 
+utils.rnaseq_preflight(config)
 sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
 is_paired = utils.detect_layout(sampletable) == "PE"

From 0ead4c3783e302b05bcc28e585dbde7ae5259289 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 4 Nov 2025 10:23:22 -0500
Subject: [PATCH 171/196] rm no-longer used dependencies

---
 include/requirements.txt | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/include/requirements.txt b/include/requirements.txt
index dfcb8601..98b67ee4 100644
--- a/include/requirements.txt
+++ b/include/requirements.txt
@@ -1,18 +1,13 @@
 bedtools
 biopython
-bowtie
 bowtie2
 cutadapt>=3.0
 deeptools
 epic2
-fastq-screen
 fastqc
 font-ttf-dejavu-sans-mono
 gffread
 gffutils
-hisat2
-intervalstats
-ipython
 macs3
 multiqc
 pandas
@@ -25,16 +20,14 @@ preseq
 pybedtools
 pyfaidx
 pysam
-pytest
-pytest-xdist
 python
+pytest
 rseqc
 
 # earlier versions of salmon can segfault on Slurm
 salmon>=1.10.1
 
 samtools
-seaborn
 snakemake>8
 sra-tools
 star
@@ -43,11 +36,6 @@ trackhub
 ucsc-bedgraphtobigwig
 ucsc-bedsort
 ucsc-bedtobigbed
-ucsc-bigwigmerge
-ucsc-fetchchromsizes
 ucsc-genepredtobed
 ucsc-gtftogenepred
-ucsc-liftover
-ucsc-oligomatch
 ucsc-twobittofa
-ucsc-wigtobigwig

From dbb21a9a54fc27a83c0c009b834083c6db96f3d0 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 4 Nov 2025 21:24:15 +0000
Subject: [PATCH 172/196] fill in label column with sample names if missing

---
 workflows/chipseq/Snakefile              |  1 +
 workflows/chipseq/config/sampletable.tsv | 18 +++++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 83386e82..f54dddcc 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -14,6 +14,7 @@ configfile: "config/config.yaml"
 utils.chipseq_preflight(config)
 sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
 sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
+sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0])
 is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.iloc[:, 0].values
diff --git a/workflows/chipseq/config/sampletable.tsv b/workflows/chipseq/config/sampletable.tsv
index 05212460..bb7e7831 100644
--- a/workflows/chipseq/config/sampletable.tsv
+++ b/workflows/chipseq/config/sampletable.tsv
@@ -1,11 +1,11 @@
 # Samplenames with the same "label" will be considered technical replicates
-samplename	antibody	biological_material	replicate	label	orig_filename
-input_1	input	wingdisc-1	1	input-wingdisc-1	data/example_data/chipseq_input1.fq.gz
-input_2	input	wingdisc-2	2	input-wingdisc-2	data/example_data/chipseq_input2.fq.gz
-ip_1	gaf	wingdisc-1	1	gaf-wingdisc-1	data/example_data/chipseq_ip1.fq.gz
-ip_2	gaf	wingdisc-2	2	gaf-wingdisc-2	data/example_data/chipseq_ip2.fq.gz
-
+samplename	label	antibody	biological_material	replicate	orig_filename
+input-wingdisc-1		input	wingdisc-1	1	data/example_data/chipseq_input1.fq.gz
+input-wingdisc-2		input	wingdisc-2	2	data/example_data/chipseq_input2.fq.gz
+gaf-wingdisc-1		gaf	wingdisc-1	1	data/example_data/chipseq_ip1.fq.gz
+gaf-wingdisc-2		gaf	wingdisc-2	2	data/example_data/chipseq_ip2.fq.gz
+                                                            
 # Note here we are treating ip_3 and ip_4 as technical replicates for the sake of testing
-ip_3	gaf	embryo-1	1	gaf-embryo-1	data/example_data/chipseq_ip3.fq.gz
-ip_4	gaf	embryo-1	1	gaf-embryo-1	data/example_data/chipseq_ip4.fq.gz
-input_3	input	embryo-1	1	input-embryo-1	data/example_data/chipseq_input3.fq.gz
+ip_3	gaf-embryo-1	gaf	embryo-1	1	data/example_data/chipseq_ip3.fq.gz
+ip_4	gaf-embryo-1	gaf	embryo-1	1	data/example_data/chipseq_ip4.fq.gz
+input-embryo-1		input	embryo-1	1	data/example_data/chipseq_input3.fq.gz

From a73eb63e85850a45635042a885334bf9d56bcfd1 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 4 Nov 2025 21:27:18 +0000
Subject: [PATCH 173/196] rm plotfingerprint

---
 workflows/chipseq/Snakefile | 66 +------------------------------------
 1 file changed, 1 insertion(+), 65 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index f54dddcc..d3568b58 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -421,61 +421,6 @@ rule bigwig:
         "&> {log}"
 
 
-rule fingerprint:
-    input:
-        bams=lambda wc: expand(
-            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
-            label=wc.ip_label,
-        ),
-        control=lambda wc: expand(
-            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam",
-            label=chipseq.merged_input_for_ip(sampletable, wc.ip_label),
-        ),
-        bais=lambda wc: expand(
-            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai",
-            label=wc.ip_label,
-        ),
-        control_bais=lambda wc: expand(
-            "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai",
-            label=chipseq.merged_input_for_ip(sampletable, wc.ip_label),
-        ),
-    output:
-        plot="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png",
-        raw_counts="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab",
-        metrics="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics",
-    threads: 8
-    log:
-        "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics.log",
-    threads: 1
-    resources:
-        mem="32g",
-        runtime="2h",
-    run:
-        if len(input.control) == 0:
-            jsdsample_arg = ""
-        else:
-            jsdsample_arg = "--JSDsample " + str(input.control)
-        shell(
-            "plotFingerprint "
-            "--bamfiles {input.bams} "
-            "-p {threads} "
-            # The JSDsample argument is disabled for testing as it dramatically
-            # increases the run time.
-            "{jsdsample_arg} "  # [disable for test]
-            "--outQualityMetrics {output.metrics} "
-            "--outRawCounts {output.raw_counts} "
-            "--plotFile {output.plot} "
-            # Default is 500k; use fewer to speed up testing:
-            # '--numberOfSamples 50 '  # [enable for test]
-            "--smartLabels "
-            "--extendReads=300 "
-            "--skipZeros "
-            "&> {log} "
-            '&& sed -i "s/NA/0.0/g" {output.metrics} '
-        )
-
-
-
 rule macs:
     input:
         ip=lambda wc: expand(
@@ -664,16 +609,7 @@ rule multiqc:
         expand(rules.samtools_idxstats.output, sample=SAMPLES),
         expand(rules.bigwig.output, label=sampletable.label),
         expand(rules.merge_techreps.output, label=sampletable.label),
-        expand(
-            "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics",
-            ip_label=sampletable.loc[sampletable.antibody != "input", "label"],
-        ),
-        expand(
-            "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics",
-            sample=SAMPLES,
-        )
-        if is_paired
-        else [],
+        expand(rules.collectinsertsizemetrics.output.metric, sample=SAMPLES) if is_paired else [],
         [v["bigbed"] for v in peaks.values()],
         config="config/multiqc_config.yaml",
     output:

From 383fb8b521e99a66417398a88069cd8b3cf297e5 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 4 Nov 2025 22:42:34 -0500
Subject: [PATCH 174/196] clean up with prepare_*_sampletable functions

---
 lib/utils.py                | 17 +++++++++++++++++
 workflows/chipseq/Snakefile | 11 ++++-------
 workflows/rnaseq/Snakefile  |  5 +----
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index 202cddd3..c4225f74 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1299,4 +1299,21 @@ def unlist_dict(d):
             writer.writerow(unlist_dict(row))
 
 
+def prepare_chipseq_sampletable(config):
+    chipseq_preflight(config)
+    sampletable_fn = config.get("sampletable", "config/sampletable.tsv")
+    sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#")
+    sampletable = sampletable.set_inde(sampletable.columns[0], drop=False)
+    sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0])
+    return sampletable
+
+
+def prepare_rnaseq_sampletable(config):
+    rnaseq_preflight(config)
+    sampletable_fn = config.get("sampletable", "config/sampletable.tsv")
+    sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#")
+    sampletable = sampletable.set_inde(sampletable.columns[0], drop=False)
+    return sampletable
+
+
 # vim: ft=python
diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index d3568b58..e6b01bb8 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -11,13 +11,10 @@ from lib import chipseq
 configfile: "config/config.yaml"
 
 
-utils.chipseq_preflight(config)
-sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
-sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
-sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0])
+sampletable = utils.prepare_chipseq_sampletable(config)
 is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
-SAMPLES = sampletable.iloc[:, 0].values
+SAMPLES = sampletable.index.values
 LABELS = sampletable.label.values
 REFERENCES = config.get("references", "references")
 peaks = chipseq.add_bams_to_peak_calling(config)
@@ -607,8 +604,8 @@ rule multiqc:
         expand(rules.samtools_stats.output, sample=SAMPLES),
         expand(rules.samtools_flagstat.output, sample=SAMPLES),
         expand(rules.samtools_idxstats.output, sample=SAMPLES),
-        expand(rules.bigwig.output, label=sampletable.label),
-        expand(rules.merge_techreps.output, label=sampletable.label),
+        expand(rules.bigwig.output, label=LABELS),
+        expand(rules.merge_techreps.output, label=LABELS),
         expand(rules.collectinsertsizemetrics.output.metric, sample=SAMPLES) if is_paired else [],
         [v["bigbed"] for v in peaks.values()],
         config="config/multiqc_config.yaml",
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 6c776dde..5c22fda8 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -9,10 +9,7 @@ from lib import utils
 
 configfile: "config/config.yaml"
 
-
-utils.rnaseq_preflight(config)
-sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#")
-sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
+sampletable = utils.prepare_rnaseq_sampletable(config)
 is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.index

From 350f338d1015980921f6a5664578551d70bc5485 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 4 Nov 2025 22:42:56 -0500
Subject: [PATCH 175/196] substantial cleanup in utils

---
 lib/utils.py | 502 +--------------------------------------------------
 1 file changed, 6 insertions(+), 496 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index c4225f74..17614537 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1,34 +1,16 @@
 import binascii
-import collections
-import contextlib
 import csv
 import gzip
 import os
-import re
 import subprocess
-import sys
 import warnings
 from collections.abc import Iterable
-from itertools import product
 
 import gffutils
-import pandas
 import pandas as pd
-import yaml
 from Bio import SeqIO
-from snakemake.io import expand, regex_from_filepattern
 from snakemake.shell import shell
 
-# Small helper functions
-
-
-def render_r1_r2(pattern):
-    return expand(pattern, sample="{sample}", n=c.n)
-
-
-def render_r1_only(pattern):
-    return expand(pattern, sample="{sample}", n=1)
-
 
 def resolve_name(name):
     """
@@ -54,22 +36,6 @@ def resolve_name(name):
     return obj
 
 
-@contextlib.contextmanager
-def temp_env(env):
-    """
-    Context manager to temporarily set os.environ.
-    """
-    env = dict(env)
-    orig = os.environ.copy()
-    _env = {k: str(v) for k, v in env.items()}
-    os.environ.update(_env)
-    try:
-        yield
-    finally:
-        os.environ.clear()
-        os.environ.update(orig)
-
-
 def flatten(iter, unlist=False):
     """
     Flatten an arbitrarily nested iterable whose innermost items are strings
@@ -121,110 +87,6 @@ def test_flatten():
     assert flatten(["a"]) == ["a"]
 
 
-def updatecopy(orig, update_with, keys=None, override=False):
-    """
-    Update a copy of a dictionary, with a bit more control than the built-in
-    dict.update.
-
-    Parameters
-    -----------
-
-    orig : dict
-        Dict to update
-
-    update_with : dict
-        Dict with new values
-
-    keys : list or None
-        If not None, then only consider these keys in `update_with`. Otherwise
-        consider all.
-
-    override : bool
-        If True, then this is similar to `dict.update`, except only those keys
-        in `keys` will be considered. If False (default), then if a key exists
-        in both `orig` and `update_with`, no updating will occur so `orig` will
-        retain its original value.
-    """
-    d = orig.copy()
-    if keys is None:
-        keys = update_with.keys()
-    for k in keys:
-        if k in update_with:
-            if k in d and not override:
-                continue
-            d[k] = update_with[k]
-    return d
-
-
-def update_recursive(orig, update_with):
-    """
-    Recursively update one dict with another.
-
-    From https://stackoverflow.com/a/3233356
-
-    >>> orig = {'a': {'b': 1, 'c': 2, 'd': [7, 8, 9]}}
-    >>> update_with = {'a': {'b': 5}}
-    >>> expected = {'a': {'b': 5, 'c': 2, 'd': [7, 8, 9]}}
-    >>> result = update_recursive(orig, update_with)
-    >>> assert result == expected, result
-
-    >>> update_with = {'a': {'d': 1}}
-    >>> result = update_recursive(orig, update_with)
-    >>> expected = {'a': {'b': 5, 'c': 2, 'd': 1}}
-    >>> result = update_recursive(orig, update_with)
-    >>> assert result == expected, result
-    """
-    for k, v in update_with.items():
-        if isinstance(v, collections.abc.Mapping):
-            orig[k] = update_recursive(orig.get(k, {}), v)
-        else:
-            orig[k] = v
-    return orig
-
-
-def boolean_labels(names, idx, mapping={True: "AND", False: "NOT"}, strip="AND_"):
-    """
-    Creates labels for boolean lists.
-
-    For example:
-
-    >>> names = ['exp1', 'exp2', 'exp3']
-    >>> idx = [True, True, False]
-    >>> boolean_labels(names, idx)
-    'exp1_AND_exp2_NOT_exp3'
-
-    Parameters
-    ----------
-
-    names : list
-        List of names to include in output
-
-    idx : list
-        List of booleans, same size as `names`
-
-    mapping : dict
-        Linking words to use for True and False
-
-    strip : str
-        Strip this text off the beginning of labels.
-
-    given a list of names and a same-size boolean, return strings like
-
-    a_NOT_b_AND_c
-
-    or
-
-    a_AND_b_AND_c_NOT_d_AND_e
-    """
-    s = []
-    for n, x in zip(names, idx):
-        s.append(mapping[x] + "_" + n)
-    s = "_".join(s)
-    if s.startswith(strip):
-        s = s.replace(strip, "", 1)
-    return s
-
-
 def make_relative_symlink(target, linkname):
     """
     Helper function to create a relative symlink.
@@ -240,33 +102,6 @@ def make_relative_symlink(target, linkname):
     shell(f"cd {linkdir}; ln -sf {relative_target} {linkbase}")
 
 
-def extract_wildcards(pattern, target):
-    """
-    Return a dictionary of wildcards and values identified from `target`.
-
-    Returns None if the regex match failed.
-
-    Parameters
-    ----------
-    pattern : str
-        Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``.
-
-    target : str
-        Filename from which to extract wildcards, e.g., ``data/a.bam``.
-
-    Examples
-    --------
-    >>> pattern = '{output}/{sample}.bam'
-    >>> target = 'data/a.bam'
-    >>> expected = {'output': 'data', 'sample': 'a'}
-    >>> assert extract_wildcards(pattern, target) == expected
-    >>> assert extract_wildcards(pattern, 'asdf') is None
-    """
-    m = re.compile(regex_from_filepattern(pattern)).match(target)
-    if m:
-        return m.groupdict()
-
-
 def is_gzipped(fn):
     """
     Filename-independent method of checking if a file is gzipped or not. Uses
@@ -299,16 +134,6 @@ def gzipped(tmpfiles, outfile):
                     fout.write(line)
 
 
-def cat(tmpfiles, outfile):
-    """
-    Simple concatenation of files.
-
-    Note that gzipped files can be concatenated as-is without un- and re-
-    compressing.
-    """
-    shell(f"cat {tmpfiles} > {outfile}")
-
-
 def is_paired_end(sampletable, sample):
     """
     Inspects the sampletable to see if the sample is paired-end or not
@@ -316,9 +141,12 @@ def is_paired_end(sampletable, sample):
     Parameters
     ----------
     sampletable : pandas.DataFrame
-        Contains a "layout" or "LibraryLayout" column (but not both). If the
-        lowercase value is "pe" or "paired", consider the sample paired-end.
-        Otherwise consider single-end.
+        If SRA sampletable, contains a "layout" or "LibraryLayout" column (but
+        not both). If the lowercase value is "pe" or "paired", consider the
+        sample paired-end. Otherwise consider single-end.
+
+        Otherwise, if there's an "orig_filename_R2" column consider it
+        paired-end, otherwise single-end.
 
     sample : str
         Assumed to be found in the first column of `sampletable`
@@ -358,46 +186,6 @@ def is_paired_end(sampletable, sample):
     return False
 
 
-def fill_r1_r2(sampletable, pattern, r1_only=False):
-    """
-    Returns a function intended to be used as a rule's input function.
-
-    The returned function, when provided with wildcards, will return one or two
-    rendered versions of a pattern depending on SE or PE respectively.
-    Specifically, given a pattern (which is expected to contain a placeholder
-    for "{sample}" and "{n}"), look up in the sampletable whether or not it is
-    paired-end.
-
-    Parameters
-    ----------
-
-    sampletable : pandas.DataFrame
-        Contains a "layout" column with either "SE" or "PE", or "LibraryLayout"
-        column with "SINGLE" or "PAIRED". If column does not exist, assume SE.
-
-    pattern : str
-        Must contain at least a "{sample}" placeholder.
-
-    r1_only : bool
-        If True, then only return the file for R1 even if PE is configured.
-    """
-
-    def func(wc):
-        try:
-            wc.sample
-        except AttributeError:
-            raise ValueError(
-                'Need "{{sample}}" in pattern ' '"{pattern}"'.format(pattern=pattern)
-            )
-        n = [1]
-        if is_paired_end(sampletable, wc.sample) and not r1_only:
-            n = [1, 2]
-        res = expand(pattern, sample=wc.sample, n=n)
-        return res
-
-    return func
-
-
 def pluck(obj, kv):
     """
     For a given dict or list that somewhere contains keys `kv`, return the
@@ -418,136 +206,6 @@ def pluck(obj, kv):
                 yield x
 
 
-# Functions for conveniently working with resources
-
-
-def autobump(*args, **kwargs):
-    """
-    Used to automatically bump resources depending on how many times the job
-    was attempted. This will return a function that is appropriate to use for
-    an entry in Snakemake's `resources:` directive::
-
-        rule example:
-            input: "a.txt"
-            resources:
-                mem_mb=autobump(gb=10),
-                runtime=autobump(hours=2, increment_hours=10)
-
-    Values can be specified in multiple ways.
-
-    A single number will be provided as the resource, and will be used to
-    increment each time. For example, this is the equivalent of 10 GB for the
-    first attempt, and 20 GB for the second:
-
-    >>> f = autobump(1024 * 10)
-    >>> f(None, 1)
-    10240
-
-    Adding a second unnamed argument will use it as a value to increment by for
-    each subsequent attempt. This will use 10 GB for the first attempt, and 110
-    GB for the second attempt.
-
-    >>> f = autobump(1024 * 10, 1024 * 100)
-    >>> f(None, 1)
-    10240
-
-    >>> f(None, 2)
-    112640
-
-    Instead of bare numbers, keyword arguments can be used for more convenient
-    specification of units. The above two examples can also take this form:
-
-    >>> f = autobump(gb=10)
-    >>> f(None, 1)
-    10240
-
-    >>> f = autobump(gb=10, increment_gb=100)
-    >>> f(None, 2)
-    112640
-
-
-    Units can be minutes, hours, days, mb, gb, or tb. For example:
-
-    >>> f = autobump(hours=2, increment_hours=5)
-    >>> f(None, 2)
-    420
-
-    """
-    multiplier = {
-        "mb": 1,
-        "minutes": 1,
-        "gb": 1024,
-        "hours": 60,
-        "days": 1440,
-        "tb": 1024 * 1024,
-    }
-    units = list(multiplier.keys())
-
-    if args and kwargs:
-        raise ValueError(
-            "Mixture of unnamed and keyword arguments not supported with autobump()"
-        )
-
-    if len(kwargs) > 2:
-        raise ValueError("Only 2 kwargs allowed for autobump()")
-
-    elif len(args) == 1 and not kwargs:
-        baseline_converted = args[0]
-        increment_converted = baseline_converted
-
-    elif len(args) == 2 and not kwargs:
-        baseline_converted, increment_converted = args
-
-    elif len(kwargs) <= 2:
-        baseline_kwargs = [k for k in kwargs.keys() if k in units]
-        if len(baseline_kwargs) != 1:
-            raise ValueError(
-                "Multiple baseline kwargs found. Do you need to change one to have an 'increment_' prefix?"
-            )
-
-        baseline_kwarg = baseline_kwargs[0]
-        baseline_value = kwargs[baseline_kwarg]
-        baseline_unit = baseline_kwarg
-
-        increment_kwargs = [k for k in kwargs if k.startswith("increment_")]
-        if increment_kwargs:
-            assert len(increment_kwargs) == 1
-            increment_kwarg = increment_kwargs[0]
-            increment_value = kwargs[increment_kwarg]
-            increment_unit = increment_kwarg.split("_")[-1]
-        else:
-            increment_value = baseline_value
-            increment_unit = baseline_unit
-
-        if baseline_unit not in multiplier:
-            raise ValueError(
-                f"Baseline unit {baseline_unit} not in valid units {units}"
-            )
-        if increment_unit not in multiplier:
-            raise ValueError(
-                f"Increment unit {increment_unit} not in valid units {units}"
-            )
-
-        baseline_converted = baseline_value * multiplier[baseline_unit]
-        increment_converted = increment_value * multiplier[increment_unit]
-
-    else:
-        raise ValueError(f"Unhandled args and kwargs: {args}, {kwargs}")
-
-    def f(wildcards, attempt):
-        return baseline_converted + (attempt - 1) * increment_converted
-
-    return f
-
-
-def gb(size_in_gb):
-    return 1024 * size_in_gb
-
-
-def hours(time_in_hours):
-    return time_in_hours * 60
-
-
 # Config parsing and handling
 
 
@@ -576,96 +234,6 @@ def detect_layout(sampletable):
         raise ValueError(f"Only a single layout (SE or PE) is supported. {report_}")
 
 
-def fill_patterns(patterns, fill, combination=product):
-    """
-    Fills in a dictionary of patterns with the dictionary `fill`.
-
-    >>> patterns = dict(a='{sample}_R{N}.fastq')
-    >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2])
-    >>> sorted(fill_patterns(patterns, fill)['a'])
-    ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq']
-
-    If using `zip` as a combination, checks to ensure all values in `fill` are
-    the same length to avoid truncated output.
-
-    This fails:
-
-    >>> patterns = dict(a='{sample}_R{N}.fastq')
-    >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2])
-    >>> sorted(fill_patterns(patterns, fill, zip)['a']) # doctest: +IGNORE_EXCEPTION_DETAIL
-    Traceback (most recent call last):
-    ...
-    ValueError: {'sample': ['one', 'two', 'three'], 'N': [1, 2]} does not have the same number of entries for each key
-
-    But this works:
-
-    >>> patterns = dict(a='{sample}_R{N}.fastq')
-    >>> fill = dict(sample=['one', 'one', 'two', 'two', 'three', 'three'], N=[1, 2, 1, 2, 1, 2])
-    >>> sorted(fill_patterns(patterns, fill, zip)['a'])
-    ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq']
-
-    """
-    # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns
-    # with no wildcards in them are removed from expand when `zip` is used as
-    # the combination function.
-    #
-    # For example, in 5.4.5:
-    #
-    #   expand('x', zip, d=[1,2,3]) == []
-    #
-    # But in 4.4.0:
-    #
-    #   expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x']
-
-    if combination == zip:
-        lengths = set([len(v) for v in fill.values()])
-        if len(lengths) != 1:
-            raise ValueError(
-                f"{fill} does not have the same number of entries for each key"
-            )
-
-    def update(d, u, c):
-        for k, v in u.items():
-            if isinstance(v, collections.abc.Mapping):
-                r = update(d.get(k, {}), v, c)
-                d[k] = r
-            else:  # not a dictionary, so we're at a leaf
-                if isinstance(fill, pd.DataFrame):
-                    d[k] = list(set(expand(u[k], zip, **fill.to_dict("list"))))
-                else:
-                    d[k] = list(set(expand(u[k], c, **fill)))
-            if not d[k]:
-                d[k] = [u[k]]
-        return d
-
-    d = {}
-    return update(d, patterns, combination)
-
-
-def rscript(string, scriptname, log=None):
-    """
-    Saves the string as `scriptname` and then runs it
-
-    Parameters
-    ----------
-    string : str
-        Filled-in template to be written as R script
-
-    scriptname : str
-        File to save script to
-
-    log : str
-        File to redirect stdout and stderr to. If None, no redirection occurs.
-    """
-    with open(scriptname, "w") as fout:
-        fout.write(string)
-    if log:
-        _log = "> {0} 2>&1".format(log)
-    else:
-        _log = ""
-    shell("Rscript {scriptname} {_log}")
-
-
 def check_unique_fn(df):
     """
     Raises an error if the fastq filenames are not unique
@@ -722,23 +290,6 @@ def chipseq_preflight(config):
         config["peaks"] = []
 
 
-def strand_arg_lookup(config, lookup):
-    """
-    Given a config object and lookup dictionary, confirm that the config has
-    correctly specified strandedness and then return the value for that key.
-    """
-    if not config.stranded:
-        raise ConfigurationError(
-            "Starting in v1.8, 'stranded' is required in the config file. "
-            "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), "
-            "or 'fr-secondstrand' (R1 aligns sense to original transcript)."
-        )
-    if config.stranded not in lookup:
-        keys = list(lookup.keys())
-        raise KeyError(f"'{config.stranded}' not one of {keys}")
-    return lookup[config.stranded]
-
-
 def filter_rrna_fastas(tmpfiles, outfile, pattern):
     """
     Extract records from fasta file(s) given a search pattern.
@@ -1012,24 +563,6 @@ def _default(origfn, newfn):
         raise ValueError(f"{outfile} does not appear to be gzipped.")
 
 
-def get_sampletable(config):
-    """
-    Return samples and pandas.DataFrame of parsed sampletable.
-
-    Returns the sample IDs and the parsed sampletable from the file specified
-    in the config.
-
-    The sample IDs are assumed to be the first column of the sampletable.
-
-    Parameters
-    ----------
-    config : dict
-    """
-    sampletable = pandas.read_csv(config["sampletable"], comment="#", sep="\t")
-    samples = sampletable.iloc[:, 0]
-    return samples, sampletable
-
-
 def get_techreps(sampletable, label):
     """
     Return all sample IDs for which the "label" column is `label`.
@@ -1176,25 +709,6 @@ def check_urls(config, verbose=False):
         )
 
 
-def check_all_urls_found(verbose=True):
-    """
-    Recursively loads all references that can be included and checks them.
-    Reports out if there are any failures.
-    """
-    check_urls(
-        {
-            "include_references": [
-                "include/reference_configs",
-                "test/test_configs",
-                "workflows/rnaseq/config",
-                "workflows/chipseq/config",
-                "workflows/references/config",
-            ]
-        },
-        verbose=verbose,
-    )
-
-
 def gff2gtf(gff, gtf):
     """
     Converts a gff file to a gtf format using the gffread function from Cufflinks
@@ -1205,10 +719,6 @@ def gff2gtf(gff, gtf):
         shell("gffread {gff} -T -o- | gzip -c > {gtf}")
 
 
-def wrapper_for(path):
-    return "file:" + os.path.join("../..", "wrappers", "wrappers", path)
-
-
 def detect_sra(sampletable):
     return "Run" in sampletable.columns and any(
         sampletable["Run"].str.startswith("SRR")

From b2370d2e823025d1908df436726336258a04be46 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 4 Nov 2025 22:45:17 -0500
Subject: [PATCH 176/196] update decision log:

- clearer chipseq config
- remove autobump
- cleanup utils
- rm plotfingerprint
- techreps
- pep consideration
---
 docs/decisions.rst | 118 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 4de9189f..86aa71f7 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -641,4 +641,122 @@ are available but the top-level is actually primary (rat, Ensembl); A GTF might
 not be available (pombe, Ensembl); or only a toplevel assembly is available and
 we need to remove the haplotypes and alt loci out (hg19, Ensembl).
 
+.. _decisions-sample-specific-params:
+
+Lack of sample-specific parameters
+----------------------------------
+
+Currently if we have samples with different library preps that need different
+arguments for cutadapt, then they need to be split into two separate workflow
+directories. Supporting sample-specific parameters would certainly be possible,
+but the addtional complexity this would impose would go against the goal of
+reducing complexity. For example, we'd need a location to store multiple sets
+of parameters (probably in the config file) and a mechanism to retrieve them
+based on sample names. This could be an additional column in the sampletable
+indicating "parameter sets", which could be used as a lookup in a ``params:``
+directive lookup function.
+
+Again, this would be possible, but it is a deliberate design choice to opt for
+a simpler approach, which is to use multiple workflow directories and edit the
+respective Snakefiles appropriately. In cases where samples across the split
+workflows need to be compared or considered together, an additional workflow
+can be introduced to aggregate their output.
+
+PEP support
+-----------
 
+Support for `Portable Encapsulated Projects
+<http://pep.databio.org/spec/specification/>`__ is built into Snakemake. Using
+a combination of PEP config files, sample tables, and subsample tables, it is
+possible to set up the workflows to use PEP in such a way that it can be
+backwards-compatible with prior lcdb-wf versions. Specifically, by providing
+TSV sampletables, forcing a sample column name, and populating the table with
+subsamples. It would be convenient to offload the complexity of handling
+technical replicate configuration to a third-party package.
+
+However, getting technical replicates to work correctly proved to be tricky,
+due to the way they come in as lists in the resulting dataframe with PEP. While
+it would be possible to fix this, some initial experimentation with this
+suggested that it would actually be more complex to do that, so deferring to
+another package did not result in a net gain in convenience or in complexity
+reduction.
+
+PEP configs are not ruled out completely, but we might need a rewiring and
+possible rewriting of the ChIP-seq (and possibly RNA-seq) workflows to fully
+support PEP subsamples. I don't consider that effort to be worth it right now,
+especially because the current config system already supports technical
+replicates.
+
+Technical replicates
+--------------------
+In practice, it's not uncommon for something to go wrong in library prep or
+sequencing such that it makes sense to re-do a library. Typically, if it's just
+resequencing the same library (perhaps after rebalancing the multiplexing), we
+consider that a technical replicate.
+
+The conventional method for handling technical replicates in RNA-seq is to sum
+the counts. That is, we take the Salmon or featureCounts files, where technical
+replicates are quantified separately, and sum them after import into R. This
+allows us to check QC on individual tech reps e.g. to see if they worked. If we
+merged at an early stage (like cocatting the FASTQs), then we would not be able
+to check QC separately.
+
+For ChIP-seq, the conventional method is to merge BAM files. However, we still
+want to keep observability of individual technical replicates where possible,
+which includes inspecting duplicates. However, when we merge BAMs of technical
+replicates that each had duplicates removed, it's possible that we're
+introducing additional duplicates. So we do another round of duplicate removal
+after merging.
+
+The end result of all of this is that we get MultiQC output for all of the
+technical replicates separately. For ChIP-seq, the post-merging files are
+bigWigs and merged-and-deduped BAMs. Currently these do not have separate
+entries in MultiQC.
+
+Removing built-in support for plotFingerprint
+---------------------------------------------
+
+deepTools' `plotFingerprint
+<https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html>`__
+needs matched input to each antibody. Previously, we configured this in the
+sampletable with the combination of "biological_material" and "antibody"
+columns. Samples with exactly "input" as the antibody were the matched control
+for the non-input samples with the same biological material.
+
+This ended up being a little complicated because "biological material" is easily
+confused with "biological replicate". And now with common CUT&RUN and Cut&Tag
+assays that use IgG as control, "IgG" and "control" should probably be aliases
+for "input".
+
+It turns out the "biological_material" column was only ever used for the
+plotFingerprint rule. It introduced complexity (in code, configuration,
+documentation, and user support) for a single rule. In addition, in practice we
+ended up visualizing the bigWigs rather than relying exclusively on the
+plotFingerprint metrics. So to reduce complexity, plotFingerpring support is
+being removed.
+
+Clearer ChIP-seq config
+-----------------------
+
+For "label", it was not clear that it was the merged name. And even if there
+were no technical replicates in an experiment, it still needed to be filled out
+with copies of the sample name.
+
+Now, ``merged_label`` is an alias for ``label``. If the column is missing
+entirely, or if the value is empty for a row, then the samplename will be used
+automatically.
+
+Removal of autobump
+-------------------
+For several versions, resources were wrapped with the ``autobump()`` function,
+which would automatically retry jobs with more resources if they failed. Turns
+out this wasn't as helpful as expected, because errors (like syntax errors or
+other mistakes) ended up being a lot more frequent than exceeding resources.
+This resulted in escalating resource allocations and longer run time with no
+need. So the autobump was removed.
+
+Cleanup of lib/utils.py
+-----------------------
+We had accumulated a lot of useful functions over time, but things have changed
+enough that they haven't been used. To avoid clutter and additional maintenance
+burden in supporting otherwise unused code, these functions were removed.

From 04d5d695f6a5e470c042671a2adb5a1cba2a67c5 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Tue, 4 Nov 2025 22:50:04 -0500
Subject: [PATCH 177/196] trackhub scripts use new utils functions

---
 workflows/chipseq/chipseq_trackhub.py | 4 ++--
 workflows/rnaseq/rnaseq_trackhub.py   | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py
index 4e520be2..9d7ba3eb 100644
--- a/workflows/chipseq/chipseq_trackhub.py
+++ b/workflows/chipseq/chipseq_trackhub.py
@@ -24,7 +24,7 @@
 from trackhub.helpers import filter_composite_from_subgroups, dimensions_from_subgroups, hex2rgb
 from trackhub.upload import upload_hub, stage_hub
 
-from lib import chipseq
+from lib import chipseq, utils
 
 ap = argparse.ArgumentParser()
 ap.add_argument('config', help='Main config.yaml file')
@@ -55,7 +55,7 @@
 )
 
 # Set up subgroups based on unique values from columns specified in the config
-df = pandas.read_csv(config['sampletable'], comment='#', sep='\t')
+df = utils.prepare_chipseq_sampletable(config)
 cols = hub_config['subgroups']['columns']
 subgroups = []
 for col in cols:
diff --git a/workflows/rnaseq/rnaseq_trackhub.py b/workflows/rnaseq/rnaseq_trackhub.py
index 6fe17f80..d6bb8cf4 100644
--- a/workflows/rnaseq/rnaseq_trackhub.py
+++ b/workflows/rnaseq/rnaseq_trackhub.py
@@ -9,6 +9,7 @@
 """
 
 import os
+import sys
 import re
 from pprint import pprint
 import pandas
@@ -20,6 +21,9 @@
 from trackhub.upload import upload_hub, stage_hub
 import argparse
 
+sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
+from lib import utils
+
 ap = argparse.ArgumentParser()
 ap.add_argument('config', help='Main config.yaml file')
 ap.add_argument('hub_config', help='Track hub config YAML file')
@@ -47,7 +51,7 @@
 )
 
 # Set up subgroups based on the configured columns
-df = pandas.read_csv(config['sampletable'], comment='#', sep='\t')
+df = utils.prepare_rnaseq_sampletable(config)
 cols = hub_config['subgroups']['columns']
 subgroups = []
 for col in cols:

From 8951b570dd970a7417ced6ce8ec6fe9d87c435fc Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 5 Nov 2025 13:05:41 +0000
Subject: [PATCH 178/196] rm url check

---
 .circleci/config.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3899fa03..60fa6a54 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -168,9 +168,6 @@ variables:
         # Ensure that the chunks in rnaseq.Rmd have matching documentation
         (cd ci && ./ensure_docs.py)
 
-        # find all URLs in reference configs and make sure they exist
-        python -c "import sys; sys.path.insert(0, '$DEST'); from lib.utils import check_all_urls_found; check_all_urls_found()"
-
         # run R package unit tests using the R env
         conda activate $LCDBWF_ENV_R
         Rscript -e "devtools::test('lib/lcdbwf', reporter=c('summary', 'fail'), export_all=TRUE)"

From b8fb369198aa8a58a68306e6a0613c4d315a0eb6 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 5 Nov 2025 13:15:25 +0000
Subject: [PATCH 179/196] typo

---
 lib/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index 17614537..ca07a9fb 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -813,7 +813,7 @@ def prepare_chipseq_sampletable(config):
     chipseq_preflight(config)
     sampletable_fn = config.get("sampletable", "config/sampletable.tsv")
     sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#")
-    sampletable = sampletable.set_inde(sampletable.columns[0], drop=False)
+    sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
     sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0])
     return sampletable
 
@@ -822,7 +822,7 @@ def prepare_rnaseq_sampletable(config):
     rnaseq_preflight(config)
     sampletable_fn = config.get("sampletable", "config/sampletable.tsv")
     sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#")
-    sampletable = sampletable.set_inde(sampletable.columns[0], drop=False)
+    sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
     return sampletable
 
 

From cb97ce1c67bed1ca054b316f4030e91cf2b9a4b1 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 5 Nov 2025 14:15:53 +0000
Subject: [PATCH 180/196] improved sampletable handling

---
 lib/utils.py                                | 147 +++++++++++---------
 workflows/rnaseq/config/sra_sampletable.csv |  20 +++
 workflows/rnaseq/config/sra_sampletable.tsv |   7 -
 3 files changed, 98 insertions(+), 76 deletions(-)
 create mode 100644 workflows/rnaseq/config/sra_sampletable.csv
 delete mode 100644 workflows/rnaseq/config/sra_sampletable.tsv

diff --git a/lib/utils.py b/lib/utils.py
index ca07a9fb..b3182567 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -156,14 +156,13 @@ def is_paired_end(sampletable, sample):
     #
     # So detect first detect if SRA sampletable based on presence of "Run"
     # column and all values of that column starting with "SRR", and then raise
-    # an error if the Layout column does not exist.
+    # an error if the Layout or LibraryLayout column does not exist.
 
-    if "Run" in sampletable.columns:
+    sra_layout_columns = ["layout", "librarylayout"]
+    sampletable_columns = [i.lower() for i in sampletable.columns]
+    if "run" in sampletable_columns:
         if all(sampletable["Run"].str.startswith("SRR")):
-            if (
-                "Layout" not in sampletable.columns
-                and "layout" not in sampletable.columns
-            ):
+            if len(set(sra_layout_columns).intersection(sampletable_columns)) == 0:
                 raise ValueError(
                     "Sampletable appears to be SRA, but no 'Layout' column "
                     "found. This is required to specify single- or paired-end "
@@ -234,62 +233,6 @@ def detect_layout(sampletable):
         raise ValueError(f"Only a single layout (SE or PE) is supported. {report_}")
 
 
-def check_unique_fn(df):
-    """
-    Raises an error if the fastq filenames are not unique
-    """
-    fns = df["orig_filename"]
-    if "orig_filename_R2" in df.columns:
-        fns = pd.concat([fns, df["orig_filename_R2"]])
-    if len(fns.unique()) < len(fns):
-        raise ValueError("Fastq filenames non unique, check the sampletable\n")
-
-
-def check_unique_samplename(df):
-    """
-    Raises an error if the samplenames are not unique
-    """
-    ns = df.index
-    if len(ns.unique()) < len(ns):
-        raise ConfigurationError("Samplenames non unique, check the sampletable\n")
-
-
-def preflight(config):
-    """
-    Performs verifications on config and sampletable files
-
-    Parameters
-    ----------
-    config: yaml config object
-    """
-    sampletable = pd.read_table(config["sampletable"], index_col=0, comment="#")
-    check_unique_samplename(sampletable)
-    if "orig_filename" in sampletable.columns:
-        check_unique_fn(sampletable)
-    if "genome" not in config:
-        raise ConfigurationError("Config is missing 'genome' key")
-    if "url" not in config["genome"]:
-        raise ConfigurationError("Config is missing 'url' key for 'genome'")
-
-
-def rnaseq_preflight(config):
-    preflight(config)
-    if "annotation" not in config:
-        raise ConfigurationError("Config is missing 'annotation' key")
-    if "url" not in config["annotation"]:
-        raise ConfigurationError("Config is missing 'url' key for 'annotation'")
-    if "stranded" not in config:
-        raise ConfigurationError("Config is missing 'stranded' key")
-    if "organism" not in config:
-        raise ConfigurationError("Config is missing 'organism' key")
-
-
-def chipseq_preflight(config):
-    preflight(config)
-    if "peaks" not in config:
-        config["peaks"] = []
-
-
 def filter_rrna_fastas(tmpfiles, outfile, pattern):
     """
     Extract records from fasta file(s) given a search pattern.
@@ -809,20 +752,86 @@ def unlist_dict(d):
             writer.writerow(unlist_dict(row))
 
 
-def prepare_chipseq_sampletable(config):
-    chipseq_preflight(config)
+def preflight(config, sampletable):
+    """
+    Performs verifications on config and sampletable files
+
+    Parameters
+    ----------
+    config: yaml config object
+    """
+
+    if len(sampletable) != len(sampletable.iloc[:, 0].unique()):
+        raise ConfigurationError("Samplenames non unique, check the sampletable")
+
+    # For non-SRA sampletables
+    if "orig_filename" in sampletable.columns:
+        fns = df["orig_filename"]
+        if "orig_filename_R2" in df.columns:
+            fns = pd.concat([fns, df["orig_filename_R2"]])
+        if len(fns.unique()) < len(fns):
+            raise ValueError("Fastq filenames non unique, check the sampletable\n")
+
+    if "genome" not in config:
+        raise ConfigurationError("Config is missing 'genome' key")
+    if "url" not in config["genome"]:
+        raise ConfigurationError("Config is missing 'url' key for 'genome'")
+
+
+def rnaseq_preflight(config, sampletable):
+    preflight(config, sampletable)
+    if "annotation" not in config:
+        raise ConfigurationError("Config is missing 'annotation' key")
+    if "url" not in config["annotation"]:
+        raise ConfigurationError("Config is missing 'url' key for 'annotation'")
+    if "stranded" not in config:
+        raise ConfigurationError("Config is missing 'stranded' key")
+    if "organism" not in config:
+        raise ConfigurationError("Config is missing 'organism' key")
+
+
+def chipseq_preflight(config, sampletable):
+    preflight(config, sampletable)
+    if "peaks" not in config:
+        config["peaks"] = []
+
+
+def read_sampletable(config):
+    """
+    Given a config object, return the sampletable with the first column used as the index.
+
+    Autodetect tsv/csv.
+    """
     sampletable_fn = config.get("sampletable", "config/sampletable.tsv")
-    sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#")
+    if sampletable_fn.endswith(".tsv"):
+        sep = "\t"
+    elif sampletable_fn.endswith(".csv"):
+        sep = ","
+    else:
+        raise ConfigurationError(
+            f"Sampletable should end in .csv or .tsv to indicate format, got {sampletable_fn}"
+        )
+    sampletable = pd.read_table(sampletable_fn, sep=sep, comment="#")
     sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
+    return sampletable
+
+
+def prepare_chipseq_sampletable(config):
+    """
+    Given a config, return the validated and prepared ChIP-seq table.
+    """
+    sampletable = read_sampletable(config)
     sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0])
+    chipseq_preflight(config, sampletable)
     return sampletable
 
 
 def prepare_rnaseq_sampletable(config):
-    rnaseq_preflight(config)
-    sampletable_fn = config.get("sampletable", "config/sampletable.tsv")
-    sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#")
-    sampletable = sampletable.set_index(sampletable.columns[0], drop=False)
+    """
+    Given a config, return the validated and prepared RNA-seq table.
+    """
+    sampletable = read_sampletable(config)
+    rnaseq_preflight(config, sampletable)
     return sampletable
 
 
diff --git a/workflows/rnaseq/config/sra_sampletable.csv b/workflows/rnaseq/config/sra_sampletable.csv
new file mode 100644
index 00000000..1ecdc3cf
--- /dev/null
+++ b/workflows/rnaseq/config/sra_sampletable.csv
@@ -0,0 +1,20 @@
+Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,GEO_Accession (exp),Instrument,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study,treatment,cell_type,Developmental_stage,cell_line
+SRR5182696,RNA-Seq,50,720229800,PRJNA362227,SAMN06236711,456024643,GEO,public,"fastq,run.zq,sra","s3,gs,ncbi","gs.us-east1,ncbi.public,s3.us-east-1",SRX2498797,GSM2461336,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461336,"sorted live neurons\, P14 pupal\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,P14 pupal,
+SRR5182697,RNA-Seq,50,651467650,PRJNA362227,SAMN06236734,413453724,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","s3.us-east-1,ncbi.public,gs.us-east1",SRX2498798,GSM2461337,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461337,"sorted live neurons\, P14 pupal\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,P14 pupal,
+SRR5182698,RNA-Seq,50,501312400,PRJNA362227,SAMN06236733,318819526,GEO,public,"run.zq,sra,fastq","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX2498799,GSM2461338,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:10:00Z,1,GSM2461338,"sorted live neurons\, P14 pupal\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,P14 pupal,
+SRR5182699,RNA-Seq,50,744291500,PRJNA362227,SAMN06236732,473503018,GEO,public,"fastq,sra,run.zq","ncbi,gs,s3","gs.us-east1,s3.us-east-1,ncbi.public",SRX2498800,GSM2461339,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461339,"sorted live neurons\, third instar larval\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,third instar larval,
+SRR5182700,RNA-Seq,50,607856150,PRJNA362227,SAMN06236731,386029421,GEO,public,"run.zq,fastq,sra","ncbi,gs,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX2498801,GSM2461340,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:12:00Z,1,GSM2461340,"sorted live neurons\, third instar larval\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,third instar larval,
+SRR5182701,RNA-Seq,50,641763000,PRJNA362227,SAMN06236730,407428219,GEO,public,"run.zq,sra,fastq","ncbi,s3,gs","ncbi.public,gs.us-east1,s3.us-east-1",SRX2498802,GSM2461341,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461341,"sorted live neurons\, third instar larval\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,third instar larval,
+SRR5182702,RNA-Seq,50,602992350,PRJNA362227,SAMN06236729,383110310,GEO,public,"fastq,sra,run.zq","gs,s3,ncbi","s3.us-east-1,ncbi.public,gs.us-east1",SRX2498803,GSM2461342,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461342,"sorted live neurons\, P14 pupal\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,P14 pupal,
+SRR5182703,RNA-Seq,50,639787300,PRJNA362227,SAMN06236728,406192647,GEO,public,"fastq,run.zq,sra","gs,s3,ncbi","gs.us-east1,s3.us-east-1,ncbi.public",SRX2498804,GSM2461343,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461343,"sorted live neurons\, P14 pupal\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,P14 pupal,
+SRR5182704,RNA-Seq,50,645383100,PRJNA362227,SAMN06236727,409821107,GEO,public,"fastq,sra,run.zq","s3,gs,ncbi","gs.us-east1,s3.us-east-1,ncbi.public",SRX2498805,GSM2461344,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:13:00Z,1,GSM2461344,"sorted live neurons\, P14 pupal\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,P14 pupal,
+SRR5182705,RNA-Seq,50,867006750,PRJNA362227,SAMN06236726,550448623,GEO,public,"sra,fastq,run.zq","ncbi,s3,gs","gs.us-east1,s3.us-east-1,ncbi.public",SRX2498806,GSM2461345,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:12:00Z,1,GSM2461345,"sorted live neurons\, third instar larval\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,third instar larval,
+SRR5182706,RNA-Seq,50,664061850,PRJNA362227,SAMN06236725,421272040,GEO,public,"sra,run.zq,fastq","s3,gs,ncbi","s3.us-east-1,gs.us-east1,ncbi.public",SRX2498807,GSM2461346,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:12:00Z,1,GSM2461346,"sorted live neurons\, third instar larval\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,third instar larval,
+SRR5182707,RNA-Seq,50,718867500,PRJNA362227,SAMN06236724,455538089,GEO,public,"fastq,sra,run.zq","s3,ncbi,gs","ncbi.public,gs.us-east1,s3.us-east-1",SRX2498808,GSM2461347,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:12:00Z,1,GSM2461347,"sorted live neurons\, third instar larval\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,third instar larval,
+SRR5182708,RNA-Seq,51,313585740,PRJNA362227,SAMN06236723,192621062,GEO,public,"sra,fastq,run.zq","ncbi,s3,gs","ncbi.public,gs.us-east1,s3.us-east-1",SRX2498809,GSM2461348,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461348,"cell culture from nervous system\, ds-GFP",SRP096911,dsRNA targeting GFP,,,BG3
+SRR5182709,RNA-Seq,51,354164145,PRJNA362227,SAMN06236722,217275323,GEO,public,"sra,run.zq,fastq","s3,gs,ncbi","ncbi.public,s3.us-east-1,gs.us-east1",SRX2498810,GSM2461349,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461349,"cell culture from nervous system\, ds-GFP",SRP096911,dsRNA targeting GFP,,,BG3
+SRR5182710,RNA-Seq,51,331996689,PRJNA362227,SAMN06236721,204104825,GEO,public,"run.zq,fastq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX2498811,GSM2461350,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461350,"cell culture from nervous system\, ds-GFP",SRP096911,dsRNA targeting GFP,,,BG3
+SRR5182711,RNA-Seq,51,484674828,PRJNA362227,SAMN06236720,298600165,GEO,public,"run.zq,fastq,sra","s3,ncbi,gs","s3.us-east-1,ncbi.public,gs.us-east1",SRX2498812,GSM2461351,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461351,"cell culture from nervous system\, ds-shep",SRP096911,dsRNA targeting shep,,,BG3
+SRR5182712,RNA-Seq,51,379084887,PRJNA362227,SAMN06236719,233433872,GEO,public,"run.zq,fastq,sra","s3,gs,ncbi","ncbi.public,gs.us-east1,s3.us-east-1",SRX2498813,GSM2461352,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461352,"cell culture from nervous system\, ds-shep",SRP096911,dsRNA targeting shep,,,BG3
+SRR5182713,RNA-Seq,51,410430405,PRJNA362227,SAMN06236718,252947684,GEO,public,"fastq,sra,run.zq","gs,s3,ncbi","s3.us-east-1,ncbi.public,gs.us-east1",SRX2498814,GSM2461353,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461353,"cell culture from nervous system\, ds-shep",SRP096911,dsRNA targeting shep,,,BG3
+
diff --git a/workflows/rnaseq/config/sra_sampletable.tsv b/workflows/rnaseq/config/sra_sampletable.tsv
deleted file mode 100644
index 3ed904c6..00000000
--- a/workflows/rnaseq/config/sra_sampletable.tsv
+++ /dev/null
@@ -1,7 +0,0 @@
-samplename	AvgSpotLen	BioSample	Experiment	MBases	MBytes	Run	SRA_Sample	Sample_Name	developmental_stage	source_name	treatment	Assay_Type	BioProject	Center_Name	Consent	DATASTORE_filetype	DATASTORE_provider	InsertSize	Instrument	LibraryLayout	LibrarySelection	LibrarySource	LoadDate	Organism	Platform	ReleaseDate	SRA_Study	cell_line	cell_type
-gfp1	51	SAMN06236723	SRX2498809	299	183	SRR5182708	SRS1925642	GSM2461348		cell culture from nervous system, ds-GFP	dsRNA targeting GFP	RNA-Seq	PRJNA362227	GEO	public	sra	ncbi	0	Illumina HiSeq 2500	SINGLE	cDNA	TRANSCRIPTOMIC	2017-01-17	Drosophila melanogaster	ILLUMINA	2017-11-22	SRP096911	BG3	
-gfp2	51	SAMN06236722	SRX2498810	337	207	SRR5182709	SRS1925643	GSM2461349		cell culture from nervous system, ds-GFP	dsRNA targeting GFP	RNA-Seq	PRJNA362227	GEO	public	sra	ncbi	0	Illumina HiSeq 2500	SINGLE	cDNA	TRANSCRIPTOMIC	2017-01-17	Drosophila melanogaster	ILLUMINA	2017-11-22	SRP096911	BG3	
-gfp3	51	SAMN06236721	SRX2498811	316	194	SRR5182710	SRS1925644	GSM2461350		cell culture from nervous system, ds-GFP	dsRNA targeting GFP	RNA-Seq	PRJNA362227	GEO	public	sra	ncbi	0	Illumina HiSeq 2500	SINGLE	cDNA	TRANSCRIPTOMIC	2017-01-17	Drosophila melanogaster	ILLUMINA	2017-11-22	SRP096911	BG3	
-shep1	51	SAMN06236720	SRX2498812	462	284	SRR5182711	SRS1925645	GSM2461351		cell culture from nervous system, ds-shep	dsRNA targeting shep	RNA-Seq	PRJNA362227	GEO	public	sra	ncbi	0	Illumina HiSeq 2500	SINGLE	cDNA	TRANSCRIPTOMIC	2017-01-17	Drosophila melanogaster	ILLUMINA	2017-11-22	SRP096911	BG3	
-shep2	51	SAMN06236719	SRX2498813	361	222	SRR5182712	SRS1925646	GSM2461352		cell culture from nervous system, ds-shep	dsRNA targeting shep	RNA-Seq	PRJNA362227	GEO	public	sra	ncbi	0	Illumina HiSeq 2500	SINGLE	cDNA	TRANSCRIPTOMIC	2017-01-17	Drosophila melanogaster	ILLUMINA	2017-11-22	SRP096911	BG3	
-shep3	51	SAMN06236718	SRX2498814	391	241	SRR5182713	SRS1925647	GSM2461353		cell culture from nervous system, ds-shep	dsRNA targeting shep	RNA-Seq	PRJNA362227	GEO	public	sra	ncbi	0	Illumina HiSeq 2500	SINGLE	cDNA	TRANSCRIPTOMIC	2017-01-17	Drosophila melanogaster	ILLUMINA	2017-11-22	SRP096911	BG3	

From d9c3f78acb78d7796d672150c24a6023230479ac Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 5 Nov 2025 16:35:28 +0000
Subject: [PATCH 181/196] fix trackhub imports

---
 workflows/chipseq/chipseq_trackhub.py | 2 +-
 workflows/rnaseq/rnaseq_trackhub.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py
index 9d7ba3eb..80637a28 100644
--- a/workflows/chipseq/chipseq_trackhub.py
+++ b/workflows/chipseq/chipseq_trackhub.py
@@ -11,7 +11,6 @@
 
 import os
 import sys
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
 import re
 import argparse
 from pprint import pprint
@@ -24,6 +23,7 @@
 from trackhub.helpers import filter_composite_from_subgroups, dimensions_from_subgroups, hex2rgb
 from trackhub.upload import upload_hub, stage_hub
 
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
 from lib import chipseq, utils
 
 ap = argparse.ArgumentParser()
diff --git a/workflows/rnaseq/rnaseq_trackhub.py b/workflows/rnaseq/rnaseq_trackhub.py
index d6bb8cf4..92199cad 100644
--- a/workflows/rnaseq/rnaseq_trackhub.py
+++ b/workflows/rnaseq/rnaseq_trackhub.py
@@ -21,7 +21,7 @@
 from trackhub.upload import upload_hub, stage_hub
 import argparse
 
-sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..")
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
 from lib import utils
 
 ap = argparse.ArgumentParser()

From 8f06eac28fa86fb4c535fad822ad8fe32e53f410 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 5 Nov 2025 16:38:51 +0000
Subject: [PATCH 182/196] test SRA csv

---
 .circleci/config.yml                       | 2 +-
 test/test_configs/test_sra_sampletable.csv | 3 +++
 test/test_configs/test_sra_sampletable.tsv | 3 ---
 3 files changed, 4 insertions(+), 4 deletions(-)
 create mode 100644 test/test_configs/test_sra_sampletable.csv
 delete mode 100644 test/test_configs/test_sra_sampletable.tsv

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 60fa6a54..51d81530 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -263,7 +263,7 @@ variables:
           # SRA test
           ./run_test.sh -k -p -j2 --use-conda \
             --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \
-            --config sampletable=$ORIG/test/test_configs/test_sra_sampletable.tsv
+            --config sampletable=$ORIG/test/test_configs/test_sra_sampletable.csv
 
           # SRA SE only
           ./run_test.sh -k -p -j2 --use-conda \
diff --git a/test/test_configs/test_sra_sampletable.csv b/test/test_configs/test_sra_sampletable.csv
new file mode 100644
index 00000000..34f57090
--- /dev/null
+++ b/test/test_configs/test_sra_sampletable.csv
@@ -0,0 +1,3 @@
+Run,LibraryLayout
+SRR948304,PAIRED
+SRR948305,PAIRED
diff --git a/test/test_configs/test_sra_sampletable.tsv b/test/test_configs/test_sra_sampletable.tsv
deleted file mode 100644
index 0f55c436..00000000
--- a/test/test_configs/test_sra_sampletable.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-samplename	Run	layout
-sra2	SRR948304	PAIRED
-sra3	SRR948305	PAIRED

From a2cf9d88ceca278f5944ea8874b57380c6d055b6 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 5 Nov 2025 16:47:39 +0000
Subject: [PATCH 183/196] df -> sampletable

---
 lib/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index b3182567..2535286f 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -766,9 +766,9 @@ def preflight(config, sampletable):
 
     # For non-SRA sampletables
     if "orig_filename" in sampletable.columns:
-        fns = df["orig_filename"]
-        if "orig_filename_R2" in df.columns:
-            fns = pd.concat([fns, df["orig_filename_R2"]])
+        fns = sampletable["orig_filename"]
+        if "orig_filename_R2" in sampletable.columns:
+            fns = pd.concat([fns, sampletable["orig_filename_R2"]])
         if len(fns.unique()) < len(fns):
             raise ValueError("Fastq filenames non unique, check the sampletable\n")
 

From fe999e27eaeee3acc07a99a7e52a204f3a6f042a Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 5 Nov 2025 18:01:02 +0000
Subject: [PATCH 184/196] use additional config for rnaseq trackhub

---
 .circleci/config.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 51d81530..9014546f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -235,7 +235,9 @@ variables:
           ./run_test.sh --use-conda -j2 -k -p \
             --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml
 
-          python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml
+          python rnaseq_trackhub.py \
+            config/config.yaml config/hub_config.yaml \
+            --additional-configs $ORIG/test/test_configs/test_rnaseq_config.yaml
 
           conda activate $LCDBWF_ENV_R
 

From 23164b1fa1bc3468ed1012868eaac7b3c6a074c9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 6 Nov 2025 10:24:51 -0500
Subject: [PATCH 185/196] update reference configs

---
 include/reference_config_templates/Homo_sapiens/GENCODE.yaml    | 2 --
 .../reference_config_templates/Mus_musculus/GENCODE_M25.yaml    | 2 --
 .../Saccharomyces_cerevisiae/S288C.yaml                         | 2 ++
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
index 507877bb..97a48e93 100644
--- a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
+++ b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml
@@ -7,8 +7,6 @@ organism: "Homo sapiens"
 
 genome:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz"
-  postprocess: lib.postprocess.default
 
 annotation:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz"
-  postprocess: lib.postprocess.default
diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
index 99120cbf..7899df9d 100644
--- a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
+++ b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml
@@ -7,9 +7,7 @@ organism: "Mus musculus"
 
 genome:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz"
-  postprocess: lib.postprocess.default
 
 
 annotation:
   url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.primary_assembly.annotation.gtf.gz"
-  postprocess: lib.postprocess.default
diff --git a/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml b/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml
index 4e0204d0..62e68ea5 100644
--- a/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml
+++ b/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml
@@ -2,6 +2,8 @@
 
 # From Ensembl. According to README in this FTP dir, if there's no primary
 # assembly then the toplevel is assumed to be the primary assembly.
+
+organism: "Saccharomyces cerevisiae"
 genome:
   url: "https://ftp.ensembl.org/pub/release-115/fasta/saccharomyces_cerevisiae/dna/Saccharomyces_cerevisiae.R64-1-1.dna_sm.toplevel.fa.gz"
 

From 5bc2b4bc6574a1151eb4a250021b0c417c26fbcf Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 6 Nov 2025 10:26:06 -0500
Subject: [PATCH 186/196] initial overhaul of docs

---
 docs/README.md                     |  30 -
 docs/_static/balloon.min.css       |   1 -
 docs/_static/custom.css            |  30 -
 docs/autodoc.rst                   |   9 -
 docs/changelog.rst                 | 912 -----------------------------
 docs/chipseq.png                   | Bin 25746 -> 0 bytes
 docs/chipseq.rst                   |  33 --
 docs/conda.rst                     | 209 -------
 docs/conf.py                       |   1 -
 docs/config-yaml.rst               | 587 -------------------
 docs/config.rst                    | 389 ++++++++++--
 docs/decisions.rst                 |  39 +-
 docs/developers.rst                | 116 ----
 docs/downstream-rnaseq.rst         |  82 ---
 docs/external.png                  | Bin 11515 -> 0 bytes
 docs/faqs.rst                      | 189 ------
 docs/functional-enrichment-rmd.rst | 122 ----
 docs/gene-patterns-rmd.rst         |  78 ---
 docs/generate_guide.py             | 185 ------
 docs/getting-started.rst           | 200 ++-----
 docs/guide-to-files.txt            | 122 ----
 docs/guide.rst                     | 257 --------
 docs/index.rst                     |  54 +-
 docs/integrative.rst               |  82 ---
 docs/lib.chipseq.rst               |  23 -
 docs/lib.common.rst                |  35 --
 docs/lib.patterns_targets.rst      |  22 -
 docs/patterns-targets.rst          | 140 -----
 docs/references-config.rst         | 603 -------------------
 docs/references.png                | Bin 10622 -> 0 bytes
 docs/references.rst                |  82 ---
 docs/rnaseq-rmd.rst                | 587 -------------------
 docs/rnaseq.png                    | Bin 34326 -> 0 bytes
 docs/rnaseq.rst                    |  53 --
 docs/sampletable.rst               | 272 ---------
 docs/tests.rst                     | 183 ------
 docs/toc.rst                       |  12 -
 docs/workflows.rst                 | 284 +++++----
 38 files changed, 600 insertions(+), 5423 deletions(-)
 delete mode 100644 docs/README.md
 delete mode 100644 docs/_static/balloon.min.css
 delete mode 100644 docs/_static/custom.css
 delete mode 100644 docs/autodoc.rst
 delete mode 100644 docs/changelog.rst
 delete mode 100644 docs/chipseq.png
 delete mode 100644 docs/chipseq.rst
 delete mode 100644 docs/conda.rst
 delete mode 100644 docs/config-yaml.rst
 delete mode 100644 docs/developers.rst
 delete mode 100644 docs/downstream-rnaseq.rst
 delete mode 100644 docs/external.png
 delete mode 100644 docs/faqs.rst
 delete mode 100644 docs/functional-enrichment-rmd.rst
 delete mode 100644 docs/gene-patterns-rmd.rst
 delete mode 100644 docs/generate_guide.py
 delete mode 100644 docs/guide-to-files.txt
 delete mode 100644 docs/guide.rst
 delete mode 100644 docs/integrative.rst
 delete mode 100644 docs/lib.chipseq.rst
 delete mode 100644 docs/lib.common.rst
 delete mode 100644 docs/lib.patterns_targets.rst
 delete mode 100644 docs/patterns-targets.rst
 delete mode 100644 docs/references-config.rst
 delete mode 100644 docs/references.png
 delete mode 100644 docs/references.rst
 delete mode 100644 docs/rnaseq-rmd.rst
 delete mode 100644 docs/rnaseq.png
 delete mode 100644 docs/rnaseq.rst
 delete mode 100644 docs/sampletable.rst
 delete mode 100644 docs/tests.rst

diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index 45ed3871..00000000
--- a/docs/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-This documentation uses [sphinx](http://www.sphinx-doc.org) to buid the documentation.
-
-The built documentation from the master branch can be found at
-https://lcdb.github.io/lcdb-wf. If you want to build a local copy of the
-documentation:
-
-- create an environment from the `docs/docs-requirements.txt` file
-- activate it
-- run the Makefile in `docs`
-
-
-That is:
-
-```bash
-# Create env
-conda create -n lcdb-wf-docs \
-  --file docs/docs-requirements.txt \
-  --channel bioconda \
-  --channel conda-forge \
-  --channel lcdb
-
-# activate it
-source activate lcdb-wf-docs
-
-# build the docs
-cd docs
-make html
-```
-
-The locally-built docs will be in `docs/_build/html/toc.html`.
diff --git a/docs/_static/balloon.min.css b/docs/_static/balloon.min.css
deleted file mode 100644
index 268c8a8e..00000000
--- a/docs/_static/balloon.min.css
+++ /dev/null
@@ -1 +0,0 @@
-[data-balloon]{position:relative}[data-balloon]:after,[data-balloon]:before{-ms-filter:"progid:DXImageTransform.Microsoft.Alpha(Opacity=0)";filter:alpha(opacity=0);-khtml-opacity:0;-moz-opacity:0;opacity:0;pointer-events:none;-webkit-transition:all .18s ease-out .18s;transition:all .18s ease-out .18s;bottom:100%;left:50%;position:absolute;z-index:10;-webkit-transform:translate(-50%,10px);-ms-transform:translate(-50%,10px);transform:translate(-50%,10px);-webkit-transform-origin:top;-ms-transform-origin:top;transform-origin:top}[data-balloon]:after{background:rgba(17,17,17,.9);border-radius:4px;color:#fff;content:attr(data-balloon);font-size:12px;padding:.5em 1em;white-space:nowrap;margin-bottom:11px}[data-balloon]:before{background:url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="36px" height="12px"><path fill="rgba(17, 17, 17, 0.9)" transform="rotate(0)" d="M2.658,0.000 C-13.615,0.000 50.938,0.000 34.662,0.000 C28.662,0.000 23.035,12.002 18.660,12.002 C14.285,12.002 8.594,0.000 2.658,0.000 Z"/></svg>') no-repeat;background-size:100% auto;height:6px;width:18px;content:"";margin-bottom:5px}[data-balloon]:hover:after,[data-balloon]:hover:before{-ms-filter:"progid:DXImageTransform.Microsoft.Alpha(Opacity=100)";filter:alpha(opacity=100);-khtml-opacity:1;-moz-opacity:1;opacity:1;pointer-events:auto;-webkit-transform:translate(-50%,0);-ms-transform:translate(-50%,0);transform:translate(-50%,0)}[data-balloon][data-balloon-break]:after{white-space:normal}[data-balloon-pos=down]:after,[data-balloon-pos=down]:before{bottom:auto;left:50%;top:100%;-webkit-transform:translate(-50%,-10px);-ms-transform:translate(-50%,-10px);transform:translate(-50%,-10px)}[data-balloon-pos=down]:after{margin-top:11px}[data-balloon-pos=down]:before{background:url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="36px" height="12px"><path fill="rgba(17, 17, 17, 0.9)" transform="rotate(180 18 6)" d="M2.658,0.000 C-13.615,0.000 50.938,0.000 34.662,0.000 C28.662,0.000 23.035,12.002 18.660,12.002 C14.285,12.002 8.594,0.000 2.658,0.000 Z"/></svg>') no-repeat;background-size:100% auto;height:6px;width:18px;margin-top:5px;margin-bottom:0}[data-balloon-pos=down]:hover:after,[data-balloon-pos=down]:hover:before{-webkit-transform:translate(-50%,0);-ms-transform:translate(-50%,0);transform:translate(-50%,0)}[data-balloon-pos=left]:after,[data-balloon-pos=left]:before{bottom:auto;left:auto;right:100%;top:50%;-webkit-transform:translate(10px,-50%);-ms-transform:translate(10px,-50%);transform:translate(10px,-50%)}[data-balloon-pos=left]:after{margin-right:11px}[data-balloon-pos=left]:before{background:url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="12px" height="36px"><path fill="rgba(17, 17, 17, 0.9)" transform="rotate(-90 18 18)" d="M2.658,0.000 C-13.615,0.000 50.938,0.000 34.662,0.000 C28.662,0.000 23.035,12.002 18.660,12.002 C14.285,12.002 8.594,0.000 2.658,0.000 Z"/></svg>') no-repeat;background-size:100% auto;height:18px;width:6px;margin-right:5px;margin-bottom:0}[data-balloon-pos=left]:hover:after,[data-balloon-pos=left]:hover:before{-webkit-transform:translate(0,-50%);-ms-transform:translate(0,-50%);transform:translate(0,-50%)}[data-balloon-pos=right]:after,[data-balloon-pos=right]:before{bottom:auto;left:100%;top:50%;-webkit-transform:translate(-10px,-50%);-ms-transform:translate(-10px,-50%);transform:translate(-10px,-50%)}[data-balloon-pos=right]:after{margin-left:11px}[data-balloon-pos=right]:before{background:url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="12px" height="36px"><path fill="rgba(17, 17, 17, 0.9)" transform="rotate(90 6 6)" d="M2.658,0.000 C-13.615,0.000 50.938,0.000 34.662,0.000 C28.662,0.000 23.035,12.002 18.660,12.002 C14.285,12.002 8.594,0.000 2.658,0.000 Z"/></svg>') no-repeat;background-size:100% auto;height:18px;width:6px;margin-bottom:0;margin-left:5px}[data-balloon-pos=right]:hover:after,[data-balloon-pos=right]:hover:before{-webkit-transform:translate(0,-50%);-ms-transform:translate(0,-50%);transform:translate(0,-50%)}[data-balloon-length]:after{white-space:normal}[data-balloon-length=small]:after{width:80px}[data-balloon-length=medium]:after{width:150px}[data-balloon-length=large]:after{width:260px}[data-balloon-length=xlarge]:after{width:90vw}@media screen and (min-width:768px){[data-balloon-length=xlarge]:after{width:380px}}[data-balloon-length=fit]:after{width:100%}
\ No newline at end of file
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
deleted file mode 100644
index b83f5902..00000000
--- a/docs/_static/custom.css
+++ /dev/null
@@ -1,30 +0,0 @@
-pre {
-    font-size: 0.7em;
-}
-
-
-h3 {
-    font-style: italic;
-}
-
-h2 {
-    /* text-decoration: underline; */
-}
-
-code {
-    background-color: #fff;
-    font-size: 0.8em;
-    color: #444;
-}
-
-code.file {
-    font-style: italic;
-}
-
-/* make fixed sidebar scrollable
-   from: https://stackoverflow.com/questions/57031848/sphinx-alabaster-theme-scroll-inside-of-fixed-sidebar 
-*/
-div.sphinxsidebar {
-    max-height: 90%;
-    overflow-y: auto;
-}
diff --git a/docs/autodoc.rst b/docs/autodoc.rst
deleted file mode 100644
index 7217f828..00000000
--- a/docs/autodoc.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Module documentation
-====================
-
-.. toctree::
-   :maxdepth: 2
-
-   lib.common
-   lib.chipseq
-   lib.patterns_targets
diff --git a/docs/changelog.rst b/docs/changelog.rst
deleted file mode 100644
index 22039944..00000000
--- a/docs/changelog.rst
+++ /dev/null
@@ -1,912 +0,0 @@
-Changelog
-=========
-
-v1.10.3
--------
-
-- improve the deploy script (thanks @aliciaaevans)
-- support the epic2 peak-caller for the ChIP-seq workflow (thanks @Mira0507)
-- for later versions of featureCounts, add ``--countReadPairs`` argument to RNA-seq workflow (@therealgenna)
-
-v1.10.2
--------
-
-Minor bugfix release.
-
-- Fix multiqc configs so that they coorectly ignore any cutadapt fastqc zips when building the raw fastq section
-- Fix multiqc config for chipseq so it correctly cleans the ``_R2`` extension to better support PE ChIP-seq-like workflows
-- Fix functional enrichment label truncation to ensure that truncated labels are unique
-
-v1.10.1
--------
-This is a bugfix and minor patch release.
-
-- Bugfix: the references workflow was missing the ``resources:`` directives;
-  they have now been added.
-
-- Bugfix: kallisto strandedness was set incorrectly for libraries using
-  ligation prep (fr-secondstrand)
-
-- The new ``utils.autobump`` function can be used to easily specify default and
-  incremented resources, and the ``utils.gb`` and ``utils.hours`` make it
-  a little easier to specify when autobump is not required.
-
-  In the following example, memory will be set to 8 * 1024 MB and will
-  increment by that much each retry. The runtime will be set to 2 * 60 minutes,
-  and will increment by 10 * 60 minutes each retry. The disk will be set to 100
-  * 1024 MB, and will not increase each retry.
-
-  .. code-block:: python
-
-      resources:
-          mem_mb=autobump(gb=8),
-          runtime=autobump(hours=2, increment_hours=10),
-          disk_mb=gb(100)
-
-- WRAPPER_SLURM no longer has the ``--latency-wait=300``,
-  ``--max-jobs-per-second=1``, and ``--max-status-checks-per-second=0.01``
-  which would override any profile settings.
-
-- In RNA-seq and ChIP-seq, the cutadpt rule now defaults to using
-  ``--nextseq-trim 20`` instead of ``-q 20``, to better handle the majority of
-  sequencing data we have recently been working with (NovaSeq). See `this
-  section of the cutadapt docs
-  <https://cutadapt.readthedocs.io/en/stable/guide.html#nextseq-trim>`_ for
-  details.
-
-- Updated requirements to use a recent version of salmon to avoid segfaults
-
-- rnaseq.Rmd, when saving the Rds file at the end, now disables compression.
-  This can have a dramatic improvement on downstream performance for
-  a reasonable disk space cost.
-
-- functional-enrichment.Rmd, now supports KEGG pathways & parallel operation.
-
-- functional-enrichment.Rmd, gene-patterns.Rmd, now saves Rds file at the
-  end (without compression) adding the respective object lists.
-
-- added ``--overlap 6`` to cutadapt to avoid greedy trimming
-
-
-v1.10
------
-The major change here is refactoring the Snakefiles to use the ``resources:``
-directive in each rule, and removing the ``--clusterconfig`` mechanism which
-has long been deprecated.
-
-For running on a cluster, this requires a `profile
-<https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles>`_.
-E.g., on `NIH's Biowulf <https://hpc.ni.gov>`_, use the `NIH-HPC
-snakemake_profile <https://github.com/NIH-HPC/snakemake_profile>`_.
-
-General
-~~~~~~~
-- No longer using clusterconfig, instead using resources to configure cluster resources
-- Migrated to a unified testing script that simplifies local and CI testing
-- If sampletable is from SRA, raise an error if a Layout column can't be found
-  (to prevent incorrect interpretation of samples as single-end)
-- Ensure bam indexes are made for the markdups bams, even if bigwigs are not created
-- Remove libsizes table, which was largely redundant with fastqc results
-
-RNA-seq
-~~~~~~~
-- Fix R tests
-- All ``lcdbwf`` R functions use the ``:::`` namespace lookup syntax
-- Fix library loads in rnaseq.Rmd to ensure they come before parallelization configuration
-- New function ``lcdbwf:::lfc_scatter`` for comparing multiple DESeq2 contrasts
-- Updates and fixes to ``gene-patterns.Rmd``
-
-
-v1.9
-----
-
-This version has substantial changes in the ``rnaseq.Rmd`` file to streamline
-its use in a production environment. This involves moving most of the code
-complexity into the ``lcdbwf`` R package and using a new config file as much as
-possible. See details below.
-
-General
-~~~~~~~
-- environments have been updated with recent versions of all tools
-- WRAPPER_SLURM arguments updated with arguments better suited for cluster submission
-- PhiX reference configs have been removed
-- compatibility with Python 3.10
-- fastq-dump rules have been converted to scripts. This is because sra-tools in
-  versions earlier than 3.0 have issue with SSL certs, however sra-tools=3
-  cannot be installed alongside recent versions of salmon (due to conflicting
-  pinnings with the ``icu`` package). Therefore, fastq-dump is now run as
-  a script in its own conda environment.
-- new idxstats rule for chipseq and rnaseq
-
-RNA-seq
-~~~~~~~
-
-**This version has major changes to** ``rnaseq.Rmd``. Briefly:
-
-1. This file has been overhauled to be driven by a config file. This
-   dramatically reduces the need to scroll through the RMarkdown file and make
-   all the customizations for a particular experiment. Now, editing the config
-   file sets up most of the project-specific components. Note that contrasts
-   still need to be customized in the Rmd file.
-2. The narrative and explanatory text has been moved to ``text.yaml`` and is
-   included at render time. This reduces the need to scroll through lots of
-   boilerplate text in the RMarkdown while still retaining the ability to
-   easily edit it.
-3. Most of the complexity has been offloaded to the ``lcdbwf`` R package.
-4. Caches are much improved. See the :ref:`downstream-detailed` section for
-   more information.
-5. Functional enrichment is moved into a separate RMarkdown file.
-
-Downstream RNA-seq config
-,,,,,,,,,,,,,,,,,,,,,,,,,
-
-The file, `workflows/rnaseq/downstream/config.yaml` is heavily commented to
-describe the various settings. The sections of the config are designed such
-that they can be used as additional chunk options to chunks in which they are
-used. This additional chunk option is used by RMarkdown to compute the hash of
-the chunk. The result is that making a change in the config file is sufficient
-to invalidate the cache of any chunks that specify that section as a chunk
-option.
-
-Complexity moved to ``lib/lcdbwf/R``
-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-
-Another major change is that most of the complexity in the ``rnaseq.Rmd`` file
-has been factored out into the ``lcdbwf`` R package that is stored inn
-``lib/lcdbwf``. While this means that all code is no longer included in the
-final rendered HTML file, it does make the Rmd much more streamlined to work
-with. It also has the side effect of making it easier to write unit tests on
-separate functions.
-
-Many helper functions have been added to the ``lcdbwf`` R package, including
-ones to streamline the creation of dds and results objects, composing and saving
-them, and generating many of the outputs.
-
-Improved caching of results chunks
-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-
-A somewhat major change is a new strategy for allowing ``results()`` calls to be
-split across multiple, independently-cached chunks that are then properly merged
-together into a single ``res.list`` object while handling dependencies and
-parallelization (thanks to `@njohnso6 <https://github.com/njohnso6>`_). This
-dramatically speeds up the process of incrementally adding contrasts to complex
-experimental designs.
-
-Other changes
-,,,,,,,,,,,,,
-
-In addition to these major changes, there are also many other improvements
-to ``rnaseq.Rmd``:
-
-    - AnnotationHub databases are only retrieved from cache when they are
-      needed. This dramatically speeds up rendering of the HTML, since before
-      the OrgDb would always load no matter what.
-    - Toggle Kallisto or Salmon quantification with a simple true/false; this
-      automatically sums to gene level using automatically retrieved TxDb. This
-      also now supports creating dds objects from featureCounts, Salmon, or
-      Kallisto in such a way that they can be easily compared with each other.
-    - ``lcdbwf::compose_results()`` to combine res_list and dds_list objects
-      together by inspecting the global namespace for specially-named objects
-    - Helper functions for retrieving global config and data structures (e.g.,
-      ``lcdbwf::get_config()``, ``lcdbwf::get_dds()``)
-    - Helper function ``lcdbwf::match_from_dots`` for working with `...`
-      arguments and splitting them up to only go to the functions they are
-      intended for
-    - Much faster to attach info (e.g., adding SYMBOL to all results) since the
-      AnnotationDbi calls are only done once instead of for each results
-      object.
-    - Refactored functional enrichment to be much more generalized, currently
-      using Gene Ontology and MSigDB. MSigDb, via the ``msigdbr`` package, is
-      available for multiple species and so this incorporates Reactome and
-      KEGG. But the generalized method can be applied to any arbitrary gene
-      sets, allowing for much more customization.
-    - Fixes to clusterProfiler::emapplot calls in particular corner cases
-    - Functional enrichment is now a completely separate file, using the
-      ``combined.Rds`` file as an intermediate between ``rnaseq.Rmd`` and
-      ``functional_enrichment.Rmd``.
-    - All-in-one enrichment function that runs either overrepresentation or
-      GSEA. Makes it much easier to do *ad hoc* tests.
-    - Helper function ``lcdbwf::enrich_list_lapply()`` to apply arbitrary
-      functions to the highly-nested `enrich_list` data structure
-    - Helper function ``lcdbwf::collect_objects`` to help compile discovered
-      results objects
-    - ``lcdbwf::get_sig()`` has more options for what to return
-    - Plotting wrappers for clusterProfiler plot functions, allowing plots to be
-      configured via the config file.
-    - New dds diagnostics and results diagnostics functions and sections of the
-      Rmd, useful for troubleshooting
-    - Refactored the results tabs: MA plots come first; ensure 10 genes are always plotted in MA
-      plots, added volcano plots with labeled genes, removed top 3 and bottom
-      3 gene plots
-    - PCA plots using plotly no longer need "unrolled" for-loops; multiple PCA
-      coloring and clustered heatmap row side colors are now configured in the
-      YAML config file
-    - Moved size factor plots and gene version removal to lcdbwf package
-    - Use datatable to show initial sampletable for cleaner output
-    - Make original dds_initial object the same way as later dds objects and
-      always using a design of ``~1`` to be used in PCA and heatmaps
-    - "Differential expression" header moved so that code is no longer hidden
-      under the size factors plot
-    - Option for filling in NA in symbol with Ensembl IDs
-    - collapseReplicates2 uses ``collapse_by`` rather than ``combine.by``
-    - Updated the code style throughout to use the tidyverse/google style guide
-    - RNA-seq differential expression output is additionally included in an
-      Excel file with one sheet per contrast.
-
-Tests
-~~~~~
-
-- ``lcdbwf`` R package now has its own tests via ``devtools`` and ``testthat``
-- recent versions of Snakemake are broken when ``--until`` is used in certain
-  circumstances; a ChIP-seq test has been disabled temporarily.
-- after a successful test, the environment is written as an artifact on circleci
-
-References
-~~~~~~~~~~
-
-- Fixed a longstanding issue with *S. cerevisiae*, now the GFF file is properly converted to GTF.
-
-v1.8
-----
-
-General
-~~~~~~~
-
-- Complete shift to using pinned ``env.yaml`` files to specify conda
-  environments, and using ``mamba`` for building environments (consistent with
-  recent versions of Snakemake). This is now reflected in documentation and
-  the updated-and-improved ``deploy.py``.
-
-- Reorganization/cleanup of the ``include`` directory
-
-- Added conda troubleshooting notes to the documentation (see
-  :ref:`conda-troubleshooting`).
-
-- The ``lib.helpers.preflight`` function no requires the first column of the
-  sampletable to be named `samplename` when checking configs.
-
-- Improvements to the deployment script ``deploy.py``:
-
-    - now requires Python >3.6
-    - proper logs (so you can easily see how long it takes to build an env)
-    - supports downloading and running the script directly, which will clone
-      a temporary copy and deploy from there
-    - using Control-C to stop the deployment will also stop mamba/conda
-    - colored output
-    - mamba is used by default, but ``--conda-frontend`` will use conda instead
-
-- fastq-dump log is sent to file rather than printed to stdout
-
-- Threads: cutadapt single-end now uses specified threads (it was using
-  1 thread by default); use 6 threads for fastqc
-
-- Added new preflight checks for RNA-seq and ChIP-seq specific configs.
-
-- Added a ``run_complex_test.sh`` driver script for testing the workflows on
-  full-scale publicly available data 
-
-RNA-seq
-~~~~~~~
-
-- **Configuration change:** The ``stranded:`` field is now required for RNA-seq.
-  This is used to choose the correct parameters for various rules, and avoids
-  one of the main reasons to edit the Snakefile. See :ref:`cfg-stranded` for
-  more details on its use.
-
-- added ``stranded:`` field to all configs used in testing
-
-- The ``strand_check`` rule now runs MultiQC for a convenient way of evaluating
-  strandedness of a library.
-
-- Kallisto is now supported in both the RNA-seq Snakefile, references
-  Snakefile, included reference configs, and downstream ``rnaseq.Rmd``
-
-
-References
-~~~~~~~~~~
-
-- When checking URLs in reference configs, don't use ``curl`` to check
-  ``file://`` URIs.
-
-- There is a new feature for reference configs that allows chaining
-  post-processing functions together, see :ref:`advanced-postprocessing`. This
-  means that it is possible, for example, to add ERCC spike-ins (which need
-  post-processing) onto references that themselves need post-processing.
-
-- ``lib/postprocess/ercc.py`` has new helper functions for adding ERCC
-  spike-ins to fasta files and GTF files.
-
-- added ``'kallisto'`` to included reference configs
-
-ChIP-seq
-~~~~~~~~
-
-- symlinks rule is now local
-- added collectinsertsizes pattern to support PE ChIP-seq experiments
-- merging bigwigs log no longer goes to stdout
-
-
-v1.7
-----
-
-Setup
-~~~~~
-
-Use mamba for installation of environments, consistent with Snakemake recommendations
-
-Testing
-~~~~~~~
-
-- We now recommend using `mamba <https://github.com/mamba-org/mamba>`_ to
-  create conda environments. This is dramatically faster and solves some
-  dependency issues. Our automated tests now use this.
-
-- We have moved from requirements.txt files to env.yaml files. We also now
-  encourage the use of the strictly-pinned environments for a more stable
-  experience to hopefully avoid transient issues in the packaging ecosystem.
-
-- ``tbb=2020.2`` as a dependency to fix a recent packaging issue with conda-forge.
-
-- many documentation improvements
-
-- symlinks rule is only set to localrule when it exists (it does not exist when
-  running an analysis exclusively from SRA)
-
-References
-~~~~~~~~~~
-
-- updated URLs for those that have changes (e.g., Sanger -> EBI; using https
-  instead of ftp for UCSC-hosted genomes)
-
-- new ``gff2gtf`` post-process tool for when an annotation is only available as
-  GFF. *S. pombe* needs this, for example, and the
-  `Schizosaccharomyces_pombe.yaml`` reference config has been updated
-  accordingly.
-
-
-- The references workflow no longer reads the config file in its directory.
-  This fixes some subtle overwriting issues when providing config files or
-  items from the command line during as is used during certain test runs. If
-  running the references workflow alone, it must be called with
-  ``--configfile``
-
-RNA-seq
-~~~~~~~
-
-- featureCounts now uses BAM files with duplicates marked. Previously if you
-  wanted to run featureCounts in a mode where it excluded duplicates you would
-  need to reconfigure rules.
-
-- improved comments in RNA-seq downstream RMarkdown files
-
-Testing
-~~~~~~~
-
-- new test that checks all URLs identified in config files to ensure that the
-  included reference files remain valid
-
-- there is now a separate ``run_downstream_test`` script`
-
-- simplified the CircleCI DAG to optimize testing resources
-
-v1.6
-----
-
-References
-~~~~~~~~~~
-- overhaul the way transcriptome fastas are created. Instead of requiring
-  separate download, they are now created out of the provided GTF and fasta
-  files. The reference config section now uses keys ``genome:``,
-  ``transcriptome:``, and ``annotation:`` rather than the ``fasta:`` and
-  ``gtf:`` keys.
-- **backwards-incompatible change:** reference config files have been updated
-  to reflect the changes in the references workflow
-- Update PhiX genome fasta to use NCBI rather than Illumina iGenomes
-
-ChIP-seq workflow
-~~~~~~~~~~~~~~~~~
-- ChIP-seq workflow now properly supports paired-end reads
-- A ChIP-seq workflow can now be run when the ``chipseq:`` and/or
-  ``peak_calling:`` sections are omitted.
-- added a missing bowtie2 config entry in ``clusterconfig.yaml`` which could
-  result in out-of-memory errors when submitting to a cluster using that file
-
-
-RNA-seq workflow
-~~~~~~~~~~~~~~~~
-- if colData is a tibble this no longer causes issues for importing counts
-- dupRadar removed from RNA-seq workflow. We ended up never using it, and it
-  depends on R which we've since removed from the main environment.
-- new ``strand_test`` rule, which can be run explicitly with ``snakemake -j2
-  strand_check``. This generates ``strandedness.tsv`` in the current directory,
-  which is the summarize output of RSeQC's ``infer_experiment.py`` across all
-  samples.
-- implement STAR two-pass alignment. Default is still single-pass.
-- Clean up hard-coded STAR indexing Log.out file
-- Include ``ashr`` and ``ihw`` Bioconductor packages in the R requirements, for
-  use with recent versions of DESeq2.
-
-
-RNA-seq downstream
-~~~~~~~~~~~~~~~~~~
-
-- Functional enrichment and gene patterns are now separate child documents.
-  This makes it easier to turn them on/off by only needing to adjust the chunk
-  options of the child chunk
-- Created a new documentation method for rnaseq.Rmd. Now there is a separate,
-  dedicated documentation page with sections that exactly correspond to each
-  named chunk in the Rmd, as well as a tool for ensuring that chunks and docs
-  stay synchronized. See :ref:`rnaseqrmd` for the new docs.
-- New ``counts.df`` and ``counts.plot`` functions to make it much easier to
-  make custom dotplots of top counts by melting and joining the counts table
-  with the metadata in colData.
-- DEGpatterns cluster IDs are now added as additional columns in the output
-  TSVs for each contrast
-- Many functions in the rnaseq.Rmd now expect a list of :term:`dds` objects.
-  See :ref:`dds_list` for more info on this.
-- Created a new R package, ``lcdbwf`` stored in :file:`lib/lcdbwf`. This can be
-  edited in place, and it is loaded from disk within ``rnaseq.Rmd``.
-- Modified some output keys to support recent versions of Snakemake, for which
-  ``count`` is a reserved keyword
-
-
-General
-~~~~~~~
-- Conda environments are now split into R and non-R. See :ref:`conda-envs` for
-  details. Updated ``deploy.py`` accordingly
-- symlinks rules are now set to be localrules
-- updated workflows to work on recent Snakemake versions
-- split environments into non-R and R. This, along with a loose pinning of
-  versions (``>=``), dramatically speeds up environment creation.
-- updates to support latest Snakemake versions
-- improvements to testing:
-   - environment YAML files, rendered HTML, and docs are stored as artifacts on CircleCI
-   - consolidations of some RNA-seq tests to reduce total time
-   - additional comments in the test config yaml to help new users understand the system
-- new "preflight check" function is run to hopefully catch errors before running workflows
-- updates to support recent Picard versions
-- added wildcard constraints to help Snakemake solve DAG
-
-
-v1.5.3
-------
-
-General
-~~~~~~~
-- default 12-hr wall time in WRAPPER_SLURM
-- update .gitignore (`#223 <https://github.com/lcdb/lcdb-wf/issues/223>`_)
-- remove the FastQC status checks section from the MultiQC report (which shows
-  up in recent MultiQC versions) (`#246 <https://github.com/lcdb/lcdb-wf/issues/246>`_
-
-Bugs
-~~~~
-
-- add bed12 conversion for all species with default reference configs
-- presence of an orig_filename_R2 in sampletable is sufficient to consider the
-  experiment PE
-- ensure DEGpattern output only contains unique genes
-- bring back featurecounts in multiqc report
-- "attach" chunk in rnaseq.Rmd was not properly set to depend on the "results" chunk
-
-RNA-seq
-~~~~~~~
-
-- dds objects can now be created from a full featureCounts input file and
-  a subsetted colData table, if subset.counts=TRUE
-- improve the dependencies between rnaseq.Rmd chunks so that cache=TRUE behaves
-  as expected: (`#232 <https://github.com/lcdb/lcdb-wf/issues/232>`_)
-- add plots for rnaseq.Rmd size factors (`#222 <https://github.com/lcdb/lcdb-wf/issues/222>`_)
-- run rseqc instead of CollectRnaSeqMetrics (the multiqc output is nicer for
-  it, and it's pretty much doing the same thing) (`#218 <https://github.com/lcdb/lcdb-wf/issues/218>`_)
-- when converting Ensembl to symbol, if there is no symbol then fall back to
-  the Ensembl ID to avoid NA (`#246
-  <https://github.com/lcdb/lcdb-wf/issues/246>`_)
-- in rnaseq.Rmd, all caches will be invalidated if the sampletable or the
-  featurecounts table have changed.
-
-Tests
-~~~~~
-- using continuumio/miniconda3 container; finally got en_US.utf8 locale
-  installed and working correctly in that container so that multiqc works.
-
-
-v1.5.2
-------
-
-Bug fixes
-~~~~~~~~~
-
-- When some samples were substrings of other samples (e.g., `WT_1_1` and
-  `WT_1_10`), DESeqDataSetFromCombinedFeatureCounts was assigning the wrong
-  names. This has now been fixed in `helpers.Rmd`.
-
-v1.5.1
-------
-
-Bug fixes
-~~~~~~~~~
-
-- DESeqDataSetFromCombinedFeatureCounts (added in v1.5) was incorrectly
-  assigning labels to samples when the order of the sampletable did not match
-  the order of the samples in the featureCounts table columns. This has been
-  fixed.
-
-General
-~~~~~~~
-
-- `deploy.py` deployment script now only pays attention to files checked in to
-  version control and optionally can create a conda environment in the target
-  directory.
-
-- tests now work out of a newly-deployed instance to better reflect real-world
-  usage
-
-
-ChIP-seq and RNA-seq
-~~~~~~~~~~~~~~~~~~~~
-- reorder cutadapt commands to avoid a MultQC parsing bug in the cutadapt
-  module (see https://github.com/ewels/MultiQC/issues/949)
-
-RNA-seq
-~~~~~~~
-The majority of these changes affect ``rnaseq.Rmd``:
-
-- modifications to MultiQC config to get back featureCounts output
-- `plotMA.label` function (in ``helpers.Rmd``) now defaults to FDR < 0.1
-  (instead of 0.01), and additionally supports labeling using different columns
-  of the results object (e.g., "symbol").
-- remove some now-redundant featureCounts code
-- add a comment showing where to collapse replicates
-- convert colData's first column to rownames
-- implement lower limit for DEGpatterns clustering (default is 0, but can
-  easily set to higher if you're getting issues)
-- expose arbitrary additional function arguments to ``top.plots``. This allows
-  different `intgroup` arguments to be passed to the `my.counts` function,
-  enabling different ways of plotting the gene dotplots.
-
-
-v1.5 (Sept 2019)
-----------------
-
-Major change: **it is no longer possible to mix single-end and paired-end
-samples within the same run of the workflow.** See `#208
-<https://github.com/lcdb/lcdb-wf/pull/208>`_ and the corresponding issue
-description at `#175 <https://github.com/lcdb/lcdb-wf/issues/175>`_.
-
-This version also has many improvements to the ``rnaseq.Rmd`` file for RNA-seq,
-as described below.
-
-RNA-seq
-~~~~~~~
-
-Many changes and improvements to ``rnaseq.Rmd``, including:
-
-- Differential analysis summaries now include labeled MA plots (`#192 <https://github.com/lcdb/lcdb-wf/pull/192/files>`_)
-- PCA plots now use plotly for improved insepction of samples (`#192 <https://github.com/lcdb/lcdb-wf/pull/192/files>`_
-- don't use knitrBootstrap any more (`#192 <https://github.com/lcdb/lcdb-wf/pull/192/files>`_
-- heatmaps use heatmaply package for better interaction (`#192 <https://github.com/lcdb/lcdb-wf/pull/192/files>`_
-- allow ``sel.list`` to be used for UpSet plots and fix some typos `#205 <https://github.com/lcdb/lcdb-wf/pull/205>`_
-- workaround for degPatterns for corner cases where there are few clusters because of the ``minc`` parameter (`#205 <https://github.com/lcdb/lcdb-wf/pull/205>`_)
-- alpha and lfc.thresh are now pulled out into a separate chunk (`#206 <https://github.com/lcdb/lcdb-wf/pull/206>`_)
-- Support AnnotationHub http proxy handling in new version of AnnotationHub (`#207 <https://github.com/lcdb/lcdb-wf/pull/207>`_).
-
-As well as the following changes to other parts of the RNA-seq workflow, such as:
-
-- better bigWig file nomenclature (`#194 <https://github.com/lcdb/lcdb-wf/pull/194/files>`_), uses "pos" and "neg".
-- featureCounts only runs once on all BAMs rather than individual samples (`#195 <https://github.com/lcdb/lcdb-wf/pull/195>`_)
-- support `rseqc infer_experiment`, which replaces running featureCounts in multiple stranded modes (`#199 <https://github.com/lcdb/lcdb-wf/pull/199>`_, `#203 <https://github.com/lcdb/lcdb-wf/pull/203>`_)
-- use ``--validateMappings`` for salmon (`#203 <https://github.com/lcdb/lcdb-wf/pull/203>`_)
-
-References
-~~~~~~~~~~
-- fix typo in *S. pombe* name
-
-All workflows
-~~~~~~~~~~~~~
-
-- Documentation now recommends creating an environment for each directory using the `-p` argument (`#195 <https://github.com/lcdb/lcdb-wf/pull/195>`_)
-
-
-v1.4.2 (Jul 2019)
------------------
-
-Bugfixes
-~~~~~~~~
-
-- Don't require ChIP-seq configs to have at least one block for each supported
-  peak-caller
-
-v1.4.1 (Jul 2019)
------------------
-
-RNA-seq
-~~~~~~~
-
-- KEGG results were not being added to the ``all.enrich`` list in ``rnaseq.Rmd``
-- symlinking bigWigs is now a local rule
-- default cutadapt options have changed to reflect current recommendations from
-  the author, and the cutadapt rule is now explicity using arguments rather
-  than requiring a separate ``adapters.fa`` file.
-- featureCounts now auto-detects whether it should be run with the ``-p``
-  argument in paired-end mode (previously it was up to the user to make sure
-  this was added). The rule does have an override if this behavior is not wanted.
-
-References
-~~~~~~~~~~
-
-- The reference config for *Drosophila* is now fixed. Previously it depended on
-  `chrom_convert`. That script was a fly-specific script in lcdblib, but
-  lcdblib is no longer a dependency since v1.3. This fix uses the
-  `convert_fastq_chroms` and `convert_gtf_chroms` used in reference configs for
-  other species.
-
-v1.4 (May 2019)
----------------
-RNA-seq
-~~~~~~~
-Much-improved ``rnaseq.Rmd``:
-
-- tabbed PCA plot
-- improved DEGpatterns chunk
-- dramatically improved functional enrichment section, with tabbed clusterprofiler plots and exported data in two flavors (combined and split)
-- improved upset plots, with exported files showing sets of genes
-- improved comments to highlight where to make changes
-- add new helper functions to ``helpers.R``:
-   - ``fromList.with.names``, for getting UpSet plot output
-   - ``rownames.first.col``, to make tidier dataframes
-   - ``nested.lapply``, for convenient 2-level nested list apply
-   - clusterprofiler helper functions
-
-
-v1.3 (May 2019)
----------------
-Bugfixes
-~~~~~~~~
-- Fix broken paired-end support for RNA-seq. Previously, when using data from
-  elsewhere on disk and using the symlink rules, R2 would be symlinked to the
-  same file as R1.
-- Support for Snakemake 5.4.0 which changes behavior of the ``expand()``
-  function.
-
-Infrastructure
-~~~~~~~~~~~~~~
-- new deploy script to copy over only the files necessary for an analysis,
-  avoiding the clutter of testing infrastructure.
-- lcdblib, an external package, is no longer a dependency. In the interest of
-  better transparency and to make the code here easier to follow, the relevant
-  code from lcdblib was copied over to the ``lib`` directory in this
-  repository.
-
-ChIP-seq and RNA-seq
-~~~~~~~~~~~~~~~~~~~~
-
-- Bowtie2, HISAT2, and rRNA rules no longer use wrappers. This makes it easier
-  to track down what parameters are being used in each rule.
-- RSeQC is now available in Python 3 so wrappers have been removed.
-- NextGenMap support removed
-
-v1.2 (Mar 2019)
----------------
-
-RNA-seq
-~~~~~~~
-- First-class paired-end support, including mixing PE and SE samples in the
-  same sampletable
-
-- Support for STAR aligner
-
-References
-~~~~~~~~~~
-- FASTA files are always symlinked into the directories of indexes that were
-  created from it
-
-- Reference configs:
-
-   - updated existing
-   - added more species
-   - new post-process for fasta or gtf: you can now use
-     NICHD-BSPC/chrom-name-mappings to convert chromosome names between UCSC
-     and Ensembl (see reference configs for examples of use)
-
-ChIP-seq and RNA-seq
-~~~~~~~~~~~~~~~~~~~~
-- Updates to dependencies and MultiQC config
-
-Infrastructure
-~~~~~~~~~~~~~~
-
-- Updated requirements in ``requirements.txt`` and in wrappers
-
-- Changed all ``pd.read_table()`` to ``pd.read_csv(sep="\t")`` to prevent warnings
-
-- Changed all ``yaml.load()`` to ``yaml.load(Loader=yaml.FullLoader)`` to
-  prevent warnings
-
-- Using DeprecationWarning rather than UserWarning in the deprecation handler
-  so there's less spam in the logs
-
-- Improved tests:
-
-  - using data from pybedtools repo because modENCODE seems to be down
-  - append rather than prepend base conda to PATH on circleci
-  - separate isolated tests for STAR, ngm, and SRA
-  - updated conda
-
-- Docs additions:
-
-  - TMPDIR handling
-  - clusterconfig
-  - WRAPPER_SLURM
-  - docs for developers
-  - symlinking fastqs
-  - using SRA sampletables
-  - paired-end data
-
-Colocalization
-~~~~~~~~~~~~~~
-- From colocalization, removed the GAT "fractions" heatmap due to unresolved
-  pandas index errors
-
-v1.1 (Aug 2018)
----------------
-
-Infrastructure
-~~~~~~~~~~~~~~
-
-- The default settings in Snakefiles are for real-world use, rather than for
-  testing. This reduces the amount of editing necessary before running actual
-  data. See :ref:`test-settings` for the extra step to take when testing
-  locally.
-
-- new ``run_test.sh`` script in each workflow directory to automatically run
-  the preprocessor when running test data
-
-- added extensive comments to Snakefiles with ``NOTE:`` string to make it
-  obvious where and how to make changes.
-
-- Documentation overhaul to bring everything up to v1.1. This includes Sphinx
-  autodocs on the ``lib`` module.
-
-- pytest test suite is run on the ``lib`` module
-
-References
-~~~~~~~~~~
-
-- new `metadata` section in references config, which can be used to store
-  additional information like mappable bases and genome size.
-
-- References can now be included from other YAML files into the main config
-  file. This dramatically simplifies individual configfiles, and allows
-  multiple workflows to use identical references without having to do
-  error-prone and hard-to-maintain copy/pastes between workflow configs. See
-  :ref:`references-config` for details.
-
-- New GTF conversion, ``mappings``. This is intended to replace the
-  ``annotation_hub`` conversion, which was problematic because 1) a particular
-  annotation hub accession is not guaranteed to be found in new versions of
-  AnnotationHub, resulting in lack of reproducibility, and 2) it was difficult
-  to synchronize the results with a particular GTF annotation. The
-  ``annotation_hub`` conversion is still supported, but if it's used then
-  a DeprecationWarning will be emitted, recommending ``mappings`` instead.
-
-
-Both RNA-seq and ChIP-seq
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- `fastq_screen` is now configured via ``config.yaml``. This reduces the need
-  to edit the Snakefile and coordinate between the config and the fastq_screen
-  rule. Now everything is done within the config file.
-
-- `fastq_screen` wrapper now handles additional output files created when using
-  the ``--tag`` and ``--filter`` arguments to ``fastq_screen``.
-
-- In the config file, ``assembly`` has been changed to the more-descriptive
-  ``organism``. The change is backwards compatible, but a DeprecationWarning is
-  raised if ``assembly:`` is still used, and changed to ``organism`` (though
-  only in memory, not on disk).
-
-- Patterns no longer use ``{sample_dir}``, ``{agg_dir}``, etc placeholders that
-  need to be configured in the config YAML. Instead, these directories are
-  hard-coded directly into the patterns. This simplifies the config files,
-  simplifies the patterns, and removes one layer of disconnect between the
-  filenames and how they are determined.
-
-- removed 4C workflow since it used 4c-ker
-
-ChIP-seq
-~~~~~~~~
-- macs2 and sicer can accept mappable genome size overrides
-
-RNA-seq
-~~~~~~~
-
-- RNA-seq downstream:
-
-    - ``downstream/help_docs.Rmd`` can be included for first-time users to
-      describe the sections of the RNA-seq analysis
-
-    - ``rnaseq.Rmd`` now uses the same ``NOTE:`` syntax as the Snakefiles for
-      indicating where/what to change
-
-    - Easy swapping of which strand to use from the three featureCounts runs
-      performed by the workflow
-
-    - Be explicit about using DESeq2::lfcShrink as is now the default in recent
-      DESeq2 versions
-
-    - improved the mechanism for keeping together results objects, dds objects, and
-      labels (list of lists, rather than individual list object; refactored
-      functions to use this new structure
-
-v1.0.1 (Jun 2018)
------------------
-Bugfixes, last release before references changes.
-
-Infrastructure
-~~~~~~~~~~~~~~
-
-- Transition to CircleCI for testing
-- Use production settings by default; see :ref:`test-settings` for
-  more.
-- lots o' docs
-- new ``include/references_configs`` to help organize references. These are
-  currently not used by the workflows directly.
-- bugfix: use additional options when uncompressing downloaded reference files
-  (``--no-same-owner`` for ``tar``, ``-f`` for ``gunzip``)
-- additional dependencies in the top-level environment to support the
-  additional features in rnaseq.Rmd and track hubs.
-- colocalization workflow, external workflow, figures workflow to demonstrate
-  vertical integration
-
-RNA-seq
-~~~~~~~
-- remove kallisto indexing, use salmon
-- improvements to how chipseq sampletables are parsed (with more informative
-  error messages)
-- run preseq for RNA-seq library complexity QC
-- support for merging bigwigs
-- featureCounts is now run in all three strandedness modes, and results
-  incorporated into MultiQC as separate modules.
-- RNA-seq now symlinks "pos" and "neg" bigWigs, which describe how reads map to
-  the *reference*, to "sense" and "antisense" bigWigs, which describe the
-  *originating RNA*. This makes it easy to swap strands depending on protocol.
-- new ``downstream/helpers.Rmd`` which factors out a lot of the work previously
-  done in ``rnaseq.Rmd`` into separate functions.
-- track hub building respects new sense/antisense bigwig symlinks
-
-``downstream/rnaseq.Rmd``
-~~~~~~~~~~~~~~~~~~~~~~~~~
-- AnnotationHub uses cache dir that will not clobber default home directory cache
-- use varianceStabilizingTransform instead of rlog
-- print a size factors table
-- use multiple cores for computationally expensive DESeq2 operations
-- using separate lists for results, dds objects, and nice labels for automated
-  plots for each contrast
-- UpSet plots for comparing gene lists across contrasts
-- DEGpattern plots for showing clusters of expression patterns (from the
-  DEGreport package)
-- attach normalized counts per sample and per factor (parsed from the model
-  used for the contrast) as well as TPM estimates to the results tables
-- trim the labels in GO enrichment plots when too long
-
-ChIP-seq
-~~~~~~~~
-- sicer for chipseq domain calling
-- pin snakemake <4.5.0 so that subworkflows behave correctly
-- chipseq peak-calling rules (and therefore wrappers) now expect a chromsizes
-  file as input
-- bigbed files for narrowPeak and broadPeak files are created correctly
-  depending on their format
-- run multiBigWigSummary and plotCorrelation from deepTools for ChIP-seq QC
-- ChIP-seq track hub generation script
-
-Both RNA-seq and ChIP-seq
-~~~~~~~~~~~~~~~~~~~~~~~~~
-- update deeptools calls to reflect >v3.0 syntax
-- support for SRA run tables so it's trivial to re-run experiments
-  in SRA
-- multiple FastQC runs are shown separately in MultiQC output
-
-v1.0 (May 2018)
----------------
-First official full release.
diff --git a/docs/chipseq.png b/docs/chipseq.png
deleted file mode 100644
index 051e0df1210824231d7ecfd9d78cfa3e318a5e3b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 25746
zcmdS>_ghoX7dDL25rjx6QVhLUsj@+&cOfVpMNnzd2~Bzz5=!Vjf=H7hpdf;D34|gb
zMg{4F5&;pB4g&Ay^L?J{{o$PR2b}8?$eyxh%{}*8vu5^+s5@{ydRh)z5)u-61AT2%
z5)ujo2?_blOXOh5vFX19@Q>>qBXgaz^K<au<TJ}jXUqOc%kPtxk0vcGC;cXC!QY=P
zCwnF_;D?N;Nz2Iq%ii8z%hpLButC9OJ@~t7vUt+6ySsaG*m8b;o|~JytE($AGSc4O
zzPPwpO-;>m@(mu32ZuU8Kkwq|atA+=`J2McNJs)RM*#4N#z)`AkAwt*z4#%`7KdCT
zA>ko0(AF>y!fZY#=4^B$ygHTz?&s7A4$ISIeCC@w=e<rw%IE)TlJwOUE}!=^%jX9j
zHY6^i|IA!u^Np%@O<I|x^Nm<Dpg2d_@=Pc$Zk#*O{I`uvj}umgH2a!3bft)k$w(K*
zWYmgcwEE31tKo>1(J<d_mQqs0!@+X(74M&a<>XUj33-I-$2YCg%Lb7SiPageJqD^Y
zbYW&j-#i?(;FMrBU$yHstE_bkQ1UYiCZBb~{Q|HwrrP7mMD`{NSW0Q}<7%D6?ByY(
z18vpYtCb(Z>V^5yqGjthd;)0fjX`1h-#j0>`_kq@am=RSC+|Cx>jf_w9PfLFv)kXk
zXt1|Ae=OUXlMGa(txtFDIfY5GBaC=C_qR^{u5dJYA8hA62qU?pi+OvY1vr(#i@`vR
zGqORMAPNHF7XnAlI^GxK+NgU^_k!Ag&z_72jGZ~{tbM>Ws!jMaAv%d|`#ZrZR)LMH
zOG)Ei-9GJ}o%8K3-m@Fq!2$S*=E&2C<k(L88TP_N5K`tZ&gaebp=0r(6V0a+fxir=
zaBZQ@x2}cws&LP0W44YoMwU(+@4Kp;$xT7nQm<T3@V@HZJMr}V_}kK|lD7AURi(&n
zM-MR&Z8QCi)0=Y#^2H~I0oakYlXiQ4$M_;AmANkK(JSj?V}aLjr`vr`)E>G;eYe}1
zf8}>|_V2NeoHJJR{{QJbbBq3F*uSa0j=r65O$U04f&9Hoh1U@|J6mV_Yj~-^q|&Yb
z1gOlZR(d;o<0lsPUUfFQPA@udob=_tsZzOslQI;ux*f6~KHy{DBs5wtzIH`*TI;eL
zw|(-zTcgtxOGY7)-|f-tg+mv!H~pyuH%U6{^9#eY35{+knam?qrg|L;Ru0Do4qz@G
zfe~Yf4@XMyo%I4@xcbOJlCXmr0wjheZMfw(tp%A~A@Zv?u=!(Z+{J-Eo%V|@MsTeB
zTP_y8QQurFQYSp-lI`Pn=CA)wHtqx0kydp0EIcTpf?9wd{^LRl_~Ur7)&i}OTlh+&
z`k%r4qo|7l^ZB#a*O%Nc`c!qE-nZ|c0GI#&7p=Z++MjI5JR81f{NeQegO67KP0dqM
z%k#+21jP1Ew&&u9w;eA*;=MS#!{S^0KX!whXZ=5%?=!#RyFR%40aWN^wG;O>AF(uW
zJPXb--}QmNf5B41vDQ&rYYek-EGVSq>`%f+MgO&<{Hq}1Jf|gLD@A-v+MvPRX^m#Q
z`+H5M2V`%wAPsVQQ)o@<;A`FsP{i9;!tmeBMoM6nQ#zFR@}g(}>30_tVdIExeFX}Z
zkR{lfg9?LUrNlt4CnW={AosI$bTLVwEe!GTkLrN%{QqTl!@6n1RzDfR9B@AO6!}M=
zffFguff9#6988Cieoi0`gHGM84ImE7AP&|b4k7G5br&|@kqk^+;9#eV*|@-ghkVrO
z0*5px@h#Y07KG6T?5^b4-CCUt2nC$(+~r5WaDJ5`k4~@Y97^lLGAm$rY|@r)<=$Ky
zMp0bj3jmwx1eY)hpk;(#2X$-6;*btsm^gUiF?28Y#LUh3(FV88{n>m?6KStilY>tN
zbUti;87O*K?G^AzPr{AOm(hR>0Iq|w6Y~b7mj%)<c`Gyff^zA%qaR_%bD4dFtQw-h
z>KF=TW_RDOf|p<|*TKFTj7`#nC$|{@AD4^DWI$m6>7b0#{*C<AY-p3AXyM}xmbW`v
zgF>=Je7b*wA`vtJ!rzx)88-Kkp2!jMu!1_Y{OF3fX!3w3SQ*qX((HnkjRXH*{w`o$
z=cgx9-}~i(_!+R7{;J7e$HlRrfsFtVxuJ|baQF}y9%SIfj)1pW=Ibk6`*1>>yH&L5
za@cu9=hpAWg?>l>Db>L6zIr=%FJn9RY)F1l>R8SF-_@(-*HHN8#uQu$XlI!%G1#@P
zVATqTvB2upZR<z~0VhyS&-44%ze1OZJ^Owq_MGLt;MuLzgH&MW%dNSC{k=wn@-Dcd
zj8-kU{LaAM`^%@dF(n?HuRlQb%x)5lx+6O(ve|OVj_9b1YPg6ue{aLAeszXsN`s3o
zs*fR+aHzUL?juB?!%Y!;`jMqok~{^?;j=iw5EIW&Mwp5J#^FkRmooi71;ene-?6U!
zdSu_19Py+KThY>+jnA!mO)v*F9rYezkmn#K*cHCMjexHA;)I#5xHm+gwTV=Fr`=>X
zNUi@FlA$tmkAD5}lJ8HDj{;Jxq9HRXbK(S*&Bp3aQ)GmrL!&Ly)WCm=nwW@xr!HVK
z&=m<tNr<Yb%|F84&8BAP?BlsP1Nt?_Txca#E2+%+a!g(?$ZsJ8JHjoEi2<K_NTaL*
zGJ-MpG)Ig6?fL7E{-8T>1O3te=ieXpsZ|>L6Ang|>8H=yIUFN-$NaDUuqUn4*td!@
zu1se@P{_(02^zyb0~f?2J4L+%)-ybOyF34*Y4D@&@mwC??xKsQQ#n?p?*xI*T_|4a
z-{AOPpjX(bF!GF=kdG-)T$uos{3K@vjTp`Hi+kX{4qAuTzeZBwlF6r*=kYrau`*St
zICSKVg)){5Hn@9SE7+wnvO?@BA@@tyn`bCztc>eWkXh{6=De38VDHt|(*hcJW}S-p
z+VJ-vvqY~@Bn3?)XG+S2S>LsP*J>smuD~PEz>$zy`wnD0=qb&T`MME|37tXzbC8C2
zv$%0P;AfD>StZu7_+X@#-L3yQO`S_5unAMpBfJ|#n#J&RZ!;u(2>V|*S_V2;oreE1
z+kaknQ0flYF$ZK|^=)0uObi+BHf)NF<9~G$(mpD;L47O)`QdiFA<Wki`r@Fuf!<fR
zM!NkEodO}xEdwfCd51o+7|0To|4+-zyMXl}6+X@3e+`S%Uj0GeAmpusopKkrO;73m
z6@6Yz`OnomGI9XX<sjp#!OGM>-84+p9f|+b@-6ZsGK!|>`Tu-B9iKe#AgBs7aYIL}
z!F3Lw$5@S!|1+Ge_Cj3lh5lN)n6G^2%}<j7p#FbCz9dd9_`fOtcnm5E;+JVow}Aa`
zsv5eOECLC&IxN}!e|XXKEgFYi$fpJ63t7hEqmi*b-v66Hc1t(vI*VA;|5VX_OitZ!
zu|;jm63U>3it2=@*#ALkz#9SfyO1wt^FG^`=<7E4Nje$$41kT@W|IL|0r?&VVep|<
zmhjW9$f9bEl98GX+_ViP`O(##crnE`9<~3f7oHe~1(sl=pMmS1F^nt~IU0NxeBrFJ
zN%c(r`dt=+h?M=e8Nd`on^M*7!A@^v2~P}`s#GvQ8UlQzV)x0)N(Z|C_Zi{uKWqf+
zRtM(_x=Cfz1$u&>Cpg*1PTk{aRNg*VR4NE2cK5nZN*kGs>Opz=_pXex9d)kXpF!;e
zDO9d5zwq?6cjOp<xI>N((i9<RCx8?o)9j7a#@v=pfMC*rQ`WU)2IdQe^*Jx)YK&@L
zLBH{V0|ab8*wVIKvxhRRmpY!zSNf?NNc#_rF$=#wL3^G7erZ?~9mtO$L&Io4e!WX9
zUj{U8rGN$g*=(dPx@b0$xb#>-cu*4*hrJs@g2@3g@&@!UKcDcUQ6+SoIGaM)c&GMP
z_5z@15sW{ylmgF{Q2hUX334MI_oe_s@yUQG<5`&Kb%f>;`1s-N%p&G*v*9t5r$OZ8
zJO=cmO<68m#aTd~vH*kdL(q=Xv#X{7hs;t9m}u(Hc?ifI|15XnHAF^883WGy5!k>9
z3&mF<nZR8vZ$OrST=U0n7l0gaWJh5Ar3i6Gpf`EjeDi!eXD94@rUzld3*-KX*V<+S
zLpXD!57w2OctA`BG`^|Bib2n!eKilzo0Q)qZvPFvL8uMLQ$5)ANN^H_clRS5_>wd*
z&1=6&L38~MGQcP{5F6ZF%q1}U|Hl3iF7m~{_m0A*T(#P{iUa2)dy)0=ezB(x4O?C9
zI7tQi&MP9J@8()6Kj&OIeRd$vC0tsJ+}aISO?7!+P;}I-vW1sU)k7n(+ial2aNI=V
zx6(*uJVU$QZ?;%Xaa|qTKlwe4UT8_B*6oP6HL8#4=}fqDvc8!hsc^+h_26LXcS<O&
z=b`kcg*F>xqx|)e?m4HBRq~y7)^AIPDHmAm-x~ch7COVv?=?3|Jmom(F~Y2QEwnxO
z&YT~D8TW$WwVV|Wj}GO}KphLN?A*df0}s@m5AS}WUJX_x<iW4w{4o1J=5_Ygq1#~!
zo2TQF_bbjTkCHjIw<BsrWEox9c3V^T54Hn6xWeq`C1HdjO?OozxfbEkHqNbroVENb
zpLXoH5Hpq_L2MSeDudgLPJ0#~jWpxf_hx2<Un3hg5dDqyn`Ud7OfIua@^XmZ#b?DX
z)2}OBl<z!AU3v#1KR~OBTU#6N3WB7n<DM?8D8c1mKge8+F`IsvM?P0xulj*I<!sxU
z-HvwC(y23EuKn~D&Ia<YXi24m%Z85$=L+rs=`8<htLI5m7iWVO^mp>qYC=@|WoG?x
zO_Oz~os(KOjv1p=-*$va$tie(Fo8%g$-w8O*|lyr!k&XuP^I4qT1Fne{{-^b^~aV4
z3e`hBHl_hCT4g3w&xk019v;15Ovla^e#By7g43UnQ^~}Lb8xxf>@9R2e80<r;@8EB
zBm<9?MG>(03qW*XGSAZiFD=lZB>2%;#~rR%nS8#tU0nPn(`8JjdwJQ5<6n>tFNe=G
zxvbvn5RWbgKe`PzmO2sdhRs?k<LyZy_dz44bK-;l&HH<A+&N4#8D`)uL4X-u7@+L#
zHQ^@+uh1FrY`V<DyPgb(<yKu-^YN*@l8<duzJLMEcsv*pi0PeYa40Ff2Nc!3s0MlU
z_OFhBg$AzBJCZ(pP3(ewG>a~pDuiF2pwtRRO`c#qLH<jgRD&wR>EV1|-yoM7KwE7w
z?2C@9f&zD@l8>!aB`Iuwfg2c0g@DLm?r~J?%aIq6r&dDJE&Fg8RHGc|W)6>Y&4<EE
zd`et*zIFXA_YsZh2mMoAvZDB1S2@={^qNchB1eCXE&9b9Fp%LVkvU%HFRD#YRGBh5
zCS7h`85{`?lZzpxE(ZOnGRxc*<u<`8fhUcA??XFI`k#}i{p}GS4s!PdLw(aS-wj=~
zw5PH#xP6UlK*mZh0@Cr0JCcZHSG-ci#(^@kE~ygey;Bmm;=~GJEK~k<gVvn-p`k-U
zqy00WE9O_9wM>8sPWg{kMKS7yapZ%caRrRbGqxbDRIhN98peYts~iXSLX&ReJ2bxL
zuK3o<N8y+LXd6j`fp?XvMolvr;mJ}zc`_%0FpwS=KO%2GeGOqmLorA4Oi}5$X)VmJ
zL5Yq2dOq(27`J8F{&^|MrX>0VPZ<@<ixvQ5`--W!sx#U2%(7M+zSb+D_9d`%S;i*N
zNOzxA{~)AgqHjDuU*E<;-Tf$#jlLIA(Pf(>L0)1GvtqQnFoPbglE=jsXj-z-$v`Np
zFlH^NCP#eX1rHz9w|^!|z8D--FO&*eY?J$3EARyrTg_PVvyqROlW^k!9Uwr^Z&G4}
zL~wx=cDhS@gVp!%`06#z5ZWTnbHhV9&@~G(S3Q*TIBaiP`#51F4B@^&)0ppj=+uMa
zYd8ELC7C@F{$o6T@q2TzV0<R-)*&=~Ecm%1=~0gcOu+YNu8=0C@7YBS72?ak_6MX*
zYcF#{fu!NwfN(N1Frq1E?D+bQ_YTBU`I3({_R)WO>$e|RDlleg7fMA`;qX7(ZGKj=
zD?b4Dg93u8`sZ$qM@?%atYYf0kU~&^Md`2Nj27D*okEE0ZAL6Fzg|4hcyXY*`{^;u
zs%G%WNLi3FoKU8T#(IE;EsC9+^7RnOzQOsEkCXq^AD&YZKs93`0aQ^q;`I>8%ZD@S
z(fnF4H+OC1@(u_<mBHiK#s|#Cg^(Ktb-cT{?l`^CEd8PekkbW@x3izv%6cj<?9nhL
zNSBy;uI^q0wt}rU`KlkRwM5%J7eM@Bas$n!{!M#apz@Jf{uft#biHvF@H4d*T>CGA
zZPV!yZY3i;(moihK@{vwD;PI03$`sni`-rs<J=IFU20^qIfW({6Z%-l^{=5+Y^M9J
z7#w5&6UhWJkk(-9->b^wCwSfvMM^^0G-fgkoTY+(#7P3o!k`f1*Z4{Uw$K(c`3ZC=
zA*Q@&yxHrfwY2WLJP<k|a4jiKyQ6w7`kK}nQhW2c$dsZAVRj-HgK-E<D%TX=h-C;q
zi5K$pwN6(COU=Zjt=qqu6hbCmh(M`tq<{jBOc#58ZZU&0pK2>(qv7Q+gb+6fPK$q<
z&($gapoLdV!gI^ge3B9UU=W$vB>%#PYs!DprE{+5SCu+L0_)EWmwzBc2Db|?SH3h(
zI7<`NhDDpCJsfz{zXsB>`i$+H^~;^h&P~&Hd=qIFlQn;?u;!u9&$faepHEl*`JwZ9
z7z`V_6MLO`NB1wu%tgyrszdfUnvmqqw3mRpzd+ttlSc30$bCdUAr2ITc_1ZycX=T&
zu}cJsJdi+(+OgDwLy@c&k(sfbG2KC<jwBx#Gt}xq_e~=>Itv}T@$Yj*;PS$b6)#%E
z<DnPmE0gm~qB&Ppi?q3*6UHzy>0WG!aXh0>Oc7WY7u1dbeCDM|Rnw|J43-FR%kqvs
z<MJWv1nXKRyMMF+sti<%&RoNv1atMPhMDxz6`x;N6_Yht_YdVCkp%kkk;mPmLJ6-f
z)~F|OV;<=cV@Lti=qagnzHpVB-ob<Mx@fGfK<Wocv?3TpQ)JQk2<i3a7HWy81>8`=
z*7l)FAuy9naBif6l$d&cE>GQWNSNEG4B^G|b#l-%5cF&N)>h}E9*#SkKN~1<hBfzw
z8h<9VIDlA5qexwR1j#Y9+$(AImeD|&%A^Lk40-Y>t$tlkW5_N0M})E`h?%<=g^2=i
zr-4VLdIdm}_T;0B5^a><VHl>iAgwp_vEcd-aJzXBgN5!{TtVc$Pp#bv1<6@n^N{d0
z8um?Z`5OZf+{1gCO0Y|k#H1R-wBF8O$LI6z2{9AI=rJ1Ziu-~fGoC%;gI`m<ZirD%
z>oqdX1-v=uKt1U+P=0a7_z14+UDrFscp$lnAF#olXnsC#4zJDsf~22V*TUbOI%pMy
zp&mq0iv)3pv{@twRG`oGHpb;-P2<a~eJXB4HE?BYtixS(c4OzL$!w*At9buRVRC=-
zBUY4K=@&)mK8(<GTCjKdI9MgYY75uUgjb$B+21{Fz0Q9k$hollVE?yFmU?7}NW$XD
z-~g+0VI_Fa_dN#)52r`EPM>sm;;P+GI*C6b=2zn*?QVG$`JHyihhhln{G>1)!lxQM
z-Rq50y#I)w9nPw&Q;k43eCqWf_(vp`edtk;$r~=5K*fVpxqmxc)t(>!y90LU8NsrT
z1<bpJtNpgK=M&>l`nODgX#cK07W`K?-Gp8`7}3NWeSQ0@i;EZvo;Mir+e+}0Ut2Fj
z_Pi6vtnhQ(v!?#1Y9lV@a>+qz99`GhS&U$$96~&!ZKrdL3Z5s&bxSRs2hRY!@0k5b
z`2(a}EqL}8sD7A=SFsyATlivq{nqF!Vsq~}AG~BBeZZ&~I56q*+A;ofLcWFI0NsR6
z2@wcy-Me12WY!lLHU1A)$&Q9am1<xfr}Jm9B7r<<2toezM7JwzWJeY1VvFnb!+4#-
zh<mqA`A6lLo&5ws$6{SFR()(5O1BlLwWyGK)L2bCEWxRlYS`JEiZLn4+K^(TQF3+P
zCzp2T=_uGIwhwK-Ih@*j3k%^LeX>m7Xn+SVZ=d|5+>(u~U#~PpW%gfA05o2ErH+HK
zr_YDi7Nm$oEwpu#A0uG~ByO&kR5lz#md>9A#U}9~&gT8?e;xKX`sdl_fOl9t!fySt
zqI;1w);03W`YV~)01V_2*O7r-4HbcTUqp{CN}a$H1C|9d3y(;ooClcO>L@7r4o1Pg
zsbdPkJH?pNRdE}Fe(O{X&+?@TUm?P$c@BgWpS-K{Hw{co8b2+JlCXTZA)|}Aj4~{+
z$!4g`(^IOtk74BIq~=;;LyU?`xpiioig(Kv*@~47rW<LYfVFelKO;y#XPR{9bT$$5
zXtPVc+L@nu;Izwe_h8_)w?Ilk{h3G{Z1!rUIGNI0(I`}`r>}pyAKgQ&QhwH*M*TRJ
z^={FHyJ+d1sJR9WCI*+MSfn4LYo>ELi6kS5aA+p~6__O92_NnUfeo>4Df{&kBQIKC
zNMl6@?eG8UroXktGJx@BlqY;G|N81E@kByz-R6#wO*%j4{cON+0P*1l!MsYM%%*xJ
z73SQKj2O~~ZByRmM|1d{O>a&h&GP@gSY-`4TuqHEdmQU!aJN%PPgOPskYB!}g(L1b
zW2<!8Z5nC2+K>5-V;_CNWK!Z&F7NBpoX#62R=8kAP4m4<Y?W&5GEkYa2Py;O$ZA6j
zy`R^6X<e^3q@<veptYi@wGn(9yw|4Vd$;eYm;$-4QQnZC7*l0oiC+>Kr9SLVh%V-_
zIh}jNpohI~f5LN=(^ZT3U_2T61|kQ5hFV5F59NoGiqR<4#0AznT!lu;=4)g{oSBn-
zeEv3?L5@ge-6c<Ov_RCduaLoK1Xl0z$h<H$DTTKSV575P#N&O^WFX&M$eoHaBYK(Y
zXEZzsR{lhUz`#WEoih_CoY2KAi{&bmLTCotFAF884_S-#FnFQZqmoh)C~7l)xMB84
zfd;O&Y>qP1&hz6#x00`@SP}KBh-T_!0A;S?PNgIp+jUtJW5%Q<ZX?~HK7`e<;N58U
z48<Z#>%^?(2C}*2?H8U;tfiH*>E;40hRgpga%q*@L;O)N`PnSj!%K!<%;0qtY2CIB
zPwq#gS*Kf!F>y8JUrNLfN}tj2eP0g)bv9K9=d0W`qyskH>On30GMqV6<=^Ut(ZD!e
z$z<$;RkQ}?l$Rg<YCYDNxJ2)D^HD(wo<~n1iUT2%ZP0$liU&LfRmT|<SLl6jmV7A5
z<k6exxQfv4GY}qK%!BHwzx!CRvqRPPJWVk5&!_Yoj9I}IGI^FSnr;Wg%Mu!zT#Sty
z*2vm5CfJ%^;|?os%Vq3aykbbWR~iEzK&!<Pj9HiISMXPCA13>K*yXFTkEKEk1h`=b
zYmOh`p9ROxoBRxI7)mdVr2I?g@>Ohblyd|bi)Xi%6B)WTdqpmTPY>rFV>0KY=eM5i
zxZW&7hzhtmNk?l(y0U)h*4+Y{uQXXap{|5KZaEJ(P%c=~HSLDA$D0akw4%m$I|dBW
z+a(Y!PU2RKx|qCC%0j7-n<j1grdH<Sz8PgwhOof3q1W|L;j+e_iX@fpwShK+x{aJ%
ze~dipHe-T)r|-cJU05o7Vk$Y&j(+c+sB_h-1pt59G>%R0Fm`rgG8{lIeHMKrClaEu
zyiq=v_$>kB-wi#E7D?x;V8DfcL^gg?x?gCpefy=o?2%pVjb#BN8|QW-&T9yH|HqU5
zFN)6}+uk^Ju9aFAsG;K}=x59Fa`MCVUA#PXHKdaLRt&cK|MnSJ!DO+}8s|4T;A@lp
z@6OCWr>w2Uc;Gt5@Q-BB9|T{PlwBrds3!r_`Hr2kO?i$YEyZ0EKEwg#K|31rKcLjL
z&joU$jHy@XpDK>OW}35-As2kjM27If@xSCAM`RqWWnf!%-Q{RQP&2^~p_QLZB4PV9
zw<$S2ym?gWtqDP65759wv@tc2K2lNUmzkut>MJ2svjRi0d2-vsU*b@r=Iap=Y{1)!
z*qfDA>9O;}?i<<!s9uC&I(ECbofaWQRj7fy?KFbw##U%Ul)pQseekfx-WBnU>-G5~
z&OnT5(%6namthI@W#UWF#_VSI@}Ldh0tAZDqUV5BoN>Yu{Q=Fh0lG>XF2kby>I6;9
z^xu@Qw?4(`e=3DW6<qPqsNV3hq2gr$cYkNXr>lmoF&+I>MY(`gIem^IU$GgDk$=py
zmjVSf$w=(@ZDtM3!jT^%@k5iwj_%{iVWZ|Vx4T0GT^UA(6d=)+j8l&Z;6>CS3s>%K
zvr0)DYvNybK8_o*Ca(-5JaMKGh6i;IA~S1LW!*dNK4#zkl=U@PSQ|3;l9BLAU?<>;
zk*rgv_5aNU;8P(OVOxbfKa_tLnUIpRF>S2t@*N->q=Jbd%zg7`dwr2~O7sY+q-)=s
z849E{2wBfC9u_(w8+VKu4Kb5xW(>rSO>5RqGfd@#ZXB$o#j+Qc)II4QG=}NZ?lBRk
zS{5I4O=3{Sk%<;qCOswIr;f{iG?7f4W{L<l?=8W#`9)<8xeA_#JH<fCB0}hA7V@~h
zwBLgh_k0P$?}XpEq(DF4HwK4&CRi9@%sx|*UQ-PG{y94s85=he150EFL^EPUq8V}F
zhGII!(v8yBJAT8FHxv5mvw_NPdMMz&>Ji2NSB8ZKFed<c{O`*i<#Z#L)I>8&P5u~I
zziz*Q;G282=q@m7@C1Jv6?-F3vbj6`Py-fuz$+fUEK<Tt@7wLyV0X~|-l9981`QB<
zj1yv0hS7-ID^f268c@%eZ0<b}dBsnyOkJ2gW(a#iW2pn{KNGuSzVOdTDRWW&HXY1F
z8?|Dg7voDZ$^7f8nj%8oYn^wr>Iwc$l-xBrp;Cs0C%nQvnK@+0M*GOw4rm|*{eT8B
z8efQFH}Z?`U<H!1kYCi{i6)u;vFbt#Eu4R$)SZu67>Hp_Ym?piDxXuruONPMvHN13
zqylLde)0)NoYv8&X9Mbr<xIq^mUI%xu#Sc#T3Fh>C$kCo@ZM{D_^B7l1FIon^#iLU
z?4|4+`*Hv7RiBv=7Kafg(n|cVauRMjCEaB{h;zbKBTVBL=R1J}*p*dsbSm)mnF$rf
z1H0{c`P(d`5M<K03acdNrR`PC1~+s^%)A*)Dz59hOUDv!w4wSu0eDp|mI;76f>Dyz
zKA~tLxqF+UvMy42L8L!%<|D;^CtQ_xzU7%%8sLIUxs#+7xI_-OBOrN&A0=fz1A1Pk
zCjplf%b~=7Eo)`>U@wcw?!)YZ#1NBf%U7<pcqWE9mt~rCEBe0A(ZSsQBcz6a>D?BD
zbTXPU=?NDQha3vE{I)1eKemV>0oESl++3iDMk)g+qtD6$+{XVE>*z#A-xuWWa`(WR
zBK<gL{B2N6g#=lYmd0o>iu?!nAS%|bL>B2MI=3u@-T|_J)Pt>)6XI{S@lH-^hrjVS
z);=_8{a9?;6@`L4ST6k(&j&05w1^54f-Rlm0}S(L<0N79(*h?{Yz10P<%4uHT~N(m
zAwk5$`vTjg4j-FGj4>OF6~jN4f`spN;Apf&7>e{T#LI=W@G2zvEI*2ylj-G(;HY45
zW;$?IB~<{t`a3DR5C9=`D;k&*?lgPp5^R|jXQP=@ou@idT~63Lx!+?u&d+YaJXT+y
zhJ;Op4w%@U4bFKTiDnX)<bF+Dq@r|D&yypj_H*ZijsoEkGx+-Pf9|KpVXB*39CN9z
z4LASsx*DN8*gor>s}L_A>OMEMk$U(A^EqqR|Ml(W`TF7G!<RH}p(9Ao3=qnZ!X#j#
z&zL9-KOgZ6`Yr7>HPJE8EKX=Yei_-!t9F-iHB=9SYmhmi>y$K>3*&@<x6d7%GP}m!
zh5vjW3c;u6wziz#6!GkK#Gh2I6;vv!sCXt|ZjI)BgasHmwMN|?uJZn}^ltRWsD19<
zwr(c=IOFb#fDGb~-CqXcmv)+6z`kYo{^NgBwMe-=rRftkGDKJ<I0i%e`RZQZ#}ft4
zU+3&y5^J6rTh2|NSU;cE;yOJ$<9ko74=W%RqQBd!d7mk*XBal}jfU@)pGfKyZ)WUm
zy|`Y0v9wB_7NJ3DO?R^+{SY($5d^xe4tqlK$!RAae24uT#Xl@_!44;4P7ZOLa_m)k
z^QBA&&LJ$*{>!w`yecapng2s^z|9Ys`V!X@uHI{N*-D3Esn~q8ywc{aJxZej_=rKI
z@WY|N!4Mm6Z+Rt2tK?a+HKf+e$10RV%Nhpj8i@U|3#8OY0>>y(`Hcv|jM`}uQ4@3E
zllo=@!4a9-Xnh`nsh{Ka4zgNZh*4s2F?fVt|B2|V1YZk?O^@_^#cFJ^^mPP;HEVBY
z<N*DJcW;`uuj<_pCrFI3uVyac<5pSzGTrr=)55xbSl#A_AB4<UrE26XbAm}WPaCbT
zY>xTTkL;lNAQkPL*rN(*Jx;S9syG{Xia}(3z-ZuEK{OCIl92`oT&Dq5&VR?>_xH`$
z0$`_S-bHL|mUxqbaA-pHyE?w5hRX$LFmyMV+XjzWPO~cU*9wm~j_WtZek^g1_HL{O
zSt(ThCq3L!L0De!^prQsqS-SOyg!)DKmyAtm_#7HF#AQB&w2Odxv^J&mK!7M-N+f5
z2#Y=MVjrq|L^zd}$lFSKc9ro$!o@3j(c8|YwpbdBWD&6N+@FKMplBpZVCX30K`%O2
z!mua_brvkC1w$fm?tKg!+KwGmUt>&oBKrK(xSYey4JMqA-yQV4FH%CI<otf$y>2n(
zk#kFp8M%<cO=2FCRw2-TEkTw+xcL$b1GP1=k_X~~|H+i_t~3DU(2fwhf`}oy^FTbV
zKVOLCT-huTz?OuL3!u0AOLwuTIxR^o%F68YyiwV2{uNWg`>H*pL;JSaseLL+N*fh<
zYgI&F!{U6xp@qBY2_y0Q8*XM-C@zS}biFH$+B$It8%<Bamr!`c`tO_0=?ZOv|78RK
z&jLlzh;7@G#@Ff~A}J>&yUWc_t)jdY_nR%x8WK0`o`H*>7l$N=(cDW;9$4d#y<G9?
zL~KpHE2Z~`S~d33mIoz)&6~jZ#MkjY1_cDC?G)P+qW)6cP%=p4=X6mZFK0~Xde?_?
zxZ<Nu;MZ&ETl;%!h@CKJNt-G~yo=`ZF%*K}QPPNHMfKC0tV&-b;klcgZjjKno(8j_
zQrIHTbj=Mylw-RFG66Pd($DIoF)Ad2RDPy1G)@8U!7eJNgZNMhQu#`AGI|*IYp`>z
z0Ghg0W>~QAr4x6_9Kb`^7b`ROP^Y8u_-PjJ!@Ci;bMutW*HT!qkUyXTerLD8pKkNz
zWAFY*kc}t@@!-9bfKExDtoQkSw)OISS4-XYjdjF&RC+1d^Ye_r&B>d!BV~<D&jMTM
z5Wh$v#^-Pr6>Ucw$hBiNCh?-qB;~ZnC-tRXia3pzFSmRCHX$SuYU9G-Ng%c2rJUyC
z+DH!`Bo)%VFWhP&YGWbT!tkQobqygm$*A=)hw9dZg(N>tr&dRS<vkl(5Ehj_Qv8@y
zx?(iJZ@w;Ug+QARob3!;wyC6(Y?aiFL&gqX^M5efD^cvf6g9>Nsz1)^5Yw>&Bc!;f
zNGcAlw`|5K>L&M9>ZM<=`GWKlzgSS_$53JrMVphB!xB91a$F*8=1^OQQ$T50a(0rq
zEwr4g16wlYNOuW#(Vu#~I8LJXS3&cq|D~{(EAh~zWoVbbdspua2&w>6EU>Y~5<-rY
z%a-lAOZ{9kaR}lxnsgm=igB?cEVxf-Uqw_zI?GRC=Pe#WvT0YE&4PI56y<KI$2}ZU
znWuA4Q2Znu2Ze&{HxQRaC@Xs^cUw{(f33<MbjK)vy-jps+aKHHd{>(~>ec1_<F`II
zXxUZQ4<(Z7lNufrlT50rlkR^sU0LgG7v7W&@s28w+g>zb%>|m$(yqK*7`>$Qusx(f
zE2Y+k;43oGFxNKqNpPAs8U%sv<Cs#?4Nf*7I1=l$wX4tRYuhJAZR9D6{o!1Y05*`&
z98s=sJ!(xeGA#G9g?opNhs!nY+{|(3VyS(P4pg)22}B8@Nrh7vcBG=NJ>gF3=TYLV
zH6vXWu~3%ay@Aj+oj)1WfRV$cdW7tjXF0=`k}sD$!`x9Bb$pfefoiyKB;4x@Q^G;N
zD?vncZ+f=x!}F*m`i;Cirat1PNUeKBL>*?=VYn<}KtTyX2Pm-=o%A;IU1tf)9_P*A
zbCF|V{x#rI1H5~=nv^-ccxhTZ9-zY1#&i^t>*9L5i`mwcaEH6kl;ipz=_y-%tIC!y
z4hJHdP@7jZz}0o8Z2pd5g|c3rrg(Ln&_>a(VX6mMeR@X}MvMPw!pbZx=lGF^O(!(p
zJATUPn#PnE(~WklZn;ckO2}_bom8pki~&dcr7RU4IsG|De6UVG3Ue6H{fa+NETd_r
zn{5C0d&QgyjXMN3AKK4>^f4!bIwu;EG%U*uWF)Wnb6l%iS>7Hv7N&th7@O`rn>>+w
z)X`8FbKuH0!nn*Z{fctcRK#retFsV#`DD^g8Wc11+4Icc+R=1$gIYctV+A7NtNT-U
zmHx}1g=Ims+b}gR<k3eXF|rwI338gEaUme~0r2(>j(6;xi!&u-Ut>Ey1k(G?SQ}gq
znV7}+JKlqe&-{%g&sy$#s`dO*ah=xVt8I7XxEP2b>&E<>Clgs=4L5vORU<v#hpzi@
zGQKy_B~UQtST~7)Rw+1u-=U6ocy4B@IsL2jEev*Q&~bAv@Nozq#~1LCLJqDD4ixC=
z&OJ$3h}h+yXlrnE-QHgC$Ne=_9MQ%2^dEy5JAN<u$txaX(-gM#QF#FBKTrWlwUF-(
z@CcR*02O&VWV87eyGWAofaC2-xp}SQuLcs4kA4|<KN}W;xP|?_8XRW4Fx>y-vbLet
z+wSH2m=p*3z@W8dUbwuI6DYt})O=>!qK7mgjY~{lo}5&1R@O;&6jEmeefV|J8H|!+
z%2%wSKv61*777dDm(GN0|3KYT)e?<Ik3&fUCwGT6VCtg<*t?-PX-bOVE6?G|dWT~{
zng~Y0X|68}J1-M3T1^Xv+lr}|5I(NzRN{=ds7$-hIkPt}Id^gxj+mzbZ&P3F6=L>e
z#$pV9J>SKWrhMZEP3GN;mNg!1bSsau^38?3;Sa|Tj9v=4a2_`J@P#IHsXQe7D>ALB
zR2VP@nR<_Ro&W3L;hg_0IjU6(0_?x+{0{kwWa=EAz0Jor#<90bZbu;r&eoLHrZReQ
zT(eEm<YwN6_&uIa{x?<)%P(uTz`Pk5sdL{kf>t!BErO3SsQp~Q#8{}=tZ7ZNnfwh&
z9x}ii&sjlEhA=Ht7`QN5zVRU@T?|rGVI`Zt;<9LDs9E_sWpKx4IqXUXjsL@pCaP9c
z@19#?qFpWnA&y@3==ZHR;lDUDRFtLksFfJ%{bU&@7wsqt0QOK=xgrhTu-#XHQ4a#p
zdK)&ihc)t~L#H-wunPq+La7T&uOeFD#`l#XY-8W;m11ew4QlH(Q||orwND<s@BFDo
zoRunIa)Kl=adZsKiu_(9+O)}!rTL-HqvvF@6iziMmUd-S++%Jrcu>NW1v?&`(9#f5
zBl>=&jR!$?^)dEQh|f)(_oQdHvj;ByR=824xcf_(7t$$^h3wsPz2|CoM`PySj3&hE
zi%1?|eTO2xhF6&3%DydoJDJ;-;qNR4)mB$_gn7X9?eB66dHh#UpKZB4x_WwCQRKzu
zL6Z7V>MZ&pVa3<C5KYyh^=H>nXhgZsj!Obn0$S_Dp+?Shz)T#f9kEy<X7LM=SY&a>
zI8ACnt(jxyJVCEUJFRsIKQ*KQs~disMRmjLm;OROO<F!5;*w|74pHs0(JSYa7X_41
zJvOEAFW+w%a@Lx$T4-XAiux<OuqZJFAIneBHSd4j2qxT3e^q|z%0vY+Ox0n7IrdTQ
zrldKKe!X3VHd1SV-TmzZ8W7u}XJ1)YX>tgUosD6k#K+J@1p3yQyHgNgvd=jTlv}&$
zmTI7w`MnOtArdbQgW)QEL<`qtQ`yNDO<D%MYSHw7SNHi}>{F&`Bnl;5!4wxhD0v7o
z`>fF<N1(_i1!`V}5s>71{gu8c*i~8}pQZrDyHJZcO{84Rr-da}0SI`2SvED`i(m=D
z)9~nZ=zb*aUd&Ew+`bNr+U4pafy`OQ0=5Z#B23=OqRT%?5>jEO95kCZus0?I(PTcB
zX!5T=IG6rZXz_z@zFgc9Zs^yPm_dbJWsQ`>4F<Ck+He2-s;Dr<PZa|K_42^;{R_q=
zotuma8})1}?g;G|ak7N9n>y)-&2nCa8$2%iwUO>?WuyCo+yKC;3#)%imN38kge+lV
zxoj==)&bMN<UQ;-^(d9+wRI0DmKrjeAdMLHH_K)P7_rbH?WxcL4n#IzMDZ}|?oSy~
zABagZnq%yn39PQoG^@35v46r7Vf~wlWe{Ozw%=-)nYo-2%-Y)vi!KzyfUl5E@}ql=
z)mZ`9`QY(FIFISW=R(4G&vJ4l#%>J-rMFn6abt_$R}%^~$OsJNWi{!ZN{mTn?G;R<
zIK{EhRgib<x){LLyIka^PoC%#rUWjYN9nZ(H-d|YR|~6qNtyJ7SeX$myeWWS#Y5~-
z|GjW^tpnb|J9lrv-3XUz&r<>64^?2U9i{$#ft@0PHj7b^Ar9G#j|YtGR|b%!%shy6
ziBJreg~$E6Kf;9T93zETUjx}I=oEm=ly`LLxx_Pi1|I|sDTAS%z`ystC0V)hUz&nn
z@aUPgt6tG;P{AtUjkBTDEF^VwIv+cqrVR#X>-J!S&4R!Kxseqm>Ne*4yTKZv(cg(b
zvCsj^YZ>h3JZr{`luX1UWr**0Uu-Iv4){J92>oUVi>5C7C*)G7nW3Aii?PzdDt$Bl
z=GCTFv`HLS;VwqpvP(4yErxV%ouUqiXE&ACE|SRR-%L0vwI1D*5>Bx4l#_2-Vl|=F
z_eAZae2!2}B`*4|W>w}MOe}44)YWlEp3mf#by+P#NUU;jz^^IhCmA=m5(VM*yv0OY
zq4`9NWrXPNM6<_CL`yK`J3a{B%wP)YGUHo+68`h5-7VpwP5%b>li1A`hWS$al&#A<
zZ<;&i=SLj3!@?w4W4C`2BhKX??Ce%Vea!*P;ufc!TVh65KG8(_!8)5a3xN4Doe<2j
z_ieIbbmK_sEz399{7}prqt!&1RfN=TE1WMPFPJibhOoOc7k2DzH|BaR@Kmk0XIlq<
z-UvIb4u3lQ<|Hhr*;=Bl3dIKR*iv(_qSfCF!FaX43+Zak-T4L!|EIiqxPhqfE<<WH
zymbGu!LC;4JM)#r<YJftYqwULZ@$<oa`i8yG%yG4I`*d%cSp7s8fb^d>!-Gt{XJmv
zZR|Ghl#gyrskT*k+Hb5dN1*X~LGSUe$B%tG3;Zyh5UckL#6tfRtj%UXMBzv2+3AWW
zdq1)H!;W<Wffhe+OX~fT-l1K0!yW}lY-y$L;OlW~%~$0R5mYsh<qkWOZ7$;3TCB<}
z7O@W)uVA5Jwzn1bRo+SE1Gn?(IK;k$V3_|hxrCA6+gC>)ox&=QZkf6#7^`w`Y=yTy
zZ4?|GTR6!o51#vmYz#5o^<i=uL`0x>3a)$bj^epVs+&>~=v4R2^MyBC4sYM0b|f=f
z`Ec3xw{UB}l+ZC`*tH^jg5i=SjB8O%53Zuft)P%va{YNKeet>fO@f3l(In$|qA>N%
zidqc^Uq}c>5C^^&U1>xqt*s@_kiHnAfvNgF5e<r*h{DhUfnjMob#7As8zJ0suVJj3
zIqt`A`c0y|%TdI=)wnLJ?6<mc+{FwYT58nhVu+Z(Gp39G*bp-@UIzzlZ8!5$#{-a=
ze=COAlTffQDmx~A$b$)?(eP=<xk{54cP<Uxh8gz1A`xGqf{2q#ypHJ}Pq;1-1+8TJ
zac)_{!1Dn0L`6*_KR!5iPnY3S<hS$t&!ClM%yq`F^FWughxE=KK@HFZ34S<AG)o`j
zn2O-La))g|Lx6$Z9m@-T0pT(2Ge$jrCM%3OkFxKJ9Fn=8OBe49f7Ro(Pjd}n9l|2g
zw;6oCx{w=Tu#va=kxOPY*hki^hpc}YXU#Q=UOc&>q`r`)f!PIf*ul!*Ld390$`*zN
z>2&fx^nP`84*|rNGnR+^Xsi76-juo|jIUZ%Of#=Jar=yBJ19RZuJ$P}p_}>|VrIQw
zNZfa_=e!mL%h(d2B#VgG)P;Y#zRKip3b{jkNbpII^?QUx{JdcScP98e5aFwwv`2J@
zVFZ42_gxDofaf-_!9(nijT_>`@NE4j3d_vXgzon$<Vw7`Y{k`=`k@KWU}YUpm(poN
zcJ7r+l$Lyu6i1qM@?aCKL7aCyCHi;r!?2yS*mqz$Ytza|)3zLso<!|XBkU!fLeav&
z%T2{OXf8_&2J!vFnE`bf&$*E)D@{D7a5%HUC^@E<lj=1OmXPSNd;})G-mE$2VdFq_
z`Yt$sJ~tpE;rS61dDbZ5_x*R;#%nvStU?YxUrgPl&O}}a4W~VSZ8p>;jao7G?0T%g
z_02KDy)kx`E1Q2`%zpbUjkw8`9zi$4tXz8jA~Azw%DLtk#k4T;0^qGEo`o)@{4uKj
zv#RLcAkf6Ul=g4fU&wLcCnL3cM!cdfM#NjUFKS6(bg{u@2Cax;+`kl;p(+%kxRD=Q
zc8AZ$5$Lm)yhWD@;|}@~YIDXyNP$v+ycTtN3^A#00^d#6f9a)aUH4(4cTF~EVDYuN
z4R$*S3jaHWU0f2n(}Y`k6`J;S#~|`5>7d1)8=Iew&gc;4z*$Gp@@beHVZiW3#~Pm+
zBNf+%e_^1m%>mULM~nJr)6SpOzOUA$Pc`X7^*&AXh-+mt>QN~66d*7?6h<Sk-_7(w
z_a%G=2G+ET-iwYT=`VhQCj*R(1Y;l92bFhZ@UjVA-sC^T?BXBia@>H1^ga&x?APYD
zd^n2sTB}Y2OxtPfn|5GQ9)`>M?e-^MEZS*jRTUihcF}zSC9B_&{DMkzaqi!4!;F^w
zKS9?IF-KSFVTK_wTh<(M{J~~U*hhwGis>QSU9+Vzm73_8tI8p!Z85edc5mYZW<-J>
zgs~xjjAi56JK-({0|wIXW7y_gYO^B?-b08!{E&(3HY`xw^j>C%&M3c09N(Hroo(`|
zynFd|BI}Ali*b&RpO4sKJ`$QBW;2J)RnUgf`ZiTs$$kv1PNIekP53|3?6LT6(bOkH
zU=aLtO}*-nn&FFnH|%Jrz|68OfWiOro>2H*U0BmP4?JWcPAD!d!Q;5-5&<GOO`349
zdlr$zGqHRr2zt=^L75+-@ui(K`+gp9Xx@}wx?meN_r4AvYHR=L2a5ma$Ws0*T4W|A
z&_h!qSUL<-kbWZYypOnV=xBPUg@Gnl7Z$%f|74LKz(2zqxff)s)a)wR99NuLWKAs(
z%$c(z8cbgWy=Yu$_?uR`awybAb}y6R!JFx`DBn-ETu(b-6VJyKjGdVDmN>7jL#(>;
zey+X0hOhr4`{bjHk6Yv@ve)0(Y#cjCgBeI>PJkH`{9l^*o1dt>cX3(g3r5-}NA;#$
z=7-pFv};H_@p$-X;I0rF=C>QQw6i(fHXsDvt;rC&v)O@dcHYX0x5V9hm7BzyGtvd!
zh|q&oiEqmd;g?RPud`1Sk0~0LGrBAqH(xQ}!9L>ATf#qb&DE>_r7%v39**Q$Btw}U
z8h;B$WpV>i?4p;I43NDYtzW3n*V4P+FmBWOR0f*`SQ38zp|bfM)w4^%l2-aQX0DC-
zo_zSiFU>fnY<<1MjATGbKHK2;EgF7^l1>U0{L6y@(QGA}s789m;kPY9ddC$*Nb=9d
zPykGKqE(_n;i23=%S7J_)(6FF8bc<Yy=r`YnICShHn&U(@zAB!gx@3F$;&M3JL-!r
zbHP3eu6!j;sO7+X>Lwk%ByKW3{MEY0Ct{$m#=N^ng_^l5Y4PS#s5GH_?{x-a6uB^*
zci=7`{Fe%w@>*2vTGx=9uOkUt(BSu0q+Bja)05GbOfpde4k7U5WikSs66L`HYHaw}
z7d5AQYzVS~nlieDD3`eWy2)>D9<FJ(8T@G}(VvA}LKyr5D9}(`<;N)U;&d>XhTyUU
zh0iW5&~+Q^#E%<{h8`(exbTgtOqY{T>tSfJ4|y7Ijr?8E`$#|H;Fv2+4N1)T6{JEP
zQ`F>L4cCUTEHo4dwR|%r#ECTC8uusac3+-#h*F}tkD0E{=%$)MN`$r=p<CGyVPDpK
ze7pno9L#xiS7ohU-+ch2cDm}5V3a;yk9IUh3m^``%vzq;!`k1)pHqmmF{YVm1}G7E
z>3wn{_M4Td-mUbnPS$f8go>011&YN0hX;ZU_dn=qrfQ8nAbcEtRo83&M~I3kfSWcN
z=H-)fI+g+zPD!MIth^CrQtDIbSyq}ErS!A&;eDDOl}=dzL$_iId!G%MoQfh#DPXI{
zEc=5pPhGYZD|Q**R(R#3nX%haBrv5<e_%Vsxqt6t%oy}If!+6RVFHFP-m%7aP$8`Q
z6@Sw%#I}x-Yz8l~fA_V+V4aTru@6HA;4sMr@InLb$gG1Q_bYo#tLOVXCU~LG#rhnB
zd=KbiN-dpUUV{Byx)dg1D4?lL>6fe4nLsf>qZ_TWx+1It>!jv~8{J=c0!{c$%ALZt
zL44*iN*np7|7k9n(xbZQ^j^a#F9{-+2FzU2HHhj(Xk5~VDBgTz)A({Yw9Qj6%|asn
zL}h?RnEdT)eKOT6v}SgS2unW!yHqkAbko%z7DOSmE~@XeMo~#qPH?LS%!~M-O}2P$
zIR2iDnEUnLA6Eai(<}O<tYwY{H146(<Sr}Q*G-R)&W7#(*+iK-*$RC#=WCyj1vg<%
zAKOKPh9js;C^Ca5><<<SAi^dGZI5J8wYC6~pQ@0>$5IcO`?2upGb){Ku*d5WSp#xn
zJ@p(zUTl6wJ}*FM$?Fxd2UA9F)p{4W5RhcV>o;dw^2zh@?IY!uNe)<>CTcXla;p;b
zZ(Lr+Ku5-WcM7cGn&@qMm!gn^_k13d=}dB?Q?`jS=Fw<`g(RZc>=&&R#kF{~KUoV8
znHo#2ikCUoe?`MD-$=hCcmJ9H|IGy$GDeTu@^Z2vVy4K647MBqMeb+O5;B~N;oPr}
z*he>kkk8a;;Y$zUa&x0%i?hfbv!=egB#<dlA49h8ZT^JiS^>f``6zRLy2&RAafJ0g
z7x*2VtDI4dsjqtqAxW^wET15JYr0nWus9O1TkyfeWPcbO&y$FzNCp}M<$ie~{56x9
z5@;}k<f4D?#uOX-JUHSa_=NFk>p~<k{f@kxE%(6%hOMtgD(P4CX@tbrvT*}atVe~I
z>ie**QVi0fg8GTVJK!6m@SnGM3Up9rFEzS;utY*n6gx<d8U1D=dPC{gkH^+e=fvd*
z|M<XM*Q*^60z+;o1EJ-qy6Hm8%f58*M(v&?8cJk+as~6iapWp>=zSJ;ULUiSKuvM2
zH#*CkWZXWCOw_3uiKmSh4RK5&o=LHPA$$-y&SF;6Z7?y%3DePsz{T<;Feb@8C;;13
z`hC)xx=OvGsw~VWiN+bf!)}uat07*tN%!tFez=EGMlmV9xC~2<yac}5KKyql(cTTJ
z<ekbj{S|y){?*lYx$-to3)BVT_XGicfvRg<R3GL#4a1fwD8*T;5bhYWQtQIH(D=+_
z8Mp<uN6LT{yY2KG1Dpv7`+F`6?_+5k-UAU}o2ShD=n0b)?6y~wDUw_r!h1CVknep(
z{+t^34vLexEJIM4ivteEW+UGU5qKuFUk%C6y?uYu_1rSMsU_*^#`m>;T+5$t<$~}4
zNw^ahVVp{VfUrD;5FFiottHXKPtic{%T3ma6i6`=OJmP<)J*G*Z6F7v2|<r-U*lY-
z#+E2|wcLCgXbeJV1SIfF9+Mw=(EgcP7@5ASI3zx}g6MXUcpIUP9y8%0q#bAsH1H7O
zuBZ9$`4jGZ`o2%Pz=lxiMlpaHvBwXl-W8869fhAh0l)q0Hi4!;4r-4?Y;|=~6Toj?
z?az5{oh{ew9pi-B$BUQ?msgV$w2vpYa6z20bE(K;3$hlAbUt)@$w}RMTg%S!F0Nqg
zg|l7fUeg}#*le?cx!;c;(YX{3U9`$W1B{Z8#Vkwx7hj;%lZ0O=yu-56JBbmJ)WyAx
zoe%R?TdA7|hklcTo{@E%^mgH=w?e;w|L?*H;@~*d5ys`iGb+_8v-IC-x?^5v#E)Y?
zW*_)zF=&Wcpz>uRUJM^rnEmKS=kWWY=HC+|66Ml4x+ix1q5L1^%6)jYg>BVNzDmM9
zv80?c2f*wXwJv-7#wxxm$aBL7h~Q2-P&IY_@P^}T?LNAHi~ITZJsg)yu5I!50mM6?
z^gZQ^;W1_MuWy$H;DD7$Dn2FYrpYKp)}x5kmH(%$^NebudD}QuiVzY)CzQ}a73muU
z>4q9Q3Zfu_pcF{}l_LFtgc7ML5IRUzP(e|Ol+cu>zyktOLZqp5M5Mj@{6D{6ww*mY
zJ7;J1%-r{N{bu@`nDV}LkX%an=eFiKe7J-J-pYh5bd{V-SS*+Wa0x=bh2NS*F05_s
zHB?2Vc5c7KxZxJxwVga$k8mG69zAn}v4d~{Q1AR1t;MYnEgjYM=DLNv{ZitAL0P-d
z`cCntP3Nhz<DsjJH<S5}z?>RbMayZw34ik&4c7yDH18^P6w+}A;4>oI8((QrnZV%1
zFVOC_Kjz;{c>U{tHZaDI#*`ej>sne}Rm;U)L@3MabA}XT?gH_3fV<wPq2}mllVkJt
zN$s!qZgPk)vc9q9o7;pgf1E`7I~|-(5ZyzP+U&3%9p$S|@9pbd5HUL1o>)IxN8FNp
zx5vM*ulaVtKJG_TH2m*C4Ih<qN9Pa1yMRnD;zsxI$=!=;)2aJIL2^F}7uA)cj4j*x
z32|>?rhfcdswOb%{5+{$JNVO(Cc3=uJRWqERhrYb_ZLMwvfq4hXQ(M$S9J8#!tU5H
z=EyHdUooX!o_kL8rqjZ3bzA1BSoqL=%;@%@ciWx6aUbj^Bb@eJqbfsE2xbC$%ncYl
z-Z#J=v{#G8mfBZSP>4>DHV{=@m2g??KDs`x_*n+^wfE}z@a{oZIIgHS6*!cdSCHIX
zdba!00!M)z`{e|SJ^PosGV3sIfVf*if(J^>R9wyKQs#_z6vT0Bp&K%igkKVjTvmLH
zJ7iFwE3OI$eJzxY?fw4V^b%rI@COd!T$GErE^n@cjnT!H^&51T%p~B5l221@T%Tq@
zSsLG5Lih<$UV1>prz`ZSOxryS$nLnp{LsgIm^2{QS>o`r<d6#I<`hi=8*GU@{juWY
zVFf8hRln*^mxfGax-~swP61YQOpPA?)-K%P+t#bO%j<_f-?3YG_;WB7KrV>ZOuIe8
z(de&<&{J0?v$iH~{_cz@fo;VJ4Ep0r+TUs8>cwc;yZqZ3>S$YK6FAG21ru%*0bzW8
z*QYhSI%1H~<~9MYlhe>|fVLi`QS#d@92&2A=kRP^j=l!!kOMIB$<Mw~9rVejkJn`|
zoM_pJEoYbnL=W~;UnQvC%!Q^OcTf#)d|TUXAj^hJt2nJrM3B9qxTe0CCOZDxcyI<j
z>}DsyBB~IL2vi?TVD-IAK|(QGwvZtecc(vn^k~^^pz>EH2@q%sJ!g;!Y<6IGSOZ^R
z$&)<5u&)N%^n*UNK8^9UwGkrXT?t2q2D(ZAI<@l1GInj*;iD2(gM#E3F@XH5;N$e5
zkKV=TN7m)x_exl8+I;6`9%9DE3;YxIQPM9#W>J1l0(~*}P0eyau{LVF6cNm`ujwcW
z39g*h_Iq)bEKh(ZzVMD#4>~aqWF_;i<&U3=bJ#;!j?WOIFyr1sZmh@%O8Tjs>^&|a
zW)cjwHmPM1Rert`m8-T=>pt>QE!G7YP?Huq;xD&%%+C(1pMAOh&I<CNOf`_Z0q_<l
z-S2ZNFHb)zJ1sWXc|xIdupA`E2D}}#h_D-a&NSz?&8XXho9CIM`oLLKSx!`R+GhfE
zW{Yl_<Y>*`F*ilrc^C)9Cv#14oTgTk(OT;~t~B>n&F;uER`5|b&zMx<<MKy8MLNZw
zYgImTx4r*7A9u5uKrs&?sC4n!jusk({6H(++f}Ib#IEz-m7~E?i20@4=e~9E>djH~
zY^h$0uP{0nEO5ORkvhm{YOq8BUBCWWct)PQt*RN(D79?fg#kUL16HM_0&bYWR%C(r
zmB<A%B3>JM#lKf>iJ*CAk5LCMeHr#LA$GM^aGS>=bLiG6FhqjZ*n`EJf6Sxx_;5Ul
zs6eGhh|gLBO#7f8eJ@h&POs~k&$rF+SF@9V@<!lwv7Z>SkNa16wUpA(ZGsU*?Cs4D
z8=q6V^LZ~JJP=kBinOLY#8YH9`ijlG!)GcO>(Dtnw)!PNvz%BuN2?cNdj2_y2&6FQ
zoDF(56T`zmb|FY!#Cw;YtQ%^KZ$5LL^7g^CEu~b%X31a<kjaR>2N@nNXbMt_E+7o9
zYc>Aci34ZukGsX<5hSj$$QEsHmTb?C^I@Y)y|NLeZnU5Sh1kIod8AT(Rq>>IhHB@g
z?!$uStItr~mvS0prXCT?Dx^&^CEt84JEO1!!yC3*JrboCc(;x8WPEEmN_8+mly_Va
z?#UDFw$x5-WQ;-jy?%9-Tqo>dAd-iOhK0xLiTHrg>*!A=J)iXk*RfWJHD+^bi`eOE
zCW>5Zca8`wLc~w%7C9)h;%JgCJ0+#`A3$~e-_AkCmlxH)J;5Lv_w9dL@gj}yOnB1V
z)sj+eSg<9073@@5NN;ZCMOiy5kL76HSYJn@<kt_n{&s1hU()aXb~((Uzl297NkBA-
z8M&3}kVXFVcp#qD+4U?Ebt`(C#ky2{9(?y*V_7&x<hB`N35C^A8v2>;oDe}R>Qr`m
z>YWwO3b4IMk;jdkS(&m~FLAg>H$fj1-OCF0T)Hhq8>}-pmhd%vQ2Je5<FR6g05gom
zKRy0-6YH8N@)57;^=|GWi(a+&gB=>^8_Q*14W+^G>UVhJJ+9KDT4%l3;4|k~w&{na
zsRJZI>z-0dOfy5TGPNof3$>#9OhXON&#f~#cxJ|h`Pf`e-q{=Q5iYsLF$U8KgNY&J
zndMS}DV_FzWIeCjK!VY(H<HyZp!RVNm1!}h8oBbH#Z6!=rwed7<9{D(bo*yFd>ZWg
z>q7pjyC*N{!1j**%uKvI4B&+(KK4q2#Bbd~N&K3Y_ouq&cj+qJ73IS-NHSBOE8^$&
zF2tE$vXz2-wmg=5xD?L%O9qn(?7aczpoFt~ZR{K~n7C!ltL`!O)4Gm0irID*TjOTD
z0|KWy4EFe2&?{NL^gjrGx0FTyM4|Q?0cmc`2YMu5L0Z<`8dD04IAD`I7-vM<=iOwY
z2V|4g_Z_IV=91W4Z+1#&X8N{&_BmYIi*9=;D-(N&55A=<4aIO97SI{U$3w)(r2}kh
zVnXGe)AFq!pSN}A7!{{4HnJglFQ$EN=3g?sn8b|}AU<2wn-9S}>PWF_nD8C49-#(r
z-eVY}Js_dzu;u#>V0QfNcWEox3o2Eb=sOg&NwQjEa__Vjy3IhvzBu@SN~Im4M4v4D
z{aLU#*V*DpxuY{Uk3hr3U(ERx$cRat`Mmx#Idoui!~ej0nSw!hs%|bc)3VB;CI!IT
zcDr_6OShXyhxDN4ehRMnR_-Sa^vEZBe_Swv#i(<3qV37UT1M~N10iQWh<k*VGh(eM
z#TTwAg{a3<UL)@M)|w*5WiSK$-<0HIXp|TY>W6VIsCUS&7GE~A(!*F@WKzUa2_n4N
zHzT%VV&<1|Cct1tdb+_)C|v}EH2Dthr5(;JJauJ{G>UbG*?yEE^~MNrTnc~0?@z7N
z{rxYJ<dR<TRkyA0)5ElVkXX@or}(ndVEboj$XudFijTO-e5EEDm>BtVK`ik7_p0aN
zsUl#ib~%t0p7&Ivzn&l)R_Le_acTcOf`b86rgZt?O_%#h$E$Tvft|i7(va%wKuEU7
z$E=TSqu?Wr$N)`%4wFl{fen?FQ=*EKOFEaOus-R<6kI9`vzMo(x=FVUJKmofpKJ^V
zlB^&fRnBuZG7NBnj$i~TzY7TyJm9vnRcVNo<;Jv|@Ssx!nt@`OM8R(1tGz-Nh>jaR
z<&ziF5Ji1BaxjF{yXa}rs3dVIhe99a(wzi*&h1Y<k6@|gC6sEgdC{XB=&w_oj^bYs
zui7>;R#4{flMmS7iGrocV|lIu);_$xBGmS45)O2M?)U%w?@r@5mDgTR`JslXfL-Ja
ziOCFFr<Hy_U|G?3SMUzRB{R+aFd847rJn*^=kHY+@Dp5;j=w^XOhw4)tx?$C*ymCf
zaI=L1GG3E5&(X4Uz@fKB8-1_cWb5y*r=+dWOgf;NS#PcIu|Cg_(~vu+e;;H~qgqZr
zG}E*Vh$A6UKlYRoK8VoJExbwru(ysy#V#gK=Q)LCl<_C(J~~Ioe=7>#C^NRddw#Qi
z#}iFwVgoUdAqi{JQ|GG|>64!TK0oP380a85+AZVR;t<8S*Nom{1whnPC$lkG@3{Y$
z<K|bHQKPXBC%HC6INi``SWMi_Axl0n92u-kOg*zXf;*^*vNh*Klqtd>INx$Sf3|Cl
zp$@pcVcD;PKy-h>m34W`=Q(qL2hl}JBXkB5iQ@%Lhy|tWT*M8k?D!35r6H%YrVnIF
zz)4xX6ekuV1=)MY(MkI2e#Q^QApGL(0Hr_F-ccv;PeMtH83V%PG&zzIlPIajnvkbj
zz|>{w6=M|A=i*hTk^rQf*U5(gSEqBMGpjpR`Ka#f(e#K)6>z$F^Y(0TFl|IqMiL@{
zs$>tq3AC&_^}(XzOF0p``@##vt_3R=BS@l_2osvUy%?D|O)Da0p1J;_dx75DVCa_&
z0dY-2$}dn`0`~w}3i&S!ACIk}^WIL@_}uh7_Ak@;*Krrbxj-5I@cGRs2xe#5Ci7fR
z5>Q_`z{fHl!$l^XRhqX!_^P8>srovSSWqt7EHY~q2xEoSco)Nn;!A6c*s{R@n!
zpGKSj8jQYus#PMzUZyvr^WK%%qpPrI#)i$N5xPh@g@E5F{smfL{Uvi)ym6a*vVF>v
zx?LFsmb)XAbRV1qmMNe*7Rz%Lk#YAxhULnouP)@07)fJ<3eCsru{%6rG#7|o;ch2B
zNT<X0B})Y2)zO5mKel(2NG<j-vvfleU~bdblgEqG4-x-DjsHey0FYW!6tl0i`hXgD
zN(YKLFpQqs2YhL+8GOU;LnRn<!ZA|DqSUO<v}goFXSIV@M|=7YD^J9#I}$V#NRk?4
zXJ5lG?yN4-j1?eSEE?$>u6D<W>>~<#oCuU+r|4-10^qVyPrcafz=PxIODoj!rRv}5
zAi(X{+>j_Z%RPG&izI%{QgewRbm(goRxyWz654Vb8BGa&D5`!cT7D9R2S7{?RC(x#
zS<C^!qg0DHprjX6sRoYv6BuPXQz%<XjT=A;u))$qpdz_uJGk4Pb4x;0sv0gb|09lV
zy3<5SPcTwxK|@g)Usl7&%gIZ}71zF}6D7aW+C|whJ+AfL^xQ0)eBDwcMUepJty>o4
z2jvAMpiY(0<%^e{eboSTJG5GRK{0$>CJWClaHGgqqIouYEBwRsb*2X|Hxx4B+DHjr
zZ6F)+Fi33UpMv9wB{k7PE|z0O7q#N!h>OD=JJ9bcWzB!W_9lW(d!m#ttF^fZw}f#$
z@4aBSn-S=24#gCkT2zO>)hv4Y7zDC#Y?ybs%pJa8nTT}R4$&ozZ*OaNuFkZ4g`r#$
zBkgANtR{30vsa(oitae#{jqFHt}ERd{2sEwe{fOh;oOnVbULuJP*iA=DC*7B2laZ8
zpmP|u*CaUe$U71&dP<0}3%s%-wJ+P4?>l@s_fMlV_5-EP+KtO7uT^5Gf%YTp=hW}W
zD6NYaSu=(bota<=m1*=W%zJ9<k+;??+o0RaNQCYX&1c0cL13_{b7Orc^5K)y`vA>5
zJr5XO<-X-M`aLBj^hL(gHfLd0?!;MIG-;vp+E&5Z$YStK{%@@X0aV3*?(4NQ+7Ts9
z39kxfj~b-0wId8=K2rsM-1-`Zzbi~V+CgJCD%}=v2fgDD+j4)OtUMDry@1$$-DgEc
ze%!z4zd0UpNn=j!)40yk+qYl!lYm_%?EB{*N2okkj#xn9L;-OJ5BHX9*%JOB*+gN*
zDP(Jbj37LR9y&70ZVwnNjhYLM*tlaHMpo8;ql?vY>#FMo)!VL_k_HGDRpg`Zj;Gge
zP}M)`l6WN8d#vt@W6Q>l%6tm;iGW<e;%<2B=|1{mEj*0$n$B|%v@VMYwp})BBJAvj
zW8|c99^<lZ&&&Tkb&906w1ucVUWo8*1vLn`!bMh<62F;XES1TQGY!PH>CCcStO+BD
z_#FyV--xVphOitdvctW%_97zm^26#f0-+V5rV&o#D!<FR(xy8r8-Pwx4H|0b=;c`b
z@5PJ_kY%%Zu_mL~c&llZL}BBPJX*#r30G}C`g}gJt{Qjh)VXkeale{`oY&)_{CTd_
zA(Lp9Rr3bO`j_?qV-hG8OB4zmMHAq)Jp+``{o{efKWgDaUB=WVqcRD|yMCgmLH-B%
zKvftf7M?x-YzNw#7jla_vp?tyCdh$xs2Ra%cEyKk%x>Qj;5*4$iicw8c=qO>1;J1b
zVj%X0LXm=0d6MqciY~ox@8A63xM^m~8=#HCG4Og&h{W&LYWo+s6SGX(-K6u7XxwCF
zC8t(?9&jS0LZwioA&vb+gnoSbxw+goMlRI8t%P-nRx)MFsTPKkDY);khZrcaWV;|)
z2E||glptVdg+ZKCtfpA;i<5Q?|5E@^j(5V%a>-drSf#5Tvu#x;VH}x!jnrp5FjPDu
zrsy~Sc0`F5I-PMO^D@(rf4O=T@=;<kyVz_k4N0nV_*qBC(!HM_4RqMG3I+b#@J{-#
zLc5~tSZdyoHNuK!?8*x34adUJ5yK63c-=lWPCiIsPhQg-keL%lE`kfrjtp3&fmS5^
zf!jsAhbU7nxhVNV6%x~_B8cvo;9ee)5A#OcR$H;=?qfTcr(0xdS~aAw1*2QiY*`{`
z#t0uIhN4te2Boka@GC9j_Y?9oGnvvB_r*Je!>)VD>XY@KbP<6m7S#HNE6w;<NSY)q
z+mt-LuF-wW8x1cSl-0zyKm_^#Ot_l7F~XXDJ59|QbL&wexnEmAKMt@H$n?ScP}0G8
z(_2jCJ7<a%i3<q!B(s8sCZO{X{?_y)LN!h8zG(;D_As7F*DmR-B{@21HC$fBO*MAp
z7W9WRX?XI1nt~~LgvE(x>1bp4{F}d^a^Qn<!j03YOfORH1;d9o2#mWOcDJuGCFL1J
zNvQRDo89X98aUcbfEz>jw8KjSIgj!?$sg|48q%Nq)-o_P{VQZ5G30;9{mbQ5ee#r)
zz!b;d(L$ert(=#kac>o`z^H#jn~sznAsYqG#}^#e>3V(t<kc=kr?W8=ztNz?xD-M2
zXIcKS-GgO%>EA7)3xKIte%*w{hX{%GyzD4%sOH@_Cy!oshf2Pg*bIoKY+c|}T(|q(
zuVmw@!rLj_aAPY8Oe!!rMQhctzBunU`;!4N#JVH1-6Di+c=FtUpkPQ}Tt(xK)6Cr7
zTB`juSP9^0_iS|%u9H1|WqCn0UK$}TnZHl+&}l(ki|Py=#19ug7N?jwn0z$JsAcl%
zw8=#bpCw#=N$hWP{#cZ{)q@hiWi+7k^-&x_bqmw5#9vGzW5*2CU8#0C4y`O}L{P;G
zC2`!B?2Lf^GI>mlth^QJUUFtobC#j-qLqz1RVrfNoKb{%f^^=Jt8c4-|8tj}686op
z9mKzq*~#+nR4g3j-&^afPvm<d!emVz^mp`Qod@hfH3xDv5B6&swO}_0nSW$jBUjGf
z49rC+<4PU}drhi(eeQX{2wG>VM8!9#RTNA6$`+`py(-T07ILpZE>)8<er}DOn-xRm
zC^V6Ea?A+xGHG|dL|nvOMh1Q?<=ojy#bTw|Eyy%~>}B0xfWDsc?s;2cSPdt@{3XGZ
zM13Lg&j_V|N0L9u<Z)5h>Pf#9gbYYrE~u7CD&x%5GDm02GvM$I=nwTFhA2*aB?U<f
z*-ZbvY?M6ptSrmCB~7XXiV_au@x``2t{PHts(E=%p+bRZfwkc*|B}#?r&=eK3byV)
zIWE;>cdVPsh;8Y8=!j));R}R}NNFAm;pY6#WK(j>sFPeU;RQ_d0d=#^@g4g|{{SDZ
zNQ>5|o)U1>{>*L1k~<GGbtEk#C1I2fuSG)T@t>LjPV@fSnHn?H8QivY5qF6C4>FZM
zk^D{Ts#e-f^Qv^$C+`i|a<U`Pa@`Wx%Y4C1K{kq9@<Bz6_ve5i6B2R<lr<;?g$o4$
zRPG9ed4s2rklm#Cd0y>ED{^a$uwhEFy-P^`K2#2dSNI#2VG#`+2<Kx9wLQ*FM5gmq
z-hfhhSjn&Mqb|OCRfkG6CXDiqtOT2n*~$wxSOEBYb4&=HGl*<0P6NRJDJc`Ogit;<
zQCs_;xII>3gcPK|R8P2knDx_4eu6NvA6B8jtSvNV8+zFN<h+Ls8Td|z@wzAd&)|#Q
zAN_K+RGuz`|5_hltjeN}7i5xvDAr#v|0+^eRDZx5We_TDNq!-|;#Vb^K@#d|zln^~
zbZhO|^-XJzI5^;WvT*6jHf1#{D&R?O5<`g8%#Ejvg8br;sV918i57M4K~(PA`B?Te
zb{u=&kx~c~_m>+2f$k9lOFvGUHZ&YmpC}z`79qP*ld`}XzGVs<TR3D;V>y3E6o%wb
z!4VZ+Bx%kU#>yWjC8dkv&hWnBzPiTRX8Wo9j3B?rf<V(-N$J54E;oMGus-o%=684;
zOdRE95xMb`^*Un2Hn=b1L?4mGAC=(q`gy2rV<3-wul$Qj^(uIU0#v(XEWH_cS~$rn
zi}U$<<GE&!7jK8br92*U1F`}0xkre5lw!CDT({Ny_6f^+XP&^&t(z{B)D5o&vvLQ@
zYEnFDhSBR~G9y4Q-dn!W-^5r4b~_}0OQ=wBMlChT+{us{{Es1~`fIv;cT@5~Lr@wQ
zPzqBrn9>-3sg<fp%zg~Sv0qb%t+8f%j&%h8t06@JhwYoIBj<{nyk`4+G7VOVquEzU
zhsXW*J3lc&rFbIF3d-X)`R&%W13Vl0%M>ZAv?y_FEcYag@v^$1>?Ua7JIP?Jq&kI2
z4s7#Wt!R|z$DI)jp@Vp=8`&CB=*!~=+eW^-e)7q5MibRcJ?<))iE9)*>>F9uG)4qp
ztfZX1*4>z*$t<gdyOb|<0TEP}Gq;Rf*wc47_cZJ5wbpFtrz-&(1qLQi?~#CttcH=J
zv6Y_inmI0>uC&+M`k^jK{V(es57}OZedk-a?ZAP%<QOl~)7vf8HJAKQv%^vgQr|-F
z>9XW;%;feu)_21}J{c%wzaA=ckB|`VX&muuXF6&*wGGjGqpHa<m-)JUoRuN3DcP2J
zg6`}E_19lLZmx##NRzbsWCV7#Kf2`CwIA@h`YbzCQ3M^g!?|>|GP2dKAkd#`#uxPC
zSBh?)n^!PJ`9v~bo*+7^<IeoLf6q4CU?|mlTkUV_QLoFVri1Ra6}=Ps`_`h5Ek?#=
z-<DzPs6@3jLVj#@{8OP57c$@3E59iRy+Z1G)Bz*buUY8h*G0ZIej?Y5L!jPv{WHbo
z3Bv1J>*7Ot_rTkJ=Sc>Ygz}(Q-3+|jCaLeT?nn$t@!vPE5;45B*&*6n@T^;z1>Y%{
z1%Oz;iss4KcSUQGLzkK3Sn~P(YDeWDntJ{}4HrJ;&DObRy85NNI*oICe+g}Zi>+;A
zn1_1C#NIWTi7@9*;?FKptI2>Hy2IW$rZ|HkWd`C-x9t7XtiGN|X380|v*=J?qpSiG
z;6euZ|E&>#pd5VUEI49L>_-8Chnav#qSm(*gm=LAfp3a^C45c0&swo2WV7q*o-g!_
zRSRx3Ke3#*6h%<vJlk%;oHH-l!i`Z<L$%P*SL2d8@ZO^5^kTrbr3z+?oRUI1v+SIv
z{yI2(x#G(t<B-P%Ppzk1<ZcY?^(&`{1aoH#xZM-+p%U>hy9tH!jCY^mDRQ?LFL{>}
z8r9N6l&FgAr6@1vIFIHY))@!%-I~dZRY`;K#Y;TzWpR;7uTW;TjGgPl=5*fr*b^m0
z^8=&7oZwLE5tI8Pe14+jy0wEfd4)sx4i+Sp*^B>`9dA>9iv0t6Gp)R-tD5E6FjO@V
z>Tu7v<Uyia2y|=BZr(5hTPs(S0?2lZNWQbSgJCS53aX8{>R&><&sZ;%qNL+LcBbH#
z#z6R}S@``bc;b(7@ta8?DOP&xTg7WD;xMl7GAggNN?Q<@L2O3fX8}`3Q}@-|!uw3g
z;%qCS&2?SDpgrT2xgTQ?VR7^0$4_}>{hky#Y}w!u=*#w>0JYxUAji*7ZzKFLv_)@!
zfwnk0`@v}Tcz;lve0sk(RaJ&r5ciSa^X4$8<<827P@S;KhPrUB`~sj(RBQO25dJ^q
zwyW?fLS_KO(mNeiA@u8B^D=tqP-7Qhyc#yCeyq2sb?tQ6n0z?o`xG!cHxMi>h^zkm
znCUbSy8MlFANy@&S-miT!4fn+TN>rQ*?9PG!um2GlL>0X#(h39vHXwGx%>oZ7OgmJ
zG}suP+T7Hmt+)|ZL7uupsS@lrB{Wj(i%uH_bP(UZKAN@^dY>EBz2TT+nGdUw@W<6l
zVs}>VRumlT1w9(&EOgYxH0e|fS1chP4OY-%^@99VyS*q?4n=eAexjy&APeqQ3Xm;t
zP6nmMde<ZLWHQVCcb;_kgPso5WMnyABAfq@N-hU-gtPwx65D}S{-Y<$Nn-y8InIKC
zV5g%P9ThDZIJn4z3y6)PSC)Z0{C5cAs+sXBIUpwAK><l+_4H7U0ZVuKAF%ov_?^cT
zE-G^X|2wLQ3QWJLM3Q?4g42@|IT-&(hj&b-=e(EbsR{!z=2t;7byxI%uzc)yiQ+}E
zGYESb5THGwSNSo&X*C0UnDSz3m8c+YNF3b%NAQ=i2V2Vjzderf5xbI>Oozz=SNNba
MGr=0aFm#XoKUfQ+`Tzg`

diff --git a/docs/chipseq.rst b/docs/chipseq.rst
deleted file mode 100644
index 5302e973..00000000
--- a/docs/chipseq.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. _chipseq:
-
-ChIP-seq workflow
------------------
-The ChIP-seq workflow starts with raw FASTQ files and performs various QC steps. It
-aligns and prepares BAM and bigWig files, performs peak-calling, and combines
-everything together into a track hub for visualization.
-
-Specifically, the workflow does the following:
-
-    - trims reads with cutadapt
-    - maps reads with Bowtie2
-    - runs FastQC on raw, trimmed, and aligned reads
-    - Removes multimappers (samtools) and duplicates (Picard MarkDuplicates)
-    - performs fastq_screen on multiple configured genomes to look for evidence of
-      cross-contamination
-    - QC aggregation using MultiQC, along with a custom table for library sizes
-    - merges technical replicates and then re-deduplicates them
-    - creates bigWigs from unique, no-dups BAM files
-    - optionally merges bigWigs to create one signal track for all replicates
-    - runs deepTools plotFingerprint on grouped IP and input for QC and
-      evaluation of enrichment
-    - calls peaks using macs, spp, and/or sicer, with support for multiple
-      peak-calling runs using different parameters to assist with assessing
-      performance and to help make decisions for downstream analysis
-    - optionally runs a template diffBind RMarkdown file used for differential binding analysis
-    - converts BED files into bigBed (or bigNarrowPeak where possible)
-    - builds and optionally uploads a track hub of bigWigs and bigBeds to
-      visualize peak-calling in UCSC Genome Browser
-
-To configure a ChIP-seq experiment, see :ref:`config-yaml`.
-
-.. image:: chipseq.png
diff --git a/docs/conda.rst b/docs/conda.rst
deleted file mode 100644
index 1cf44f84..00000000
--- a/docs/conda.rst
+++ /dev/null
@@ -1,209 +0,0 @@
-.. _conda-envs:
-
-conda and conda envs in `lcdb-wf`
-=================================
-
-Conda basics
-------------
-
-If you're not familiar with ``conda``, it is a way of keeping software isolated
-on a computer in an "environment" (basically a directory with the executables
-for all the software you want to use). When you "activate" the environment, it
-places that location at the beginning of your ``$PATH`` variable, so that any
-executables there are found first. It does not affect any existing installation
-of any software on your machine and does not need root privileges.
-
-If you don't already have conda installed and the Bioconda channel set up, see
-the `Bioconda docs <https://bioconda.github.io>`_ for details.
-
-You'll also probably want `mamba <https://github.com/mamba-org/mamba>`_. Mamba
-is a drop-in replacement for conda that is faster and more robust. In fact, it
-is now the default conda front-end for Snakemake. If you don't already have
-mamba, you can install it into your base conda environment with:
-
-.. code-block:: bash
-
-    conda install -n base -c conda-forge mamba
-
-It's recommended that you install mamba into the base env (just like conda
-itself is) so that it behaves like conda. It does *not* need to be installed
-into each individual environment.
-
-
-Building the environments
--------------------------
-
-**It is recommended that you create a separate environment directory for
-each project**, rather than a single environment for all projects. That way you
-can update packages in each project independently of any others, and yet the
-environment will always be close at hand. This is an especially good practice
-in shared space as others can easily find and activate the environment specific
-to the project.
-
-.. note::
-
-    We recommend using mamba rather than conda for the speed increase and
-    ability to more correctly solve environments. See the `snakemake docs
-    <https://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda>`_
-    for more info.
-
-
-If you use the ``--build-envs`` argument when deploying lcdb-wf to a project
-directory (see :ref:`setup-proj`), two conda environments will be built in the
-directories: ``env``, which has all of the non-R requirements, and ``env-r``
-which has the R packages used in particular for downstream RNA-seq analysis.
-These environments will use the fully-pinned environments in ``env.yml`` and
-``env-r.yml``. If you've already deployed but didn't use the ``--build-envs``
-argument, then then the equivalent command to run in the deployed directory is:
-
-.. code-block:: bash
-
-    mamba env create -p ./env --file env.yml
-    mamba env create -p ./env-r --file env-r.yml
-
-
-.. _conda-troubleshooting:
-
-Troubleshooting environments
-----------------------------
-
-Sometimes there is a problem with creating an environment. For example, the
-exact package specified in the env yaml might not be available for some reason
-(this should not happen, but in practice sometimes it does in corner cases).
-
-If this happens, you can try a couple things.
-
-First, some terminology with how packages are specified in the environment
-yamls. Here's an example for ``libpng`` version 1.6.37::
-
-    libpng=1.6.37=hed695b0_2
-    |____| |____| |________|
-     |       |       |
-    name     |       |
-           version   |
-                   build string
-
-The package name (libpng) and version (1.6.37) are pretty standard and
-self-explanatory. The `build` string refers to different built versions of the
-*conda package*, but for the same version (1.6.37 in this case) of the package.
-For example, if a conda package was built for version 1.1 of a tool, but that
-package itself had an error unrelated to the tool, then a fixed build would be
-made. The package version would remain the same (1.1) but the build string
-would change.
-
-In this example, the build string contains a hash ``hed695b0`` which is a hash
-of all the pinned dependencies for this package at packaging time. The
-`conda-forge pinning docs
-<https://conda-forge.org/docs/maintainer/pinning_deps.html>`_ give more detail
-on what this pinning is about, but basically if that pinning changes then this
-hash will change. The ``_2`` on the end of the build string hash indicates that
-this is the third built package (build numbers start at zero) for this version
-of ``libpng`` using the same pinning. In other words, there also likely exists
-``libpng=1.6.37=hed695b0_1`` and ``libpng=1.6.37=hed695b0_0``. At the time of
-this writing, there is also ``libpng-1.6.37-h21135ba_2`` (notice the different
-hash) which is the same libpng version but uses different pinnings.
-
-What does this mean for troubleshooting?
-
-For any package that seems to be problematic, try editing the respective
-environment yaml (e.g., ``env.yml``) to remove the build string (so in the
-example above, you would try changing it to just ``libpng=1.6.37``) and try
-building the environment again. If that doesn't work, try removing the version
-as well (so just ``libpng``).
-
-Alternatively for very problematic cases or cases where there are multiple
-problematic packages, you can try creating an environment with the "loose"
-pinning in ``include/requirements.txt`` which effectively does not require any
-particular versions with the exception of a few corner cases. Keep in mind that
-using that file may cause the environment to take a while to build as conda (or
-mamba) solves the dependencies of all the specified packages.
-
-
-Conda envs in lcdb-wf
----------------------
-
-Given all of the software used across all of `lcdb-wf`, the environments can
-take a lot of time to build because the solver needs to figure out the entire
-dependency tree and come up with a solution that works to satisfy the entire
-set of specified requirements.
-
-We chose to split the conda environments in two: the **main** environment and the **R**
-environment (see :ref:`conda-design-decisions`). These environments are
-described by both "strict" and "loose" files. By default we use the "strict"
-version, which pins all versions of all packages exactly. This is preferred
-wherever possible. However we also provide a "loose" version that is not
-specific about versions. The following table describes these files:
-
-+----------------+--------------------------------+----------------------------------+
-| strict version | loose version                  | used for                         |
-+================+================================+==================================+
-| ``env.yml``    | ``include/requirements.txt``   | Main Snakefiles                  |
-+----------------+--------------------------------+----------------------------------+
-| ``env-r.yaml`` | ``include/requirements-r.txt`` | Downstream RNA-seq analysis in R |
-+----------------+--------------------------------+----------------------------------+
-
-When deploying new instances, use the ``--build-envs`` argument which will use
-the strict version. Or use the following commands in a deployed directory:
-
-.. code-block:: bash
-
-    mamba env create -p ./env --file env.yml
-    mamba env create -p ./env-r --file env-r.yml
-
-When getting ready to release a new lcdb-wf version, create a new environment
-using the loose version to prepare the env and then when tests pass, export it
-to yaml. That is:
-
-.. code-block:: bash
-
-    # use loose version when preparing a new version of lcdb-wf
-    mamba create -p ./env --file include/requirements.txt
-    mamba create -p ./env-r --file include/requirements-r.txt
-
-    # then do testing....
-
-    # when tests pass, export the envs
-    conda env export -p ./env > env.yml
-    conda env export -p ./env-r > env-r.yaml
-
-    # commit, push, finalize release
-
-
-.. _conda-design-decisions:
-
-Design decisions
-----------------
-
-We made the design decision to split the conda envs into two different
-environments -- one for R, one for non-R. We found that by by removing the
-entire sub-DAG of R packages from the main environment we can dramatically
-reduce the creation time.
-
-We also made the decision to use large top-level environments rather than
-smaller environments created for each rule using the ``conda:`` directive.
-There are two reasons for this choice. First, it allows us to activate a single
-environment to give us access to all the tools used. This streamlines
-troubleshooting because we don't have to dig through the ``.snakemake/conda``
-directory to figure out which hash corresponds to which file, but comes with
-the up-front cost of creating the environment initially. Second, it simplifies
-running the tests on CircleCI, allowing us to cache the env directories as
-a whole to be re-used for multiple tests rather than caching the individual
-.snakemake directories for each tested workflow.
-
-Given that the conda and snakemake ecosystem are in flux, this may change in
-the future to using small conda environments for each rule separately if it
-turns out to be more beneficial to do so.
-
-.. note::
-
-    Prior to v1.7, we used requirements.txt files with loose pinning. Moving to
-    yaml files allows us the option of also installing pip packages if needed.
-    It also allows us to specify channels directly in the yaml file for
-    streamlined installation.
-
-    Using strictly-pinned yaml files that are consistently tested will
-    hopefully result in a more stable experience for users. For example, if you
-    happen to create an environment around the time of a new R/Bioconductor
-    release, the environment may not build correctly using a loose pinning.
-    Other transient issues in the packaging ecosystem can similarly cause
-    issues.
diff --git a/docs/conf.py b/docs/conf.py
index a8c11dc9..2f653095 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -34,7 +34,6 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'generate_guide',
     'sphinx.ext.autodoc',
     'sphinx.ext.autosummary',
     'sphinx.ext.doctest',
diff --git a/docs/config-yaml.rst b/docs/config-yaml.rst
deleted file mode 100644
index ad1d3fb3..00000000
--- a/docs/config-yaml.rst
+++ /dev/null
@@ -1,587 +0,0 @@
-.. _config-yaml:
-
-Config YAML
-===========
-
-This page details the various configuration options and describes how to
-configure a new workflow.
-
-Note that the ``references:`` section is detailed separately, at
-:ref:`references-config`.
-
-Config files are expected to be in a ``config`` directory next to the
-the Snakefile. For example, the RNA-seq workflow at
-``workflows/rnaseq/Snakefile`` expects the config file
-``workflows/rnaseq/config/config.yaml``.
-
-While it is possible to use Snakemake mechanisms such as ``--config`` to
-override a particular config value and ``--configfile`` to update the config
-with a different file, it is easiest to edit the existing
-``config/config.yaml`` in place. This has the additional benefit of reproducibity
-because all of the config information is stored in one place.
-
-The following table summarizes the config fields, which ones are use for which
-workflow, and under what conditions, if any, they are required. Each option
-links to a section below with more details on how to use it.
-
-================================================================================== =================== ================ ================= =========
-Field                                                                              Used for References Used for RNA-seq Used for ChIP-seq Required
-================================================================================== =================== ================ ================= =========
-:ref:`references <cfg-references>` and/or :ref:`include_references <cfg-inc-refs>`          yes                 yes              yes      yes
-:ref:`references_dir <cfg-references-dir>`                                                  yes                 yes              yes      if `REFERENCES_DIR` env var not set
-:ref:`sampletable <cfg-sampletable>`                                                        .                   yes              yes      always
-:ref:`organism <cfg-organism>`                                                              .                   yes              yes      always
-:ref:`aligner <cfg-aligner>`                                                                .                   yes              yes      always
-:ref:`stranded <cfg-stranded>`                                                              .                   yes              no       usually (see :ref:`stranded <cfg-stranded>`)
-:ref:`fastq_screen <cfg-fastq-screen>`                                                      .                   yes              yes      if using `fastq_screen`
-:ref:`merged_bigwigs <cfg-merged-bigwigs>`                                                  .                   yes              yes      if you want to merge bigwigs
-:ref:`gtf <cfg-gtf>`                                                                        .                   yes              .        always for RNA-seq
-:ref:`rrna <cfg-rrna>`                                                                      .                   yes              .        if rRNA screening desired
-:ref:`salmon <cfg-salmon>`                                                                  .                   yes              .        if Salmon quantification will be run
-:ref:`chipseq <cfg-chipseq>`                                                                .                   .                yes      always for ChIP-seq
-================================================================================== =================== ================ ================= =========
-
-Example configs
----------------
-
-To provide an overview, here are some example config files. More detail is
-provided later; this is just to provide some context:
-
-RNA-seq
-~~~~~~~
-
-The config file for RNA-seq is expected to be in
-``workflows/rnaseq/config/config.yaml``:
-
-.. code-block:: yaml
-
-    references_dir: "/data/references"
-    sampletable: "config/sampletable.tsv"
-    organism: 'human'
-    aligner:
-      tag: 'gencode-v25'
-      index: 'hisat2'
-    rrna:
-      tag: 'rRNA'
-      index: 'bowtie2'
-    gtf:
-      tag: 'gencode-v25'
-
-    fastq_screen:
-      - label: Human
-        organism: human
-        tag: gencode-v25
-      - label: rRNA
-        organism: human
-        tag: rRNA
-
-    # Portions have been omitted from "references" section below for
-    # simplicity; see references config section for details.
-
-    references:
-      human:
-        gencode-v25:
-          genome:
-            url: 'ftp://.../genome.fa.gz'
-            indexes:
-              - 'hisat2'
-              - 'bowtie2'
-          annotation:
-            url: 'ftp://.../annotation.gtf.gz'
-
-          transcriptome:
-            indexes:
-              - 'salmon'
-
-        rRNA:
-          genome:
-            url: 'https://...'
-            indexes:
-                - 'bowtie2'
-
-ChIP-seq
-~~~~~~~~
-
-The config file for ChIP-seq is expected to be in
-``workflows/chipseq/config/config.yaml``.
-
-The major differences between ChIP-seq and RNA-seq configs are:
-
-- ChIP-seq has no ``annotation`` or ``rrna`` fields
-- ChIP-seq has an addition section ``chipseq: peak_calling:``
-
-.. code-block:: yaml
-
-    sampletable: 'config/sampletable.tsv'
-    organism: 'dmel'
-    genome: 'dm6'
-
-    aligner:
-      index: 'bowtie2'
-      tag: 'test'
-
-    chipseq:
-      peak_calling:
-
-        - label: gaf-embryo-1
-          algorithm: macs
-          ip:
-            - gaf-embryo-1
-          control:
-            - input-embryo-1
-
-        - label: gaf-embryo-1
-          algorithm: spp
-          ip:
-            - gaf-embryo-1
-          control:
-            - input-embryo-1
-
-        - label: gaf-wingdisc-pooled
-          algorithm: macs
-          ip:
-            - gaf-wingdisc-1
-            - gaf-wingdisc-2
-          control:
-            - input-wingdisc-1
-            - input-wingdisc-2
-
-        - label: gaf-wingdisc-pooled
-          algorithm: spp
-          ip:
-            - gaf-wingdisc-1
-            - gaf-wingdisc-2
-          control:
-            - input-wingdisc-1
-            - input-wingdisc-2
-
-        - label: gaf-wingdisc-pooled-1
-          algorithm: epic2
-          ip:
-            - gaf-wingdisc-1
-          control:
-            - input-wingdisc-1
-          extra: ''
-
-        - label: gaf-wingdisc-pooled-2
-          algorithm: epic2
-          ip:
-            - gaf-wingdisc-2
-          control:
-            - input-wingdisc-2
-          extra: ''
-
-    fastq_screen:
-      - label: Human
-        organism: human
-        tag: gencode-v25
-
-    merged_bigwigs:
-      input-wingdisc:
-        - input-wingdisc-1
-        - input-wingdisc-2
-      gaf-wingdisc:
-        - gaf-wingdisc-1
-        - gaf-wingdisc-2
-      gaf-embryo:
-        - gaf-embryo-1
-
-
-    # Portions have been omitted from "references" section below for
-    # simplicity; see references config section for details.
-
-    references:
-      human:
-        gencode-v25:
-          genome:
-            url: 'ftp://.../genome.fa.gz'
-            indexes:
-              - 'hisat2'
-              - 'bowtie2'
-          annotation:
-            url: 'ftp://.../annotation.gtf.gz'
-
-      fly:
-        test:
-          genome:
-            url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa"
-            postprocess: 'lib.common.gzipped'
-            indexes:
-              - 'bowtie2'
-              - 'hisat2'
-
-
-
-Field descriptions
-------------------
-Required for references, RNA-seq and ChIP-seq
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. _cfg-references:
-
-``references``
-``````````````
-    This section defines labels for references, where to get FASTA and GTF
-    files and (optionally) post-process them, and which indexes to build.
-
-    Briefly, the example above has a single organism configured ("human"). That
-    organism has two tags ("gencode-v25" and "rRNA").
-
-    This is the most complex section and is documented elsewhere (see
-    :ref:`references-config`).
-
-
-.. _cfg-inc-refs:
-
-``include_references``
-``````````````````````
-
-    This section can be used to supplement the ``references`` section with
-    other reference sections stored elsewhere in files. It's a convenient way
-    of managing a large amount of references without cluttering the config
-    file.
-
-    See :ref:`references-config` for more.
-
-
-.. _cfg-references-dir:
-
-``references_dir``
-``````````````````
-    Top-level directory in which to create references.
-
-    If not specified, uses the environment variable ``REFERENCES_DIR``.
-
-    If specified and ``REFERENCES_DIR`` also exists, ``REFERENCES_DIR`` takes
-    precedence.
-
-    This is useful when multiple people in a group share the same references to
-    avoid duplicating commonly-used references. Simply point references_dir to
-    an existing references directory to avoid having to rebuild references.
-
-Required for RNA-seq and ChIP-seq
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. _cfg-sampletable:
-
-``sampletable`` field
-`````````````````````
-    Path to sampletable file which, at minimum, list sample names and paths to
-    FASTQ files. The path of this filename is relative to the Snakefile. See
-    :ref:`sampletable` for more info on the expected contents of the file.
-
-    Example:
-
-    .. code-block:: yaml
-
-        sampletable: "config/sampletable.tsv"
-
-.. _cfg-organism:
-
-``organism`` field
-``````````````````
-    This field selects the top-level section of the ``references`` section that
-    will be used for the analysis. In RNA-seq example above, "human" is the
-    only organism configured. In the ChIP-seq example, there is "human" as well
-    as "fly".
-
-    Example:
-
-    .. code-block:: yaml
-
-        organism: "human"
-
-.. _cfg-aligner:
-
-``aligner`` config section
-``````````````````````````
-    This field has two sub-fields, and automatically uses the configured
-    ``organism`` to select the top-level entry in the references section.
-    ``tag`` selects the tag from the organism to use, and ``index`` selects
-    which aligner index to use. The relevant option from the example above
-    would be "gencode-v25", which configures both bowtie2 and hisat2 indexes to
-    be built. For RNA-seq we would likely choose "hisat2"; for ChIP-seq
-    "bowtie2".
-
-    Currently-configured options are ``hisat2``, ``bowtie2``, and ``star``.
-
-    Example:
-
-    .. code-block:: yaml
-
-        aligner:
-          tag: "gencode-v25"
-          index: "hisat2"
-
-Required for RNA-seq
-~~~~~~~~~~~~~~~~~~~~
-
-.. _cfg-stranded:
-
-``stranded`` field
-``````````````````
-    This field specifies the strandedness of the library. This is used by
-    various rule to set the parameters correctly. For example, if this is set to ``fr-firststrand`` then
-    ``featureCounts`` will use ``-s2``; CollectRnaSeqMetrics will use
-    ``STRAND=SECOND_READ_TRANSCRIPTION_STRAND``, and deepTools bamCoverage will
-    use ``-filterRNAstrand reverse``.
-
-    This field can take the following options:
-
-    =================== ===========
-    value               description
-    =================== ===========
-    ``unstranded``      The strand that R1 reads align to has no information about the strand of the gene.
-    ``fr-firststrand``  R1 reads from plus-strand genes align to the *minus* strand. Also called reverse stranded, dUTP-based
-    ``fr-secondstrand`` R1 reads from plus-strand genes align to the *plus* strand. Also called forward stranded.
-    =================== ===========
-
-    Example:
-
-    .. code-block:: yaml
-
-        stranded: "fr-firststrand"
-
-    Rules that require information about strand will check the config file at
-    run time and raise an error if this field doesn't exist.
-
-
-Optional fields
-~~~~~~~~~~~~~~~
-
-.. _cfg-fastq-screen:
-
-``fastq_screen`` config section
-```````````````````````````````
-
-    This section configures which Bowtie2 indexes should be used with
-    `fastq_screen`. It takes the form of a list of dictionaries. Each
-    dictionary has the keys:
-
-        - `label`: how to label the genome in the output
-        - `organism`: a configured organism. In the example above, there is only a single configured organism, "human".
-        - `tag`: a configured tag for that organism.
-
-    Each entry in the list must have a Bowtie2 index configured to be built.
-
-    Example:
-
-    .. code-block:: yaml
-
-        fastq_screen:
-          - label: Human
-            organism: human
-            tag: gencode-v25
-          - label: rRNA
-            organism: human
-            tag: rRNA
-
-   The above example configures two different indexes to use for fastq_screen:
-   the human gencode-v25 reference, and the human rRNA reference.
-
-.. _cfg-merged-bigwigs:
-
-``merged_bigwigs`` config section
-`````````````````````````````````
-    This section controls optional merging of signal files in bigWig format.
-    Its format differs depending on RNA-seq or ChIP-seq, due to how strands are
-    handled in those workflows.
-
-    Here is an RNA-seq example:
-
-    .. code-block:: yaml
-
-        merged_bigwigs:
-          arbitrary_label_to_use:
-            pos:
-              - 'sample1'
-              - 'sample2'
-            neg:
-              - 'sample1'
-              - 'sample2'
-
-    This will result in a single bigWig file called
-    `arbitrary_label_to_use.bigwig` in the directory
-    `data/rnaseq_aggregation/merged_bigwigs` (by default; this is configured
-    using ``config/rnaseq_patterns.yaml``). That file merges together both the
-    positive and negative signal strands of two samples, `sample1` and `sample2`. The
-    names "sample1" and "sample2" are sample names defined in the :ref:`sample
-    table <sampletable>`.
-
-    In other words, if samples 1 and 2 are replicates for a condition, this
-    gets us a single merged (averaged) track for that condition.
-
-    Here's another RNA-seq example, where we merge the samples again but keep
-    the strands separate. This will result in two output bigwigs.
-
-    .. code-block:: yaml
-
-        merged_bigwigs:
-          merged_sense:
-            sense:
-              - 'sample1'
-              - 'sample2'
-          merged_antisense:
-            antisense:
-              - 'sample1'
-              - 'sample
-
-    Here is a ChIP-seq example:
-
-    .. code-block:: yaml
-
-        merged_bigwigs:
-          arbitrary_label_to_use:
-            - 'label1'
-            - 'label2'
-
-    This will result in a single bigWig file called
-    `arbitrary_label_to_use.bigwig` in the directory
-    `data/chipseq_aggregation/merged_bigwigs` (by default; this is configured
-    using ``config/chipseq_patterns.yaml``) that merges together the "label1"
-    and "label2" bigwigs.
-
-    See :ref:`sampletable` for more info on the relationship between a *sample*
-    and a *label* when working with ChIP-seq.
-
-
-RNA-seq-only fields
-~~~~~~~~~~~~~~~~~~~
-.. _cfg-rrna:
-
-``rrna`` field
-```````````````
-
-    This field selects the reference tag to use for screening rRNA reads.
-    Similar to the ``aligner`` field, it takes both a ``tag`` and ``index``
-    key. The specified index must have been configured to be built for the
-    specified tag. It uses the already configured ``organism``.
-
-    Example:
-
-    .. code-block:: yaml
-
-        rrna:
-          tag: 'rRNA'
-          index: 'bowtie2'
-
-
-.. _cfg-gtf:
-
-``gtf`` field
-`````````````
-
-    This field selects the reference tag to use for counting reads in features.
-    The tag must have had a ``gtf:`` section specified; see
-    :ref:`references-config` for details.
-
-    The organism is inherited from the ``organism:`` field.
-
-    Example:
-
-    .. code-block:: yaml
-
-         gtf:
-           tag: "gencode-v25"
-
-.. _cfg-salmon:
-
-``salmon`` field
-````````````````
-    This field selects the reference tag to use for the Salmon index (if used).
-    The tag must have had a FASTA configured, and an index for "salmon" must
-    have been configured to be built for the organism selected with the
-    ``organism`` config option.
-
-
-ChIP-seq-only fields
-~~~~~~~~~~~~~~~~~~~~
-
-.. _cfg-chipseq:
-
-``chipseq`` config section
-``````````````````````````
-    This section configures the peak-calling stage of the ChIP-seq workflow. It
-    currently expects a single key, ``peak_calling``, which is a list of
-    peak-calling runs.
-
-    A peak-calling run is a dictionary configuring a single execution of
-    a peak-caller which results in a single BED file of called peaks.
-    A peak-calling run is uniquely described by its ``label`` and
-    ``algorithm``. This way, we can use the same label (e.g., `gaf-embryo-1`)
-    across multiple peak-callers to help organize the output.
-
-   The currently-supported peak-callers are ``macs``, ``spp``, and ``sicer``.
-   They each have corresponding wrappers in the ``wrappers`` directory. To add
-   other peak-callers, see :ref:`new-peak-caller`.
-
-    The track hubs will include all of these called peaks which helps with
-    assessing the peak-calling performance.
-
-    Here is a minimal example of a peak-calling config section. It defines
-    a single peak-calling run using the `macs` algorithm. Note that the
-    ``ip:`` and ``control:`` keys are lists of **labels** from the ChIP-seq
-    sample table's ``label`` column, **not sample IDs** from the first column.
-
-    .. code-block:: yaml
-
-        chipseq:
-          peak_calling:
-
-            - label: gaf-embryo-1
-              algorithm: macs
-              ip:
-                - gaf-embryo-1
-              control:
-                - input-embryo-1
-
-    The above peak-calling config will result in a file
-    ``data/chipseq_peaks/macs/gaf-embryo-1/peaks.bed`` (that pattern is
-    defined in ``chipseq_patterns.yaml`` if you need to change it).
-
-    We can specify additional command-line arguments that are passed verbatim
-    to `macs` with the ``extra:`` section, for example:
-
-    .. code-block:: yaml
-
-        chipseq:
-          peak_calling:
-
-            - label: gaf-embryo-1
-              algorithm: macs
-              ip:
-                - gaf-embryo-1
-              control:
-                - input-embryo-1
-              extra: '--nomodel --extsize 147'
-
-
-    `macs` supports multiple IP and input files, which internally are merged
-    by `macs`. We can supply multiple IP and input labels for biological
-    replicates to get a set of peaks called on pooled samples. Note that we
-    give it a different label so it doesn't overwrite the other peak-calling
-    run we already have configured.
-
-    .. code-block:: yaml
-
-        chipseq:
-          peak_calling:
-
-            - label: gaf-embryo-1
-              algorithm: macs
-              ip:
-                - gaf-embryo-1
-              control:
-                - input-embryo-1
-              extra: '--nomodel --extsize 147'
-
-
-            - label: gaf-embryo-pooled
-              algorithm: macs
-              ip:
-                - gaf-embryo-1
-                - gaf-embryo-2
-              control:
-                - input-embryo-1
-                - input-embryo-2
-
-
-
diff --git a/docs/config.rst b/docs/config.rst
index 649a3cab..6275af7d 100644
--- a/docs/config.rst
+++ b/docs/config.rst
@@ -5,75 +5,352 @@
 Configuration
 =============
 
-General configuration
-~~~~~~~~~~~~~~~~~~~~~
+Configuration happens in two places:
+
+**Config file:**
+
+- :ref:`rnaseq-config`
+- :ref:`chipseq-config`
+
+**Sampletable:**
+
+- :ref:`rnaseq-sampletable`
+- :ref:`chipseq-sampletable`
+
+
+.. _configfiles:
+
+Config file
+-----------
+
+Config files, at a minimum, specify which reference FASTA to use (:ref:`reference-config`).
+
+For RNA-seq (:ref:`rnaseq-config`) the config file also specifies strandedness.
+
+For ChIP-seq (:ref:`chipseq-config`) the config file specifies peak-calling runs.
+
+Config files are in YAML format. By default, they are expected to be at
+:file:`config/config.yaml`, but you can override from the command line like this::
+
+  snakemake --configfile="otherdir/myconfig.yaml" ...
+
+Snakemake will merge the config file(s) given on the command line with the
+default config file (:file:`config/config.yaml`).
+
+.. _reference-config:
+
+Configuring genome fasta (RNA-seq & ChIP-seq)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Both RNA-seq and ChIP-seq need a reference fasta configured, like this:
+
+.. code-block:: yaml
+
+  genome:
+    url: <URL to gzipped FASTA file>
+
+The value of ``url`` can be a file, like
+``file:///data/references/Homo_sapiens/gencode.fa.gz``, or any FTP or HTTP URL.
+
+
+You could optionally use the included reference configs to fill in the genome
+and annotation from the commandline, and Snakemake would be called like this::
+
+  snakemake --configfile=../../include/reference_config_templates/Homo_sapiens/GENCODE.yaml ...
+
+Or you could copy the contents of the reference config templates and paste in
+your own :file:`config/config.yaml`.
+
+
+- url can be file
+- postprocessing
+- overrides
+- included reference configs
+
+
+RNA-seq config
+~~~~~~~~~~~~~~
+
+For RNA-seq, in addition to the genome fasta file described above, you also need:
+
+- ``annotation``, structured similar to ``genome``, which specifies a gzipped
+  GTF file. A transcriptome fasta is automatically built from the genome fasta
+  and this GTF.
+- ``organism`` which will be used to screen ribosomal RNA. Technically, this is
+  searching for the string in the SILVA rRNA database's fasta records.
+- ``stranded`` of the libraries, which is used for automatically
+  configuring strand-specific tools. The options are:
+  - ``fr-firststrand`` for dUTP libraries
+  - ``fr-secondstrand`` for ligation libraries
+  - ``unstranded`` for libraries without strand specificity.
+
+See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more
+info on strandedness. If you don't know ahead of time, you can use
+``fr-firststrand`` and inspect the results for RSeQC's infer_experiment in the
+MultiQC output. Correct the strandedness in the config, and re-run. Only the
+jobs affected by strandedness will be re-run.
+
+Here is an example for human:
+
+.. code-block:: yaml
+
+    organism: "Homo sapiens"
+    genome:
+      url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz"
+    annotation:
+      url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz"
+    stranded: "fr-firststrand"
+
+In :file:`include/reference_configs` you can find configs for common model
+organisms. These have both genome and annotation, so you can point Snakemake to
+them on the command line. You would still need to specify strandedness, which
+can be the only config entry in :file:`config/config.yaml`. Or it could be
+specified directly on the command line, like this:
+
+.. code:block:: bash
+
+  snakemake \
+    --configfile=../../include/reference_configs/Homo_sapiens/GENCODE.yaml \
+    --config stranded=fr-firststrand
+
+(in this case no separate :file:`config/config.yaml` would be needed, as long
+as you use the default :file:`config/sampletable.tsv` as your sampletable)
+
+
+ChIP-seq config
+~~~~~~~~~~~~~~~
+
+For ChIP-seq, in addition to the genome fasta file described above, you also
+need a peak-calling section if you want to to run peak-calling.
+
+The idea is that the ``peak_calling:`` entry in the config is a list. Each item
+in the list is a dictionary with the following keys:
 
-The majority of the work in setting up a new project is in the configuration --
-which samples to run, where the data files are located, which references are
-needed, etc.
+- ``label`` for the peak-calling run. This is intentionally free-form since you
+  may want to run the same samples through multiple algorithms or different
+  parameters. Output will be in :file:`data/peak_calling/<algorithm>/<label>`.
+- ``algorithm``, currently supported options are ``macs``, ``epic2``
+- ``ip`` a list of IP samples (or merged tech reps) or their equivalent. For
+  ATAC-seq, this is the ATAC samples. For CUT&RUN and Cut&Tag, these are the
+  samples with the antibody of interest.
+- ``control`` a list of control samples (or merged tech reps). For ATAC-seq,
+  leave this empty or exclude entirely. For CUT&RUN and Cut&Tag, this can be
+  excluded or IgG samples can be used.
+- ``extra`` is a string that is passed verbatim to the peak-caller's
+  command-line call. Use this to modify parameters for each peak-calling run.
 
-**The entry point for configuration** is in the ``config/config.yaml`` file found
-in each workflow directory. See :ref:`config-yaml` for more.
+.. note::
 
-.. toctree::
-   :maxdepth: 2
+   The values for ``ip`` and ``control`` must match the sampletable's ``label``
+   column, see :ref:`chipseq-sampletable` for details. That is, these are the
+   names of the merged technical replicates.
 
-   config-yaml
 
-The **references section** of the config file configures the genomes,
-transcriptomes, and annotations to be used. See :ref:`references-config` for more.
+Here is an example:
 
-.. toctree::
-   :maxdepth: 2
+.. code-block:: yaml
 
-   references-config
+    chipseq:
+      peak_calling:
+        - label: gaf-wingdisc-pooled
+          algorithm: macs
+          ip:
+            - gaf-wingdisc-1
+            - gaf-wingdisc-2
+          control:
+            - input-wingdisc-1
+            - input-wingdisc-2
+          extra: '--nomodel --extsize 147'
 
-The **sample table**, lists sample IDs, filenames, and other metadata. Its path
-is specified in the config file. See :ref:`sampletable` for more.
+        - label: gaf-wingdisc-pooled-1
+          algorithm: epic2
+          ip:
+            - gaf-wingdisc-1
+          control:
+            - input-wingdisc-1
 
-.. toctree::
-   :maxdepth: 2
+        - label: gaf-wingdisc-pooled-no-control
+          algorithm: epic2
+          ip:
+            - gaf-wingdisc-1
+            - gaf-wingdisc-2
+          control: []
+          extra: ''
 
-   sampletable
+.. _sampletables:
 
-A **patterns file** only needs to be edited
-if you're doing custom work. It determines the patterns of files that will be
-created by the workflow. See :ref:`patterns-and-targets` for more.
+Sampletable
+-----------
 
-.. toctree::
-   :maxdepth: 2
+Sample tables map sample names to files on disk and provide additional
+metadata.
 
-   patterns-targets
+.. note::
 
-.. _cluster:
+    Running data from SRA? You can use the SRA metadata sample table (see
+    `example <https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP423046>`__)
+    as-is, though you may want to add a new column at the beginning with more
+    readable sample names. If there are technical replicates, you may need to
+    edit the SRA table -- see :ref:`chipseq-sampletable` below.
 
-Running on a cluster
+Sample tables are TSV or CSV (with a respective .tsv or .csv
+extension) and have a header. Empty lines and lines that start with a comment
+(``#``) are skipped.
+
+**The first column is interpreted as sample name.** FASTQ files specified in the
+sampletable will be automatically symlinked to locations named after the sample
+name as expected by lcdb-wf (i.e., you don't need to move files anywhere to meet
+these expectations).
+
+By default, the Snakefiles look for a file called
+:file:`config/sampletable.tsv`. But you can edit the config file or provide
+a commandline config option if you want to use something different:
+
+.. code-block:: yaml
+
+    # in the config yaml
+    sampletable: "config/mytable.csv"
+
+    # ...remainder of config
+
+or don't edit the config and instead modify the command-line call::
+
+  snakemake --config sampletable="config/mytable.csv" ...
+
+.. _rnaseq-sampletable:
+
+RNA-seq sample table
 ~~~~~~~~~~~~~~~~~~~~
-The example commands in :ref:`getting-started` describe running Snakemake
-locally. For larger data sets, you'll want to run them on an HPC cluster.
-Snakemake `supports arbitrary cluster commands
-<http://snakemake.readthedocs.io/en/latest/snakefiles/configuration.html>`_,
-making it easy to run these workflows on many different cluster environments.
-
-Snakemake and these workflows are designed to decouple the code from the
-configuration. Each rule has resources specified. When running with
-a cluster-specific `Snakemake profile
-<https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles>`_,
-these resources are translated into cluster-specific commands.
-
-For example, if runnng NIH's Biowulf HPC cluster, use the `Biowulf profile
-<https://github.com/NIH-HPC/snakemake_profile>`_.
-
-Generally, you shouldn't run long-running tasks on a login node of a cluster,
-and this includes long-running Snakemake workflows. So lcdb-wf comes with
-a wrapper script, ``include/WRAPPER_SLURM``, that runs Snakemake which can be
-submitted to a compute node on a Slurm cluster.
-
-For example, to run a workflow on a Slurm cluster, from the workflow directory
-(e.g., ``workflows/rnaseq``, run the following command::
-
-    sbatch ../../include/WRAPPER_SLURM
-
-The ``WRAPPER_SLURM`` script submits the main Snakemake process on a separate
-node to avoid any restrictions from running on the head node. That main
-Snakemake process then submits each rule separately to the cluster scheduler.
+
+Here is an example minimal sample table for single-end RNA-seq data. The column
+``orig_filename`` is required::
+
+    # Example RNA-seq sample table
+    sample   orig_filename
+    c1       /data/c1_R1.fastq.gz
+    c2       /data/c2_R1.fastq.gz
+    t1       other-data/treatment_1_1.fq.gz
+    t2       ../../raw-data/t2_1.fq.gz
+
+For paired-end data, we need to specify the second end of the pair in the
+``orig_filename_R2`` column::
+
+    sample   orig_filename                   orig_filename_R2
+    c1       /data/c1_R1.fastq.gz            /data/c1_R2.fastq.gz
+    c2       /data/c2_rR.fastq.gz            /data/c2_R1.fastq.gz
+    t1       other-data/treatment_1_1.fq.gz  other-data/treatment_1_2.fq.gz
+    t2       ../../raw-data/t2_1.fq.gz       ../../raw-data/t2_2.fq.gz
+
+
+Relative paths are interpreted relative to the Snakefile
+(:file:`workflows/rnaseq`), so the paired-end examplea above would result in the
+following symlinks being created::
+
+  data/rnaseq_samples/c1/c1_R1.fastq.gz --> /data/c1_R1.fastq.gz
+  data/rnaseq_samples/c1/c1_R2.fastq.gz --> /data/c1_R2.fastq.gz
+  data/rnaseq_samples/c2/c2_R1.fastq.gz --> /data/c2_R1.fastq.gz
+  data/rnaseq_samples/c2/c2_R2.fastq.gz --> /data/c2_R2.fastq.gz
+  data/rnaseq_samples/t1/t1_R1.fastq.gz --> ../../../other-data/treatment_1_1.fq.gz
+  data/rnaseq_samples/t1/t1_R2.fastq.gz --> ../../../other-data/treatment_1_2.fq.gz
+  data/rnaseq_samples/t2/t2_R1.fastq.gz --> ../../../../../raw-data/t2_1.fq.gz
+  data/rnaseq_samples/t2/t2_R2.fastq.gz --> ../../../../../raw-data/t2_2.fq.gz
+
+This sampletable will be read into the downstream differential expression
+analysis, so it's a good idea to add lots of metadata here. Here is a final
+paired-end sample table we could use::
+
+    sample   group     replicate  orig_filename                   orig_filename_R2
+    c1       control   1          /data/c1_R1.fastq.gz            /data/c1_R2.fastq.gz
+    c2       control   2          /data/c2_rR.fastq.gz            /data/c2_R1.fastq.gz
+    t1       treatment 1          other-data/treatment_1_1.fq.gz  other-data/treatment_1_2.fq.gz
+    t2       treatment 2          ../../raw-data/t2_1.fq.gz       ../../raw-data/t2_2.fq.gz
+
+Here are some additional tips:
+
+- If you have technical replicates, list them as separate lines. This lets us inspect
+  the QC of the technical replicates independently. The downstream analysis in
+  R has a function for summing counts of technical replicates, so the merging
+  can be handled later.
+- It's helpful to add as many metadata columns as you can, so that the
+  sampletable becomes a single source of truth.
+- Creating sampletables is by far the most error-prone step -- it's very easy
+  to miss changing an R1 to an R2, for example. Double- and triple-check!
+- You can split up experiments
+
+
+.. _chipseq-sampletable:
+
+ChIP-seq sample table
+~~~~~~~~~~~~~~~~~~~~~
+
+A ChIP-seq sampletable needs sample names and original filenames that are
+symlinked to expected locations. just like the RNA-seq sampletable described
+above.
+
+However, if a sample has technical replicates, they have to be specified in the
+sampletable for ChIP-seq. This is in contrast to RNA-seq, where we can simply
+sum counts of tech reps in R. See :ref:`decisions-techreps` for details.
+
+Use the ``merged_label`` column to control this. Rows with the same
+``merged_label`` value will merged together. Take the following example::
+
+  samplename  merged_label  orig_filename
+  ip1a        ip1           /data/run1/ip1.fq.gz
+  ip1b        ip1           /data/run2/ip1.fq.gz
+  ip2                       /data/run1/ip2.fq.gz
+  input1                    /data/run1/input1.fq.gz
+
+In this case, we will get individual QC metrics for technical replicates
+``ip1a`` and ``ip1b``. Then they will be merged into a single ``ip1`` (the
+merged label) BAM file that is ready for peak-calling. The other samples
+(``ip2``, ``input1``) do not have technical replicates.
+
+The merging process uses ``samtools merge`` on the tech reps followed by
+Picard ``MarkDuplicates``, saving the result in
+:file:`data/chipseq_merged/{merged_label}/{merged_label}.cutadapt.unique.nodups.merged.bam`.
+If no merging needs to be done (like for ``ip2`` and ``input1`` here), then we
+will get a symlink to the respective BAM file. In this way,
+:file:`data/chipseq_merged` always has the complete set of BAMs ready for
+peak-calling.
+
+The workflow will automatically fill in missing values in ``merged_label`` with
+values from the first column. Or, to be explicit, we could write it all out
+like this::
+
+  samplename  merged_label  orig_filename
+  ip1a        ip1           /data/run1/ip1.fq.gz
+  ip1b        ip1           /data/run2/ip1.fq.gz
+  ip2         ip2           /data/run1/ip2.fq.gz
+  input1      input1        /data/run1/input1.fq.gz
+
+
+.. note::
+
+  In prior versions of lcdb-wf, the ``merged_label`` column was called just
+  ``label``. This is still supported for backward compatibility; if no
+  ``merged_label`` column exists then the workflow will make a new
+  ``merged_label`` column out of an existing ``label`` column.
+
+**Use** ``merged_label`` **values when configuring peak-calling.** See
+:ref:`chipseq-config` for more on the peak-calling section.
+
+With the sampletable above, a peak-calling config section might then look like this:
+
+.. code-block:: yaml
+
+  chipseq:
+    peak_calling:
+      - label: ip1-macs
+        algorithm: macs
+        ip: ip1  # Note the use of merged_label here
+        input: input1
+      - label: ip2-macs
+        algorithm: macs
+        ip: ip2
+        input: input1
+
+.. note::
+
+  In general, you may find it useful to add an antibody column and a chromatin
+  prep column to the sampletable so you know which inputs/controls go with
+  which IPs.
diff --git a/docs/decisions.rst b/docs/decisions.rst
index 86aa71f7..7d08e5d4 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -29,7 +29,7 @@ should be only one organism per workflow though.**
 - Re-using indexes is space- and time-efficient in the short term, but experience has
   shown it to be inefficient in time and reproducibility in the long term.
 - Keeping everything in the same deployment directory also helps with the
-  archiving process. 
+  archiving process.
 - We were hesitant to update the references in the central location due to
   being unsure of what was depending on them.
 - Overall, here we make the decision that the time and space cost to re-make
@@ -463,7 +463,7 @@ be run once per sample and then those outputs can be aggregated later. Previousl
 provided all BAMs to a single all-in-one call of featureCounts. However, for
 paired-end BAMs, featureCounts will internally name sort each BAM before
 counting. It does this serially. The result is possibly substantial memory
-usage and a lot of time. 
+usage and a lot of time.
 
 One approach could be to temporarily name-sort BAMs in a separate rule,
 conditional on paired-end reads, and the featureCounts rule would need to have
@@ -648,18 +648,25 @@ Lack of sample-specific parameters
 
 Currently if we have samples with different library preps that need different
 arguments for cutadapt, then they need to be split into two separate workflow
-directories. Supporting sample-specific parameters would certainly be possible,
-but the addtional complexity this would impose would go against the goal of
-reducing complexity. For example, we'd need a location to store multiple sets
-of parameters (probably in the config file) and a mechanism to retrieve them
-based on sample names. This could be an additional column in the sampletable
-indicating "parameter sets", which could be used as a lookup in a ``params:``
-directive lookup function.
-
-Again, this would be possible, but it is a deliberate design choice to opt for
-a simpler approach, which is to use multiple workflow directories and edit the
-respective Snakefiles appropriately. In cases where samples across the split
-workflows need to be compared or considered together, an additional workflow
+directories and the Snakefiles edited accordingly to have the correct parameters
+for rules.
+
+Supporting sample-specific parameters would certainly be possible. But this
+would go against the goal of reducing complexity.
+
+For example, we'd need a location to store multiple sets of parameters (probably
+in the config file) and a mechanism to retrieve them based on sample names. This
+could be an additional column in the sampletable indicating "parameter sets".
+Then we could create a lookup table in the config storing the different
+parameter sets, with each set containing parameters for all rules. We'd need to
+handle default params in case they weren't specified. Then we'd need to have
+each rules' ``params:`` directive do the lookup in a sample-specific manner,
+which would be a lookup function in :file:`lib/utils.py`.
+
+Again, this would all be possible. But to reduce complexity it is a deliberate
+design choice to opt for a simpler approach: use multiple workflow directories
+and edit the respective Snakefiles appropriately. In cases where samples need to
+be compared or considered together across the workflows, an additional workflow
 can be introduced to aggregate their output.
 
 PEP support
@@ -687,6 +694,8 @@ support PEP subsamples. I don't consider that effort to be worth it right now,
 especially because the current config system already supports technical
 replicates.
 
+.. _decisions-techreps:
+
 Technical replicates
 --------------------
 In practice, it's not uncommon for something to go wrong in library prep or
@@ -760,3 +769,5 @@ Cleanup of lib/utils.py
 We had accumulated a lot of useful functions over time, but things have changed
 enough that they haven't been used. To avoid clutter and additional maintenance
 burden in supporting otherwise unused code, these functions were removed.
+
+
diff --git a/docs/developers.rst b/docs/developers.rst
deleted file mode 100644
index 9e459a97..00000000
--- a/docs/developers.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-For Developers
-==============
-
-Creating and updating conda envs
---------------------------------
-
-The ``env.yml`` and ``env-r.yml`` files contain fully-pinned versions of the
-environments. This hopefully helps with stability and can dramatically speed up
-the creation of environment. However these env definitions periodically need to
-be updated.
-
-To do so, create new environments using the unpinned versions in
-``include/requirements.txt`` and ``include/requirements-r.txt``. This may take
-substantially longer to create.
-
-Then run the tests (:ref:`running-the-tests`) using those environments.
-
-If all tests pass, then export the newly-created environments to the
-``env.yml`` and ``env-r.yml`` files.
-
-When you commit and push those files, the CI/CD system will detect that they
-are different and will trigger a re-build of the cached environments and
-proceed with the tests using those new environments.
-
-Running the full complex datasets
----------------------------------
-
-Prior to a release, the complex datasets should be run. These do a more
-extensive job in testing the corner cases. This should be run on a cluster or
-a machine with substantial resources. The configs can be found in
-``include/test``. Here is how to run it using the WRAPPER_SLURM:
-
-.. code-block:: bash
-
-    sbatch ../../include/WRAPPER_SLURM \
-      --configfile ../../test/test_configs/complex-dataset-rnaseq-config.yaml \
-      --config sampletable=../../test/test_configs/complex-dataset-rnaseq-sampletable.tsv
-
-Module documentation
---------------------
-
-.. toctree::
-   :maxdepth: 2
-
-   lib.common
-   lib.chipseq
-   lib.patterns_targets
-
-
-Adding a new aligner
---------------------
-
-Modules
-^^^^^^^
-
-In `lib/common.py`, there is a function `references_dict`. Within that is
-a `index_extensions` dictionary. You'll need to add the name of the aligner and
-the extension of the index it creates. If it creates multiple index files, just
-one should be sufficient. The filename will be automatically created and will
-be used as the expected output file which can then be accessed from the
-references dict as `references_dict[organism][tag][aligner]` for use in various
-rules that need the index as input (that is, any mapping rules).
-
-Configuration
-^^^^^^^^^^^^^
-
-- add the aligner to the `include/reference_configs/test.yaml` config file,
-  "indexes:" section.
-
-- write a rule in `workflows/references/Snakefile` to build the index. Use the
-  other index-building rules there as a guide.
-
-- Depending on which type of workflow the aligner is appropriate for, add
-  a rule there. Enclose it in an "if:" clause to only run if the config file
-  has specified that aligner.
-
-- add the name to the list of supported aligners in `docs/config-yaml.rst`, in
-  the "Aligner config" section.
-
-- add appropriate memory/time requirements to the rule for that aligner.
-
-Testing
-^^^^^^^
-
-- For testing, create a copy of the config for any workflows it is used for,
-  and change only the aligner.
-
-- Modify `.circleci/config.yml` to include a new block in each of the
-  variables, jobs, and workflows sections. Use the `rnaseq-star` blocks as
-  a guide for this. The idea is to only run up through the aligner step in
-  a parallel task (to save on CI build time).
-
-
-.. _new-peak-caller:
-
-Adding a new peak-caller
-------------------------
-
-First, write a wrapper for the peak-caller. You can use the ``macs``, ``spp``,
-and ``sicer`` wrappers as a guide. A wrapper should expect one or more sorted
-and indexed BAM files as IP, one or more sorted and indexed BAM files as input.
-The wrapper should create at least a sorted BED file of peaks, and can
-optionally create other supplemental files as well.
-
-Next, add the peak-caller to the top of ``lib/patterns_targets.py`` in the
-``PEAK_CALLERS`` list.
-
-Then write a rule for the peak-caller, again using ``macs``, ``spp``, or
-``sicer`` rules as a guide.
-
-Last, add additional lines in
-``workflows/chipseq/config/chipseq-patterns.yaml`` for the
-``patterns_by_peaks`` key.
-
-To test or use, add the new peak-caller to the
-``workflows/chipseq/config/config.yaml`` file's ``peak_calling`` key.
diff --git a/docs/downstream-rnaseq.rst b/docs/downstream-rnaseq.rst
deleted file mode 100644
index 73a9504c..00000000
--- a/docs/downstream-rnaseq.rst
+++ /dev/null
@@ -1,82 +0,0 @@
-.. _downstream:
-
-RNA-Seq downstream analysis
-===========================
-
-In a typical RNA-seq analysis, it is relatively straightforward to go from raw
-reads to read counts in features to importing them into R. After that however,
-expression analysis gets a bit more complicated and highly depends on the
-design of the experiment.
-
-We attempted to strike the balance between simplicity -- where as much
-configuration as possible takes place via a config file -- and flexibility
-where the R code can be modified as needed depending on the project.
-
-This file is ``workflows/rnaseq/downstream/rnaseq.Rmd``. It uses a separate
-conda environment that just has the R dependencies. It is rendered via
-``knitr`` to create an HTML file. The inputs for the rule are the featureCounts
-output, the sample table, the ``lib/lcdbwf`` R package, and the Rmd.
-
-.. warning::
-
-   This RMarkdown file is **intended to be edited and customized per experiment**.
-
-
-How to use this code
-~~~~~~~~~~~~~~~~~~~~
-
-1. Activate the ``env-r`` conda environment (created as part of setting up the
-   `lcdb-wf` deployment)
-
-2. Edit the :file:`workflows/rnaseq/downstream/config.yaml` file. It is
-   heavily commented and should be self-explanatory.
-
-3. Customize the contrasts you want to run (see below for details on this)
-
-4. From the :file:`workflows/rnaseq/downstream` directory, run
-   ``rmarkdown::render("rnaseq.Rmd")`` to get :file:`rnaseq.html`
-
-Here are some additional notes:
-
-- Many of the code chunks have the ``cache=TRUE`` option to speed up
-  re-rendering and make iterative development quicker. When everything's in
-  a final state, you may want to delete the ``rnaseq_cache`` directory and
-  re-run.
-
-- Many of the cached code chunks also specify a config argument. These config
-  items are taken from the :file:`config.yaml` file living alongside the
-  :file:`rnaseq.Rmd`. If a cached chunk specifies a config option, and the
-  value in the config file changes, the chunk will be re-run because its cache
-  is invalidated.
-
-- As with many analyses in R, the work is highly iterative. You may want to
-  consider using an interactive interpreter, either via the command line or
-  RStudio. To ensure that RStudio is using the same packages as the workflows,
-  you should set the ``RSTUDIO_WHICH_R`` environment variable.
-
-  The easiest way to do this is to activate the conda environment you're using
-  for the analysis, then export the identified location of R to that variable:
-
-  .. code-block:: bash
-
-      source activate lcdb-wf
-      export RSTUDIO_WHICH_R=$(which R)
-
-  On MacOS, you may additionally need the following:
-
-  .. code-block:: bash
-
-      launchctl setenv RSTUDIO_WHICH_R $RSTUDIO_WHICH_R
-
-  Then run RStudio, which should pick up the conda environment's version of R and
-  which will already have packages like DESeq2 installed in the environment.
-
-More details
-~~~~~~~~~~~~
-
-For more detailed documentation, see :ref:`downstream-detailed`.
-
-.. toctree::
-   :maxdepth: 2
-
-   rnaseq-rmd
diff --git a/docs/external.png b/docs/external.png
deleted file mode 100644
index 52aaab6648a028070e839c8c4dd50ec18e889203..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11515
zcmc(Fby(ET_wN!)gOmcwqS7GUEe+D((jC$%-6(?OQcAbdDcvX~g47byCAlCau=E{1
z-}v48-24B92loB!ocGL`IWy<H&g)Esx~e<@E)^~W0wGXTkkN!d&?vz31@>L=znVFU
z41C-*SCW^3+}!@=v=t^oAdex6GLkR7Gj`{F+{cgGZVooQCY3*?KBIerV`(-TocGx9
z{rmgHHo2ozLHf@nI~QI%!u2op?44%~9Q|4c4w@eyXKXvRJ>FI;#;Y5<ADaIw^0}Vu
zeT*k)UnEabpCwv;+}nevM&nGIF)a2EUbw0br4Alup86mCv~M4<)_4hn5lg+lH)3lP
zeDA57Vkh<!DLHgGY+~-P0i#Zz<R}a_whR|xp{Y^Hn!dLPB!QgcQ+x{Yce<Ie&uUU5
zUh(j(tfY`UjLP#c>-t<$n<*%X%$0V}5Lc0xy<=QRI<Yp^{G}?lMt6jf&UC2%YeKf#
z<zOmBUu;{M6-%;j!s$RgKfeI4OI|sFkkTU~%^11%4=LXrBO)l0_G76gb_;B0`P1s1
zdSC{n9zyw1dG|!DaQkItyH(RYZKEihI>#FgTa66{+CP4To+8QL;pfMb<r7D##<o6v
zrv!6toPX)=c{0O8ZO>iJ&G&)Q)s+e+P&FwXUKFs)Cp?_J=h`u3{x#{tyRWaLxv2O`
z*e|^^RSl1i8%s(x#Im69o*RTje6#tT-r8C-w%597N7eH~=V?@i*U3zse6Fr@4s~vh
zh8dVjm`CV^8LFOq0!Loq`SQO8RTP#kEc(oG6Wb{3Y7&n)I+Tw#va$Xk^q%QDx5kJ1
z991c&VSXt;89=&^o%cKt6h8}{q?OM>-pt$3IUd_N;N8df(UZodAos3p&+<YdF;3S~
zf0n&aT73~^h01;&ZN~;B+MAp@($FF`;!3pVKa$bD-yb$0Uhbtgr=s#w?BdxjJ$<_f
z*&Sr7&nI@1*o9Y!+WOI=w%VjLf`>K|KHM%QD-G?kGe16XhH@lHV#wVwE~|K@_L^r&
zgiJp*?Z>z7?oqEiHB6YTpNM^FY4JfelL{9FY65i-J-S)d1=Eo~(^6YL2_mLKjD7e1
zt6|QdJn|{)eQpV_<D%-WedQg(OcGq&@47XwM&c-WO&5yFt&}WiA+Ek+a4Sm}n1e$c
z%Y%HQn!c_h7z}JKGzx3(1LTf_xjgD#H_~16dR)srLj>#j;O~A*vFpa2I?VW^(#j&=
zYadZUEQsQ}cVJH9OZIo?aK-CqF_uXxr;bVwR_P68h;V$xBpwRH+FtMG<<%rEzSQB#
zBOq@zE}1IF3su^wwT%usRK!VsT=RS5V$$AU6cS@lZ+e4TMKeJ--)^O{%3?JCa9Q{(
zOU@xnT=S-<nqa5*ar00*^JBOa)qQZh<|v3CY4l(%4<|%3{V_<lIx4@mt^PixH-`2|
zUP9_1+rCt<qkRh2{O6n?Wy;<jLnlhRpunwer|CVKuDdXFc#y6)$mXw8fLLr4yGPiY
zzU7?jwo}MRPj}pLVs;J@VqiV~8@Cm1@#!K_@V1-A>3aH=c*Ah;*_U#*ks&K{WqdqR
zQfc*q_q<zjw6gvEkXqk1hmumfGrvj4nb3wh^5EZjPouZCo&>3(Mc_Rk;c<NmVY@gP
zJy}Z*5U(5#p5BqbtAwy>YOUGDQ+#n|q>o>gNLtu(8QJs<-)RGTN6_;RW@k#4>ex!p
z6F&}DV%aB-6D6InPhk?*)x(9yxG6`OnIWv`Q@uU$9J_|yoD^j5L=qAp`$a54M<%Y=
zclw4hLk0@^y%v}^jO28I0lSS}6F?mM;H%rKhRT(}LL)O&n+Cj#b_BK|Pq}}pD0J@J
zyb^KINKP=d6Y{<X>2c^4_Hz2ZApdt*IH-LutYpk>>3L<J-yP+tg(zc<Ms3xP5lW_$
zTPfQeOzL9`&_>;R%68V)Idx7u<HbzZth(lwJu2C;c~cnep4<<R<eY2N)DTOBLRnWz
zhVE`iySVa)%uGp>Kx->{)P=N1rUMzRnsv45;CBdit@QQ*s>h4>3~sBn=B;fQ#x;LV
zH!bawH|5xX<@B&G8t445XM|Eq6N1j>-vYYMymr1*v(xr|;2y&9!j|YO#M1aCi~n)B
zJQd=M1Jh@A)=$FofZ9lYAo;cJ`Q_OWQ>4^9cJ13nT}er_j+?7J>H164fch;&;Zg5H
zXUyV~uRfyC_S3dKVj7y^2_2Jp<bi2r!=G6-g}ZdfKx0Pbu!zmPwG)Yyxbexl>kGjt
zh8E`haTko`;h<}t7O%a6FJBV!<>6m~ZUW6OHxxs$$sMuw?j<`6XNh(BUSD5br1Dr3
z@&{pc>eaj&UFc&kzxS2Sk5ePBx|+Bqqx2r_GbS`GBO?_vvnSf6ABwdK53@Rs^pexk
zsQ$E_8jp>QDX6Jgul@MI%gYPdU+~*=UHhPOH|rwG<Pu$0gNcppj_KY)+b;+HxocKp
zM4{TogAUXky!+2O?dsfs<l(41&9TAk^F0D}UETM2Oq=e)hlhvW=Uata;xa>~*^ZA1
z!{74RPsSxD%WG<q5Q(E6mtP0+Kze$5t^#AfZa1Uu+2$N;>gs-1;%Mt;t!#ah#}tws
zLKv^FuMgY|fsl|89RtH{`*e+U_~mrh#NRE1&rWq;<3aDk2sqq&d%SpOz9o4h%P-b-
zwb!g>8kHP$_MmlMRhg)sfS^7Gzn&dywg77;R?2904fjRoGrcdNDJc{$Uc5l<cM(lj
znMuR@Ixgn-%o}Ef{85}2r+bC1tr_I3+K?*KujoKbaHAuv-J*|>K`m=AmRbb8V>nxa
zM!I1j>!E<t?~ZF^hhWgP=;>mR1gmygPKRsFU0>&5HiP<{5*GI4DDZvul(An-!-B8<
zcd!^hW@+i`+sm`zpn|fpXkfX+e#UJ4t=IV4Iwt1^3m@4Gyf<^hz4yOjX~4)~u;~Wp
z#o&SFo9hc9k8Ls$uRp)%nqL3j*wA|Ua(tD>xpt=2hsS=ZVh!v<GBUC%mu0EQs3;+~
z-`HRt8(l;aI|p#VU;Mnk__>ojghuK)M?85+>WuCmKUIp1nxOCL*`6sWiK5xoQdEq<
zEqcIih&|T;rb@1EZ*M;Vj9nV6udRLG`*MJz<0V?%j~i&stDl-C0mrZjr-oAgbd`YP
z37v_Sy><f_43;So)ZXkg-`r_m*><9T+_aH}ceyY!l7nt1EBmgbq(s<y=+UuP$3<pI
z$#&erg*it%DzDkHKjBG0eMp9wpZ1+`twPn~Gn8qzMNTvOb%wOwf;5rXCvnu$+mOD~
z-PyvlpX=huI>yEo=yq0CR!gq%zI^?1?c?w9v>1VZ*juP$JL_v_mnp{V!@8>d8uVK&
zBX0k;f3^92uWUJ)v7gw;4iewLV9fT<V~NbGudT)W1602l54xHPg26oY9c`){Et!yR
zIUV!QXGayr`N?4RQ+KcLKWNzzONorcU0GRCK=NmP?RH{g9T*tkqQL*XzJB*lI<Kwf
z5?8Q+Q`0+Y5zl>KPM53WsT%JCbMb4G1=z+w0fB}&HzoM#&b9IV8$R#Cjwop_X`el>
z1)p$aMN2FS<xM7Q_A`y|<;$1&TQvl~2UB@AkI&JvB}O(ViYk=!7F?1(d_ar)@PS+~
zV+?sL_=s>QLx^W*?Opo#_;}|o*v{q_77@rvHU{uhwQzSf?Cz{wSTp;@#VkQBb<9i0
z<kc@2ujU$EYwPPZS81*Yu`?vlC?o<!of<@3urbwF_N2K3FTFb-aMA=_a?Lh4TQ7fm
z7fmhlad1$1vcgE|#S2`m0_CPpGk5*=@E)*NIZShB`5!8ll$IX&wzckm!wWeQdkf46
z)??+sTQK1KZxs?+R-r!7-!EB+(9qF&Aoywfyy@y>R{m3%3)epz(ya-s^sWMOCTs=h
zw$dFj=h8(Wdb!^<J~^qWum3|W=p50xO3I+TRFLkpk{b@qYi>?EJah|0b<upW8GXuq
z89=fAGd;X(dFgn{glMOH&TWA0@-*nitFkS5nPR6cQ<c%U@<IB~uzpK(;3{v`l)iSr
z=pPy?sH%#?f_ev&GOC(3%%08f1@?h+cQ;_y)Z?ALrzgMHoGVtTcKOQh{>0k4x@zY|
z@yN)?&CSU-{8ae=_!PrG&5vwE#+vJMv41EosHF3cj*Wfmjv!@z_Uw1yjf?L*4IB>N
zNVBgnTOUldw6dxOo~d*7RZXhT(K<0HX&3Je*$1$W-ODb9+MW0(3r$T;qDO-~&kXAE
zODBF<z{Prvyyp4JhWX<J7QEluA_`n*9UHjJzF`tR672jbuqe8T4Aaypa-3_7zr6In
z&!#V<uTKRRJ;VlDNB&Tnf9u%s){!>3g&nxP+6qU`wgNW2?QGMn^H9u5)G22uMTOYU
zpJnA0cb7(^@o`B=-ZWpXi}@89x7-homZzc5&3Rlj@IYGUY9)$yZ-Jnb+___ZmCM$K
zF%+8;Cq>B}ObQw^*?d!`U-tre7>bQI_F#)-ZGHV@wowgx8DUQ(x3}lJ8y^I9rEG4%
z*qteOQ*56ix0oJtJ=v45H`R$0yJiS%LocvT8Pq<@j~Eyka~!Kv$8Re^^3T32G`L~s
zI&5D5Ja){5p;+1BKcyZHdqN=(lYxVf$rNU?E3RBnTEemkG@R1Y!wJqcz}sSPLuZ%@
zx@sBSE$Cgtb+vt}nfVvaXEC~^fG6AesSvB3E3>CZ(xjb&N=Jp@PXPmDP)Mi^x<}55
zXg=;o$zyNS633TLuH_BVa3ahtq<MIX^6nv>lctTe!%FM<$@ZS!9X*5xVLExKe48|?
z3{E9}@F0cPK72klBZ9cAZr+I~x=V!dJHbw~QVLI>moyCQ8R(Z&;;Vh%>WgsLXzI^y
zmFxv1Gtg4IaP{s@bnJ$R$n^5!6|p(fwjp~xA3wS&0+~upL-XY{GF$9nTM+h0uj8!C
zyXD8T@nyDsgu+*d9)`j6@F9t5W3Pa$g$@yVdQQjwf^*-r79-+&=8r3<CPtT?S6qfu
zxG1CrS31mj6WmVcAw|^G<1tIm2u6nxGcT@0_8>Va%5)FNkI!1gF&L&zmqe~{1`WiZ
zfhJm{X~m2(GSd@tgA41gI=!k*LxT36CXsZUsSI8psHdIo<oZli#C2GYYr*WUH-h$2
zH<JJiPVfEpy5d)7QdWIccC&K>UghuqQ%KQSeBx<*vd4}LhS}QEB8X%QX&on6_V-+~
zgU+mG)H10eykhk|z_Drd;(+v4TPU(W+c|Bza>8JcT|Ov#W_}gaD8bKL1sm#nFP^cx
zxitObM<?CsfR<MOUj8VoNcZqa(Ww0qWMVDvaN2t7eK6XOE)B$FIq&5G9I|O{(eHcd
zN|so>xIu)X=NYnG866yauC+(N$9MVX+np`B$LQ@gj%&$lYrn2Gww&sxb$29Mp^A!8
z4-y+)o<N{2nTLg0c~&>q#Fez>K5gSy9XIkSQ#s4R)RaUL(YAAqXeN{PqABhE>n{60
z-r<Ef7nPb@oEhs=rDD(}zjxR*=GYV9t@@n%WF*NfYvKpd7(#mN)Q0ILJRGm%iY<0n
z{AfNG5pSrgaA@R}o2d7UL(dvU&R5y~Vz$~Mc<;CG`CB6MOabSSDMDsbNV}V`Pelof
zi>zWLlRqn0{2m>n#cqCL;(KK%%u44qkMqRD=q3%VF4`YO#at$Odfu4rFlkuvV(YZm
zyRHt~hY1N3H4c<G0&^qQ;i97g@Gi+NESiy85BnVHeDNnCgEUvzN`cpLAGv-9qctwM
z9n2nWoR!Qf9LvcrH#Z7aM8O${9T+BT@D(@sa|x15%ul@qr$)cVPimjk){cxj?jOj!
z4VkdD9mFy7OI8~F;>E#-oWe@0#-Jb}jo@H<{3p$x)5_YSYFb@tB*v`vP&l)3dwY|K
z?ep=Hq&^{3W_Qx)dqVc`FPck;h3|P>vQSy@mRnBUhhsjzaYd8QIqIh4d>A9FxOk5I
zAEPNr>rI#zVhO1vF5|+8#5WJ%BxI2!zoaAGjr@WqA3FaH_+!@iDHv^Y-}fxN>uaB<
zh?NHam_KKakLcXwc=vLn;0yV$K5u}=J~%jYX(<#5s?oViCB?a0`EZ--of@bgZ*qy4
zK=Mbu8ikp9g0>De1-i&oPBi3C0gJXR2KQt^Y!p3~ve&VbfQe9M26D4=`Cuuj+A>UL
z@0j~@8Ge4>TlvaDL6P)W&Wl1m7F<_hYCQ&0QVa9%Q4yopq642bVKW0iY$O8GhJL(g
zEidm{sBumGnbC6iLey8xH#w@6lyO8>LCGt#=gop$Pex{p)ekcK{{G~)!5qx!Mt6XD
zLLzEFpl|xqj2sz=_+IrJ#9H@Fb<>rzjEoFu4NK(0x1%MY$lq&g@jEvKHum<brIqbt
zjZa6j0d#rZNJmc}kLW`&<}u==aB_mqK{jEsy}f-m^R0e#D*!llR~AFv-QBlMWHB=r
z(F`P<W}T*-E)y`F+#{11W;V9Zip^G`1mRu8DG0qn1eeG5%V;ytif4w3ouLr`#Z<H&
zzJR=ht$|ReAwc9VxV^LU`&YF^2(ujk&&pERSfi1=pieqBwitjcIvdPrBmy5*8pb1{
zOG+L|1ZJq-K-N=JQvveOUM-P(0Mz8M9iwx&c76GX5UWrvd$KWa6Q-&8X5I-4q#xX_
zD>5;xQ~`9R-=g!Flrx3N6FX>aVKDNItJSji@voP^-6^?5<fbSy6sTrW3Cc5ssZQ!r
zaHlF8dQ8AVO34aUGxLPr7(|DqfBbk~HA@r+g&y=Hi8+O%n%=%PF&IY+q)RxY43MQ`
zU!lNDLH4Z+ayO@jbc?>&J3|2~DV!nQ-7+zyRBd4VUyZJ-3a6)rj?T^-VDbuG$Pr=J
z70lghp-hlqQ3!kweqnA3X6|$d66Z6<E_(-u-(~oBD7k@AZ}YBpr-wR*uJ3xuV*-Jc
zYUR{xu;ym_!<2z1JJZjjG>JuzkB>vd&5V$S=b=X8I4K&P=GAeFj+oUeE9NsZGrVO?
zUj#FUpaGqSM@JF679?C;{v@3FG61+uPfrI|>g-ip^zF-h=S0)r)=zf2VG00wRrHp1
zEX_Z1@Bg+Eif=uk)(7uIgJ&ps_Wk!W;nou|ZaP!BCtE%NKA*z>evc2P{QPO-_;WmT
zj8+N#6<K1|R2QSzC;5+<gG>x!mErQf3UE=qmi&yrtx&41C*=e<DcFz2DzO;EKGo5p
zkv{pf;c@dRyw;P>^y&8=vC82Lp?pV4v**4F_DbjsZw%*-&OwT!F@K~FzEMMG_@*#y
zPXqLU0(o_ij^!YXc~4qDLdTY+(HW!w>X36Bwg>Ivpk4RhEwZ2^g3lWsetsh4<m4d(
zjro#6e6zcrK<pI|n*+nbX(XVVK1L=c!yy=00I~A+!D*(x44U0q8tk<7&Jc|8sVNK-
zFH{m3swc+-?de}&%BtC2z^g91H3<SaMMXuiS4=iePQ6?1=GD5qW$rGjg*+qV-*!D;
z|J~GZb91w?vrB-EnJJu@=5>EkOsvu`53bzsFhOJk+#c<&*EOjVV?CU;J1Xrv_DPY}
z=;LNMrgD-x*}K-(Rx~y5*va6nJSJR#4&5{bq7^!2B0$m}3l$SKz4=sMU*EYl?=n6!
zBW#*434;mw{w>1SBbK8x&FcirgOKY=cO2F1yCdU4b-N?qHnIl~29^xWq`-Q5xVz(*
zUMUHl8d0e_g5}{>b)|?4j&K3+G$|?RiA2W*EX`%i<AZ&XrlzLl^J31&Xry;4+RwMm
z_=NHL+EQ{SEjcL~S0Y&(&$mmZ4dW?rZLvt{HMO*=ryY_$eslw=16sC$mKG_+6AnXs
zdQ;F#im1<*ou*J?oD?Q=?$qzY!;E}<A8xZfzu3I|d<@`EYy|T8!n?Rkb?(+o19B-b
z+=GcpE(}wd?1iG@6K3WJ9_t}>4UN%_&&ag2v?`pGkP4`kr6niu0F(+W%KbtqOrH^3
zDV%2LAZaDSubDP-baea`y)`?v9N%NUJY!$E7Rg!>=YnGqNyTmuPocrW!Z@Sq(s+E&
zTKGrR<V<79nVtLg)S^((i|b&!*B`=QQgZSPUcCN{`z<;LZlcQ+C*urmKYf(p>0gt6
zF*7M3=UwQ+6*PLwo6m}DAI_%LYa+Mnt?yy={p>@91&CXS@Ib1HxTs)fb$|FW1Yk$S
zj*Kc{3^s4`9p$ITP)~|`UpgsW-fv)72$PIg5zhDyRVEV}_kBNQZ?fudJzVgYOBioe
zS??D<KYnEV?{!OesF+8+KJOxzx>>XsdF6t%v=oNZ#Jl<3oV=Php3cLbTox)8dhyK(
zl&(CyEZT<3=nS~-C7G>m*H)eaLD0tGyc(DMW4U#PL5l<{l4cE0;i0Hxk_~2Y=Y^id
zM?rhU{8NmBYhSAu`#z=cmZ=A0DtFtv2^<(s)MPkupgor~%kqGp7O?jfG}sy%xzu{=
zdv{zLiQAoHbF@!Cd_CB7DTu^LAn2#kc`$!_3^Pk(O+HA-w#^D`8<cRx@KM6KpD?si
zjowkR^<(DHuKYZ?)>4*fuCQEu&|w~}@rrCD5>q+U?o{~9>P#yGxvwJ1&-{v8;)k!O
z`HB8HV#Rj-tYqWthYx!2GlXE^hbwz?4{pYU`*Kqu4HP6C#*?s0ywHJ(Y6PkW#Yih|
z2w6GCxpbc!ZWQ*Uf_%s%Z+&@{-kF<7p+~|=Y-6yhP1pcMe~d`#Pm7l4Vj1OWZr9*C
z4C0}&X!Q|=gc}eYXsq(blt!uo>E316ed8-XNcW~W%5_rhJ>t}<_~F(wVLZ5%xN7v!
zRP6fvxtFW=BPtzo5Qd%((rojOt{)Tl`B=LVZ}qyzu#TxVb7hJoMnP<j>JP)4Rj72x
zZUq$*TO!n`MbH){rt&`Z%-kotd!I2N9Y3+s>aZ|2lehUHUj<V$IPh^m=%A?G)vF`X
z-}+^y*A`ro%f)Yu8shzuD9GzlWi>J{3R4+V=$G3)NR9crFVZlk6o&@bvZoAnH&=MX
z?fI(RtwiYNOA(B6kHT(Mf=g)}PUN6+L?&+M)(MGWi4y1oc%9IqoihB|;}i-Wja9pE
zYC1?O4r3M(R=x@bU!Vte%Ph2!O=Z>Zp#{n})7?bpX)TzAO!z7Cz+-K{sG;J76SBK-
z-FHg<l^%_htNo8A|J>+2E-o(WYA<MUZxt|MA6Rt~;^H{%3Nf~tR6kwJE5rvAw;4VX
zr19TJyExg+&&zuQNKQN5R0>BrQyC7wlbz-4o3m_;4jOU)6rtDarZoZcdTx_tdJey;
zB;Rg#+dM_1y=&DlYsJmus(AaxJW)I@9-hzE=VWidO9CevbkPuGu{HMjk)TWPM5#_b
z6YiZmcY>X-^ZfSOl51;u#}Ej;CO3Kl0s=_u-!Y}Zl+o8^b6>u2|MeBK9?C$=`TY6a
zEpeHcC~a>3^xeC6fTZHzzr3Y&kFswv&W<(`<KjAJojb(P?WGL~aVWTQ0PVLwB29F#
z^p+^Eghfr_ir2a0iXZqc%d7L_qLLq3Vtx?RAFqz6d8dv2<*uE@j%>haXqV}tbq1Wy
z_eGF03i_j5Os606JD6Tyo&h=q-R0)$^!4^Qv*(|={-)O(z;_lF6=7GNBC8Fr0ojp!
zgM#1W6c<N)R!WYYi@cXCv+jLQQUmZOh1r2G^z<kJoPv}YG{gcDK?9J$a&mVU&xX8o
zI|9W3|Ms+`<PiAAgT0&UJqRG5s(g-Kv9PjYR0gf5S}VMGu>!!Ga{Bti=iAHSG_U>`
zZr$b5{|=4Djs#lc-<39X#xw<SE2IyiU1|I?-ML-S+M4l}K~hmsne&)33f^<UdCba+
zC##`OcBjo4-nESVND9r;+8REHMI!mck8>qq(_O41y%$acpWR&tVL#*^6Eic8=v^0N
zc>@U`N&qeM3ox8Y$?QGm)evBcx2lEt1qH7<6?e#Gkr8R#E<l||I!a2)M?!9x4YLHD
zp*W<CTLtO2Bx+GbMFF64X|DIfpXt}WkH%m0ekPn5d$HG%?SY&;xW3ps>ck+z{Ft1X
z`IFjf?rRCGRM6*;4n&Tf*#>xC-cw;=;qx-dPzf9gSH&#v6`bqysSXh5(RJea9cN_x
z{Y9q?UlRc9(y_8e#XJ-^@t)tFC_Vd&Q0pHUumHeK3_9EBN_%#8b~`-Sa8RG$#M<Th
z^vU1lxl-%tq;?z^x}qlR(R33Jm|m0M@>Vdlem@@#+xhz6QY{Im)cd&j;D~#|vNVls
zH54b40slRHp|aLEb56&#cDjn*b_cb9ocPbNv}-WqE;WQvX_I7?h;$e;f@nj)8o>_y
z+0&mXP}D1S&-?a<Dkx?r2^`dk@mRDil@6&GIM1PPzD-oO6NXsN3>0}x*cob|z=vDG
z{M6Le_AlEmsOC#UVF2d1Ma&2xEG!~Bv%ov}?^HR<AH%yu`Z{8}zXKfcxMtPb0&x*m
zcUk26(4#^a_&{S258zY{Xr_o+R3U2mZL1V!BJf&VEOOUsKjtCsMbxc&r(wP1wtGLD
zr?%7sGU@lAui=AJV&86k>YHxo1dE|N{#VW*T77nuq&jc)dEl3RiXLY%wDxKiJ%%+>
z9866vJQikKI&yYXTbU7*nsA~;rNejlVn*%Y$60d70#YXWk=`ot<_jh9)6eW!ou8Y%
zD-QLBJTs_&!{S#R`t-6&{rNfjuL6n(n#K|;U16h&09~@Z&>v)rz(cF@+qo>cWG1ZL
znQAN<xyU=l3pBb`+JP=!B90sdYM5-RdttI1ltc_~d|nc9n4s8N9%ZAoOTTkwe4v>`
zg2HitK=41^-Uaa2fj6bmO13m){v_Bt+6ZF{36vMm-8i`VOOun_E}sTW?ZUzmRJJ(S
za=eE*^M;NxL09+{<!e$WisKu9=tS4X&lc~NKWtR<F~OCi%k#{qF4iyNc}ox}05_uK
z2aNg0%L@DF$r?FE;}j(Q(GV*YX06N*)4$$7Np>|V!E5PL_30C8cM$uQ^--)%98pp`
zF&LYC*JJrPIzyzpx5zJmO@i!Dk6qAtH8;t{@Eq})tOrT9*r;!DrFu+h*scmqb?I3R
zbn*JD<$<n(=S2aHE$8^^xQO(jctn`YI{o14$8Lkqq!ttj!4c*xELOW#N%h$UUuqKg
zomn#_XKEB3ROuRoiGq13t$67%8NVw>#_JkA(kpnz(5+0z!Jsx&6Uv1{K}=FtR|hh}
zN5P<8m5|0_Q6)obi$#0OKT<rA`L6Ikml^(jLWre8qcNlJJ+-7W4ZZh-7VZCws3a0y
z#?r>o@pEo&Zr`5KHWF#;>`Zi-m64%rXgK$NlCd_osHDWsD3s#Lb8kV#+Ik}{%bq*6
z@%kJIa5OfS+k3h<-%KgahxT2vSmgx3GLRJ)7eDc-pa7^!M*R~ANWYuR-4la}C5azC
zRPy(~%o&b?!x{PcJuz8GfDPp^f2yA-Tkg>V?C?HEQB95JuMi36s4QrR>9_p$s^<8@
z8^p3#H9nZEB>K1g@QpwtciLoVz*sMBZEcw%$B==t8WcRQqVJO9uqlFGxGC`I7#UBS
zC4_`1^GAy$Wu=rzIG=8w3zH>u<!{oV(Y|4)rw87h24z%;h)+n!Gn18K9h>H|E3U1}
z%=vtLZzvYw?)h@*Kv)B&qyF)ajlM$5YZ3_8N0PFL*|%Q|Wx4XEdd#`4&A8O)#GUR|
zS8D|?Yy^a!fDIR2(roK6BPWf}HmsP5Yg4n9CsZMbLQ_z_#$z~|j%!OEO?lzA(xC97
zl|VK=HfdP~Y~tEl=!(>?y}M@?xC5eU{ng>eH`A(IrmrHzNEZ}gRN3Tt4x1LtsBxZ9
zSeSL+H&YDCliHh*)trE8q?GiNaP<M>2A6nhdR$vQ`gwb`W9!+kG68d?l<*$zh*2+q
zn-M;^en;#(u^p$_5ADAha`y$&$a2~0W<zjN-fwscXlNWi8#*Dm5yRz2ijHX}Ta278
zZhP_`c<UFH1|^3Tz#$wEu64s+E3uH#k&A>T3Sc4P!L^cAhcm6lxprx?e6AQ*N;nBA
zkL2nPsr<IKq{!{MMngUb1p)K#=o@mmeBOP;EnGhI&CFm9ij0kY4be4c;9*A|yq~t;
zg7+bmWhahmh14LX{4ulw7gL+=+%lo}<!1lE${!|>=*uJzL}D@y)a3jdA)CiImhcNo
z*J=c0bF(bx%ejC%wQxi{0C6EyR1ITezSxjN=ESmS`M16vOmobU@vf<GGk!Rhzs@jd
zw>5;hgLBRag}-e+2e%!z%qg;uN&lX2razS_tl?xoa&>Q}fNXCs_N=<v7{dMn_r}TR
zk-qO8@R}q>k^P(HBma9v3Rc_&JPb|P5O~9fwErUH_p|e%^VOvLSFh<A*(U3AIO8(@
zIw}|7Lv~^_8|PyHjy3|p;XM$69Yf+0{*8|FC=?W%F(9yG)Bgd-woXFW!Kz#qbwd-4
zY5Qg>DN(8i1QXM9|Ha1Y6GPAK=YRbw4hdpH6V6NpPY}Ubr8konhWM%=3b?y#lh`oO
z@Mz4Me;b)9Pj0*RpqG0CEa1|z8!|h_+b)p5jPw`_Fp3xfi*e7`S$?Vh5%hR+0(!R-
z={QxDu=x~V;$|=A#L;<QWcz=>?>`<3V)}1Hi{$=RKuG-PKkn?AQF9Uo7Gag|u^pLg
zZcYv)_}e!rGFebfi%(AOL#RopoG!mDP|F?ycLZLq4`-`MNul3j%?}?wJTq>k09B;%
zVy&~E&Vg_9n8qh2&_+OkDvi(ndw>5EF0Odc*HD(2h$%I4vYeKK1K!vu=+t^3M+g-a
z{ARCU$PJUhY_VwYZS{`0{j>Ahfu0}%BrdKn>1ZGBn{#UL-CvUQ0MKt6fIiB`8-O8U
zeX-P~e`e~GJX+>Kv3Hw5U@nK<sQG1iaj}M4ws<)h&)^PVEA&C7xUQ&3!Kl{ew!~jD
zv+H>*FL53(Y|)E9v3l@VPu|tF+K!iMVp#k_*+V#U`_FvKL?2C%|F7&lw<&ldFL&Qt
ziX`Jo>)vQe(Zlbwms@H<E8x}wf#?Y+=ks_~R2oPrD&n+w?&j6kf8?_0jY~`W5d<n0
zhhQ;Cg8|FQ@4TRFWkr7rjBkr>fE9J!`NdVBl0J!Si;<&cNW>|fc%iI}50Zrtv4_Oh
zzwasjMP_;}EH@)7XvM{$vKqJd5&&@`mX(;9N!@6qr>E%e?+;h}!>Ek%y<7wt1mw$?
zF97n&YK;7+Sc|YG@bU3!J)ShUMD<C$0!A0z@5Jyw=y@P{ne;S?&J=SgeW9-9|5;pZ
zoc{wI)=H|JO$MbzP-!F)Ttrs~1;573f8pVZ3PKpn?{Y5iVmDwC&E;aZp~eHLeSH8A
zy1l{R)V#$27{WJ)O|mxN5+H<|n_JNBx5DG+8UJ^E6ocIY7}YT7>iBkP|Bpf}pb_U>
z`=~+5FEWu)ZESL~w>{v3iG>C5H~GWD_0Ee!!+|F}fS+?|GV~VBdhYo2@^-ZU4K_0|
zF+CO#NVd-M39}mbXjZ5H&n*s6|GsN91XdqVW#*s??0+%4xSr+r2d&fcP~}a%>#7`J
zH1%q2h(UC^RSGD8TQfapO}&Wf6;M!Cwj9lq1tvWBhR3lVE^!%xLZL>=@ril49Th`e
zc=vL@__9ItDB<QR*l^O^ptAuJ-FAN=kn*KX5?3G@$NFJ(#4pqN9lrsZ8{{1S6rKYQ
zNk6h1m3Zh$(cVVn?T-dd%fS9m7#YLC!MXoTxAXO|SmAB+7cAO0EcEp68WsZ$klZpU
zabhXhxlHFy9sT4V7ut;(#il-B(*wkui_k-F5X;ggV8u~`swpjbo_f$r0k%`hora+X
z0sCx5zg%2%g!C}yEdXfV{TA>uSokibH~rKrbLCquEWe^#n)TzNMDYGA9mjvysB2>V
z_2h)xutGlui9^G#!?wHI9;n{#K_?v<O9qBCS4mmz_&jE_SYgYy+}*(LzpL0`EN@PC
zODcq-rYZjNL#Q4$EW1q*97cZMaGMzY)kpw&yC5fZ=>P!rZLT1r5go1A^bk@xBdId~
zkV@xMTUztfK}maTl11+v?U4tZ6AY7>gi<jdYznTtiOKg#uD@gZeY6!%{>T^tzNDMJ
zZ)2!&>+}VM#}L}1$A#s>H{3%fT4~y!?pR+imkf36+>~z!6Vu91dSgb`{Z7nkeguJe
zXlR^0+oiVVtFZfB>y_~t)c8R0Z>xA9qVBi-zXFbWzhac)3q$PSS&I!2HhCNV!R7Y0
zS8r<8I=XiceoAE8w50z2)B|{#HcQbI?1B<#3CHFAmQ|h`F=5RFJj%p>A@Pu@SyiL1
zi*W&m+|tGPpTh9yzcdPqv;>NsGNK=v&wq|jc1Y_O+<W^ZqLyo*ieZj(`{wNB#M^wI
z2YepR%bo(~doTr+rf@E1S%rLINAOm}`p346N<H{2z~IBC3f)vD&kg7JQdFiX@awBO
zqDxEI=b$3g)EhRAVsG-T)GCL0`5pOi4#slW9|6k{S2nyo?<FdB{ucsgip-*$mD22c
z7QZlN2EYw&lOZ23Z$N1NKL9*G{%hQOHZ!PVM1y&fYP6X=IzvfuZA0?1g9Lz<^iXH}
zUwhL`k;HB6go(^}N`o@y=7(`d&z6<`xdc2$czYH2pK!wk*|RYUz4w&+Dkvl&MR7?H
SV-wtFhA7Ia%2Y_1zWHB&;guKw

diff --git a/docs/faqs.rst b/docs/faqs.rst
deleted file mode 100644
index 77ac5020..00000000
--- a/docs/faqs.rst
+++ /dev/null
@@ -1,189 +0,0 @@
-FAQs
-====
-
-This page serves as a catch-all for details of various topics.
-
-
-.. _simultaneous-workflows:
-
-Can I run multiple workflows at once?
--------------------------------------
-
-Sometimes. While Snakemake creates a lockfile to prevent multiple instances
-running in the same directory, the fact that each workflow (rnaseq, chipseq,
-etc) are in their own subdirectory means that they will each have their own
-separate lockfiles and can be run.
-
-Be careful though, because both ChIP-seq and RNA-seq workflows include the
-references workflow. This means that if you have not yet already set up the
-references, the RNA-seq and ChIP-seq workflows may both attempt to write the
-references, potentially corrupting it.
-
-
-.. _multiple-experiments:
-
-How do I handle multiple experiments in the same project?
----------------------------------------------------------
-
-It's pretty common to have both RNA-seq and ChIP-seq experiments that need to
-be analyzed together. For example, we might have RNA-seq in two different mouse
-cell types, RNA-seq in a human cell type, ChIP-seq for different antibodies in
-those cell types, and all of this needs to be compared with publicly available
-data (say, other RNA-seq experiments from GEO). We need to make figures for the
-manuscript, so the figure-making code needs to be re-run if there are any
-changes to the primary analysis (new samples, parameter changes, etc).
-
-lcdb-wf is designed to handle all of this. There are a couple of limitations to
-lcdb-wf that will determine how best to split up workflows, and the subsections
-below will help you decide if you should consider an experiment as part of
-a different workflow.
-
-If an experiment needs to be considered as part of a different workflow, then
-make copies of the relevant workflow directory after deploying. Taking the
-above project as an example, immediately after deploying (using ``--flavor
-full`` so we get all supported workflows including RNA-seq and ChIP-seq), we
-have::
-
-    workflows/
-      chipseq/
-      colocalization/
-      external/
-      figures/
-      rnaseq/
-
-then we might rename the directory called "external" to match the GEO accession (to make it
-easier to remember), make copies of the RNA-seq directory for the mouse and
-human experiments, and clean up a little:
-
-.. code-block:: bash
-
-    rm -r workflows/colocalization
-    mv workflows/external workflows/GSE00112233
-    cp -r workflows/rnaseq workflows/mouse-rnaseq
-    cp -r workflows/rnaseq workflows/human-rnaseq
-    rm -r workflows/rnaseq
-
-then we would have::
-
-    workflows/
-      chipseq/
-      figures/
-      GSE00112233/
-      human-rnaseq/
-      mouse-rnaseq/
-
-See below for advice on when to split experiments into separate workflows.
-
-
-Samples need the same library layout
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-A single workflow must be either all single-end or all paired-end.
-
-If you have the same RNA-seq samples that were sequenced once SE and again PE,
-you'll need to have two different copies of the ``workflows/rnaseq`` directory
-(say, ``workflows/rnaseq-se`` and ``workflows/rnaseq-pe``). If they are to be
-combined in the same differential expression analysis (e.g., by modeling layout
-as a batch effect), then the adjust your downstream analysis in R to read in
-both counts tables (see :ref:`rnaseqrmd` for more).
-
-Samples need to use the same parameters
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-There is no mechanism for specifying sample-specific parameters. For example,
-to use cutadapt to trim some 5' bases from some samples but leave other samples
-alone. Samples that need to treated differently should be split off into
-a separate workflow, and the respective Snakefiles should be edited
-accordingly.
-
-.. note::
-
-    A partial exception to this is that the peak-calling for ChIP-seq supports
-    specifying custom parameters for each peak-calling run. For example, when
-    running macs you can specify "--nomodel" for a single peak-calling run, or
-    any other parameter supported by the peak-caller.
-
-    However, the BAM files used in peak-calling still need to have used uniform
-    parameters across samples, so if alignment or trimming options differ, you
-    should split each set of parameters into a different workflow.
-
-Samples must use the same assembly
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-There is no mechanism for specifying per-sample assemblies. Samples from
-different species need to go in different workflows. If you want to compare,
-say, hg19 and hg38, then you would make a copy of the workflow for each assembly
-and adjust the reference config accordingly for each workflow separately.
-
-.. _lowcounts:
-
-How are low counts handled during differential expression analysis? Should we use a read-count threshold to filter genes?
--------------------------------------------------------------------------------------------------------------------------
-Low count genes are handled during the normalization and analysis steps of DESeq2
-with sophisticated statistical models. Genes with low counts across the board are flagged
-as *low count outliers*, and the p-values are set to NA. Also genes with low counts
-are penalized by shrinking the ``log2FoldChange`` estimate. For example, a fold change of
-4 that came from 4 reads in the treatment group vs 1 read in the control, will be shrunken,
-as opposed to if the treatment had 2000 reads vs 500 in the control. As a result of this
-low-count correction, the ``log2FoldChange`` of genes clearing a false-discovery criterion
-can be used as a reliable metric for prioritizing candidate genes for follow-up experiments.
-In contrast, using an arbitrary fold-change cutoff could introduce biases that potentially
-violate modeling assumptions and introduce variables that we could not predict or control for.
-So, we do not recommend using count thresholds to filter differential expression analysis
-results to determine candidate genes for follow up.
-
-
-.. _troubleshooting:
-
-How do I troubleshoot failed jobs?
-----------------------------------
-Many rules have an explicit ``log:`` directive that defines where the log is
-written. These are typically in the same directory as the output files the rule
-creates, and this is the first place to check if something goes wrong.
-
-Some rules do not explicitly redirect to ``log:`` or may only redirect either
-stdout or stderr. Where this output ends up depends on if you're running
-locally or on a cluster.
-
-**When running locally,**  stdout and stderr will be included in the output
-from Snakemake, so check there.
-
-**If running on a cluster,** the default behavior is to send the main Snakemake
-output to ``Snakefile.log``.  The per-rule output depends on how it was sent to
-the cluster.  As described in the above section, by default stdout and stderr
-are sent to the ``logs`` directory, named after rule and job ID.
-
-**If a job fails on a cluster**:
-
-- Open ``Snakefile.log`` and search for ``Error``
-- Recent versions of Snakemake report the ``log:`` file (if any) and the
-  ``cluster_jobid:``. Keep track of these.
-- If ``log:`` was defined for the rule, check there first
-- If not, or if more information is needed, check
-  ``logs/<rulename>.{e,o}.<jobid>`` (which is how stderr and stdout are
-  configure when running with the ``include/WRAPPER_SLURM`` wrapper).
-
-For example, if we find the following error in ``Snakefile.log``::
-
-    [Tue Feb  6 20:06:30 2018] Error in rule rnaseq_rmarkdown:
-    [Tue Feb  6 20:06:30 2018]     jobid: 156
-    [Tue Feb  6 20:06:30 2018]     output: downstream/rnaseq.html
-    [Tue Feb  6 20:06:30 2018]     cluster_jobid: 60894387
-
-Then we would check ``logs/rnaseq_markdown.e.60894387`` and
-``logs/rnaseq_markdown.o.60894387`` for more information.
-
-
-.. _updating:
-
-How do I update my deployment?
-------------------------------
-
-If there are additional fixes or features in the main lcdb-wf repo that you
-want to propagate to your existing projects, the best way to do this is to
-clone a recent version and do the manual diffs between the new version and what
-you have on disk.
-
-To help narrow down the changes that have happened in the main lcdb-wf repo
-since you deplyed to a project, Use the ``.lcdb-wf-deployment.json`` file that
-is created when deploying to a project to find the commit hash that the
-deployment used.
diff --git a/docs/functional-enrichment-rmd.rst b/docs/functional-enrichment-rmd.rst
deleted file mode 100644
index 4b9f4465..00000000
--- a/docs/functional-enrichment-rmd.rst
+++ /dev/null
@@ -1,122 +0,0 @@
-.. _functional-enrichment:
-
-Functional enrichment analysis
-==============================
-
-Let's say that the differential expression analysis of your RNA-Seq data
-yields hundreds of changed genes. While this is exciting as it shows that
-there is an effect of the treatment or mutant or drug, it also can be
-daunting, since it is extremely labor intensive to manually go through a
-bunch of genes, and assess their relevance to the biology that you're 
-interested in. 
-
-So, one of the first questions we usually have after performing a
-differential expression analysis, is: **what kind of genes are changing in 
-the experiment?**
-
-To answer this question, we perform functional enrichment analysis, where
-information about gene function (``annotation``) from various databases
-are collated and tested for enrichment using a `hypergeometric test 
-<http://en.wikipedia.org/wiki/Hypergeometric_distribution#Hypergeometric_test>`_.
-The databases implemented here by default are gene ontology (GO) and KEGG pathways,
-while Reactome pathways can be enabled if desired.
-
-cprofsetup
-----------
-Here we set up ``clusterProfiler`` parameters and output directories. The default
-parameters should work with almost all analyses. The only parameter that can be
-changed:
-
-+-----------+--------------------------------------------------------------------------------------------------+
-| parameter | description                                                                                      |
-+===========+==================================================================================================+
-| ``types`` | A list of keys to extract from the annotation database. The defaults should not be changed, but  |
-|           | additional keys can be added. For a list of keys for an orgDb object, run ``keyTypes(orgdb)``    |
-+-----------+--------------------------------------------------------------------------------------------------+
-
-``bitrgenes``
--------------
-Here we obtain a list of differentially expressed genes and obtain a mapping of
-these gene IDs to the terms listed in ``types`` above. Notable parameter to change:
-
-+---------------+-----------------------------------------------------------------------+
-| parameter     | description                                                           |
-+===============+=======================================================================+
-| ``from.type`` | This needs to be changed to match the source of gene IDs being used.  |
-|               | For instance, if the GTF file is downloaded from Ensembl, then this   |
-|               | should be set to ``ENSEMBL`` (default = ``'FLYBASE'``).               |
-+---------------+-----------------------------------------------------------------------+
-
-``enrichall``
--------------
-This is the main chunk where the functional enrichment analysis is performed. A set of 
-parameters for the analysis are set at the top of the chunk, but the only ones that should be
-changed based on the experiment are:
-
-+-----------------------+-----------------------------------------------------------------------------------+
-| parameter             | description                                                                       |
-+=======================+===================================================================================+
-| ``kegg.organism``     | This is set based on the organism being studied. For example, for *Homo sapiens*  |
-|                       | this should be ``'hsa'`` (default = ``'dme'``).                                   |
-+-----------------------+-----------------------------------------------------------------------------------+
-| ``RUN.REACTOME``      | Specify if Reactome analysis should be performed (default = ``FALSE``).           |
-+-----------------------+-----------------------------------------------------------------------------------+
-| ``reactome.organism`` | In contrast to KEGG, Reactome analysis needs the name of the species.             |
-|                       | For example, for *Homo sapiens* this should be ``'human'`` (default = ``'fly'``). |
-|                       | This only needs adjusting if ``RUN.REACTOME`` is ``TRUE``.                        |
-+-----------------------+-----------------------------------------------------------------------------------+
-
-This chunk performs GO, KEGG (and optionally Reactome) analyses for each element in the list of 
-DE genes, separately for up- and down-regulated genes. The results populate a list which is then
-output to files and used subsequently for visualization.
-
-.. warning::
-
-   Each list of DE genes is analyzed in multiple (3 GO ontologies + KEGG + optionally Reactome) separate ways
-   so for many lists, this chunk can be *very* time-consuming. However, with ``cache = TRUE``, 
-   this is only run once and subsequent runs are much faster.
-
-``plotgo``
-----------
-Here we visualize the functional enrichment analysis results using three different
-plots:
-
-- ``dotplot``
-
-   This represents highly enriched terms from the analysis with the color corresponding to
-   level of enrichment (adjusted p-value) and the size of the dots representing the
-   number of genes associated with the GO term or pathway.
-   
-   This is the easiest plot to read the enriched terms. By default the top 10
-   categories are shown.
-
-- ``emapplot``
-
-   The enrichment map plot is a network visualization of the enrichment analysis. Nodes in
-   the network represent GO terms or pathways and the connections (edges) between nodes
-   represent shared genes between the terms/pathways. The color of the node corresponds
-   to adjusted p-values, the size represent the number of genes associated with the term
-   and the thickness of the edge represents the number or fraction of genes shared between
-   the terms.
-   
-   This plot is good for identifying enriched pathways, since categories with
-   similar genes will cluster together. By default the top 30 categories are
-   shown.
-
-- ``cnetplot``
-
-   This plot extracts the complex associations of genes with multiple functional terms or
-   pathways. The size of the nodes for functional terms represent the number of associated
-   genes and the gene nodes can also be overlayed with observed fold-changes.
-   
-   This plot is good for identifying which genes act as links between which
-   categories. By default the top 5 categories are plotted.
-
-Each plot is also saved to file as PDF and a link included in the html summary report.
-
-``adjustcatlen``
-----------------
-Here we update the ``clusterProfiler`` results object by adding symbols corresponding to
-KEGG gene IDs. This is done so that human-readable gene names show in the visualizations.
-This will likely be moved to a helper function that is run automatically in a future release.
-
diff --git a/docs/gene-patterns-rmd.rst b/docs/gene-patterns-rmd.rst
deleted file mode 100644
index a8130708..00000000
--- a/docs/gene-patterns-rmd.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-.. _gene-patterns:
-
-Gene patterns analysis
-======================
-
-When analyzing RNA-Seq experiments that have a temporal element, e.g.
-time-series data where a treatment is applied for an increasing amount
-of time, or a dose-response aspect, e.g. where increasing amounts of a drug
-is administered to a set of cells, an interesting biological question can be:
-*which genes are changing in the same way over time, or in response to the
-drug treatment?*
-
-In these cases, it may be useful to look for groups of **co-expressed** genes, or genes
-that show similar trends in expression over time or the dose-response assay, as this might
-indicate a pathway or gene regulatory network is being affected. For instance,
-if the main effect of a treatment is to suppress a transcription factor that
-induces a group of genes, these genes would likely show a trend of decreasing
-expression over the course of a dose-response experiment.
-
-One approach to this analysis could be to compare each time-point or dose sample to
-the control sample. However, if we are working with many time-points or doses
-this can get unwieldy very quickly, as we would be dealing with that many lists
-of differentially expressed genes. Moreover, we would lose the temporal information.
-For instance, in a time-course experiment if a gene is up-regulated from 0 hr ⇒ 2 hr, 
-and also from 0 hr ⇒ 4 hr, we don't know how 4 hr compares to 2 hr, without comparing
-those time-points explicitly.
-
-An alternative approach that avoids the complications of a per-time-point analysis
-is a pattern-based analysis, which looks at the temporal trend of gene expression as a whole,
-to find similar groups of genes. For instance, in the above experiment, the pattern
-analysis would immediately tell us which genes keep going up from 0 ⇒ 2 ⇒ 4 hr,
-and which genes go up from 0 ⇒ 2 hr, but then don't change from 2 ⇒ 4 hr.
-
-In this Rmd, we implement the latter approach and look for groups of co-expressed or
-co-regulated genes, using gene patterns analysis with the R package, ``DEGreport``. The input to 
-this analysis is normalized expression data of differentially expressed genes.
-A clustering algorithm implemented in the package is then used to find similar groups
-of genes, which are then plotted.
-
-``dpsettings``
---------------
-
-This is the chunk where the parameters for the pattern analysis are specified.
-Notable parameters to adjust include:
-
-+-------------+----------------------------------------------------------------------------------------------------------------+
-| parameter   |  description                                                                                                   |
-+=============+================================================================================================================+
-| ``time``    | Factor to show on x-axis. Typically this is the time or dose-response factor in the experiment.                |
-|             | Should be a column in the ``colData`` metadata (default = ``'group'``).                                        | 
-+-------------+----------------------------------------------------------------------------------------------------------------+
-| ``col``     | Factor to color lines by. Should be a column in the ``colData`` metadata (default = ``NULL``).                 |
-+-------------+----------------------------------------------------------------------------------------------------------------+
-| ``minc``    | Minimum cluster size (default = 1). Consider increasing if you're getting many clusters with very few genes.   |
-+-------------+----------------------------------------------------------------------------------------------------------------+
-| ``lim``     | Maximum number of genes to include in the clustering (default = 2000). If number of DE genes is higher, they   |
-|             | are downsampled to ``lim``. Increase with caution since clustering a huge number of genes can be very          | 
-|             | CPU-intensive.                                                                                                 |
-+-------------+----------------------------------------------------------------------------------------------------------------+
-
-``finalclusters``
------------------
-
-This is the chunk where the pattern analysis is done via the following steps:
-
-1. Set up selections of DE genes to be used in the clustering analysis. By default,
-   we use all the ``changed`` genes for this purpose, but you can choose to only
-   use ``up`` or ``dn`` genes.
-2. The set of genes are consolidated and downsampled if the total number > ``lim``.
-3. Normalized data corresponding to these genes are extracted. We use the
-   ``varianceStabilizingTransformation`` for the normalization. Before proceeding,
-   genes with identical counts across all samples are removed, since this could lead
-   to errors in the clustering.
-4. The primary clustering command ``degPatterns`` is run, using the parameters specified
-   in the ``dpsettings`` chunk above.
-5. Clusters having > ``minc`` genes are filtered out and plotted, with the gene lists being
-   saved to files in the ``final_clusters`` subdirectory, while a column encoding the cluster
-   membership is added to the ``res.list`` element corresponding to the DE genes being analyzed.
diff --git a/docs/generate_guide.py b/docs/generate_guide.py
deleted file mode 100644
index 4f6fc101..00000000
--- a/docs/generate_guide.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import os
-from jinja2 import Template
-HERE = os.path.abspath(os.path.dirname(__file__))
-FILES = os.path.join(HERE, 'guide-to-files.txt')
-
-class File(object):
-    def __init__(self, fn):
-        self._fn = fn.strip()
-        self._desc = ""
-        self._padding = 0
-
-    @property
-    def fn(self):
-        f = self._fn
-        if f.endswith('/'):
-            cls = 'dir'
-            padding = f.count('/') - 2
-            f = os.path.basename(f.rstrip('/'))
-            f = ('&nbsp;' * padding * 3) + f + '/'
-        else:
-            cls = 'file'
-            d, f = os.path.split(f)
-            padding = d.count('/')
-            if self._fn.count('/') == 1:
-                padding = 0
-            f = ('&nbsp;' * padding * 3) + f
-
-        if '***' in self._desc:
-            cls += ' important'
-            self._desc.replace('***', '')
-        if not self.desc:
-            cls += ' undoc'
-        return '<span class="{0}">{1}</span>'.format(cls, f)
-
-    @property
-    def desc(self):
-        return self._desc.replace('*', '')
-
-    def __str__(self):
-        if self.desc:
-            return (
-                '<a href="https://github.com/lcdb/lcdb-wf/blob/master/{0}" '
-                'data-balloon="{1}" data-balloon-pos="right" '
-                'data-balloon-length="xlarge" style="text-decoration:none;">{2}</a>'
-                .format(self._fn, self.desc, self.fn)
-            )
-        return (
-            '<a href="https://github.com/lcdb/lcdb-wf/blob/master/{0}" '
-            'style="text-decoration:none;">{2}</a>'
-            .format(self._fn, self.desc, self.fn)
-        )
-
-
-def gen():
-    f = None
-    for line in open(FILES):
-        if line.startswith('/'):
-            # it's a filename
-            if f is not None:
-                yield f
-            f = File(line)
-        else:
-            f._desc += ' ' + line.strip()
-    if f is not None:
-        yield f
-
-files = list(gen())
-
-
-TEMPLATE = """\
-.. _guide:
-
-Guide to file hierarchy
-=======================
-
-The ``lcdb-wf`` workflow system is designed to have a standardized directory
-structure and file hierarchy to allow us to be as consistent across many diverse
-and disparate analyses and sources of data and reduce the overhead when it comes
-to troubleshooting when something goes wrong. All the components of the repository
-are laid out with this overarching design principle in mind.
-
-Below we give a high-level overview and brief description of the files and folders used
-by the workflows, and include an annotated directory tree highlighting the most important
-parts of the repository.
-
-Folder organization
-~~~~~~~~~~~~~~~~~~~
-
-The top level of the repo looks like this:
-
-::
-
-    [1]  ├── ci/
-    [2]  ├── docs/
-    [3]  ├── include/
-    [4]  ├── lib/
-    [5]  ├── README.md
-    [6]  ├── requirements-non-r.txt
-    [7]  ├── requirements-r.txt
-    [8]  ├── workflows/
-    [9]  └── wrappers/
-
-1. ``ci`` contains infrastructure for continuous integration testing. You don't
-   have to worry about this stuff unless you're actively developing `lcdb-wf`.
-
-2. ``docs/`` contains the source for documentation. You're reading it.
-
-3. ``include/`` has miscellaneous files and scripts that can be used by all
-   workflows. Of particular note is the ``WRAPPER_SLURM`` script (see
-   :ref:`cluster` for more) and the ``reference_configs`` directory (see
-   :ref:`references` and :ref:`config` for more).
-
-4. ``lib/`` contains Python modules used by the workflows.
-
-5. ``README.md`` contains top-level info.
-
-6. ``requirements-non-r.txt`` contains the package dependencies needed to run the
-   workflows, and is used to set up a conda environment.
-
-7. ``requirements-r.txt`` contains the package dependencies for R and various
-   Bioconductor packages used in downstream analysis. See :ref:`conda-envs` for the
-   rationale for splitting these.
-
-8. ``workflows/`` contains one directory for each workflow. Each workflow directory contains
-   its own ``Snakefile`` and configuration files. We go into more detail in the next section.
-
-9. ``wrappers/`` contains Snakemake `wrappers
-   <https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#wrappers>`_,
-   which are scripts that can use their own independent environment. See
-   :ref:`wrappers` for more.
-
-Below, you can see a detailed overview of the files contained in these folders.
-
-
-Annotated tree
-~~~~~~~~~~~~~~
-
-The following is an annotated directory tree of the ``lcdb-wf`` repository to
-help orient you. Hover over files for a tooltip description; click a file to
-view the most recent version on GitHub.
-
-Files in bold are the most important.
-
-.. raw:: html
-
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/balloon-css/0.2.4/balloon.min.css">
-
-    <style>
-    .dir {
-        font-family: monospace;
-        font-size: 1em;
-    }
-    .file {
-        font-family: monospace;
-        font-size: 0.8em;
-    }
-    .important {
-        font-weight: bold;
-    }
-    .undoc {
-        color: #888;
-        }
-
-    </style>
-    {% for x in files %}
-    <p style="margin:0px;">{{ x }}</p>
-    {% endfor %}
-
-
-Now that you have seen which files and folders are the most important and have some idea
-of where everything lives, let's look at how to run tests to make sure everything is set up 
-correctly (see :ref:`running-the-tests`), or jump right in to learning about how to configure
-the workflows for your particular experiment (see :ref:`config`).
-
-"""#.format( '   '.join([i for i in open(FILES)]))
-
-
-def setup(*args):
-    t = Template(TEMPLATE)
-    contents = t.render(files=files)
-    with open('guide.rst', 'w') as fout:
-        fout.write(contents)
-
-if __name__ == "__main__":
-    setup()
diff --git a/docs/getting-started.rst b/docs/getting-started.rst
index 17132c36..c77c935e 100644
--- a/docs/getting-started.rst
+++ b/docs/getting-started.rst
@@ -3,201 +3,127 @@
 Getting started
 ===============
 
-The main prerequisite for `lcdb-wf` is `conda <https://docs.conda.io/en/latest/>_`, with the `bioconda <https://bioconda.github.io>`_. channel set up and the `mamba <https://github.com/mamba-org/mamba>`_ drop-in replacement for conda installed.
-
-If this is new to you, please see :ref:`conda-envs`.
+**Prerequisites:** `conda <https://docs.conda.io/en/latest/>`_ with the `bioconda <https://bioconda.github.io>`_ channel. See :ref:`conda-envs` if you need help setting this up.
 
 .. note::
 
-    `lcdb-wf` is tested and heavily used on Linux. It is only supported on
-    Linux.
+    `lcdb-wf` is only supported on Linux.
 
 .. _setup-proj:
 
-Setting up a project
---------------------
-
-The general steps to use lcdb-wf in a new project are:
+Quick start
+-----------
 
-1. **Deploy:** download and run ``deploy.py`` to copy files into a project directory
-2. **Configure:** set up samples table for experiments and edit configuration file
-3. **Run:** activate environment and run the Snakemake file either locally or on a cluster
+1. **Deploy:** Copy workflow files to your project directory
+2. **Configure:** Set up sample tables and edit config files
+3. **Run:** Activate environment and run Snakemake
 
 .. _deploy:
 
-1. Deploying lcdb-wf
---------------------
-Using `lcdb-wf` starts with copying files to a project directory, or
-"deploying".
-
-Unlike other tools you may have used, `lcdb-wf` is not actually installed per
-se. Rather, it is "deployed" by copying over relevant files from the `lcdb-wf`
-repository to your project directory. This includes Snakefiles, config files,
-and other infrastructure required to run, and excludes files like these docs
-and testing files that are not necessary for an actual project. The reason is
-to use this script is so you end up with a cleaner project directory, compared
-to cloning the repo directly.
+1. Deploy
+---------
 
-This script also writes a file to the destination called
-``.lcdb-wf-deployment.json``. It stores the timestamp and details about what
-commit was used to deploy it. This tracks provenance of the code, so you can
-always figure out what lcdb-wf commit your deployment originally started from.
+`lcdb-wf` is deployed by copying relevant files (Snakefiles, configs,
+infrastructure) to your project directory. This creates
+a ``.lcdb-wf-deployment.json`` file tracking the commit used.
 
-There are a few ways of doing this.
-
-Option 1: Download and run the deployment script
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This is the most convenient method, although it does not allow running tests
-locally.
-
-.. code-block:: bash
-
-    BRANCH=master  # optionally change branch
-    wget https://raw.githubusercontent.com/lcdb/lcdb-wf/$BRANCH/deploy.py
-
-Run ``python deploy.py -h`` to see help. Be sure to use the ``--staging`` and
-``--branch=$BRANCH`` arguments when using this method, which will clone the
-repository to a location of your choosing. Once you deploy you can remove the
-script. For example:
+**Option A: Download deployment script** (quickest method)
 
 .. code-block:: bash
 
+    wget https://raw.githubusercontent.com/lcdb/lcdb-wf/master/deploy.py
     python deploy.py \
       --dest analysis/project \
       --staging /tmp/lcdb-wf-tmp \
-      --branch $BRANCH \
+      --branch master \
       --flavor rnaseq \
       --clone \
       --build-envs
 
-    # You can clean up the cloned copy if you want:
-    # rm -rf /tmp/lcdb-wf-tmp
-
-This will clone the full git repo to ``/tmp/lcdb-wf-tmp``, check out the master
-branch (or whatever branch ``$BRANCH`` is set to), copy the files required for
-an RNA-seq project over to ``analysis/project``, build the main conda
-environment and the R environment, save the ``.lcdb-wf-deployment.json`` file
-there, and then delete the temporary repo.
-
-Option 2: Clone repo manually
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Clone a repo using git and check out the branch. Use this method for running
-tests):
+**Option B: Clone repo first** (use this to run tests)
 
 .. code-block:: bash
 
-   BRANCH=master  # optionally change branch
    git clone https://github.com/lcdb/lcdb-wf /tmp/lcdb-wf
    cd /tmp/lcdb-wf
-   git checkout $BRANCH
-
-Then run the deploy script, ``python deploy.py -h`` to see usage info. Here is
-an example for RNA-seq:
-
-.. code-block:: bash
-
-    python deploy.py \
-      --dest analysis/project \
-      --flavor rnaseq \
-      --build-envs
-
-.. note::
+   python deploy.py \
+     --dest analysis/project \
+     --flavor rnaseq \
+     --build-envs
 
-   If you want to run the tests then don't deploy just yet -- see
-   :ref:`running-the-tests` for details, and then come back here to deploy for
-   an actual project.
-
-
-.. note::
-
-    See :ref:`conda-envs` for more details on the conda environment building.
+See :ref:`conda-envs` for conda environment details.
 
 2. Configure
 ------------
+For your workflow of interest (see :ref:`workflows`), change to that directory
+(e.g., :file:`workflows/rnaseq`).
 
-This step takes the most effort. The first time you set up a project it
-will take some time to understand the configuration system.
-
-- see :ref:`sampletable` for how to write a sampletable, which includes where to find raw data and contains the associated metadata
-- see :ref:`config-yaml` for configuring each workflow
-- see :ref:`multiple-experiments` for advice on how to handle multiple experiments that are intended to be analyzed together
+- Edit :file:`config/sampletable.tsv` to reflect your samples and additional
+  metadata. The required columns depend on the respective workflow type. See :ref:`sampletable`.
+- Edit :ref:`config/config.yaml`. This is also workflow-specific, but at least
+  points to the reference files (genome fasta). See :ref:`config`.
 
 3. Run
 ------
 
-Activate the main environment and go to the workflow you want to run. For
-example if you have deployed and configured an RNA-seq run, then do:
-
-.. code-block:: bash
+.. warning::
 
-    conda activate ./env
-    cd workflows/rnaseq
+    Some jobs require substantial RAM (e.g., 20 GB for typical MarkDuplicates;
+    64 GB for building a STAR index for a mammalian genome). For
+    MarkDuplicates, the Java VM will try to allocate this much RAM before
+    starting, and will immediately crash if not enough is available. The STAR
+    index building will continue to consume RAM and the machine may become
+    sluggish and eventually crash.
 
-and run the following:
+Activate the environment and navigate to your workflow:
 
 .. code-block:: bash
 
+    conda activate ./env
+    cd workflows/rnaseq
     snakemake --dryrun
 
-If all goes well, this should print a list of jobs to be run.
-
-You can run locally, but this is NOT recommended for a typicaly RNA-seq
-project. To run locally, choose the number of CPUs you want to use with the
-``-j`` argument as is standard for Snakemake.
-
-.. warning::
-
-    If you haven't made any changes to the Snakefiles, be aware that the
-    default configuration needs a lot of RAM. For example, the MarkDuplicates
-    runs set 20 GB RAM for Java, and that's for each job. Adjust the Snakefiles
-    accordingly if you don't have enough RAM available (search for "Xmx" to
-    find the Java args that set memory).
+**Local execution** (not recommended for typical projects):
 
 .. code-block:: bash
 
-    # run locally (not recommended)
     snakemake --use-conda -j 8
 
-The recommended way is to run on a cluster.
+**Cluster execution** (recommended):
 
-To run on a cluster, you will need a `Snakemake profile
+Use a `Snakemake profile
 <https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles>`_ for
-your cluster that translates generic resource requirements into arguments for
-your cluster's batch system.
+your cluster. For NIH's Biowulf, you can use the `NIH-HPC snakemake profile
+<https://github.com/NIH-HPC/snakemake_profile>`__, using the ``snakemake8``
+branch, like this:
 
-On NIH's Biowulf cluster, the profile can be found at
-https://github.com/NIH-HPC/snakemake_profile. If you are not already using this for other Snakemake workflows, you can set it up the first time like this:
-
-1. Clone the profile to a location of your choosing, maybe
-   ``~/snakemake_profile``
-2. Set the environment variable ``LCDBWF_SNAKEMAKE_PROFILE``, perhaps in your
-   ``~/.bashrc`` file.
+.. code-block:: bash
 
-Then back in your deployed and configured project, submit the wrapper script as
-a batch job:
+    # One-time setup:
+    git clone https://github.com/NIH-HPC/snakemake_profile ~/snakemake_profile
+    (cd ~/snakemake_profile && git checkout snakemake8)
 
-.. code-block:: bash
+    # add this to ~/.bashrc for persistence
+    export LCDBWF_SNAKEMAKE_PROFILE=~/snakemake_profile
 
+    # Submit job to the Slurm cluster
     sbatch ../../include/WRAPPER_SLURM
 
-This will submit Snakemake as a batch job, use the profile to translate
-resources to cluster arguments and set default command-line arguments, and
-submit the various jobs created by Snakemake to the cluster on your behalf. See
-:ref:`cluster` for more details on this.
-
-Other clusters will need different configuration, but everything in `lcdb-wf`
-is standard Snakemake. The Snakemake documentation on `cluster execution
+For other clusters, see Snakemake's `cluster execution
 <https://snakemake.readthedocs.io/en/stable/executing/cluster.html>`_ and
 `cloud execution
-<https://snakemake.readthedocs.io/en/stable/executing/cloud.html>`_ can be
-consulted for running on your particular system.
+<https://snakemake.readthedocs.io/en/stable/executing/cloud.html>`_ docs. See
+:ref:`cluster` and :ref:`workflows` for more details.
+
+4. Downstream analyses
+----------------------
 
-You can typically run simultaneous workflows when they are in different
-directories; see :ref:`workflows` for details.
+For RNA-seq, there is a comprehensive set of downstream analyses to run, see
+:ref:`rnaseq-downstream`.
 
-Next steps
-~~~~~~~~~~
+5. Review output
+----------------
 
-Next, we give a brief overview of the file hierarchy of ``lcdb-wf`` in the
-:ref:`guide` page.
+In the workflow's directory (e.g., :file:`workflows/rnaseq`) there will be
+a :file:`data` directory with the output. See the respective details for each
+workflow at :ref:`workflows`.
diff --git a/docs/guide-to-files.txt b/docs/guide-to-files.txt
deleted file mode 100644
index da4aea14..00000000
--- a/docs/guide-to-files.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-/README.md
-/workflows/
-    The directory with all workflows
-/workflows/references/
-    The main workflow to generate references
-/workflows/references/Snakefile
-    Snakefile to perform the references workflow***
-/workflows/rnaseq/
-    The main workflow for performing RNA-seq analysis
-/workflows/rnaseq/Snakefile
-    Snakefile to perform RNA-Seq analysis***
-/workflows/rnaseq/config/
-    Configuration for the RNA-seq workflow***
-/workflows/rnaseq/config/sampletable.tsv
-    Sample metadata for the RNA-seq workflow
-/workflows/rnaseq/config/rnaseq_patterns.yaml
-    Output filename patterns used by the RNA-seq workflow
-/workflows/rnaseq/downstream/rnaseq.Rmd
-    Rmd file called by the RNA-Seq Snakefile***
-/workflows/rnaseq/downstream/gene-patterns.Rmd
-    Rmd file called by rnaseq.Rmd to perform gene patterns analysis
-/workflows/rnaseq/downstream/functional-enrichment.Rmd
-    Rmd file called by rnaseq.Rmd to perform functional enrichment analysis
-/workflows/chipseq/
-    The main workflow for performing ChIP-seq analysis
-/workflows/chipseq/Snakefile
-    Snakefile to perform ChIP-Seq analysis***
-/workflows/chipseq/config/
-    Configuration for the ChIP-seq workflow***
-/workflows/chipseq/config/sampletable.tsv
-    Sample metadata for the ChIP-seq workflow
-/workflows/chipseq/config/chipseq_patterns.yaml
-    Output filename patterns used by the ChIP-seq workflow
-/requirements.txt
-    Dependencies required for running lcdb-wf
-/ci/
-    Tools for managing the continuous integration tests
-/ci/build-docs.sh
-    Builds documentation on travis-ci and automatically pushes to the
-    gh-pages branch on github
-/ci/dependency_consistency.py
-    Helper script for consistently updating dependencies of wrappers and
-    requirements.txt.
-/ci/get-data.py
-    Script for downloading example data.***
-/ci/key.enc
-    Encoded private key that allows pushing to github from travis-ci
-/ci/travis-run.sh
-    Runs tests on travis-ci
-/ci/travis-setup.sh
-    Sets up environment on travis-ci
-/config/
-    This directory contains various configuration files used by the workflows***
-/config/sampletable.tsv
-    Table of sample metadata***
-/config/config.yml
-    Main config file***
-/config/4c-sampletable.tsv
-    Example sampletable for running a 4C analysis
-/config/envs/
-    Conda environment definitions for per-rule environments that are not
-    already a wrapper
-/config/multiqc_config.yaml
-    Config file with additional settings for running MultiQC
-/config/test_4c_config.yaml
-    Test config for 4C
-/config/test_config.yaml
-    Test config for rnaseq
-/downstream/rnaseq-requirements.txt
-/include/
-/include/adapters.fa
-    Used in the cutadapt rules.
-/include/WRAPPER_SLURM
-    Wrapper script to submit jobs to a SLURM cluster***
-/lib/
-    Directory of utilities used by the workflows
-/lib/common.py
-    The main module of utilities
-/lib/postprocess/
-    A package for post-processing references after they are downloaded.***
-/lib/postprocess/adapters.py
-/lib/postprocess/dicty.py
-/lib/postprocess/dm6.py
-/lib/postprocess/erccFisher.py
-/lib/postprocess/ercc.py
-/lib/postprocess/hg19.py
-/lib/postprocess/hg38.py
-/lib/postprocess/__init__.py
-/lib/postprocess/merge.py
-/lib/postprocess/phix.py
-/lib/postprocess/sacCer3.py
-/make_trackhub.py
-/wrappers/
-/wrappers/.gitignore
-/wrappers/LICENSE
-/wrappers/README.md
-/wrappers/test/
-    Main test directory for wrappers
-/wrappers/test/conftest.py
-    Fixtures are imported here and used across py.test tests
-/wrappers/test/raw_data_fixtures.py
-    Fixtures for downloading example data
-/wrappers/test/test_atropos.py
-/wrappers/test/test_bowtie2.py
-/wrappers/test/test_cutadapt.py
-/wrappers/test/test_deeptools.py
-/wrappers/test/test_demo.py
-/wrappers/test/test_dupradar.py
-/wrappers/test/test_fastqc.py
-/wrappers/test/test_fastq_screen.py
-/wrappers/test/test_featurecounts.py
-/wrappers/test/test_hisat2.py
-/wrappers/test/test_kallisto.py
-/wrappers/test/test_multiqc.py
-/wrappers/test/test_picard.py
-/wrappers/test/test_rseqc.py
-/wrappers/test/test_salmon.py
-/wrappers/test/test_samtools.py
-/wrappers/test_toy.py
-/wrappers/test/utils.py
-/wrappers/wrappers/
-    Wrappers directory of snakemake wrappers used among the workflows***
diff --git a/docs/guide.rst b/docs/guide.rst
deleted file mode 100644
index 030f686d..00000000
--- a/docs/guide.rst
+++ /dev/null
@@ -1,257 +0,0 @@
-.. _guide:
-
-Guide to file hierarchy
-=======================
-
-The ``lcdb-wf`` workflow system is designed to have a standardized directory
-structure and file hierarchy to allow us to be as consistent across many diverse
-and disparate analyses and sources of data and reduce the overhead when it comes
-to troubleshooting when something goes wrong. All the components of the repository
-are laid out with this overarching design principle in mind.
-
-Below we give a high-level overview and brief description of the files and folders used
-by the workflows, and include an annotated directory tree highlighting the most important
-parts of the repository.
-
-Folder organization
-~~~~~~~~~~~~~~~~~~~
-
-The top level of the repo looks like this:
-
-::
-
-    [1]  ├── ci/
-    [2]  ├── docs/
-    [3]  ├── include/
-    [4]  ├── lib/
-    [5]  ├── README.md
-    [6]  ├── requirements-non-r.txt
-    [7]  ├── requirements-r.txt
-    [8]  ├── workflows/
-    [9]  └── wrappers/
-
-1. ``ci`` contains infrastructure for continuous integration testing. You don't
-   have to worry about this stuff unless you're actively developing `lcdb-wf`.
-
-2. ``docs/`` contains the source for documentation. You're reading it.
-
-3. ``include/`` has miscellaneous files and scripts that can be used by all
-   workflows. Of particular note is the ``WRAPPER_SLURM`` script (see
-   :ref:`cluster` for more) and the ``reference_configs`` directory (see
-   :ref:`references` and :ref:`config` for more).
-
-4. ``lib/`` contains Python modules used by the workflows.
-
-5. ``README.md`` contains top-level info.
-
-6. ``requirements-non-r.txt`` contains the package dependencies needed to run the
-   workflows, and is used to set up a conda environment.
-
-7. ``requirements-r.txt`` contains the package dependencies for R and various
-   Bioconductor packages used in downstream analysis. See :ref:`conda-envs` for the
-   rationale for splitting these.
-
-8. ``workflows/`` contains one directory for each workflow. Each workflow directory contains
-   its own ``Snakefile`` and configuration files. We go into more detail in the next section.
-
-9. ``wrappers/`` contains Snakemake `wrappers
-   <https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#wrappers>`_,
-   which are scripts that can use their own independent environment. See
-   :ref:`wrappers` for more.
-
-Below, you can see a detailed overview of the files contained in these folders.
-
-
-Annotated tree
-~~~~~~~~~~~~~~
-
-The following is an annotated directory tree of the ``lcdb-wf`` repository to
-help orient you. Hover over files for a tooltip description; click a file to
-view the most recent version on GitHub.
-
-Files in bold are the most important.
-
-.. raw:: html
-
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/balloon-css/0.2.4/balloon.min.css">
-
-    <style>
-    .dir {
-        font-family: monospace;
-        font-size: 1em;
-    }
-    .file {
-        font-family: monospace;
-        font-size: 0.8em;
-    }
-    .important {
-        font-weight: bold;
-    }
-    .undoc {
-        color: #888;
-        }
-
-    </style>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//README.md" style="text-decoration:none;"><span class="file undoc">README.md</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/" data-balloon=" The directory with all workflows" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir">workflows/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/references/" data-balloon=" The main workflow to generate references" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir">&nbsp;&nbsp;&nbsp;references/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/references/Snakefile" data-balloon=" Snakefile to perform the references workflow" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file important">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Snakefile</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/rnaseq/" data-balloon=" The main workflow for performing RNA-seq analysis" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir">&nbsp;&nbsp;&nbsp;rnaseq/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/rnaseq/Snakefile" data-balloon=" Snakefile to perform RNA-Seq analysis" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file important">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Snakefile</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/rnaseq/config/" data-balloon=" Configuration for the RNA-seq workflow" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir important">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;config/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/rnaseq/config/sampletable.tsv" data-balloon=" Sample metadata for the RNA-seq workflow" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;sampletable.tsv</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/rnaseq/config/rnaseq_patterns.yaml" data-balloon=" Output filename patterns used by the RNA-seq workflow" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;rnaseq_patterns.yaml</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/rnaseq/downstream/rnaseq.Rmd" data-balloon=" Rmd file called by the RNA-Seq Snakefile" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file important">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;rnaseq.Rmd</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/rnaseq/downstream/gene-patterns.Rmd" data-balloon=" Rmd file called by rnaseq.Rmd to perform gene patterns analysis" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;gene-patterns.Rmd</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/rnaseq/downstream/functional-enrichment.Rmd" data-balloon=" Rmd file called by rnaseq.Rmd to perform functional enrichment analysis" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;functional-enrichment.Rmd</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/chipseq/" data-balloon=" The main workflow for performing ChIP-seq analysis" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir">&nbsp;&nbsp;&nbsp;chipseq/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/chipseq/Snakefile" data-balloon=" Snakefile to perform ChIP-Seq analysis" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file important">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Snakefile</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/chipseq/config/" data-balloon=" Configuration for the ChIP-seq workflow" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir important">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;config/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/chipseq/config/sampletable.tsv" data-balloon=" Sample metadata for the ChIP-seq workflow" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;sampletable.tsv</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//workflows/chipseq/config/chipseq_patterns.yaml" data-balloon=" Output filename patterns used by the ChIP-seq workflow" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;chipseq_patterns.yaml</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//requirements.txt" data-balloon=" Dependencies required for running lcdb-wf" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">requirements.txt</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//ci/" data-balloon=" Tools for managing the continuous integration tests" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir">ci/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//ci/build-docs.sh" data-balloon=" Builds documentation on travis-ci and automatically pushes to the gh-pages branch on github" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;build-docs.sh</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//ci/dependency_consistency.py" data-balloon=" Helper script for consistently updating dependencies of wrappers and requirements.txt." data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;dependency_consistency.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//ci/get-data.py" data-balloon=" Script for downloading example data." data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file important">&nbsp;&nbsp;&nbsp;get-data.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//ci/key.enc" data-balloon=" Encoded private key that allows pushing to github from travis-ci" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;key.enc</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//ci/travis-run.sh" data-balloon=" Runs tests on travis-ci" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;travis-run.sh</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//ci/travis-setup.sh" data-balloon=" Sets up environment on travis-ci" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;travis-setup.sh</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//config/" data-balloon=" This directory contains various configuration files used by the workflows" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir important">config/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//config/sampletable.tsv" data-balloon=" Table of sample metadata" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file important">&nbsp;&nbsp;&nbsp;sampletable.tsv</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//config/config.yml" data-balloon=" Main config file" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file important">&nbsp;&nbsp;&nbsp;config.yml</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//config/4c-sampletable.tsv" data-balloon=" Example sampletable for running a 4C analysis" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;4c-sampletable.tsv</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//config/envs/" data-balloon=" Conda environment definitions for per-rule environments that are not already a wrapper" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir">&nbsp;&nbsp;&nbsp;envs/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//config/multiqc_config.yaml" data-balloon=" Config file with additional settings for running MultiQC" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;multiqc_config.yaml</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//config/test_4c_config.yaml" data-balloon=" Test config for 4C" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;test_4c_config.yaml</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//config/test_config.yaml" data-balloon=" Test config for rnaseq" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;test_config.yaml</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//downstream/rnaseq-requirements.txt" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;rnaseq-requirements.txt</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//include/" style="text-decoration:none;"><span class="dir undoc">include/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//include/adapters.fa" data-balloon=" Used in the cutadapt rules." data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;adapters.fa</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//include/WRAPPER_SLURM" data-balloon=" Wrapper script to submit jobs to a SLURM cluster" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file important">&nbsp;&nbsp;&nbsp;WRAPPER_SLURM</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/" data-balloon=" Directory of utilities used by the workflows" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir">lib/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/common.py" data-balloon=" The main module of utilities" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;common.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/" data-balloon=" A package for post-processing references after they are downloaded." data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir important">&nbsp;&nbsp;&nbsp;postprocess/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/adapters.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;adapters.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/dicty.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;dicty.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/dm6.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;dm6.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/erccFisher.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;erccFisher.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/ercc.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ercc.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/hg19.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;hg19.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/hg38.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;hg38.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/__init__.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;__init__.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/merge.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;merge.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/phix.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;phix.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//lib/postprocess/sacCer3.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;sacCer3.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//make_trackhub.py" style="text-decoration:none;"><span class="file undoc">make_trackhub.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/" style="text-decoration:none;"><span class="dir undoc">wrappers/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/.gitignore" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;.gitignore</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/LICENSE" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;LICENSE</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/README.md" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;README.md</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/" data-balloon=" Main test directory for wrappers" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir">&nbsp;&nbsp;&nbsp;test/</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/conftest.py" data-balloon=" Fixtures are imported here and used across py.test tests" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;conftest.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/raw_data_fixtures.py" data-balloon=" Fixtures for downloading example data" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="file">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;raw_data_fixtures.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_atropos.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_atropos.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_bowtie2.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_bowtie2.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_cutadapt.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_cutadapt.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_deeptools.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_deeptools.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_demo.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_demo.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_dupradar.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_dupradar.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_fastqc.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_fastqc.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_fastq_screen.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_fastq_screen.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_featurecounts.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_featurecounts.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_hisat2.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_hisat2.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_kallisto.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_kallisto.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_multiqc.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_multiqc.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_picard.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_picard.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_rseqc.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_rseqc.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_salmon.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_salmon.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/test_samtools.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;test_samtools.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test_toy.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;test_toy.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/test/utils.py" style="text-decoration:none;"><span class="file undoc">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;utils.py</span></a></p>
-    
-    <p style="margin:0px;"><a href="https://github.com/lcdb/lcdb-wf/blob/master//wrappers/wrappers/" data-balloon=" Wrappers directory of snakemake wrappers used among the workflows" data-balloon-pos="right" data-balloon-length="xlarge" style="text-decoration:none;"><span class="dir important">&nbsp;&nbsp;&nbsp;wrappers/</span></a></p>
-    
-
-
-Now that you have seen which files and folders are the most important and have some idea
-of where everything lives, let's look at how to run tests to make sure everything is set up 
-correctly (see :ref:`running-the-tests`), or jump right in to learning about how to configure
-the workflows for your particular experiment (see :ref:`config`).
diff --git a/docs/index.rst b/docs/index.rst
index 064f30ab..6bbb52a5 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -4,11 +4,12 @@ Introduction
 `lcdb-wf` is a collection of Snakemake workflows for common high-throughput
 sequencing analysis.
 
-There are a multitude of workflows out there for high-throughput sequencing analysis.
 What makes `lcdb-wf` different?
 
+Features
+--------
 Designed with customization in mind
------------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We recognize that every experiment has its own idiosyncracies. Rather
 than provide a one-size-fits-all solution, we aim to provide a reasonable
 starting point that users can modify for their own use.
@@ -18,20 +19,30 @@ Unconventional command-line arguments to tools? New tools to add to the
 workflow? No problem.
 
 Extensive downstream RNA-seq
-----------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A comprehensive RMarkdown template, along with a custom R package, enables
 sophisticated RNA-seq analysis that supports complex experimental designs and
-many contrasts.
+many contrasts. For example, easily set up multiple DESeqDataSet objects with
+different models or different mixtures of samples, set up contrasts using these
+different objects, and all the output and functional enrichment analysis will
+automatically be generated.
 
-Extenstive exploration of ChIP-seq peaks
-----------------------------------------
+Integration with Carnation
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+The RNA-seq workflow generates RDS objects compatible with `Carnation
+<https://github.com/NICHD-BSPC/carnation>`__ the Shiny app for interactively
+exploring RNA-seq results, including gene patterns, functional enrichment,
+comparisons across contrasts, and more.
+
+Extensive exploration of ChIP-seq peaks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The ChIP-seq configuration supports multiple peak-callers as well as calling
 peaks with many different parameter sets for each caller. Combined with
 visualizaiton in track hubs (see below), this can identify the optimal
 parameters for a given experiment.
 
 Track hubs
-----------
+~~~~~~~~~~
 The ChIP-seq and RNA-seq workflows generate track hubs that can be viewed in
 the UCSC Genome Browser. ChIP-seq shows signal and called peaks (many
 peak-calling runs, with different peak-callers and different parameters for
@@ -40,38 +51,23 @@ RNA-seq shows strand-specific signal tracks. Both support the addition of
 arbitrary additional tracks (primers, loci of interest, external data, etc) to
 view alongside your data.
 
-Unified approach to reference genomes
--------------------------------------
-The references workflow defines the genome files for the organisms used in 
-the experiment. It it is shared by RNA-seq and ChIP-seq and is driven by
-a config file that specifies URLs for FASTA and GTF files. Set it up once for
-a site to get lots of genomes you can use for running `fastq_screen`, and
-easily include arbitrary other genomes. They can then be automatically included
-in RNA-seq and ChIP-seq workflows.
+Support for complex reference genomes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Reference genomes may need to be patched with experimental genetic constructs,
+or may need to be adjusted when downloaded from an original source (for
+example, change chromosome nomenclature to match existing work).
 
 Arbitrary genomes can be used, whether local (e.g., customized with additional
-genetic constructs) or on the web. The `references` workflow need only be run
-once for all these genomes to be created, with the `references_dir` being used
-as a centralized repository that can be then used with all other workflows.
-
-Integration with external data and figure-making
-------------------------------------------------
-It is designed to tie together your entire analysis, from downloading references
-through producing final figures. The included examples tie together workflows that download external data, perform downstream analysis, and make figures.
-
-If an upstream file changes (e.g., gene annotation), all dependent downstream
-jobs -- including figures -- will be updated so you can ensure that even
-complex analyses stay correct and up-to-date.
+genetic constructs) or on the web.
 
 Tested automatically
---------------------
+~~~~~~~~~~~~~~~~~~~~
 Every change to the code on GitHub triggers an automated test, the results of
 which you can find at https://circleci.com/gh/lcdb/lcdb-wf. Each test sets the
 system up from scratch, including installing all software, downloading example
 data, and running everything up through the final results. This guarantees that
 you can set up and test the code yourself.
 
-
 All the advantages of Snakemake
 -------------------------------
 
diff --git a/docs/integrative.rst b/docs/integrative.rst
deleted file mode 100644
index d1425b1f..00000000
--- a/docs/integrative.rst
+++ /dev/null
@@ -1,82 +0,0 @@
-.. _integrative:
-
-Integrative workflows
-=====================
-
-Here we look at integrative workflows which can be used to combine multiple
-standard or non-standard workflows.
-
-.. _colocalization:
-
-Colocalization workflow
------------------------
-The output of this workflow is a set of heatmaps showing metrics of
-colocalization between pairs of regions. This can be used to answer questions
-like "what else does my protein of interest bind with?".
-
-Several colocalization methods are run, and they all give slightly different
-results.
-
-- bedtools fisher
-- bedtools jaccard
-- GAT (Genome Association Test) log2 fold change
-- GAT (Genome Association Test) nucleotide overlap
-- IntervalStats
-
-.. _external:
-
-"External" workflow
--------------------
-Often we want to compare new data with existing published data. We have found
-that in practice, having a separate workflow to handle downloading and
-reformatting and various conversion tasks helps with organization.
-
-The test workflow is a working example that:
-
-- downloads ChIP-seq data from modENCODE in an older fly genome organism (dm3)
-- downloads the chainfile for liftover
-- fixes the formatting of the downloaded files so they can be lifted over
-- lifts over the files to the newer dm6 assembly.
-
-The file is intended to be heavily edited for the particular experiment; it is
-here mostly as a placeholder and to be used as a template for integrative
-downstream work.  It can then be incorporated into the ``figures`` workflow (see
-:ref:`figures`) to integrate the analysis with other output.
-
-.. image:: external.png
-
-.. _figures:
-
-"Figures" workflow
-------------------
-
-This workflow is a working example of how you would tie together output from
-RNA-seq, ChIP-seq, and "external" workflows to automate figure creation and
-formally link together the dependencies of each figure all the way back to the
-original fastq files. If any changes are made upstream, they will trigger
-downstream rules to re-run as needed.
-
-For example, if a new GTF annotation file comes out, you would change the URL
-in the config, and then re-run the figures workflow. All RNA-seq jobs that
-depended on some way on the GTF file will be re-run. This will include the
-feature counts and downstream RNA-seq analysis, but will *not* re-run any of
-the jobs like trimming or mapping that do not depend on the GTF. In addition,
-any figures that depended in some way on that GTF file will also be re-run.
-
-
-
-To provide a sufficiently complex example that can be used in real-world
-applications, this workflow currently:
-
-    - counts the number of peaks in all configured peak-calling runs and stores
-      the output in a TSV report (work is performed in the script
-      ``scripts/peak_count.py``)
-    - identifies peaks at promoters of genes, reports a summary of how many
-      peaks from each run were found in a promoter, and creates BED files of
-      these subsets (``scripts/peaks_at_promoters.py``)
-    - builds the DAG image for the ChIP-seq and RNA-seq workflows
-    - symlinks over the ChIP-seq peaks and RNA-seq differential expression results
-    - extracts the text from the README.txt files created by the scripts
-      (usually from their docstrings), and compiles them into a summary report
-    - the ``figures`` directory can then be zipped up and distributed to collaborators
-
diff --git a/docs/lib.chipseq.rst b/docs/lib.chipseq.rst
deleted file mode 100644
index 1dcca700..00000000
--- a/docs/lib.chipseq.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-Module ``lib.chipseq``
-======================
-
-Handling ChIP-seq peak-calling configuration correctly is complex. The
-functions in this module help manipulate the config information so we can use
-it more easily in the ChIP-seq workflow without cluttering the Snakefile.
-
-.. currentmodule:: lib.chipseq
-
-.. autosummary::
-
-    peak_calling_dict
-    block_for_run
-    samples_for_run
-    merged_input_for_ip
-    detect_peak_format
-
-Details
--------
-
-.. automodule:: lib.chipseq
-    :members:
-
diff --git a/docs/lib.common.rst b/docs/lib.common.rst
deleted file mode 100644
index 22f95df0..00000000
--- a/docs/lib.common.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-Module ``lib.common``
-=====================
-
-This module contains various helper functions used by the workflows. It has two
-main types of functions, those for handling configuration information those for
-handling references.
-
-.. currentmodule:: lib.common
-
-Functions for handling configuration
-------------------------------------
-
-.. autosummary::
-
-    resolve_config
-    references_dict
-    get_references_dir
-    get_sampletable
-
-Functions for handling references
----------------------------------
-
-.. autosummary::
-
-    gzipped
-    cat
-    filter_fastas
-    twobit_to_fasta
-    download_and_postprocess
-
-Details
--------
-
-.. automodule:: lib.common
-    :members:
diff --git a/docs/lib.patterns_targets.rst b/docs/lib.patterns_targets.rst
deleted file mode 100644
index 2c452b17..00000000
--- a/docs/lib.patterns_targets.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-Module ``lib.patterns_targets``
-===============================
-
-The classes in module control how ``rnaseq_patterns.yaml`` and
-``chipseq_patterns.yaml`` are filled in with the information from their
-respective sample tables and config files.
-
-.. currentmodule:: lib.patterns_targets
-
-.. autosummary::
-
-    SeqConfig
-    RNASeqConfig
-    ChIPSeqConfig
-
-Details
--------
-.. autoclass:: lib.patterns_targets.SeqConfig
-
-.. autoclass:: lib.patterns_targets.RNASeqConfig
-
-.. autoclass:: lib.patterns_targets.ChIPSeqConfig
diff --git a/docs/patterns-targets.rst b/docs/patterns-targets.rst
deleted file mode 100644
index b35a71fe..00000000
--- a/docs/patterns-targets.rst
+++ /dev/null
@@ -1,140 +0,0 @@
-.. _patterns-and-targets:
-
-Patterns and targets
-====================
-We use a hybrid approach to specifying input and output patterns for Snakemake
-rules. These are used in the RNA-seq and ChIP-seq workflows.
-
-Patterns
---------
-
-**Patterns** are filename-like strings with placeholders in them, like this::
-
-    data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz
-
-Generally you don't need to modify them (though you can). The patterns file is
-useful as a guide to the files that are created by the workflow.
-
-- RNA-seq patterns are in ``workflows/rnaseq/config/rnaseq_patterns.yaml``
-- ChIP-seq patterns are in ``workflows/chipseq/config/chipseq_patterns.yaml``
-
-
-Targets
--------
-The metadata (sample names, peak-calling runs) configured in the sample table
-and config file are used to fill in the patterns to create the targets. It's the
-equivalent of a complicated `expand()` call in a standard
-Snakefile.
-
-If we had 2 samples, A and B, then filling in the pattern::
-
-    data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz
-
-would result in::
-
-    data/rnaseq_samples/A/A_R1.fastq.gz
-    data/rnaseq_samples/B/B_R1.fastq.gz
-
-
-How patterns and targets are used in Snakefiles
------------------------------------------------
-Briefly:
-
-- Each Snakefile has access to an object, ``c``.
-- **Patterns** are accessed via the ``c.patterns`` dictionary. The structure of
-  ``c.patterns`` matches that of the patterns file. Patterns still have the
-  ``{}`` placeholders as written in that file.
-- **Targets** are accessed via the ``c.targets`` dictionary. The structure
-  matches that of ``c.patterns``, but the placeholders are filled in based on
-  other configuration information such that each single string from patterns
-  may turn into a list after being filled in using the `snakemake.expand()`
-  function.
-- We can collapse arbitrary groups of patterns or targets together into
-  a flattened list with ``lib.utils.flatten()``.
-
-Here is an example rule that uses patterns and targets from the ``c`` object:
-
-.. code-block:: python
-
-    rule all:
-        input:
-            c.targets['cutadapt']
-
-    rule cutadapt:
-        input:
-            c.patterns['fastq']
-        output:
-            c.patterns['cutadapt']
-
-        shell:
-            'cutadapt {input} -o {output}'
-
-In the example above, ``c.patterns['fastq']`` and ``c.patterns['cutadapt']``
-have the following values, configured in ``config/rnaseq_patterns.yaml``:
-
-.. code-block:: python
-
-    c.patterns['fastq']
-    # data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz
-
-    c.patterns['cutadapt']
-    # data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz
-
-
-And ``c.targets[['cutadapt']`` might have the following values, after being
-filled in with the sample table (see below for details):
-
-.. code-block:: python
-
-    c.targets['cutadapt']
-    # data/rnaseq_samples/sample1/sample1_R1.cutadapt.fastq.gz
-    # data/rnaseq_samples/sample1/sample2_R1.cutadapt.fastq.gz
-    # data/rnaseq_samples/sample1/sample3_R1.cutadapt.fastq.gz
-    # data/rnaseq_samples/sample1/sample4_R1.cutadapt.fastq.gz
-
-
-
-
-This has several advantages:
-
-- Patterns can be automatically filled in by the sample table and config file,
-  so the workflow is largely controlled by a TSV file and a YAML file.
-
-- Storing filenames outside individual Snakefiles allows us to access them much
-  more easily from other Snakefiles or downstream scripts.
-
-- Writing aggregation rules is much easier. For example, instead of lots of
-  ``expand()`` calls, we can get all the FastQC output across raw, trimmed, and
-  aligned runs with ``flatten(c.targets["fastqc"])``.
-
-- Toggling entire sections of the workflow can be performed by changing
-  a single line in the first ``all`` rule.
-
-- We can re-organize the output directories only by editing the patterns file
-  -- no need to touch the Snakefile.
-
-- It's easier to understand the output locations of files by looking at the
-  patterns file than it is to scroll through a Snakefile.
-
-- Letting the ``c`` objects do the work of filling in patterns allows complex
-  work to be abstracted away, resulting in simpler Snakefiles. For example,
-  filling in the output BED files across many arbitrary-named configured
-  peak-calling runs gets complicated, but since this is handled transparently
-  by the ``c`` object, we can do things like
-  ``utils.flatten(c.targets['peaks'])`` to get all the BED files for all
-  peak-callers and all peak-calling runs.
-
-.. seealso::
-
-    For more details, the code is the authoritative source.
-
-    In particular:
-
-        - :class:`lib.patterns_targets.SeqConfig`
-        - :class:`lib.patterns_targets.RNASeqConfig`
-        - :class:`lib.patterns_targets.ChIPSeqConfig`
-
-    In addition, the `figures Snakefile
-    <https://github.com/lcdb/lcdb-wf/blob/master/workflows/figures/Snakefile>`_
-    demonstrates how the ChIP-seq and RNA-seq patterns and targets can be used
-    for downstream work.
diff --git a/docs/references-config.rst b/docs/references-config.rst
deleted file mode 100644
index 99715440..00000000
--- a/docs/references-config.rst
+++ /dev/null
@@ -1,603 +0,0 @@
-
-.. _references-config:
-
-References config
-=================
-
-The references section defines which genomes, transcriptomes, and annotations
-to use. It supports arbitrarily many species and assemblies, and supports
-customizing references for a particular project. For example, there are tools
-and examples for adding ERCC spike-in controls to references, or adjusting
-chromosome nomenclature, or removing problematic entries from GTF files, and so
-on.
-
-Another advantage is that this makes it easy to add multiple genomes to the
-screening step (which uses fastq_screen).
-
-Specifying a references directory
----------------------------------
-
-See :ref:`cfg-references-dir` for more information on where the references are
-built (as well as how and why to adjust this).
-
-Including existing reference configs
-------------------------------------
-We provide a number of pre-configured reference configs for common model
-organisms. If you just want to use some references configs that work, then put
-this in your config file:
-
-.. code-block:: yaml
-
-    include_references:
-      - '../../include/references_configs'
-
-This will populate the config with the contents of all the files contained in
-the ``include/references_configs`` directory. Any paths provided under the
-``include_references`` key are relative to the Snakefile using the config.
-**Note that you will still need to inspect the contents** of those files to
-decide which organsim and tag you want to use for your particular experiment
-(see :ref:`cfg-organism` and :ref:`cfg-aligner` for more on these fields). For
-example, if you are working with human RNA-seq data, and you use the above
-``include_references``, you may want this in your config:
-
-.. code-block:: yaml
-
-    organism: 'human'
-    aligner:
-      tag: 'gencode-v25'
-      index: 'hisat2'
-    salmon: 'gencode-v25'
-
-The reason for using ``gencode-v25`` is because that tag is configured for the
-``human`` key in ``../../include/references_configs/Homo_sapiens.yaml``.
-
-You can provide entire directories of reference configs, a single file, or use
-the references section below. The prioritization works like this:
-
-- an organism can show up in multiple configs; if a tag exists for an organism
-  in more than one config, higher-priority configs will overwrite the contents
-  of the tag.
-- directories have lowest priority; when multiple directories are specified the
-  last one has priority
-- files have priority over directories; when multiple files are specified the
-  last one has priority
-- the ``references:``` section always has priority over anything in
-  ``include_references:``.
-
-The remainder of this section of the documentation explains how to customize
-the references, to add your own or modify the existing examples.
-
-Overview
---------
-The references workflow is based on the idea that while each genome's source
-files may differ, they can usually be modified to a uniform format. For
-example, reference files (FASTA, GTF) may come from different providers
-(Ensembl, FlyBase, UCSC, etc) and have slightly different formatting (strange
-headers, one big file or a tarball of individual chromosomes, etc), once they
-are well-formatted they can be used to create a hisat2 index, a bowtie2 index,
-a list of genes, intergenic regions, and so on without any further
-customization.
-
-The challenging part is the "well-formatted" part. To solve this, the config
-file and references building system allows a very flexible specification of how
-to modify references via a plugin architecture. It works something like this:
-
-- Each key in the references section refers to an **organism**.
-- An organism has one or more **tags**.
-- Each **tag** has a FASTA file and/or a GTF file associated with it.
-- Each FASTA or GTF specifies one or more URIs from which to download the raw
-  file(s). These can be `ftp://`, `http://`, `https://`, or `file://` URIs.
-- An optional **postprocess** key specifies the import path to a Python module.
-  This is the primary hook for customization, and is described in more detail
-  below.
-- For FASTA files one or more **indexes** are requested
-- For GTF files, zero or more **conversions** are requested.
-
-.. note::
-
-    If using a ``file://`` URI, it needs to be gzipped.
-
-It's probably easiest to show an example config and then describe what's
-happening.
-
-Example references config
--------------------------
-
-The following example configures the workflow to:
-
-- download a fasta file from the GENCODE project for the human genome and build
-  a hisat2 and bowtie2 index
-- download the corresponding GTF file from GENCODE, strip off the dotted
-  version numbers from Ensembl gene and transcript IDs, and create a refFlat
-  format file from it
-- download the SILVA rRNA database and keep only the ribosomal RNA sequence
-  corresponding to *Homo sapiens*
-
-This example contains sufficient real-world complexity to illustrate the
-flexibility afforded by the references workflow. It is heavily commented for
-illustration.
-
-.. code-block:: yaml
-
-    # EXAMPLE REFERENCES CONFIG SECTION
-
-    # This configures the directory in which the prepared references will be
-    # saved (see below for directory structure). If you already have reference
-    # files saved in the lcdb-wf structure, point this to that directory to
-    # avoid rebuilding a fresh set of references:
-
-    references_dir: 'data/references'
-
-    # One of the organisms configured below. We are only configuring a single one
-    # so "human" is our only option here:
-
-    organism: 'human'
-
-    # Here we specify which tag under "human" to use for aligning, as well as
-    # which index we'll be using. This example is RNA-seq, so we'll use HISAT2:
-
-    aligner:
-      tag: 'gencode-v25'
-      index: 'hisat2'
-
-    # Top-level section for references:
-
-    references:
-
-      # Label for this organism or species:
-
-      human:
-
-        # "gencode-v25" is our tag to describe this particular FASTA and GTF
-        # we're preparing:
-
-        gencode-v25:
-
-          # This block will define how to get and postprocess a FASTA file:
-
-          fasta:
-
-            # URL to download:
-
-            url: 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh38.primary_organism.genome.fa.gz'
-
-            # We can optionally build indexes for various aligners:
-
-            indexes:
-              - 'hisat2'
-              - 'bowtie2'
-
-          # This next block will define how to get and postprocess a GTF file.
-          # The coordinates of the GTF file correspond to the
-          # coordinates in the fasta defined above, so we're putting it under
-          # the same tag. This is not required; we could also put it under
-          # separate tag (perhaps called "gencode-v25-annotations")
-
-          gtf:
-            url: 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.annotation.gtf.gz'
-
-            # The GENCODE annotations include the dotted Ensembl versions in
-            # the gene IDs. The following function, strip_ensembl_version, is
-            # defined in lib/postprocess/hg38.py. It strips off those dotted
-            # versions so that our resulting GTF file used by the workflows
-            # will not contain them:
-
-            postprocess: 'lib.postprocess.hg38.strip_ensembl_version'
-
-            # Once well-formatted by the postprocessing function, we can now
-            # perform standard conversions on the GTF. These conversions are
-            # defined as rules in the references Snakefile, and will be run
-            # if the conversion is specified here. Here we ask to get a refFlat
-            # file, which can be provided to Picard's collectRnaSeqMetrics tool:
-
-            conversions:
-              - 'refflat'
-
-
-        # Here is another tag, to create a FASTA file for ribosomal RNA. It can
-        # then be used for fastq_screen, or for the rRNA screening portion of the
-        # RNA-seq workflow:
-
-        rRNA:
-          fasta:
-
-            # The SILVA database has separate files for large and small subunit
-            # sequences. We'd like them all; by providing multiple URLs they will
-            # be concatenated:
-
-            url:
-              - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
-              - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
-
-            # However, the downloaded files contain many species. Here we only
-            # care about human. We already have a function, "filter_fastas()", in
-            # lib/common.py that accepts a FASTA and only keeps the records that
-            # contain the provided first argument.
-
-            # We specify that first argument here, and it will be passed to that
-            # function, resulting in a final FASTA file that only contains the
-            # rRNA sequence for Homo sapiens:
-
-            postprocess:
-                function: 'lib.common.filter_fastas'
-                args: 'Homo sapiens'
-
-            # We only need a bowtie2 index out of it.
-            indexes:
-                - 'bowtie2'
-
-Without all those comments, it looks like this:
-
-.. code-block:: yaml
-
-    references_dir: 'data/references'
-    organism: 'human'
-    aligner:
-      tag: 'gencode-v25'
-      index: 'hisat2'
-    references:
-      human:
-        gencode-v25:
-          fasta:
-            url: 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh38.primary_organism.genome.fa.gz'
-            indexes:
-              - 'hisat2'
-              - 'bowtie2'
-          gtf:
-            url: 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.annotation.gtf.gz'
-            postprocess: 'lib.postprocess.hg38.strip_ensembl_version'
-            conversions:
-              - 'refflat'
-        rRNA:
-          fasta:
-            url:
-              - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
-              - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
-            postprocess:
-                function: 'lib.common.filter_fastas'
-                args: 'Homo sapiens'
-            indexes:
-                - 'bowtie2'
-
-
-The above file will result in the following directory structure::
-
-    data/references/human/gencode-v25/fasta
-    data/references/human/gencode-v25/bowtie2
-    data/references/human/gencode-v25/hisat2
-    data/references/human/gencode-v25/gtf
-    data/references/human/gencode-v25-transcriptome/fasta
-    data/references/human/gencode-v25-transcriptome/salmon
-    data/references/human/rRNA/fasta
-    data/references/human/rRNA/bowtie2
-
-Each block in the YAML file describes either a `fasta` or `gtf` file. Each
-block has at least the organism, type, and a URL.  A block can optionally have
-a `postprocess`, which is an arbitrary function (described below) that converts
-the downloaded URL to something that conforms to the standards of the workflow
-(also described below). By supplying a tag, we can differentiate between
-different versions (e.g., FlyBase r6.04 vs r6.11; hg19 vs hg38) or different
-kinds of postprocessing (e.g, "chr" preprended to chrom names or not;
-comprehensive annotation vs only coding genes).
-
-`fasta` blocks can have an optional  `indexes` entry which will build the
-specified indexes. `gtf` blocks can have an optional `conversions` entry which
-will perform the specified conversion. Available indexes and conversions are
-described below.
-
-
-Post processing
----------------
-
-**All files created by a block are required to be gzipped.**
-
-This means that if a URL points to an uncompressed GTF file, a post-processing
-function must gzip it. It also means that any post-processing functions must
-write gzipped output files.
-
-Other than that, it's up to the user to decide what transformations (if any)
-are required. Examples might include:
-
-- exluding particular contigs
-- removing or editing problematic genes that have transcripts on both strands
-  -- mod(mdg4) I'm looking at you
-- renaming chromosomes (e.g., prepend "chr")
-- remove unnecessary annotations (e.g., keep only cds/exon/transcript/gene features)
-
-In the example config above, the yeast genome is available as a tarball of
-separate fasta files, but we'd like to get it into a single fasta file for
-downstream tools to work with.
-
-The configuration block can define an optional `postprocess` string which
-contains a dotted name referring to Python function that is importable by the
-`reference.snakefile` workflow. By default, the workflow will find modules in
-in ``lib.postprocess`` directory, so it's most convenient and organized to put
-your functions within modules in that directory.
-
-For example, above we used the postprocess function
-``lib.postprocess.sacCer3.fasta_lib.postprocess``, and you can view this
-function in ``lib/postprocess/sacCer3.py``.
-
-Please see :func:`lib.common.download_and_postprocess` for more details, and
-the files in the ``lib/postproces`` directory for inspiration.
-
-These two arguments are automatically provided by the references workflow --
-you don't have to know or care exactly what the filenames are, just what has to
-be done to their contents.
-
-See the files in ``lib/postprocess`` for inspiration if you need to write your
-own post-processing functions.
-
-The job of a postprocessing function is to ensure that the
-fastq/gtf/transcriptome fasta meets the requirements described above and is
-ready for any intended downstream tasks. For example if we download the fasta
-file from FlyBase for dm6 but want "chr" prepended to chromosome names, we can
-create a function in the file ``dm6.py`` called ``add_chr`` that does
-this:
-
-.. code-block:: python
-
-    # This is dm6.py
-
-    from snakemake.shell import shell  # a very convenient function
-
-    def add_chr(origfn, newfn):
-        shell(
-            'zcat {origfn} '       # input is always gzipped
-            '| sed "s/>/>chr/g" '  # add chr to names
-            '| gzip -c > {newfn} ' # re-zip
-            '&& rm {origfn}'       # clean up
-        )
-
-We specify this function to be called in the fasta config block like this (note
-that the module doesn't have to be the same name as the organism, but it is
-here for clarity):
-
-.. code-block:: yaml
-
-    dm6:
-      fasta:
-        url: ...
-        postprocess: "dm6.add_chr"
-
-This expects a file ``dm6.py`` in the same directory as the
-`references.snakefile` workflow, and expects a function ``add_chr`` to
-be defined in that module.
-
-Any downstream rules that operate on the genome FASTA file (like hisat2 index,
-bowtie2 index, etc) will now use this fixed version with "chr" prepended to
-chromosome names.  In this way, we can apply arbitrary code to modify
-references to get them into a uniform format.
-
-
-.. _advanced-postprocessing:
-
-More advanced postprocessing
-----------------------------
-
-If a post-processing function has a keyword argument with starts and ends with
-a double underscore (``__``), the config system will assume this is a string
-that should be interpreted as a dotted function name and the actual function
-will be resolved and passed to the post-processing function.
-
-This is useful for example when attaching ERCC spike-ins to a reference file
-that in turn needs to be modified. For example, the `S. pombe` reference
-annotations are available as a GFF file, but this needs to be converted to
-a GTF file. After that, the ERCC spike-in GTF annotations need to be added to
-the newly-created GTF.
-
-The functions in ``lib/postprocess/ercc.py`` support such a use-case. The
-config looks like this:
-
-.. code-block:: yaml
-
-    genome:
-      url:
-        # S. pombe fasta
-        - 'ftp://ftp.ensemblgenomes.org/pub/fungi/release-41/fasta/schizosaccharomyces_pombe/dna/Schizosaccharomyces_pombe.ASM294v2.dna_sm.toplevel.fa.gz'
-        # ERCC fasta
-        - 'https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA'
-      postprocess:
-        function: "lib.postprocess.ercc.add_fasta_to_genome"
-
-    annotation:
-      url:
-        # S. pombe GFF, which needs to be converted to GTF
-        - 'ftp://ftp.ensemblgenomes.org/pub/fungi/release-41/gff3/schizosaccharomyces_pombe/Schizosaccharomyces_pombe.ASM294v2.41.gff3.gz'
-
-        # ERCC GTF is not available; conversion function needed to convert
-        # fasta to GTF
-        - 'https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA'
-
-      postprocess:
-        function: "lib.postprocess.ercc.add_gtf_to_genome"
-        kwargs:
-          # As per the docs for add_gtf_to_genome, this function will be
-          # applied to all but the last input file. It is specified as a string
-          # here, but the config-processing system will resolve this to the
-          # actual function and pass that along to add_gtf_to_genome
-          __preprocess__: "lib.common.gff2gtf"
-
-.. versionadded:: 1.7
-    Ability to use special ``__``-prefixed variables that are interpreted as
-    dotted-path functions to import.
-
-Locations of downloaded-and-post-processed FASTA and GTF files
---------------------------------------------------------------
-Generally speaking, the fasta and gtf files will be in::
-
-    {references_dir}/{organism}/{tag}/fasta/{organism}_{tag}.fasta
-    {references_dir}/{organism}/{tag}/gtf/{organism}_{tag}.gtf
-
-If a config file looks like this (simplified here for clarity):
-
-.. code-block:: yaml
-
-  references_dir: refs
-  references:
-    human:
-      hg38:
-        fasta: ...
-        gtf: ...
-
-Then the following files will be created::
-
-    refs/human/hg38/fasta/human_hg38.fasta
-    refs/human/hg38/gtf/human_hg38.gtf
-
-
-If you are running the references workflow directly, or it is included in
-another workflow that requests a chromsizes file, the following will also be
-created::
-
-    refs/human/hg38/fasta/human_hg38.chromsizes
-
-.. note::
-
-  URLs are expected to be gzipped and any postprocessing functions are
-  expected to output gzipped files. This is because it is most common for
-  providers to offer gzipped reference files, and therefore minimizes the
-  effort required to prepare fasta and gtf files.  However, not all downstream
-  tools handle gzipped input. The references workflow therefore stores only the
-  uncompressed versions. We consider the resulting configuration simplicity to
-  be worth the additional space and time cost.
-
-
-Available indexes and conversions
----------------------------------
-The following indexes can be currently be specified for fasta files:
-
-hisat2
-^^^^^^
-
-    .. code-block:: yaml
-
-        indexes:
-          - hisat2
-
-    Output files::
-
-      {references_dir}/{organism}/{tag}/hisat2/{organism}_{tag}.*.ht2
-
-bowtie2
-^^^^^^^
-
-    .. code-block:: yaml
-
-        indexes:
-          - bowtie2
-
-    Output files::
-
-      {references_dir}/{organism}/{tag}/bowtie2/{organism}_{tag}.*.bt2
-
-salmon
-^^^^^^
-
-    .. code-block:: yaml
-
-        indexes:
-          - salmon
-
-    Output files::
-
-      {references_dir}/{organism}/{tag}/salmon/{organism}_{tag}/*
-
-The following conversions can be specified for GTF files:
-
-refflat
-^^^^^^^
-
-    .. code-block:: yaml
-
-        conversions:
-          - refflat
-
-    Converts GTF to refFlat format. See the ``conversion_refflat`` rule in
-    ``workflows/references/Snakefile``.
-
-    Output file::
-
-      {references_dir}/{organism}/{tag}/gtf/{organism}_{tag}.refflat
-
-bed12
-^^^^^
-
-    .. code-block:: yaml
-
-        conversions:
-           - bed12
-
-   Converts GTF to BED12 format. See the ``conversion_bed12`` rule in
-   ``workflows/references/Snakefile``.
-
-   Output file::
-
-      {references_dir}/{organism}/{tag}/gtf/{organism}_{tag}.refflat
-
-gffutils
-^^^^^^^^
-    Converts GTF to gffutils database (typically used for downstream work). You
-    can specify arbitrary kwargs to ``gffutils.create_db`` by including them as
-    keys. For example, if the GTF file already contains features for genes and
-    transcripts:
-
-    .. code-block:: yaml
-
-        conversions:
-          - gffutils:
-              disable_infer_genes: True
-              disable_infer_transcripts: True
-
-
-    Output file::
-
-        {references_dir}/{organism}/{tag}/gtf/{organism}_{tag}.gtf.db
-
-genelist
-^^^^^^^^
-    Reads the postprocessed GTF file, and extracts the set of gene IDs found,
-    one ID per line. The GTF attribute to use is configured by the
-    ``gene_id:`` key, for example, if the file contains gene IDs in the
-    ``Name`` attribute of each line, use the following:
-
-    .. code-block:: yaml
-
-        conversions:
-          - genelist:
-              gene_id: 'Name'
-
-    Output file::
-
-      {references_dir}/{organism}/{tag}/gtf/{organism}_{tag}.genelist
-
-mappings
-^^^^^^^^
-    Reads the postprocesses GTF file, and outputs mappings between attributes
-    as a gzipped TSV.
-
-    You can include/exclude featuretypes from being checked.  For example, if
-    your GTF has genes and transcripts in addition to exons, the gene and
-    transcript lines probably contain all of the attributes you are interested
-    in (like gene_id, symbol, name, etc) and the exon (and any other lines) can
-    be ignored, speeding up the process. In this case you could use
-    ``include_featuretypes: [gene, transcript]``.
-
-    A ``__featuretype__`` column is always included in the mapping.  This is
-    the GTF featuretype of each line, with extra ``__`` to avoid overwriting an
-    attribute that may happen to be called ``featuretype``.
-
-    .. code-block:: yaml
-
-        conversions:
-          - mappings
-
-    .. code-block:: yaml
-
-        conversions:
-          - mappings:
-              include_featuretypes: [gene, transcript]
-
-    Output file::
-
-      {references_dir}/{organism}/{tag}/gtf/{organism}_{tag}.mapping.tsv.gz
diff --git a/docs/references.png b/docs/references.png
deleted file mode 100644
index df35437e5293b5873fc2e7959757870e0349ee80..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10622
zcmd6N`9D-&{J-7cT4Lzhjg%$JOq87%-jQvrQMMVfBto|A>m9qXt4L(2C`2gh7+YB*
zl0st%A!N(`oj%{k_qWd<@O?aH?m73|b6&4=Ugz~Z=f2K;CSZ)Ou`=^9Q&CZ|>g(y4
zQBl!CR8-VWFj`8<^YM_|ltd3>XrX&@dP@04PFRgkSXoU>*i2YeO;}A-T3NlA_*W2T
z)!yE2H4!~CGvnakP*_;l*47pg5g{xrtf{GKHSuC}bd)0L^z>9%{p@{8j>gZ-*qn+g
zAag*8a>wMYXG1Av>-_hp&c4XTPemm{rLUu95tP4HkdUz=#RirCz7)7zGdEE_yB9IK
z*-GP*9;>r`qhM5!UkveAbhRY&bXzK0s`eyPl`8vsDfrUjYbl6ZQOx;Rz&)m)h$!<V
z?LPg!Z2HO!*vxu-iG9I1{Z0-p{=%Svo$~zoE)IPfE9FyI+kc6ORA*@)UeoX>I2x{R
zGh;R@ZI_6E=B2$B2yt*Upnr2R11DpZh=|s)1Wk!KQe7PO9X$1wZhcuem>hQHrgfw4
zbrfB0HBLfynzIy)Hu(OM=dAYwGk|#B%hE%+U$%?G_7Bh7=6Hil984zat*(7%sXi(;
zr~R@Fa=xPk)He9>hzIn=ngK=r4x?k`zPn5IKj%!2pYR7=NA=5eaokKSlCn-j)WOk~
z)o`?%B=piz5*k*iT7G$0R~41>((Bv%Uk;u#ZFu}Z`TDm#@7K$&K<-x9-hSOWy3KNB
z^VMi6F{s%i$HBhhZL622zvk&G)Ke93r|zgawz4=E2Ya>k==ZNXRAs-Ky#9VQ+Ntnx
zIBw#P<n<12I8G+}hFgrb?uIiO_3RlhGK_9+lK=L(cpmNZ34NiBS^pF=s<^&`mvQE9
zp>(Zae7nx`uEyB*ecZ-ag#C`rwDnZk94Th)=sEdla^3lXQmg;_D|rSBpYFU3vU|4F
z(9_UTiOai18Jw|4@b}QvzVL&u?3PP*igshW=2LufOlN_Htt#x^{&Icz<D!MX<I9td
zD$^~C4ptspLz|l=MGOH`jW_JV^Uzke2^8xFUcNhdpl(s=bL&B;ca;N)7#MglfMS8n
z7QWWIW8UK{_m8kJn`|wL!u&4~tu`rU&u?Zvo*cfWlvP>ZWOv|68D+f(>4o&~FVJzY
z=fr0cJ@)dShNJ4Z|FM+JV!=pMazG{S+(mKV%hvktXo}@}aJip)tN)v6NkM*r-&yrL
zlPwR!lNWP^a|o8(lu6BYHEPKDBGgg>{>-6tu?xd3O_|fD9`7Hm`;4b8a-%;{DxLQ>
zVgcSEDf7zNSVj-o=25J<jCXt<{m-GMi*wHUxc`qsnX**QP`rmOw^3I{_Q#(R%F~-Z
z={)xm%mDKFyLNqv^@ftrM#c{7`{HOBONt+hy}GMk!*j8V105}u{1n4Nfq*C~>6YQs
z+Xg6|bV45*fz6SlprLKkVU|Ywh({8#K1kV@S`?bOp^wrTy+@O!dhMmnKRB8HKREf&
z>kkYKv6sC2{=#(43|Me$M+U;A1SBEf5|E+^lL{x{O2LomIO|F{TG<Rp%En1tOT^Yt
zs_LPntrCkURlBaEvd(o?TmOIDsHo8H9om#=fPxnZcU4nhCt3pLP^K>yPI^Nb9>L6-
zf=FJ<@N#exI*Hg1W<Z>tq_j<95r-sn-T;*)+Eq=#sTsv4(p{LutPaXlCQ@wj8jgN)
zu=kn?P1^ik^m3#We3Kbfmx5k-_g$+ZqXhKL%vKPGn2Ra;`r(GzzSfa&v{mAQzH*jK
z7a^0=&5VO0+qpz63yvn50s9hX#1=3VkyoyF>nX!8b`g?B5F>qPiml(lXb@eP=Ww*F
z8PG;qKDeUpO8DQ9vY_OMR3+fYEL<=8s=pyBOR|fAkNj&2+`f*QM<@Z==qNMb0gPF=
z3sV9|3sD;1Ohm9tLJk!Dc<qg4dt`vc-4gH*!HQIsLy?h4zF|OV$fe;qx;Iw->s#O+
zZ^S)~XOG;?RfJafG*zW`DdKt)$Qyl41!*l!`;*=(SMs-Np5f#u^1hFGpC~VlS+PLt
zHN<246r6+^MVT^6!2L1u?tjd#K#$YZx4zUw%HksQ20I~ux;wdi2UCId`6n~h#O{V=
zV0-n=>QN8U?7uEC6-E(v{o6NQ1ztDE#`S8EXCAstq#2t5loh(aqR>w37)~mt;BaKf
zQmcjgUv^}L;YZ}t)`x2qrJ&hIx*x$bF!ZsD7X?o4&h*}@jN1s%C&bp$UJ{z{r4CYK
zO(`mt&`U%Vf2VrFcQyQK@wf8CBFes@P7`!TUL*P>-|V(&1|03UPvsW;j$@9r#jPtz
z!L1+TaPx)?Yb3JyfhC+?4@F0ii_FbgG^bkFn2GpuOW`++YwCG>DOmP_zkT{9!wq8S
zx2ma_;@%sH*h+1x)v#%`M+diCJOXo{5#lZV%m5NM%yIp~+*1WM_JXOCEZqPz08P>1
zbcV6c=*PxQ1L1tB#L5ydSc;)tBc5nl!#^5fBq*$YG0C<B)VRpju8~JH{qITh0T}Y&
z?+qSaJ(Qmm&%eUAU)ySeVdtFlsGnIx>7nYRu%1w$YF%jinrk@iueGK7$P=xLiJfME
z(e;;XX9VR^3jXpUZKc)laCmdEo_N<rap$p(#f_?;9$fcF#d8s+k_q!>K%gBAaEMN~
zI}R2=CX<oLI=bmriR2PznrFP*C*GZ}vS01j7PZ5;buURmRCL|);KrHKG*H7%<oEAc
zJq!5e_7teiF~L~1cFSa@EZ3p`r&?Hd^T7)V+_DFXP81nOtTtKEz~yygk?{vOhWges
zXe7-j2uQ{q*TCMZ?B7g|=SO<JOloCC_rA??yU;8$6>d<&$K6yM|Lk4|M*!1#zC_9Y
z8fGdf`+;7tr-e=Irv^t@vv-D(E_X57L1c83gyBK#u!7N5xz4vxQ|y9I4j|A(zn@<B
zHtJ@V6Rgef&DEeTj#PBs)Z5~`l46vOwb1fU9IL>g+AjMxA^*HC%BTw$|3;b$C|jjU
zY5O*%(@DQc_e=t+&;xwNX=A(SXwL#fWy7%Jg*!T?0OkQG`ihn#lZj_gUA@2aZ?t}X
zF<AKq+4(4<cY$~2>p?GBwwqn&1l}c*9fO!|rnNDu^&xR2p{qB4h4DvZM<ITOqP6n^
zPDz`?WQ7r4j4ZdV9Csw*ry5DNM~5~Qzu`qK|58HZx)Rj5#bV24W@IZ99Fn>V%d^(~
zbe;%`4qbfe36sCU(#n$=N?U%Tuh>W%rBi6Kq0Sr0z39|%3U8>^w}KmG#3K&WqHp3W
zo)7i}ooO$>Y1({U0Q$Sd=pj^HHoa=r6`A34dsD;)<TT_rfw7Lsb$SPbN)K+p&oXsj
z1Y!vAGHhN@RHIFMZ)@>HACZ77#)!~0_XRZwfxuNj@BYS`L}EL^8EUtSJQt>!<%W<2
zDii+xeA6q8e)Nz!kb0=@1KLdonf}A5r=hr409`B@wHM_$v@#LH+Wy6H_`(9e>3lL`
z%vMAT78WwIGLd=4jibil@&PB$SDfAzMwmTEN4;ZS3VJmo=nqCCQ4J+#sKFTvYuHg|
z*O-ioaN^5S$0v1|I>#e!kDodEq2onDejnu&-uMmg7=M=OYdQ}h+}~3Rz6z+uDe|ZI
z(&yuf_obSo-*gbJ4E*po!>)}1h*iIOFTApa?>p+{J766C_K^~?S*jbD<+9g?@Q03i
z1=fzPe_2Slhb&MxXOiN>u9{wI^q*MV7|r*`dsl``yE$fC_;jBP7i0QU5n&gIQ>FmX
zMXg(M`j7gf(!9?&On(v<f_EIAvtfz^XJw}fliCFv)u#u^hj`3?z<sq8gi?u(KO2A!
z!!a5Tk_Q&|k@{2#A0aFQ14^Xirk~ZQsYVU$AXV`v$$jP|%L{-!GM)y(0%Y2&>Le;L
z;%_u)+T_?|y>ZMtuPj@58oTVm9dR{ar6&Be>A&L{xbm#O*iRE{<*;uqKhtb4l_m+1
z)P^utg;8|&)?y(YfjDqAEKWiY&5{%VL^9RYEHR+?qp$G1K=s!dRUagmc{8Nnm1a{U
zo)KW8A#Si0XVWM!-xHa<=<7x2TA*Fpg%po9W{&B9c%^*8*9pwRgs`&Yrl9$uE4YmL
z4`jZWbXpZ&WwA~qSByz1azp}!f89*K0B0lC>TkV}r`e$U{bwrAMb~@uGVBttr42L2
z2Ja!&9hVBMf?&t5aAt0o?sX98W1kI1abWXUwR^^HUHZ=!2}t0T;0IrI%e(`;L+`c|
zSmW+Xe52(AYEetU*)&w5+H8f$Z39`(*cNvD1jn?UtsJwQfo4N0@rP4h6^F__=V%d~
zo+8yKYOo`g%j<DuhpTwLgqlx$3N`?Sk}PRb3)L%fg*Fq(&wMzU3wK*86UpxQ<Rq*y
zY^rKmi;Tu&NA~rf!!@$doXo;U$Y<4>7OsZC{Y!thUr+x*h7QWh#)%@u5{M3eK^K|W
zw0rlczZr6_T^HtBPIsTjE!COSiu<e1-T?Lp1mrwbQJ=DaS|~E~`jty@#*7y5_iCHZ
zywjNIN&OreZKGV(5Q;ebjGi9iVR!Drplq`rYZbQb&^hDJK<q}tsJf7+&oHR`EJ5um
z&EUu`LL@pZS>rOXRbln3quXxR#)vmNGY|R#hsFV3htg#5%3t`K_TgVi=i+Wz5I`5#
z;luUQ>zx}fQgDB{LEorh@o1#q`?LA4^d8`r?l?P___N7ago9-Vl^WSAeqO+eNz+ka
zw!0v;Q!iCVq-Gp@&v2?JKOoF^9=8-O>)Qz4?i}tZeqpBpgo%c~U5RLT{r8_#`Ph;>
zK@WPF(bq0Ajd@@991WYCB*z6_4*jwcQBl7Wor0i+ku<o=41uXvosJWi!xm-<ijaTH
zNkbwtT9)IW$6LFc?@5@$+~@JNa{lnTb@EmDCpK)A{E1&Ss(I^=3r$>W<HUxD%8%_p
z-fk=5Zk3ZiFMrj)Y5<zcdHSCB&?yv}4&>x6B#(8ymWPst)Jy*-_&Om>&b{`!+pzof
z=beNizK5?>E2?5-!NEDrw4e>uM=xcQ$o{>=Npkh|3$LPB&+RvtpO$&!&uu*&^jPAt
zRDou%V<WW6D=H8P=vJt%4e#!Tr)%xGT`gp7X3cuB`hn{UsP)-aPxZ>nUt_CIp5jO-
zW-24<K)$53B_+7OCnI+!d7w+n6*yan&tkhXIX5<1lWz#V4$FBr`Lk@_qH5Pd@EB>^
z9U-vk76B|Nbt}u^1u6x1`(<|P+sE%T;u^F5e(`TFf1ZJIrosjR*Zj&XSZkHn{h`zD
z#f42`fTICfZK@encSQ~LCiS-9%89{S=coHX`O4ny&vO|LTfQ>_2Ueu7eTdS`<zSPw
zgvxh(#$n^{CKp|<lJCFl4F_dYP0qcxN}RkAaTCA`+!fpvYHI0bBpD1kBD1WS59L)x
zgZW3~@4bVR3trnnex8iuth%HA{#|_N8zioCqfZaR`B%34LcKq<Cu75e{6~!|JU{gL
zcF)c}KYUC6vs#}Q<i8c*r@vOAo^ctgv}dz;Ck-3qH<=NPYfL^>n3ZHVQWaE_gVY&s
z;ahsVo(Fet<j8XXyjMZ)m=?a>kmv?l{NsTt_7K2RW?*b1o@~#b3ho|0f6SC-&U9#P
zCvdyV`-&-RuEV$82x)KeMt0EtCE2gO_4}WM3W(og%|l@O0curQ$%4cM<=mP0j%@x3
zP@&mMk-s0d{|hbgbXANw@GI+go+LL+@njs~mlDW9DWh$V<e|gt5o(g>=c@g1y|#th
z>Q4842&|;8PQR|t0S0&E;6f-h2maLiTV9KAy*B-a*Cwkq?fJbhEluv3A^%Ld`UMgP
zSie9!!Onc2Z?oDxyWUQqjWwP5;n-+?AY{JvWF041=G>%p<(OZ{n5AyOr~GqUY3cHs
zvHLBDm*nu74Qtk1Q+B4HDt_X5xjx>CBs+>f<bkZliniUK2M>uY$7e{##>)a|c?Y4J
zVS;{5=9A-#XNK<E%dAHz%R*?jUDZ-oph33|+Zd<9`s)q!+Emx6w;2APS<tm|RP&z1
zI8QXqf6GY-sP#y_6qO`#27QI&KYAbS#U0-^TLB!4J{ElRO$y2byJd^Tfc>nFMv=kC
zBM%}tFS>v?m9rr1<$K04t`M9g(>tM?6^&qlnHLjw@y1%TGIS*Cg%g$}WS3L#GkCW>
zkFk^>+T5e7S_0U=3JS;7u@hnZ70Q<n5FntOe9Qs-SBgY1y9+aMD^1ug5SXAGc|%lw
zI0LSiwqqVy5_T1*1pV}dai`fFP$v)*I|%xchVP~G&$EzzZ1Slt_0Z==A-G`F67XH}
zK6&n~<8$FBLk4FkmlyCdJmOb{b1_nE5z}$w19A>|RN<U=6~@ho;da`>5+T_J4eOc4
zmRk*l=cVw*ipi}$AyylE{JxTL-a`x-t!s)dz4L+Y+uy>`+(({mwj!~O(zbM)x-GFY
zJN%<~zgsE~13LFNbelzj{fI9Ggkm;{)y-x~l7gb5%+HWwKfSN4<_m|i6$*OKZh(o5
zkgW(EyI4B<ja+LQX*jq69z`0fJY?d3h%OB7Tl}*j@IfBxGRtEa(e#6NJGnhD;YTJg
zW@;L9pcKm@g85Ek=o7w32%+X_5Z6U&q>s1Q&&Q>MD$236o4ECdXgZG5qtTf^!e6zH
z=7c)9AYF5%KigYdCY3)(H4*_!<9j*LE3;#K8-;m)`acS*`6Yg!MGt){V)k@B;z~jM
zr?4yHbZB$RDa_I_CZ`;aNcfQQ;}xjs*joZpw&|W>AGUKmu34W1*9r||=&`5X31HNO
zBTFh#AJGd;oARy}B99oD3tUn8&_xZxU9P3p8(dh{UbFoBe(S~nuW23$nf9qXMRNue
z(dvK4p&#0WJo05$#^#MQ&~a?x;@3N)$PMP!bkGBbU3yy7jEI|B6RLCX^m!=!lK#~@
z0yfHYA?KV2J#1+d1Aqr`QZk$;8vFZUEH!GB<GGn@7e-Q=f%N@86A@d6z&v=1eGcb|
z#r{sG@_}(0JRhwDrQ*VNyPEXx$|y4unrj&n&-aR_o6AFA%UKnfO-Rxi!!e!dVcMC>
z{YV<^Tq_{qNHy!Rwvf#dcD7d|XI}NP4YI`db|>=izPJ}hE2Qj<g>yjfu4RHW500{U
z<*|7Y-I$n-`hEoKv#0h!ed_sj9M9*(cU;?%r=dj*<a-a%LjMsviIJo_(2Z0+FfNE@
zsEvE)Od~C?*D3Z~3GBc7u`}a8ll%&P2>6-E6xW6A#d>j-<tPNt8a_mpY~SPLVpcA}
zaoP$1N`~=K$e>?3b|`10X*?qK;b38+O{5PpZ~CvR&HITDqkcpJQ2`5axTr+Px@dJ1
z9iyKqi?I}F=(_^EhMbq;W>OJ&$<zgc)HWcgg6(xo1Wm0kKmr$(ufQJl-WOYnN5nAA
z>R^k&H$l`WuVVe#!HfbHX#QkK+eoW=?Px!bUXYQW4IT3orfTBF2L3D*xXa8<8gM)*
z(xuaiMZ8aI3IMiDSml}fx(Foa9&bGk|FX_}WdDy-5ij`9p<dBsB_;3PjXUZ$ZvJ(Q
ze@tB;NcSWz6O<Jwu4lKbFR3PS6li|{MIR1msJX}?77QpSki2HrN|3bIXF4d7-@7~X
zF8H&6D;1uTO71Uu&QG)IC46-`9{U;`mgjI+KjbTjJIlbu_UR^`Aw!N^8rfNnLDfz2
z_`9<0Niuz?!@t2bHqPsjhh$?3xQ_Pk4`=gs*+oiFF%mzrxk$H{CV2Hqkb5Z?SCxFz
zP;MtER3~b+{5yDp!8`{wREU>;^Ip9=D(cYkAs#WHomP8l1PV2r57Kxtbyz<jK%J=M
z7{7M@&8=|e53;5Wo=hQZrrP})1N_i-y0<Je10g<{7mv0C_9ah$@@pqd$PX|l)eo|w
zrLOgcul|nWXV@P!!fF5!soFc7GnquXbaHww`d-fl^`*S|5tL!|0DzX#{H{!wmPQoV
z+efOI#wTE}J204U%YX;Q+<uWjSrD8NFf9=CAJn|I4G*_ec1DsiFJblMm-WYOFnZh!
z13%|cbZsv48k^bt=)2+xQ(GpSHgIjF*yPkMI+L47fMqG-#m(}=a0YL;LDca{rDVsV
zix2U5twNG{rrWgW=&k$WhMa{BO;I<2J&qALo)qT_Y99{fb8#C-JrHU1UHYaVCVP)Y
zku7EtAB(#hOGAl0?5*32NbwyQ04s4+$fYg9;c2sb{vwP-#X&0xbR(&BSVR;+zN_2R
zw8?_Ta0VRg)@uPaDm67s-tQlGhrT!(e)?ni)HB4cBRi8+K8V?<E=I!8wlq=4d80tm
zkFS*`XayRQx-(*f*uuzvb$g>2Nr#qV#@c?BG_3VVFT5Sf+*06NHq1%Q4fRU^?F520
zm20=!oavJ5<5+D?#eK<qr$&L`ZrWI3FZh9z9uu~3uqdU2E|Wd@CxM}38joh+c=Sw!
zNDz!;frfm}m@9=Ouiw20F>R`?5H)z?YURGT3&9`FK5Zum$EiZUUAEz6?@HvLG&Tvz
zQ19+IZPNJPGwFV@pcxp6&_a4^u+nOmZxjibnf?8d!V=elfcpW;W`FI#3vqLZE~NDZ
zpuhvgEt&%l#^qs69?gFyctqotwzb}i`!_k&im#n}&Bi4;t6#Jz^GE<*7BpZE6d1cn
z`W-mwVE?{WU_Dm`KWK4GryOQA4#XFAvG}0;l;<q~Qedg1z_EAf@H%_JA57j4#j{X=
z+-JCo+MD8G4pu%WRC4>Uc7_I7(x|Y?P4ecIV-xg1JrV1MAB5RfD*yw5ZX<IKa&TZ}
zwJ(^)=>${PRC5r4zEF-IFD!%I@32NiGr3-cK5h=)zbi}@ln&2>0X0niE@_O=PI<w}
z*B)^9<hNd)(mlOvp5S{15GwMpv>RgT7-r$I1!CUd6N&;ms{&5W#uj-<c7N(D=#Yo%
z^;%HP;!VpMo}Ph2+@!6mM}65=q}u77qjiPutso|(vk93R-2>x7u0Cq@^?nXAJZy{C
zg#6V{`w|f$uqyCoaf{>M-x{=bVU`V^c19tod(}dZhP=;g?EBr%S=k+Z<>5S;-Oo3a
za{R64bYt$aOvyW+%YW<wt=dk4JF5>vCj;B?Z%%A`6xf|T>)Ab&TN)zXnoCK0v8ia)
zY~Oq~k@<)6G3TClEp)7+bC9L(0IYmi99!_HQ^2<N&j-!E)jgS1v5m?9CQ|^N37;g-
zgz(Kt7SwzHN5n})cASHD5%x%d3=Q8e{+46@qqhDezM#OqEnCG`_&X}d7t<Bwy4mg%
z+J|=P+l3c`-M@#vym8Cs_@J+x@5&k?YiCiV?3gL!PCcuKdP{wkhS5y3otKqhk<#{L
ze;Uuq5Fvg1hs#(znP=I2R}Cc)*1|W}d!e>sKOp$0t*9`KjVh&Mlb)jyub>bW8M2{o
zBv&&0ly&(z3-mLxYRvm+{j$2HG0=K2RPiKt`HlVJ)X1CLDzd5j2@$^(-|~(R7HwjH
z)Y9`j=uxHLp#Y+)=&oj(lZt{W>&p1_n#(6%Qp=Y+;dM%@;*joPYm?1oOHz!h>+TWK
zwtvPhb)yddFyAGNwd|YgDc}5w_gmk|EU#q+d~NS_W2E^d2{o*nU<-M&I>aQYOxWe*
z-`6N(8;36*TMrdpC4RYm>+;^Ih56~7$${4?6>}~NKfm0{Y1>y<3-;gYc4NAI_%6lW
zUecpB%(`#+ZP<*Tot@2;fL}Cj*<V5PW<IKt)r%(u6gubcGQ>{$n(OYo=I1T1IdjHZ
zblv_I-h;4pbv(})d@|zB=9<oe-eO9M#RQRt!P+K@-8X>g)`o8*w=U;zYk102;*BRm
z<+VHeyX-vBnD@@5^*UMbNd<X))t;5zV11|No$@c{uHhBullY=fMl*$!vjcnQ%inG5
ztI`Vxj$A?n)?e^VqrN+-;DU_HmGgBmAam44bdhJ)pa%)d1~INLPir?!`+#z6k2ZTB
zQaom{;&IJpQvpxTKvjbpQb5+{CM`sXv<@fk@{b45za1|L<c=bbcqm2?H-5T|QIi%|
z_Z|3mdpt-33r&v$<SX`<J5{Df-;xDMVFyE?(5O3;YevBnWMwBNsAm}7uw}eV7!R=3
z9#wd|QZTaT%i=Tr<Zf6R_<Qf*XZJv8l+(`2t3{*brC|%!29y!)*GHbD$=~9y*YG^$
zobxwWbL}hk`5{Hcq*PGDW$4t{-NVl&Y*OaN%Jlt|Bfs*`k>o4K<wnc@7Xr3?*S``{
zRh-=PBdhnaG@RlugqgTk)1bENFD7WyLeMZb+CdT`YIk7ZaW{YgTBGjjM|)+dfp2-^
z4x;H3G05aTAsHxL{w^CM721@857ms9tK_?W$igDPZOz*9HN%qmP!;NiOF*q=Knyoj
zl&kLpr_@ThDgseL4{c;!XxpO~FT`y%|41}DxqXaM0Cvt5*$REd2C2IA43nN-g~@>S
z*NTJ*OOnTtosRU)!Rs-(on{k{k@MzkJ9E)oV!LzEnY$&I5OvtzVu4@yMRC2~lnWeP
zR9dZw>n#=drKoaVZ}r9pF*v%0<0i@(!)D$#Gi<@ObI2;bDT$Bl!gxup4uCg57M0vP
z9QXrWIj*0<#F;MP3>9Jn0HXy$7U_dSsEf=!H`4sZhoddUk(lKVqsO-bY9)SLCgDp(
z^0g_8WWxxDuTscif=N}Wz^u;_Ha`)~t|_eqgw%Aj0ww6BcH~98$J+5%_}vaGwsfU!
zd83XZhp~a7q8DO4<}hB~mVWT&>2Ne%=c%B;u}-F2uH4OVDk@mVKQ6%PrYc*)w96(X
z06Bc$+l)HyThhcCNVUOtWdTnCTxL)pC%*XI{yIzqn%FKAGmc=KIPEi}i931omrV}R
zz3SFAJb-95B*$jBc^#@g4G-U)343<H@6cLwbbD+l?N}f`w0xCpj$(c~oKUo|g<L{>
zKgX=~X_AA3#H~{m$%BgW&SKSO254RS{hR4vYoN;OklL$3u#c+yyDpL737xi*RP0M4
zEl+G_KOE=e2An5!_=@;B_U_!L(@qpUfWfS2cEsjyeaFI~8R;L=j8N!3x{zn_!kWRU
z@#P{j`HZI4Jh5KDzVDKOsMgVhvS>k6a*}Fn$B0O1J3$`b@+DhT_oz`{G(R#wNi|>l
zC=a&KO<19yeJ#$}bzd*%wDSCg$POArXg5I}Z=#zn8rmPHm3>lK@vX2!6tUfop~4$;
zx_9*45>WU$#8xC6w_pdC4erL!;n_GThnlJf@WqC?F72f_#1{W$#)tZ1^BhF>v9~gc
z%=fgofg)kZS4kmL53X*Uv*EM|Bd2r}Bepv+oOlM#7sb~$V+7yFQ{IeYTR~>p&EO(E
z9<}Im-+Sfd?wyUx#YbB?#D=2!)v-jCOo?1{f4h9uy*9JI^c)P15=g;ooL#>0pdu~I
zbDc+Blk@Gb;`y{99gd-O_4A>E0-pu)Ym&&T92(8Hwr`05B+P|)jeF{GdyMX^{o`LN
zwR^8&L1y|-Tk9yt79ub08^0@$e0b0zVr(+G+9>#A;gKoHr(DTy+}d8uJs9uQB~0vn
zYGKI`Sev`zdkF^gJCPIH*fr*Ew#e*StwD~AHEU~0Hgt{1AS#mpa~4wROdc)cxXGPV
zWWmvp$AqD)Vdb}Pm8@Mj@cw&?chwvTe~;JvFWE>m)_r*5jnNnRSciVcLa3<!d%>LU
z1=JMM(eYi`$4BHf<Ux0A(+E<UTE`ZiIS@^s?f&Qqc$EGQip);L%Au0>+|KA5Dlo%M
z%Ov02_SclXx8n$-BH0Ti3}#3DVb-cM%JY8HKj?($4EddT0R$G-eh(Ch$|S>MIEv=;
z@~9Z%q<IasuoDL7KJiF={H0gT$iaA=KXJcN@a$XbVfuKioX7n;?C_XQH;)fm1e0F*
zm^V+VCFzo|a)~-O@_!hdAGf$e)p;Ll^i5EU`eLl%4{$eH&+i&ji{q?x>4WkDJO{&9
zIW{w_qr}psO@!WrCD&bX|FI0?C>aZ`=<m7o@4S(#n==v&r9B+fjGS&_K4deOnHVkQ
z$0dL^sapI14LLucgDIwp@VkQ}3sao)lEKt0h;k%qHM4NoeV$A3Gt!DhBLJ7tP+#(~
zQAYb*bZG8XFNBlQb*hALVpQ_EPCa<sVUxmK46qrnx8rmsSj|^Ak}R=X8^r*T7@8G7
zTHf1rMiUk*YtNj4o&}jC20AX2;(s7NU-E@y!r?^gGq?%)HVXiD(FZ<VK-)r?AHNir
z=$Pn+66gxb(2l+i#zhPv+5ft~?Jp3`lL`i|)f7}#6GX3_8NT=@0U-k3lZ=l47(<pQ
ztK|VPouGENpd+t*L88c;KUk&2g>Cw4PW;hbI=y+vXsy1Iudh?C%4SL!KA=eV<~yg2
z0@|^n1X4TIj|{gucX0yg`v;3~K)mMha6Ixgxk>uy;BCY4!pQw3wCeD8$>>;IG+E4E
zo}W9W=b*V}tM>*hj<oLWq}dfA$X)nQ>S{hRzcaq8kxEDjD3zunt)1z7E(p8SIx1ti
zip=k#>$YCsUDDF4k$i|o4}VGb&dd4J2`B0Az1P-U-%K^a&YfwzID$tEG<?1oZ9SVz
z7UN1PF%TIqN8p&5&wbDHt$<}DszbH?NHZBc7zkk!Bmf`hkL8?tatQGCN64wVJS5(N
zWA_5vdKAA|kfu(-&wJTO;OxRv)Nr576>a82rcMK7>VzRV<LB<mc`lM=r@@6z(%bVg
zapHmHSG^tvGhmo>a_<5+KFJiit<=GVj@CBqYv-t+a_T+1gtXPE&6$;6>=?#hrbjj;
zc6f82d}OY_-Yjk>C>3?dLJUgsK0EoWWz_Tzvv?ZwQ+ReVS*p2o@sgj4jy*6G(OGna
z`l9IA$n2^-DV;AgVy^Vp;-quwG4eaT9l|io#0A9Z38j5}z23PYffTbVu)*pHCH&G)
zB<nBhJ!(P{32F@4g>ZC5Goh~1lQtf*n({btwK#Skoo&PBxfsVubPXG{>iw>m!dsn%
zs&ru7X#IbCdbG0|w61b_(C1twg$;tslM(|wH~<ssGCqr<fBY^+Xlik^_3$FJ)^yl>
z@c!6u?DPDVy>Pxs3j3tXDoWt_qOf_zmyO!5E%{z7l2BzSD7s8T-Z6F`ZS?|E?~c)8
z)?XJ^kavIh-3i?gsuE~={nCCQp!q-{Y-`$XzEQOzF%uVGa$m6d+~&*mF>f7rX^*)G
z$_rV{=!+d<sH0}9+BAjckxr?zWxt)=>G9n+KoP*tJQ^x->F|#5R=KPJ)_7=`H?&xe
z`tL;P{NwGQ#R}xQjrA-@{?m0S%}$Ls>R$0p8@~ddyd)QY>6ta}qEJB+WI?vGKCjM@
zmbI%M{%-n=XDm0r2+h64w%PKh_B;+O1(~yPdd2db6mrzV3Ua7d^OSOnV~vJV!6b5Z
zrCz<BFT5U~tNB07ApNWb_0q&5P!bX|11x1Ji7&!*pg|>b<}w#Wn*l{z9p6h^GeExt
z{9lr+1zJvSG>XF2$f8H%(q2+HGYTKK$@@|iR!&0J8h?QWO$o>;Ik(6Ea>4*rVm5g1
z&=Qr4(<?pP`*cqA*`LzulpArCD_PnU@*R_|4x{^&rIsG51Wqr~mZeQ$@agZc@N@bx
zN&%8k-GOe=_ZW+~>!^|(?dR|HQT-xu82=pHbF<sI|I6jeDRtpCLN7!wW{E;~q|(<l
L)_JY%82x_$D^{Ef

diff --git a/docs/references.rst b/docs/references.rst
deleted file mode 100644
index 068a894e..00000000
--- a/docs/references.rst
+++ /dev/null
@@ -1,82 +0,0 @@
-.. _references:
-
-References workflow
-===================
-
-This workflow is not normally run on its own. Rather, it is intended to be
-`include:`-ed into other workflows that depend on reference fastas, indexes,
-and annotations. That way, rules in this references workflow will only be run
-for those files asked for in the parent workflow.
-
-It is still possible to run this workflow on its own. In that case, it will
-build **all** of the references and indexes specified in the config. This can
-be helpful when setting up the workflows for the first time on a new machine.
-
-In all cases, it depends on the `references` section being in
-``config/config.yaml``. See :ref:`references-config` for details on
-configuring.
-
-.. image:: references.png
-
-A dictionary of references
---------------------------
-When run on its own, the references workflow in ``workflows/references/Snakefile`` 
-builds all references specified in the config file. This is typically done
-only when initially setting up a system that will run workflows on many
-different references.
-
-Most of the time, this workflow is included into the other workflows with the
-``include:`` directive. This way, any reference files that are needed by, say,
-the RNA-seq workflow will be created automatically.
-
-The format of the config YAML is designed to be convenient to edit and
-maintain. It can be awkward to use within a Snakefile though, so for
-convenience it is converted into an easier-to-access dictionary in
-the ``c`` config object in each workflow, accessible as ``c.refdict``.
-
-If we have the following `references` section defined in our config file (see
-:ref:`config` for more):
-
-.. code-block:: yaml
-
-    references:
-      dm6:
-        r6-11:
-          fasta:
-            url: "https://url/to/dm6.fasta"
-            indexes:
-              - bowtie2
-              - hisat2
-          gtf:
-            url: "https://url/to/gm6.gtf"
-            conversions:
-              - refflat
-        r6-11_transcriptome:
-          fasta:
-            url: "https://url/to/transcriptome.fa"
-            indexes:
-              - salmon
-
-then it will be converted to this simplified version where values are
-filenames:
-
-.. code-block:: python
-
-    {
-      'dm6': {
-         'r6-11': {
-             'fasta': '/data/dm6/r6-11/fasta/dm6_r6-11.fasta',
-             'refflat': '/data/dm6/r6-11/gtf/dm6_r6-11.refflat',
-             'gtf': '/data/dm6/r6-11/gtf/dm6_r6-11.gtf',
-             'chromsizes': '/data/dm6/r6-11/fasta/dm6_r6-11.chromsizes',
-             'bowtie2': '/data/dm6/r6-11/bowtie2/dm6_r6-11.1.bt2',
-             'hisat2': '/data/dm6/r6-11/hisat2/dm6_r6-11.1.ht2',
-             },
-         'r6-11_transcriptome': {
-             'fasta': '/data/dm6/r6-11_transcriptome/fasta/dm6_r6-11_transcriptome.fasta',
-             'chromsizes': '/data/dm6/r6-11_transcriptome/fasta/dm6_r6-11_transcriptome.chromsizes',
-             'salmon': '/data/dm6/r6-11_transcriptome/salmon/dm6_r6-11_transcriptome/hash.bin,
-             },
-        },
-    }
-
diff --git a/docs/rnaseq-rmd.rst b/docs/rnaseq-rmd.rst
deleted file mode 100644
index e28411ed..00000000
--- a/docs/rnaseq-rmd.rst
+++ /dev/null
@@ -1,587 +0,0 @@
-.. _downstream-detailed:
-
-Detailed documentation of RNA-Seq downstream
-============================================
-
-Here we describe in detail the downstream analysis of RNA-Seq data performed in :file:`workflows/rnaseq/downstream/rnaseq.Rmd`. 
-
-This page has one section per named chunk. For example, if :file:`rnaseq.Rmd`
-has the following code:
-
-.. code-block:: r
-
-    ```{r load_libraries}
-    library(DESeq2)
-    library(dplyr)
-    ```
-
-Then you can find the corresponding documentation on this page under the
-``load_libraries`` heading.
-
-The file :file:`ci/ensure_docs.py` double-checks to make sure all chunks are
-documented and all documentation corresponds to a chunk, as part of the testing
-framework.
-
-.. _rnaseqrmd:
-
-
-``global_options``
-------------------
-This chunk sets global rmarkdown options. Some of the lines provide
-a mechanism for all cached chunks to have their cache invalidated if either of
-the filenames' modification times have changed. For example, the following
-argument to ``knitr::opts_chunk$set`` will inject the option
-``cache.extra_file_dep_1`` into all chunks:
-
-.. code-block:: r
-
-    cache.extra_file_dep_1=file.info('../config/sampletable.tsv')$mtime,
-
-Therefore if the sample table of the RNA-Seq workflow has been updated since
-the last time the analysis script was run, the modification time (mtime) will
-be changed, so the ``cache.extra_file_dep_1`` value will be different than it
-was before for every chunk, and so every chunk will be re-run.
-
-``lcdbwf``
-----------
-
-Loads the ``lcdbwf`` R package, stored in ``../../../lib/lcdbwf/R``. This chunk
-is not cached and fully reloads the package each time using
-``devtools::load_all``, so any changes to the code in that package will show up
-when this file is rendered. Documentation is also automatically re-generated.
-
-Note that throughout this RMarkdown file, functions from this package will use
-the ``lcdbwf::`` namespace prefix to be explicit about where that function is
-coming from.
-
-``config``
-----------
-
-This chunk loads the config files :file:`config.yaml` and :file:`text.yaml`.
-This chunk is not cached, so any changes in the config files automatically show
-up here.
-
-See the :file:`config.yaml` file for configuring the code.
-
-See the :file:`text.yaml` file for editing the explanatory text.
-
-This chunk also configures the parallelization options in a chunk that is not
-cached.
-
-``libraries``
--------------
-
-Standard loading of the library dependencies used throughout the code.
-
-``coldata_setup``
------------------
-This chunk loads the sample table.
-
-Use this chunk to add additional columns to your sampletable.
-
-The only requirement is that the rownames correspond to sample names.
-
-``dds_initial``
----------------
-
-Constructs the initial dds object and the variance stabilized counts.
-
-Note that the design used is ``~1`` and the call to
-``varianceStabilizingTransformation`` uses ``blind=TRUE``, so you don't need to
-change anything here.
-
-Also note that the entire ``config`` object is passed to the ``make_dds``
-function, which reads options like whether to strip version numbers off of gene
-IDs or whether (and how) to collapse technical replicates.
-
-``print_coldata``
------------------
-
-Simply prints the colData for reference -- excluding columns that might be in
-there that would clutter the output.
-
-``sample_heatmap``
-------------------
-
-This chunk creates a clustered heatmap of sample distances. The columns
-specified in the config file's "covariates_for_plots" item will show up as
-colors along the right side.
-
-
-``pca``
--------
-
-Creates PCA plots, one tab per entry in the config file's
-"covariates_for_plots" item. Each plot has the same points, but the colors
-differ. This can help assess the experimental design and set expectations on
-how many differentially expressed genes one may find.
-
-These are interactive plots, and hovering over a point indicates the sample.
-
-
-``sizefactors``
----------------
-
-This chunk makes diagnostic plots. In general, we expect sizeFactors to
-correlate with total read count. When it doesn't, it can indicate that a small
-number of genes are very highly expressed.
-
-
-.. _dds_list:
-
-``dds_list``
-------------
-
-This chunk sets up the :term:`dds` objects to be used in the `results` section
-below for differential expression detection.
-
-You may need different ``dds`` objects for testing different models, or perhaps
-removing outlier samples. If you have technical replicates you might need to
-combine them, and you might need to remove gene version identifiers. You might
-want to use salmon instead of featureCounts. These would need to be done for
-each ``dds``, requiring code duplication.
-
-After working on many complex and/or messy experimental designs, we have
-settled on the approach of a named list of ``dds`` objects, where later code
-refers to these objects by their name in the list.
-
-**The** ``results`` **chunk below expects such a list.**
-
-The simplest example is the following where we create a single ``dds`` and put
-it into a list.
-
-.. code-block:: r
-
-   dds <- DESeqFromCombinedFeatureCounts(
-      '../data/rnaseq_aggregation/featurecounts.txt',
-      sampletable=colData,
-      design=~group)
-   dds <- DESeq(dds, parallel=parallel)
-
-   dds.list <- list(main=dds)
-
-Now imagine a case where we want to remove a replicate that we think is an
-outlier, but we still want to compare it to the results when it is included.
-Let's say we also need to collapse the technical replicates. Such code would
-look like this:
-
-.. code-block:: r
-
-   # The long way...
-   #
-   # First object with all replicates
-   dds1 <- DESeqFromCombinedFeatureCounts(
-      '../data/rnaseq_aggregation/featurecounts.txt',
-      sampletable=colData,
-      design=~group)
-   dds1 <- collapseReplicates(dds1, 'biorep')
-   dds1 <- DESeq(dds1, parallel=parallel)
-
-   # Similar to above, but remove replicate 4
-   dds2 <- DESeqFromCombinedFeatureCounts(
-      '../data/rnaseq_aggregation/featurecounts.txt',
-      sampletable=colData %>% filter(replicate!='rep4'),
-      design=~group,
-      # need subset_counts=TRUE if we want to automatically
-      # subset the featureCounts to match the filtered colData
-      # we provided.
-      subset_counts=TRUE
-      )
-   dds2 <- collapseReplicates(dds, 'biorep')
-   dds2 <- DESeq(dds2, parallel=parallel)
-
-Based on our experience, as we add more ``dds`` objects the code gets more
-error-prone. So for more complex use-cases, we have a function
-``lcdbwf::make_dds``.
-
-Here is how the code above would look using this method:
-
-.. code-block:: r
-
-   lst <- list(
-
-      main=list(sampletable=colData, design=~group),
-
-      no.rep.4=list(
-         sampletable=colData %>% filter(replicate!='rep4'),
-         design=~group,
-         subset.counts=TRUE))
-   )
-
-   dds_list <- map(lst, lcdbwf::make_dds, config=config, parallel=config$parallel$parallel)
-
-That is, first we create a list of lists (``lst``), and then we used ``map()`` to apply
-the ``make_dds`` function to all items in the list. The collapsing of
-replicates and other dds-creation configuration like stripping dotted version
-names is determined by the config object which is passed along.
-
-See the help for ``lcdbwf::make_dds`` for more details.
-
-This chunk becomes a dependency of all of the ``results`` chunks below.
-
-``dds_diagnostics``
--------------------
-
-If configured, this chunk will run the diagnostics on the dds objects and show
-tabbed reports on each dds.
-
-``results_*``
--------------
-
-.. note::
-
-  This is where most of the customization needs to happen for each project.
-
-This is actually a series of chunks where the bulk of the differential
-expression analysis takes place.
-
-For simple cases, you probably just need one of these. But for complex
-experimental designs where you end up doing lots of contrasts, it can get time
-consuming to run them every time you change the RMarkdown file.
-
-The end result of these chunks is a single list containing DESeq2 results
-objects and associated metadata in (sub)lists. Each of these sublists has:
-
-
-- ``res``, the results object
-- ``dds``, the string name in ``names(dds.list)``
-- ``label``, a "nice" label which is used for headings and other output
-- additional optional arguments that are passed along to ``DESeq2::results()``
-  and/or ``DESeq2::lfcshrink()``
-
-To continue our example from above, we might want to run the same contrast on
-all samples (the "main" dds) and after removing replicate 4 (the "no.rep.4"
-dds). To illustrate how additional arguments are used, let's imagine we also
-want to use `ashr` as the shrinkage method for the second contrast.
-
-Use the ``lcdbwf::make_results()`` function for this. This function is
-a loose wrapper around ``DESeq2::results()`` and ``DESeq2::lfcshrink()`` that
-adds some extra convenience when working with lists of dds objects, including
-the detection of parallelization as set up in the config object. See the help
-for ``lcdbwf::make_results()`` for more details.
-
-By default, if no test argument is specified in the parameters for
-``lcdbwf::make_dds`` found here in `examples 1-4, <https://github.com/lcdb/lcdb-wf/blob/LRT/workflows/rnaseq/downstream/rnaseq.Rmd#L164-L187>`_
-the Wald test is performed. When ``lcdbwf::make_results`` processes a Wald test dds object, it
-detects the Wald test and expects a ``contrast`` or ``coef`` argument to specify which
-p-values and log2FoldChange values to report.
-
-DESeq2 also supports the nBinomLRT (LRT). `Example 5 <https://github.com/lcdb/lcdb-wf/blob/LRT/workflows/rnaseq/downstream/rnaseq.Rmd#L189-L194>`_
-demonstrates how to create a dds object with LRT data. Since the LRT tests
-the removal of one or more terms from the design formula, a single
-log2FoldChange column doesn't reflect the test's complexity. DESeq2's results
-object is optimized for the Wald test, and when storing LRT results, it
-maintains consistency in datastructure by choosing a single pair-wise comparison for
-log2FoldChange values. To avoid confusion, ***we set all log2FoldChange values to
-0 for LRT results***.
-
-For more details, see the DESeq2 documentation: 
-`DESeq2 Likelihood Ratio Test <https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#i-ran-a-likelihood-ratio-test-but-results-only-gives-me-one-comparison>`_.
-
-.. _rules:
-
-To take advantage of this infrastructure, we put each of those contrasts into
-its own chunk **according to the following rules**:
-
-- the chunk name must start with ``results_``
-- the chunk is cached
-- the chunk depends on the ``'dds_list'`` chunk
-- the variable name starts with ``contr_[index]_``, and the rest of the variable name
-  will be used as the name in the list. index is a string, containing 1 or more alphanumeric
-  characters, and will be used as a sorting index for contrasts when generating output files.
-  Note that the index string cannot contain "_".
-
-Our example would look like the following. Note that we're showing the chunks
-here because that will be come meaningful in a moment. They are shown as
-comments here just to get the syntax highlighting to look OK.
-
-.. code-block:: r
-
-    # ```{r results_01, dependson='dds_list', cache=TRUE}
-    contr_1_ko.vs.wt <- lcdbwf::make_results(
-      dds_name='main',
-      label='Using all samples',
-      contrast=c('genotype', 'KO', 'WT')
-    )
-    # ```
-
-    # ```{r results_02, dependson='dds_list', cache=TRUE}
-    contr_2a_no.rep.4 <- lcdbwf::make_results(
-      dds_name='no.rep.4',
-      label='Removing replicate 4 and using ashr for shrinkage',
-      contrast=c('genotype', 'KO', 'WT'),
-      type='ashr'
-    )
-    # ```
-
-
-When combined with the ``assemble_variables`` chunk described below, this
-allows us to:
-
-- retain caching at the level of individual contrasts
-- combine all results into a single list used in later code while still respecting dependencies
-- reduce the bookkeeping overhead
-
-In our experience this scales well with very complex experimental designs with
-lots of contrasts. For more information on creating complex contrasts, see
-:ref:`contrast`. For more information on how these results are collected, see :ref:`assemble_variables`.
-
-
-.. _assemble_variables:
-
-``assemble_variables``
-----------------------
-
-If we had put all ``results()`` calls into the same chunk and cached that, then
-a change anywhere in that chunk would invalidate the cache which would cause
-all results to be regenerated. With many contrasts, this can get quite
-time-consuming. An alternative would be to put each ``results()`` call into its
-own chunk. But then we would need to keep track of dependencies and ensure
-those dependencies were specified in downstream chunks.
-
-For example, if you add a new results chunk and cache it, but forget to add
-that chunk as a dependency in a later chunk, that later chunk will be
-inconsistent and may even be missing the new results. Keeping track of this can
-be error-prone.
-
-Our solution is to set up the contrasts according to the :ref:`rules described
-above <rules>`. By following those rules, the following becomes possible:
-
-- we can detect all chunks creating results by looking for ``results_`` in the
-  chunk name and automatically inject these into dependencies of future chunks.
-- we can detect all results objects created by looking for variables starting
-  with ``contr_[index]_``
-- we can assemble all results objects into a list, and name each item in the
-  list according to its variable name (minus the ``contr_[index]_``).
-- we can alter the order of contrasts by simply modifying the index string in a single chunk.
-  For example, if we have three contrasts contr_1_ko.vs.wt, contr_2_no.rep.4, and contr_3_no.rep.3, 
-  we can change the order of contrasts simply by modifying one index string (ex: change contr_3_no.rep.3 to
-  contr_1a_no.rep.3).
-
-The ``assemble_variables`` chunk does all of this. The end result of this chunk
-is a list of lists that is used by functions in the `lcdbwf` R package for
-downstream work. For more details, see :term:`res_list`.
-
-For each contrast (that is, each entry in `res_list`) the below chunks will
-automatically create a DE results section including:
-
-- a tabbed section using the label as a header
-- summary table
-- MA plot
-- counts plots of top 3 up- and down-regulated genes
-- p-value distribution
-- exported results tables with links
-
-.. _contrast:
-
-Specifying contrasts
-^^^^^^^^^^^^^^^^^^^^
-
-Contrasts can be specified in three different ways.
-
-.. note::
-
-   In these examples, "control" and "treatment" are factor levels in the
-   "group" factor (which was in the :term:`colData`), and the :term:`dds`
-   object was created with the design ``~group``:
-
-1. A character vector to the `contrast` parameter.
-
-   This should be a three element vector:
-
-   - the name of a factor in the design formula
-   - name of the numerator for the fold change
-   - the name of the denominator for the fold change. E.g.,
-
-   .. code-block:: r
-
-      res <- results(dds, contrast=c('group', 'treatment', 'control')
-
-   That is, **the control must be last**.
-
-2. `name` parameter for ``results()`` function call or `coef` parameter for
-   ``lfcShrink()`` call
-
-   `name` or `coef` should be one of the values returned by
-   ``resultsNames(dds)`` that corresponds to the precomputed results. E.g.
-
-   .. code-block:: r
-
-      resultsNames(dds)
-      # [1] "Intercept"  "group_treatment_vs_control"
-
-      res <- results(dds, name='group_treatment_vs_control')
-
-3. A numeric contrast vector with one element for each element in the
-   ``resultsNames()`` function call. This is useful for arbitrary comparisons
-   in multi-factor designs with a grouping variable.
-
-   .. code-block:: r
-
-      resultsNames(dds)
-      # [1] "Intercept"  "group_treatment_vs_control"
-
-      res <- results(dds, contrast=c(0, 1))
-
-
-The most general way to specify contrasts
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The most general way to specify contrasts is with a numeric vector (third
-option above).
-
-Here is a worked example, using a two-factor experiment.
-
-`group` encodes all combinations of a two-factor experiment, so we construct
-a sampletable that looks like the following (here, showing 2 replicates per
-group):
-
-.. code-block::
-
-   sample   genotype   condition   group
-   1        A          I           IA
-   2        A          I           IA
-   3        B          I           IB
-   4        B          I           IB
-   5        A          II          IIA
-   6        A          II          IIA
-   7        B          II          IIB
-   8        B          II          IIB
-
-
-
-We can make arbitrary comparisons by fitting an 'intercept-less' model, e.g.
-``design=~group + 0``, and numeric contrast vectors:
-
-.. code-block:: r
-
-   dds <- DESeqDataSetFromCombinedFeatureCounts(
-       '../data/rnaseq_aggregation/featurecounts.txt',
-       sampletable=colData,
-       # NOTE: the design is now different
-       design=~group + 0
-   )
-   dds <- DESeq(dds)
-
-Check ``resultsNames``:
-
-.. code-block::
-
-   resultsNames(dds)
-   # [1] "groupIA"  "groupIB"  "groupIIA"  "groupIIB"
-
-So any numeric vectors we provide must be 4 items long. Here is how we can make
-various contrasts with this experimental design. In each example, the
-coefficients are indicated above the resultsNames to make it easier to see.
-
-To compare IA and IB (that is, the genotype effect only in condition I):
-
-.. code-block:: r
-
-   #     1          -1         0           0
-   # "groupIA"  "groupIB"  "groupIIA"  "groupIIB"
-
-   res <- results(dds, contrast=c(1, -1, 0, 0)
-
-
-Effect of genotype B (that is, disregard information about condition):
-
-.. code-block:: r
-
-   #     1          -1         1           -1
-   # "groupIA"  "groupIB"  "groupIIA"  "groupIIB"
-
-   res <- results(dds, contrast=c(1, -1, 1, -1)
-
-
-Effect of condition II (that is, disregard information about genotype):
-
-.. code-block:: r
-
-   #     1           1         -1          -1
-   # "groupIA"  "groupIB"  "groupIIA"  "groupIIB"
-
-   res <- results(dds, contrast=c(1, 1, -1, -1)
-
-
-
-Interaction term, that is, (IA vs IB) vs (IIA vs IIB). This is effectively ``(IA
-- IB) - (IIA - IIB)``, which in turn becomes ``IA - IB - IIA + IIB``:
-
-.. code-block:: r
-
-   #     1          -1         -1          1
-   # "groupIA"  "groupIB"  "groupIIA"  "groupIIB"
-
-   res <- results(dds, contrast=c(1, -1, -1, 1)
-
-
-``summary``
------------
-
-This chunk prints a high-level overview of all the contrasts.
-
-
-``reportresults``
------------------
-
-This is the section that creates multiple, tabbed outputs for each of the
-contrasts in the :term:`res_list`.
-
-If the config specifies results diagnostics
-(``config$toggle$results_diagnostics`` is TRUE), then this chunk will also run
-the diagnostics. You can select just the ones you want diagnostics on using the
-``config$plotting$diagnostics_results_names`` config option.
-
-``upsetplots``
---------------
-
-This chunk produces UpSet plots comparing the contrasts.
-
-``excel``
----------
-
-This chunk outputs an Excel spreadsheet with one contrast per sheet. Normalized
-counts for each sample, from the respective dds object used for the contrast,
-are also included on each sheet.
-
-``write_output``
-----------------
-
-TSVs for each contrast's results are written to disk.
-
-``combined_rds``
-----------------
-
-A single object is written as an .Rds file. This can then be used for
-downstream visualization or it can be used as input to the functional
-enrichment RMarkdown document.
-
-``sessioninfo``
----------------
-
-The output of sessionInfo records the versions of packages used in the analysis.
-
-Glossary
---------
-.. glossary::
-
-   colData
-      The metadata describing the samples. This is originally defined in the
-      sampletable for the entire lcdb-wf run, is imported into rnaseq.Rmd, and
-      may be subsequently modified.
-
-   dds
-      DESeq data set object. Typically this is incrementally added to, as in
-      the DESeq2 vignette.
-
-   vsd
-      The variance-stabilized transformed version of the counts. Used for PCA,
-      clustered heatmaps, and gene patterns.
-
-   res_list
-      A list, with one item per contrast. Each of those items in turn is a list
-      of objects that together compose the contrast (dds name, results object, and
-      label). This list-of-lists, which we call `res_list` for short, is used
-      by functions in the `lcdbwf` R package for more downstream work, like
-      gene patterns, functional enrichment, and Shiny apps.
diff --git a/docs/rnaseq.png b/docs/rnaseq.png
deleted file mode 100644
index fb37fb2ffd1daf996568d129f757607ee702f76d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 34326
zcmdpdg;&&H@F)!{ODqU3-3UnR0xI3@(j}5hw;-S(jY}_G3sTYo0@58zNtcAAbc2et
zz+1k*^Uizc{R!_J*w1I~%$=D#Gk0!&BDFM>NgmKXz{0{JQB_gU!NMYdV`1Se5aD7<
zp5}kw!90AmV0wzTcXyb-xljY|IRk^asJU{?Z-&9wxj8R`dQ3^;oWb1O+>k+ccelaZ
zfI-NdK^LaPsra7wv%&b>oUOrJfq@$a%$qZ?!k~QTQVcpfJ0s={R#sM+n3y^`IsyX&
z&CShoa&n}lr48nqrlzJa!td_x2$?3HV%~5(bu@Ibu-+z(h+rUqn~ISK78YsG{Xcff
z6H+=XEM_cK1zA1s%!91(`h&)p3SZIs?$g@!xv9;;HnrnBgkjS|FcvwpEZ{C<x28b>
zhgoIqiJDhzpgoK;X}~&hoU9NRk`CjXlC?oVG0!9<62hq~dZSjTX4mUUPXmV)3G$32
zM5q<_dlKD46g(v4L2AX{JhAQ}oJo^j5-8`h`}c#=+@SrqalH@s99*Zh+x8<S3yhV(
zN!&G#+)H&m4m!MS$bn?T)9)_wNN)XmM5SZt{L@~?Nv?Z@zJ3;Q>4TX9t$R!{PuUMI
zMJJud_ZZ!jl!HO^Q->r_pY>qLU@T<1*GY%-9-}=?b+Cu?>~rFOn7s$}43;o%K{N_R
zqi^0;Az+!rdas#Z=`RH>^Bzvcd(5RCyr}|Woy7P*m?{%PaJ6dOz0OjHL#L-6Bxl|o
zi?dcYYa~G!atdpVS^f+VS&60nM}n!e{hu(5V!I~)@vG&Zoju<^bCh-SUv?dHI%$47
zl7Dl0>yWXK-}de`0UCM!@qF9Em-k9Nqhc)U8A<d0@%2TwQ}v9*>l{0wf0wR-flF6P
z>rC14>YT~{3HdT~dhO$4<mR$`d^YwjaL;?iY-D|@I+vq&eFli2Ieyo?Vzv@Md^+2a
zHA8Z>zf`$IR#SI0W_W#F(>s)Z67c#cZkNpb8z-gOy(x1HHk$)uz3xn|rF~mlGsgCR
z$QOzy{heJf-@4A4zt-(Y**v>ibXc$u`N{jmOu16`;mE)1qty&hUo!K(jWx^OEJ>2=
z^Sr;ao62Tj9J{`L?{qb#bbe*SKJ0gQdg~{3+_d6s-zxQP{@G4t2U9m$Qr+s<(Ye{-
zz~;>nV|Gknqx`dUxBqPQP^x?J^_a5n=uWl=AK*mNtM(Do@e;_US?b-N9JpD#&~Wao
z-DNnNbrRq|&uey_L--ix%^7d@&pF`j#}4x!`VQNCM+D99-u`I|-z5`TIfsn=W!$=T
zK8kY+PyPLWH9KU5!zudqi@2{R_$A1;`L5A_dCAD;+O&H2YH<MbsOVa2v2NJ@ezSZx
zvBY%u3*jvI=|69OhBdYO+-i`hD1miD{vWT;yzysf$3H_ZKS!g~FkvCMsT;c4jE+}-
z|36jHZ%pCeI3@pUto7ZK!6J`)A{#yb)8_UF^{9Iy!P4gUrmLTa3NZYWxbsWPW~cF;
zbvXVD=Q;*}WAtnR?0>qzlmDv=bd)}T>f<}<{MS_spHdxWSUz<~z7PAds8M{XP{&E`
zdk#j^-wz#eo_<cb$CRs?_3BX+sn*>$;_Z!E;AsW3f-e4l%AG#h#5eKvD|&UW(t{mC
zsuIsw7-2l78c+TtR%D6idsz5?5Qhe{3I8Jvk3wB5u?P&KDbRO{B2OaDDGiL0yF?gf
z!h@J5GRR^;rTad`_KnEuzaH(0hXHBt^GYlxkW};ktBeu%pW0&gmW{brMHwTeK>PoD
zW_A02^mnyjMJe~r*lY&FVXy8@!;^+-y#c26pWk4t<#=D@n1<a~aGxJPzr_^&M^$=H
zrFUQSp2h;Ec2D(xiUKfHV)sRR_f)#~Q~?;O|B6WOEnkR)6x9W&YwzZ27OfmjQ}(&H
zd6pC<4G~#gMJYV$hG~d03$pe3R(g&XonU;Kk=*yl3P$R~LcxxN_UU1g$W#d)3~VU~
zAOXrFKbK>;ne8GVJu1;r4=^wa18eYlL;50kupjn;by+a#kkHW~At7pc2vDD%k`kDb
zjRpv{RI^5~^yw*JdKDRg)Qd1MfP)fEZpFjGhK!^nKI2hR+V!h4gcea?g2MZg1tNi*
zkO9N`-#wU)0Wo2sSB!)dKRWT?^M!FTr#?10Lk%aJE1{;MKN2Z{1AbR^5m^oLvkmXs
zAhZRFrou59%oKwiBFZ;|*+bB%>LDVJw_iJ+e|-c!SYt!7^d%cAp=zRY2$jID?g{sm
z`Phbk*dQhZ*pQzMXyBv#B4o>zr^gLxEm@#2%uuHLMG&27DuU+awn7YW0BP!Ta7*68
zI7#nG#k+UXvf!|~L5vI<_+N~1OZ73_okl*)9-g!}+_?m{22_k3{=Qv2id)$JF|0Wd
z<}@OMI``dMud!E4iv}(|^eZlLILgsD4@hlxD&q#sLDSi75TY#x5{>m7d$;x(XK`B-
z==GVIitTNmwq-MObnPr7Lx^kT{`KYIdcD1+Ry<_NQ@A_%6tP=Tcg~E_OckS91qnQ?
z`mZ0Ua`Hm{wcDbpH}5FKss;1ze)M0Z3!so@*Zx}uz|=_eI!h78XvY7e_x1Hn>lw%X
zQ+i!av_uDDm&Z^Owczrj-)bVcf*D@4Yk$OGL4fw0U5+zlVncE_AR%_Mv4oYeT*v*s
zSM&#uhxaer`LDeBGAK6LhW%AhzIShiM<Ja0_cnaOxX3gcOkHVw_f<p=we=&wIs00K
zF9W>6+I`~F*8T|56srJe4{!2JjGeK*y0_^UpI?psVQOB!=XWvWMJ|CiXYQ|r3x^a@
z)B+8&ArP`}?*{l^4D`DQ-aCVg%rEdqsOy9RB{pIEfH7sCidAA7XzSq3iN)3JlN?1#
z4Q}u+iXw4hQEsaO=)gA&e^DREzZ5l%Xin72!N7P3R8xtT5|gqXr^%>7(LXo&&Tsxk
z+?GKJXp?f0knI#bJ}A1A3U8>K_l`e3xR0HgZCJF9hxM!@FVmE{YIyLfBhNV(3dLwH
z=JJB@IXMu5{i#zhIu7!ykARjk923WhWhP*_V1!btYE?I353o5dxG$NVZP;O>45S2>
zEljX65Je*e$5Hked&zxj?)<mhW7h3y$vVWIDi8bs!i@oNvkk9O@e#DbL_{sT2Lz)d
zAp_3{XzXBGG;mTrEx^Uo$Cjh-5g<&BuoJ?mq0CUAP*DjI5>92geWJbU7nHy;RBkQx
z3)eOg-nW4<PrRI~_cQ)X03>}}aIYZ}5sy2}wT>L%z=%=!v1MW{f^SisxLGEwv~kJA
zBs7eACWxk2`@S@C!V`8H8Y@R7H&lSvQ;;2`B{+;~$KaG4K3>fZc&jwz8iY;=Mh-@<
z*y0bDto;!D91q$8tE~)mXt4HFPBlBFP}IWma@QI<_;66Nj)HbP9Bi-~%XxH1(e9he
z+u>&gYdT@6{JBRR9fgiI9KJj6C+#s^6}D-))F_OGMW~)v%Fd4qDnI=fkA9SJ_muep
zE4lD{vZ4|yqqhSt%k{&oogPO(Y+}-hDk_}SC7Q^LQ+|`6P`Hg#mIhv`f8@XCq32(y
zg-n&r4YGhbfit)CJ~rAQKI`J270(*B>B~tPzwgQmQL3<xpHFhNOav9lrTjnzygSv8
z5E~aH0@M0oy(chMyj52fM1L=`6c<!MIS$Gz+VA3%l0!*N^DAt5U=J~pbOyeF-CoH%
zl<dy^p^{MhZeJVIUZDzQu||AW4n4O(LEi7nW!cjpBB-8og&LK-34uXr;EvkA+aA67
zQq3tcktYD_(E4o|UL;62t&uYDm<>iXS+#44n9wCJRUTqC`F;W7QGc;nWBUq|^32uV
z8bh$~e~-ZR;!@4#<^Z_*nc29o+|OwsrIzmpH7Z0l@)G2)2tE<Y&{pnAFxc{I^06Tu
z9p){*at$JUL~8%C0zZg2Z(fScwPs`H<ZU|jU-N=DV!lUGb9`2#vf=vPwLi81vb&HF
zMBAq{^Wn3=QZ2}ksb;m<6CPUfx1{1f!ZD%>g^vooh1Y&F((`?Y$X8QQ<cpLzYh}WW
zW&S$Nkur#cs7s*zt|Ftcj?$j$4j{Gjt2JVRvu=TqJW=6$G5m$V)y3o=9evBOYA`8g
zq~LK_ehxuHlmhtcJ=JKe9pCc2_!ViOxFm??N<3n~1t7VvPQDfzPRb;C=$U70)=TUq
zgOsAbzoyo*|BNfbZ7jwSo64g5bnDu0zW|ddJWE1caH3<y2nb&ZKmK6;nay^<FvUT7
zH{>&Bm?9~)3=aZ)pzquLG0E*eCt*}3d}C)OHfV$3u6R{X08!}CLh>bn!$wnDUfHew
zbyE3^goNskRRj?Lz%id}I8yPd3agfPtEMqa8dPGbqkK>|@TqY9L2nXXpTdj;UwwJ<
z0FkaniVHpve#U2N|Ji0EZ*36IPHaD$!Pq>*8o^z-_ZC}X#_NUr3l-<xpCgW&Q`S$^
zV~`L<z4a?)(3aqhgEJr!`gpPFi+{88z-~OKC}_{*Dk6x458}=lCfR$c`H@cwGvBi|
z{8Gk9UHuiGclm&b{R(T!uB@sS2}##Kejlgg;w&2}D$lwrh31`q!^^`4Gnj7uj_cl%
z9W^cb&DnA+0oE4>O{wP^8X!v2_NsjIZiBH(PhymJ&74RFQ2Bl&z=p~H0S2f3wDR@#
z4|Z?I&u>dad)g=zhU|KW-tv8b0chZYpZ5yeHdW&JR-~G`%M}-&un1DZ&Nz6tQ)M|@
zkPsui%hF{KewD5hV`a%jT8J<_wg>FXon)khqA&HAWm^?|H%Mv9(ad{G%hO`BTWWOu
zz-)*)NmB{zGu@Gz;C%Tuk+htZ#3#v!Sd_>1rp%k;1B{afUZ8(2*vDw{%zqQ7Oe2wl
zPqV4qm*dz5Ie-bIsy)+g-$rI-o&8QlcH&@(-s<JLgCtoc6D6?rbdw*035!ebgK&P1
zRh`r_t(m6m4}#ym4HB`HJTR;{9+u;PndU#DJS2~35U^ZaRbAFpqOrn^)gWD$p&7!o
zY13NOKCCCH(v_{G>@aaj8@FU#0vb$TF3_8ZBB`;NCU8M<bk0+a@TU<1rWnnQOl*>&
zN|<>P&2;rE7B{13J;s_bzp!wMM3zDkHkAz~qtyeXf|z-cipqkVS8ufy@y5zlzqmni
zs7~Vci=>tvC*z+cf_+s(L~Oa8JU1V5<yISUJPq5JB=_j9TyKfqnkEE{6+CSIM_Rfp
zRST7W6Z<Oikhz<e4cO93te41-Jy_oTsnbRY))@SlhP#QEgTIj%d+II(fCEbdKpROT
zxy4)=o_%v0B<lW>=;scxZ8Bnwb3*@%pzP~KiJfylwg`}(+C1m#`{^djjF~b%u>$5G
zA+1r278@qwlFo6w!C90(KdF)%UZ*TPB*snF&ei99famg<PTGy;**C}lQFkfCeB<Wz
zqRi?f5~=d%P!X0m{ckOtc)6b5bcSCihQ67I&Pv1@v8qQ7@sWGk95PwBiiPY~@e<dg
zfM}ik@q?ed*<p1iO!uULUu>90(_q)Q5rn>YOx-Cm^1WU<6kA9nLU;fd%rD(v(bb1i
zpn=al#Dn#TDD`xX3E<Ma^lJfiTZ+XMT;jMVL<+>P5U`7XW5=jf(8)SXj2d7Gfj9@z
zIDZk)t9vel#d*X|XEEo;(1t^2y(5VJ#Z`4H$OPw%YkUWBZZDJSvBriS$eOgX0rP9}
zA6ZxdF*AA6P-^pl&T<WPOzOI-$b|IXSmOg%f_E=YXPM;_L1D68UWQl@m7G6_&o;U|
zm_!PmTguA)M{D&<5MA(u0L6+37Ps+NF*voVmLMBI$71BYyulF6%Mv_cgO#%Gf5EeG
zGxNdLJ+Z1b2?F>G0>NhC#a!)uV9bU>WE5Y$B_g=lMhYzgoT+B0VSv|y!Y0<aqbeD!
z$uR=Bo5u&q3070-Ag7F$f3TG(;f<2n`De7&xfmpdNmOf?!(UKQSQaV`;%eSx-YxtO
zCa${qw6D_|p-OHMOofBwN9(<b7*GLMR{jeDx%s3HIw7&Y5y?NY_M(A{O@AehfjEn)
zV`}tZmH@=H)iWu#XKNmTHzr*H@@Np}!L;4iSoOxztIxVrst+R}uN`4s77Dp<v(ZjS
zzz~uALpP&UB1#^X1>dKjqF8xk5@ftI_9G)4of`rYdzMxFi`Gg`5Iv4(s%wEhDC@tu
zX_CB2ts%F4l(Mkj{NoeRx|a0(w}Z?Z;MA^!*DQEF4HU-yD@2WX_<Gp{SF!vVdS@k&
z3$0+=Pc)Ue3${Z39wd?n`9WQXPP_x>C%>EIG%TFVx4NpI*gwC!nUavs8S~|08C%$v
zq(W5mmK>SQ-p!o8zVklSx3c#;+*>_4Jf)k=ApFu+?|dfp>e@;#XY6o-w^id&(JF96
zf|_u;>Usu2qq|gi<Fy|yN_s6m-+RDZOsImYTMvy>mR6q<%U}=)Nf%K3knrzKSL9v=
zw2zMsXogCuWr4M9ix;%6%<f+=J^KBwWtHi8_k(-KW#vt%+1PRMMtiI0%|i8AK>bZi
zDfDT3U`6G^^#1b#DzfHIv-5PwN%5AiVXAHARH%c<Zy{JHB3<XkQOF&$U39w_AMQ{a
z{X<rR3<=@y&x+eF_8K<_&rBS6gNFUMX^}g#v!i-%W1##r@Nqf5{6m_e>#;I{Wl1x$
zk?qriC;bYj)!@FD{b?x{7g!7D`wO+9mJ^LPS}hmA7aYTXg>95kMKN#A#7t3P9x$}T
z7WXSDtGzJEn78`H?o+rN!^`g5`I0B~o!-JHVi_T(ctWO8glzxtcZE_+t%l8%z~ydE
z-+nW0-RwM3m~uVoKe=sR&S*+)xN8c0f6DQ*&gT#GFJ^!1cJ}u7{P&A1BS{1cg)Ncu
z3)E~&AiuD07C)>Tn?mWPZ=oZchG=SW<#^k2Zr$Fn=0hgqPhG+A_0fJAl$}szKKGm=
zWU9~`vKENH4bz_m*Q9_5xHql<Iq;)~v!optZ`4<@?!WKL8OwRq=C(W>0?(TQo@NDv
zHEAN60_N`|mWMREyYYu<io)cPX^<~wImcDuFZS=P`cM6C=651wMF^P#`E9Xrm$?18
zv;u02@vFvMFQp&ZRBF&!n)2P`nPl7GA!FJ~%adD*`hfc7r>-nscf22`R=R~NeU00c
zu8;dKp8Y(#m72&A@LTDYXr+FvU$yIpXp*jw|CIA*Jm#&C__y6)knOg&JnGmNEDc}l
z+PJX8)6n;dfxe)DYspn+{(Ltue;ByGr+9b%#C7W30Au%~h>X?GeT(mm(CsT^w6rVX
z{rhGeEG``{X8iHr)vqW|U+zfBjTsgbPq~HJGDhd~%y}#e!Nivr_QekZYCqmL@E5Sg
zy_IeFpJ_D~Jg>|8GTd=g-!lekP6LOa1wP8(wOD6&2(8tZ1or>vc)|JEg9_tVb~d0K
z;;9uPy&$lI25#+Dk>P}-u|@<(S6d_kJQx`J5x2pK!W~K|)rV;EV&yPMt;HYNL+hur
z;Hd#3Txy3p`YYgi-Huwy?TD=%*gdO!i`EJWahpgT^a0c?81FS?fRxPao;3A=*??L-
zUT;S4s5Xbx)@@GdMs5w0J$9Tq&JU27Wt@T;dolZPT=X5yiBI_F6<Z&`dXtAYI`c5*
z?gxz(0z0x@4wE3=J+Bhzpu``Hhj*53mOq3>JQ>S6IBoCz{5hnqvy9fD1CwjXIe6nN
z5!ek>%)@)<@3}k-&>LQJs@G~3X;*auoNPcDZm0Zj(O|^XTl>}s_ngSmTv)w4tW*1l
zO);1a_IPASMyb@W9kal6FXTf~L<Bd}Q%j1&B`Qt+GT91J^!W;S3<%(&`oJF4H1Iq%
z6RN;(abQ$`(_~G1<fS`1lFgy}=?H4TGXl_jX=>2(Pz?Qd^7-B5WaCz$4I+};Up1K-
zRy$JN<H2fsqgb6a7USV!%uj4-sUb+o`LQx9Sv`Q%!T8r7L9{|o72_Kw%eC8Qg*VDl
zJ+Rcwi<iYZY_<V8*ih|9G-Fg8ageOP!+y`WUmY4*BLLAh&<#b{@W-+O4;EVoJ2JAd
zBq4oNf--DTw!Yb%7E-J*JOH*r0HW+THvFh4&o;ko@<#l~3Vn9uH?U|~w6jI21l&kY
z-qnXKKbMk9$@%?Hr)s!nfQa(ptl)(_=Qq3?ycgbAN!3hCoYWd0TGU8v*>jR_814VP
z`!g;5fjD<m*QG;@4C8#3-+Up`8y2u{s<ocJSlH8X!MAR%C7r;BxWX-vp$<xdpj!WA
z4PBQsF=7lgwtGd8^hTl`{a1WRKL=6m+tvJd?F^xgw1<$e6`Oc_3T-jfI@5i6Opsjk
zO5WV>grfbhsrFVYz~(c-oz?meg$gex_l5l&G@?+tkUpaIM|X?j7Zl8}9maTpP8`YS
zZo%t<jIWENJEScxs!0hmsu*mm50xN7^C<RTlf>Vmo*|kqCwK9BXFjb9VXEtgrIS@w
z=-rK1)4K|-d|Vh`XU)r_x5LsQj|m|cDR6bH+1bf|hQ83FheU?rn#6q4<_%Pl0fPN1
zAXJt`e$4ORByg!uWoY*^)`R-;VL_3Lnotsx2KUZ`SbhhDQCN6=ew^Uu2QDV;kF_vI
zTLz!+O5lc#Vu|+B5*CZW*R@&EPyTL{yTo&UiWKa{)gs5v7lt_mFnD9giiNSl3`E^R
zAK`%e2SS4(>a5n!P@h7(Kdq&#TGW01>qk8aS9U?TtKwS_)e#%JYj1OLqD|^MiXn1c
z@FH(G7b8}~=m07|cAjWmhq}{EIk=*>%IbUXZy|pH5wRkQJcvzHTId?~ul?NKXL(ll
zWbuDtxM>ue59A^hnetJj&~`1IXD?+3A67o-#U&rtmL3oH`ld|>Khwbhd++9cH+XrC
z1^*UTC-<AZupMX{HH%ooj@XHxepZT$;Z~DG-HG*r@JkU|telQ75dIie0|pHI=~Jq2
zd)3PNkW5C2khdO-;q`YV)R#9{1d9}w$SST+rZX`GfT>?HoF^;k+@|EQ1_|wAW_6)E
z_82cXvrDw0HYuG{zkK>B_0g6N9p7OIjO6#Bu1A4BexKaTtqoLV_%e_7ESIH&Kbyow
zxp8ewM()%()L-fEkPAJ7=gE-{Csz;@-bC;9M2HTS#6X+|H27RMFBUkHVXy`wcu6?;
z<&$TeL{>$fd|Zd-FT+dgEg0;_l~5@g*dLsZYUGyQl>{sl?VBrNbd)Rl;U3nzWsMpG
z58TMAC1?_s*1ItfmSd=evHNIXuVP^0@`pt7k=Qg?0_5ymy4etprR{O|Qa&YRnp%)8
z_k^k<-AbJ%Z|t{<mcA}F5;RY>==tXl>4KPLR#qaE@INB`Aoom~L~4fmN@BOJ>z`b|
zBpct+vDpk71j|4C)JUr1k|5TT8v`L$)!^zn%Ts<Ys&Kyie1(#;o5l8RubDN9OE(jO
z-)8rno5llyF-to|ruw8>dNX9Y>${z2!nvSl%ed>m_|;q_h%<>)%jI`zAf~yM$=J3L
z;f2){KoKz!BKd1Ey7s99&7ZoatTsqUlP5y+uf4vsJ=-vEQ+mNHe|Co2*PlCITar6i
ziDfxM1(i^KrET+xdY2fzlB}`i5Bjv#RiBxdJ|r(n_yA17;E|=bl#+^GWb;Hqs*V&6
zLmw(*6L0z}6O_Y_YM-;z>=PwJ(boD)o2exk{muv)j;G<@aq(s~@^DAJpzrZ*=~f2d
z3Z*mUT$Bywp2vnJJ|$uTmV5LVo5r(tjx)|wHRW8%@>es&aZw<EVS<GK-BBNIeG}K+
z(znA(DA|x8*>@;v-S;p+5H=|6D$SUP#fNu5)>fXTl9&w^pl|F-srSS%2GWL?(_pa>
zcwNw^V4ig3x<@pnB2N%P!v@2ie&w2^w~dN{WC>&0n)gwB$P-ZU<|}OcIa()d&6>>J
zv|q+6Dj~*ZRZ3q2D$;fD6xlb_(c><gn40i1!iHy1;Wny^n?Oc&-fQ*xr5U7bKGUZn
zohO<i@_&e+J5A|NF3~Z_t1L4^3mY-7-wI-QB=hh#43xee#;jR8J0FE~K#HItQk>?7
zn#km1OUk?u8tU-jSi!<)t^ZU`_mSKsE*Q$D{%7NFrCQ^MZ02eIzTHCSZV+(Gqzf9i
zqpbABY#!{<)_|1Ogq}K+AfgoHeyc5d1&>NmtV!nPqtnN<{p-k!iX7Q;g=}+6p>%NY
zW2b67D9vZGzc`7_S2%514c|dU+;}vqn(!4LNvA0q53Py#7)W{0t~0i+v>AOBzBQmh
zuk*VmY9iY6u<3w(`^koQj$z6x_Eq7561z1`O>kdi$dQlM6Jd*@+0n>u!09=NAaGEQ
zKrz`Hu*}AJH@8<JRGsn1t;qAc;7koAr?^F~7q-4kUl<Vov3;k%rU0DQ;T!K`?&p}0
ze6gwQ=dLgKk`1^(k8L1gNtt^_yl@s3M%GUIHe^4ZeF9wr%}x@udr|_*ii4!kTWM&3
z`!MS<w?%3h;jINzII$V}=c4Tj^_#kWxX2@$Q9~^@!Jd-uO#n6=a3F&cACb3j`xmOO
zt4(FukeHW}XLbXV@&EfSz{?TY^r@7o5{(FOWQSu{xh`dx$NRU6HVzrpe*r-W{z_YI
zab702hTpz3bt*sjw3=O+p_ClF_n<JK*6uMd|MMx%-8^T7uYFFIU31=zBngqceBE}U
z4E}?Kq=MmHM+exXiXqQox1<nUq}X;WZQ5MxlXr-hF)pWr#P8b_%A(ak7W(HL`WK@H
zs#cbZL|&?wZkjMKODAuDy6+X){<jz4SH$J()#4!yz*T7ih)AffRGh{O@Fwx<BI5Vb
z6`4uRmtG0wN7p&bd)FbfzYlnV@Xe!3iKfO`PLA8&v&HT^OWhsl+Bx{++nmc>jTB6~
zUc}_8#d))hM2HH&C2B%~h79-7pbcUoej&-uAcw%zf&O}uJ+Voc0Sj?EE~}7b>p6x0
z`_(m;VF0v7;MRyJpF}7M<napahO;I8#Hvdd9|vNA9_}dQL@W|Cb=El(o=b>+IFVyl
z0rzY79e&>x0-~f_us!(uE}{ad(e{=|v!2X!NNm}@ojKNx-2$6uz$H%D!)inN2y8y>
z*B`|IQ;mCVLZAQ(?}swT{_Rn2TJW|!?6cbIh~WKlgu4b2DgS7|lTMMGc+kOzWLx@r
znK)&X?YWC||M2QhoZSW^HqsnDy-Xr#s8cHoK|vBl7!@SCA}z)sMgcp?o~d3!9wE#7
z+RGOXHIGft$KjmxYB4a-pp<c^?n|!xwaNB!@aB6#<LC?2QXO6=FzX`(UVxpnhTSM~
z^JR|{wqA06l67CcDrHx!nk+&H7fybM&%#?k3EgH#I`QaS2)-jDSIQJ1VStVo`dFa7
zxA{W_f{rvQBatGwmYxsaF=)edB2iNmw!YjTh&HY$mQZ8YI~;*wX82Dh<5el}2G?0+
zP%wDTN1e0hHz>cb)+y2|S*|BD5fba8LIB%fG3M+X{}+^Ile@}7R8x625QHsZy})QS
zdFggx2z7kW5@zT1Vpo2c2C(j<l<NFk4rP7K45IHcw8`bLb=Y4ZsHq<-PsE-QhSC{#
zdPb<8`S_ueN#TikT4J9Uj309hAI(PsK3L<f8t5#P&r|ubA=S8f_M_x>3rV1l*^%2s
z-E!cZ1QW6Wu58q}1>!gN9~3MKLa2l+Qr69|+${uR3^9B}OK2fyP>oG+{Gg7x8?h?#
zASLAk1Zd!nT7!2_0CdF=m3sJ7yLnO2Z8qc_h*JjTrwpzYSabwFmBN8PQwtoEP7i)=
zE9Ot14rz`hDGk{zd;rBPJC10{EJ6^gRb-SI5ZISERn&nkYTV83z5y(z6MrWWs3}li
zx~~+v@o<L&_=a1F2={5B-A#C}GPsdJ3QbQ9ZFX#~pq%ECj1XmmpFK(^TCdAbh0KM5
zon0|(yA=u8VEo*~cJZ`5nK6)~o}3iOj*UG6)L?~(R5MaFq#otsix8!OOE`?ZKy7dL
zt&OnJ!`<qw5UO_#*m@4iVGGgy8Zs<`t)QbAh{WDor}Mkp#eH@pbx&p-q+-Yja6++3
z90s{~3O;gUL|dq}4Qa?Ip-RANVj@XPu7~fOy%5@e+V4ydzY7;NQRmwyyF-K59>=%W
zJN!~o+0z89!|S_l^4HGv@2Dx@*h+w;D*^jGiY?%j<|<|*N3>`bvpbl{(YEmACf0N-
zK$7egARiZ6%!+)zijA&~-al=smtMRg6O@Fzjy1GAjeuw>BD&^dJw+wq%ja8rGDuLE
zzUYhh4aPYs6iWD(qe|d&lE*Zzap&99prYMa-sAqmrxaVauC9iSS+=HsTXeOZE<N_)
zr_KKJrGK3(+dev7V>cH6$XzvPX@8xw{ZQk4=vvI=;P!xj`6$4SMccqU6B2nRF4SE6
zco=nD(Stk@(%6KAX3=ZP4j_!Kca}(5B*GNkF;{~$&&5Q@;9(@y>b+$jW1JeYsC_q|
z+Wmd{ZjSSH-Ncn0cq#VKqvNBKp-8D7Bh-!+DZSu|{#$4*)h*FH_?z*(X<s#MOk*8$
zgGSuaRQi1D7A1uxUCC$7i5_lPp55P<3OjK&yjj1wscE@r#Lv7`eC7L9Bm6(DWMWRH
zkh-FQ6<(RzpAqhtJ96Fu^M#Y<_K*4(+DJ9zQSKj{btg7m8w^wFT=%NfmmTvDM<M%7
zZxQ*hUXX_%+WW|R6f*LQ#`Zm~&_8pJHj+ECx+(AM#J22pyK>u?)i-13`0$Eo=`go2
zbE(<gv0lYEp~{FS#|H?SlO@Wnk7uqt#ib{2GGFf^eq8U!H3rTTkM+CV?U@nUu?)Y-
zDc6}etXU{CI!x&e`zY?5)UaqDTWmdaFJGe&<iU!@(>dfEcB`=0oLaI=PvvttJtjKY
z$z2baCF304mt?FxK7RL3=Q!2b{2BEV|Hf(aEOgU(h4lIE7>m*ac*%b3Hq@x?UX`&~
zh}V=}tp0YLzx{44<#-g3l&QhtbM3zUOZp=(H)=GJ&N?UR`b$US@{<*k=l8B@ns|oz
zF<keA-k%4FIw9{%+qrd_<B&mZiM0j$NVVLZ<!B#|zPr&vn~2LuDp6L(odAWFud?B0
z=C|eiIz;kjUS9(qv%-W%$3#-P3&T%HOb0*Seu=}}W@1{z^tdG&TaDhsQ$N>pwPJWT
z-zdZv?I*p%2dGJRvJ-irPmKxn;j{BlIRs61>$%<b8J<5AW4ymf3Z4ea*QwaF^t|8e
z<*yscrfFZ)<xI<s$5p|!BxAdDM<4}2-rNHVjm4WL)g({FIxHf6qIagdLjX*Nh%?cl
z{mi8@(;!m>KZ@6&|42Seo29>-{v$b@4e=-@fv0*L0+c^R1r#0#Bhr)n+<8<`NBy6I
zJXg4B%<@}JkFUBk2N)a1TBN&crT=oQ*WdB~@GjeJ|05VW^{zGWu7Hwr1~Gr46^{Bz
zW1E@dz;n$_&sfoOh*!(n-Pa!rIeAn<k7%;Ls3?tiyTZ{Ts_V{hcZl3HOBZau;!pno
zIZ-l}<S-to6QB>}X`;|j1UJyCV?><QNy@t00rpAcz;i5NU>O7bo8v^$nM>J%@`5MC
z{e>aKKQ9n~1e?hwqNM8iHPaY~!`mJr5Bl$1Pj&~<?qzd5OzDtcBt?;syVu^6Yq!MD
z$$K+7zSQZ0!dwhfuNzWsl+UA#*mNc<H~bCp;ZZDGC`6a*V@U*DtH)?w4Hv}eYVPl~
z&WEPGJ-x;Ks&t<`V>LsIF6|M&owHTmXeyw3ya;y8#Nl{ql?3SF2l-0Jt$v07DOV;~
z<I9e|c>aR7Vf)ftTf@vIoOAdsHp9O_#U_K99iCmDJHzBG9>LGbEHF0@1)(8jOrCm*
zP&Kz=Ml1O5l$nqI8B_*)TS=jrU(NQ?nsS&L3d^RThS|tDn`!bKP8p@#9)LLMEalq4
z)L=3xJ$=4FNks;!e`ty@T&Ug?&`b&HnQ1YBewTut#RI3uK<iW7Y91IWGaPKxiS@hd
zPpFWO;=Ua=8p2>X`Gw0irZlgD2Zj1GX3~qmu6!q>VMGeWX1m1vX!1rATjt}Sv4}Z`
zSc>X`X#|E?!gkq(RT!Lv87zf+akP&BWyuf}6{#9GCA{_g4Mm?vxSfN(CfFo&gAFKg
z14F3RuO?E=ah2{x$pv!@S@N(0->QC~h0<GwaK2%%{1<?trjCd3-o2-(;lfW?B|kY9
zfLnCiiUd|M!WVf4X(b5>B@=>d5P*oPuwcF<5G6Do0F@GO{+La;mkyBU4Kl5<?m-ej
z+LJEhu_44M7BQ*l3Jy|a&t->>B7&vjmBsoUJ1ghRjiwR#x(B*p3Uh=a*F{HrCk}!}
zAN^#$%~XkK4Km`Pa|+1MuicNpDJzfo918jC_24Kz4CTEIjQHL7b8%S<jWD)(xfY3k
zw7@|E-5&+_jr%)Z<b*|5ar$5BKYT|5CAUWW;Ie_Uig7S=`r*)8#Va9?Q9W0`LAGVQ
z@X7rjrN%gx^pBZe>H#8DC9&OKL_OUE<!>;DKC+XIP?d&Lq_Gw;ovQu_lw)|*6)nYm
ztxMerk>sQ_UgbS?T1dGnI9SP!jsI1Wc~SGMlecF$1C4+Cd>V)IiDfb+=VcOUg%ded
zm(-Ej!qR%7O!e)C(8%zkqJC=%;>`V;ulX%xR4g~wD_v$2FS$!DPI8{&+HzCR{3|j^
zxKo#1p&twfk04Y9sCRVEm`dd)1mrG~N*34c%+w&hMi?7wu^4}Jx+{7BIy*bd`K$)^
z$1beedY<mpR312%%nGoPN4`Q=B@@dr8eLG#u|0fKt3+{uZjJ9msQ$B~?wqLVDs(@w
zAxa%i(^5%@dXLl2s|Z(jMf1`miAoNOJ%=|wAmEN+Nv=J!^o~4wK!a{16sST5ac%;-
zfoXkXX{(c1F+*!I9sovH>KkbO%h0M`W#9c=ZzDix#xR10MweY6GqHq(pPL=%F<=p4
zZv)wi6Szt-dL@uP%b;~Upd{D#L-(-0uBrZtWJ8pexiMKzl2Mgw#+vwXi#foCHbM+t
zRl#^hPk@tTa)lRdX^ict9|4h5?6c#?1NY!|Ul{Jo=>LUG$1ro0a1%Ta*rW-=oCD61
zaK3;?PnI#b=(|07$cD5q`u)l?+57l4^dy&vKU*it!`mKTJBdZQA3l6Z)Gg_Svv2Vl
z+JDuTkqRWxHAsr_Qy-X42+}6BXeG>OBvzp3cw#Tf)0>e$te6VjVL^7*0@&Rg1lO09
z)6+-@oyEVC8hyl{qKCA!v|ln%0(sA-nMnjKHkC)9MXdegwd6{cPgOQ5g^Vnw@}tv3
zUdq8OO21eq$``$RPTyf=8jSl&;?dB{=lf~W<`mO~R}SAGI1Gq{A5Rwvh+VxJnJ!U~
zPvg$kYVLDaw+09A;>EpRdMx<q3x&TSQIc{>(dbnBgF7w*{*=B}^Vz}!w{0_m_<)%_
z>R6+XUYY*j^<hPBK5pDM=#^U+-UhKFmXl%UqaOksV1lV#Xlj!8#U_C5*^6DJO#?}e
zhcBO8{`$%IkMI1Qpq=QJb>Ns!=mr(lPg=q9-j4-ZB^;USQ)7;qkE=o`;@Tl_RBG&)
zr<(Z~EpKfmbN)c>+^W<+K7o&e1IDV%!J$hcM{*Tdl2-Ep3zdPThVdX>5}DqeAb}FW
zMACLHI5-VCCBl-tllY`ABhR&l&$IDSpv<F{*+P^hGK}i&X5(SI!fC97xYwK~apfGE
zb%P@nn}rFu{*%$e7SL}N4>>4vx0BpTuho9=r7&UvSw_Fwx1d*U5o8(F<gv73o<|Tv
z=OUu~Gllnw8Z8mUX_jD5tHaM!=bc_6Hc4)$1*CD&xY`4y6t7S=iKdJP)PR>V(6~{8
zSf`E7uU4^~JdI&rUr4LgB7U+Tp#*!7>)UDeKSPPwq*&U#656yK87^<O%$A7E%ZfsI
zCZ&SB*=7~{>k4GX87#AYJrQ@;EqGmox9Nr)Q1AQX@k02B6_JpQoa<BlW8z42xwvKi
zgXs06mFrX-QH1E*=fP(3y>dG^yPQ>96Qh4dUUQDcyHgwg7)V4mW5L<CS^D1#)P^nQ
zbA2rV-TLV!NQZYST-F>kqrjxixVBG+ULzukGxr5^)_-bwkbP7Sk+pk_{7%1N{Ik#?
zzWa4LKFJ9<Gj>r{o^sP+qcuZ>U}M;{=r{?!EulqzS^F##PN~!#|Mo!DDOk2fg@0oh
zkzlqgy4Cm{Xa16}4wWypo}^vytCKcFpg0!$xsQkO+Ji@xp>#=`Jg#O9%dIo`G)+M@
z6iUpeM-C}jD>`L%hYgEjq>Z7jb(z84l$&G*)<D#y@l*8@%iEA|P!?n3UNzT%hCH7U
z=C3Lq6k(F9%aO@hvpQwAqJw#5v%7OOHH)%a#0&{s=HiPs$e$jKE@`hi|KUU3Uy9T9
z4!Y1Z2pWwre<nWxg^7;@H&JGfS(e!t)xW4P3T#>++=>-4{Do4uq<vpe1H%##8{^q#
z!?Du#^yJrbAg1;{Dj0mhRqu?zWH{?~MDf=-9-2PmfWm+*+<LNqtSykpKMfB}tvj(K
zzlaQ_cCfE8h=^11RQE>tvVXeLVx!dIM7tFUiz!dm7`nQ18D*z&>{x<Ck2k<rJleNK
z?Q|ycSokzBf%B+{RHYyR&2G8<1}#%_ADEgQ^?*WRIK<BZ0gL{dobrmoF_qmhT;zC&
z3_Gnv#(0IKo&{?S*g$Sf{V5Ne6|gcQ7u^VxeJ9dmC-_l;^_wiXeWS}(?tM`dFFwOi
zd^Ly37ld-~h+eKsNr;ZzNn?Z#t)-lsJKnw}>_<L6khNw^MR+24GXb#;M8Jo@kn2B`
z$-ekpFMWj@hZz`Vz4I5u6l{3D&-q4s5PfxHX@64@Z{g0aDrKv@V<d@V#gmrKO<|j~
zhFDdjlvw>!ljyZ<7|nN`xDX~|bP+of(%nf2c0BCp7elyx8()3M`vOf7oi^>(!_o*3
zqsL>lsOM4#7cE*;^Gee+6)k*yRNeGeK_&(bb+_#9Jnpl&?v&hOL7-(C2TkHFAEXZJ
z{l@EisuwTr?0XxGOMmhznFFXRr<FcZ<@kmdHkLS8|2RamH(vJOBXLYf6>c{J&Us@v
zIT7AAkH>EW7rAC*YMug;M>9L|gEbiPovBP#8(md^{c)83iPD_1J%bIcnKw7vw%5ul
z>xaqykB&gi1JZcryced)b#Fr+Mm;tYl((7TGkIx5Bk!3i^|Fyk!#4y0r;!wkG@6Od
zjN6$9F*3THuI0@wtTN%dG$_7D96f;HZ6^;U=iuSze^Q|1YKpk}_*7G(J%UXUwG9<k
ziG|QxDHT;XrQ3Y@RjKytO~L{Ybs7a+6AX9}JkPWiB*>D$-?cz-@{<v7T6c6Fpe>TI
zVc!<|yuQd|3sBoqRZmX$$Uc>&S=AB&2S0fRzchG|_IQvs0nGe9hV{eCA2y3Xr~LxE
z#f=3);t^;pLSnN#i(D~eHF~DutHg`AcNh;^J&#!K#rAdXQU!&vgkA{Syn4*-5n+5`
zMwwg{j2+yfop~TZ!Fonhe2QFrjQKW08YQgkbB3q)^(9^izH3va55*rUdZrn2V?9BH
z>KGG;Xln}b&&{>A;72@IXTcUpVGeD-<LfW{`yx##kD|X+k9a3e^`tsc_sW4tQhlFr
zDzn;)*_dR0iX`V0^{M^0ZN$DW`r(VwOz$wHeM5xYzuq;mzvH?35xrBqo4D*g6nX(y
zdmfjUuo0<mVU>vU@*n?tkg#AMP=%b+E370b(;--4rs&_GI<%XLymfn&#_-uBQo*;6
zR;8l4w-H6Ho|=WNCmh@Q$#(9mZ2yY6!cwkMFE|DZPLZFlS$<K<RZo2mAAkS5H+rZ2
z^c$sYUZnTzOWez~?tRWXu^0r!>jd%1g9Tr>Q9x@6%WuVQ4O9=TaJXph)wk9o=HC_z
z9?CIazq${B6ib7>x9y34G?^PPn&lT#e_>1XjuuP^I}`U%NSbi&4vmfAqjg1B>BT0u
zm!CW|aol{%vsm}?8`lfE<AJa2WccIa^K$R6I#R-y9J>lzVsXa{v~s^@jXg3<h{6Gr
z843?0REH>DEWtAh$4hzI)mbWf>+wWzlzGcmb}A^tad#MBvr66RM3-OenTBE8>-_P^
zbegI;=4mpGa&Pq*dk7SOC+Z|;!1w5zelujAO&SXym{`*t_%nYD#8r)of-A(|N~lk=
zi~1BA{-P(yRdw$(F_{&4hdMA*6fqC$d<c93*RfO^6Bv>6#t!wLSlfv27d;;&Bajj8
z;SH4aT%MQpe%p`Z&@;}i|3-Jmj3qry*jn(#^ith*+ue-;F#<quJ@83P2~7!PgX>Io
z>x-%c8A0g8CT!B$oc?r`WeMO|pSToFWP1}3RAV_LTVbncYy=c;m`}z3rs8Y;8Xk4+
zvTaWnBfZ^6qLv%=3&hF7nBAr249lKy(64%E@X6Xd@8$EMZ(6b%G3f+^*=DKFA``R6
zMl!gye#-E_FkHTvTsZEs34eJ`as8Zm`Q;o@61j#wS72~i!&=9o7d>j}?rOS@WYH`l
zH8}#4&nKcE>>LEzo&G)yFd5vo6V(f=3eI6-j?u{vZY0m1oudYyN&_F@VIc)wN&q<g
z(uT543h(d!PHDFAOWPP)ut$DgX9Y?=Jcf|Fr2{A(*8Uv4utCt75`y9E5ZrR&5FGDC
ztODXc*XxWBjX?+*$R7W@n$&jm75gD)vK?tN9v3d1dAhy^u=A^=X<PB1cBj-F2=*ZR
zR!k)1uS)BOPMjh(SnI@0O{I5ly1OhWECxDZ^HEj1!}q-j7UBma%m?MxY0onJP=gfJ
zq&qT@hGo^sB1nQ__QV499jN)i(FZi$za3-4AsZt9Jhr6Xj@Q*a;-5smw2((q*AGwN
z9TPJ7%lbmds2mB0KlOO8k#Y%C=qw)AFks>83`9(z!E%obU2kk8JMci=awy@>M%OBI
zug%k!rI$?3g*2Ykg}fH59<hZ(r7MGkY;f|R2{}}6oMJ50#4|nltf2hgI}zP>jarQ_
zw2i}Wl#?^vNYUSMP=)Xn!e|W&0gVzRQ|HU&RbV`M`Z;5Qz{iSDWY~DYF~340Dhh9a
zDbrs^QI`}D0AtRBzFhGG!7?&P%;S>penRA7w87Gz&`v*fLLxRCR2OgQhcv#GYrlzA
zDz_Xgl;ZZUoe4BnSc-z4%UXOzdF&=97Rs~dp&f9h&55ZD4=G;$-SdPzY>p2LRS+Tf
zYHCQk!}?-P_(0tR!MCE+3d-9!&qk-&Dgn6&-MZ9&Juh%@GHa|lBL0MDp7AS5Fg|#Y
zeEnsw*yUsdGiMe=_&m5u5nE`EY$+9Im=rpS*C_Stq5uVPD0+ub6`BR$gQ*Zd`dN%$
zW+4*XH=)Rn`61Cv+x|@DuR4YOoj*Q_^o68D07336i3G5o;5m@($%Tuw3m7v_v#%ir
z_9cE3VO9-%tVG5XeB6HmZpr^PC60qQEoh#<D5Q};DyM-|{2ZhR+Ykp=3tONW@vx_O
zN0sR!axu4bDFKvwi@TBx5MVLpu(eE22>ymui4?Ld4`hV)&ICri9tm<+N=x2TK#yau
zw;5+bLb4z46DN<)O%B^CtRddNGu_I?6@teHY0H8&=`ROMnJxa+L%YkUp(zw$B8$Rk
zt$#{+RDmkgo-Doynvr7--VGW(6>uyYFfPHUWHBo9Ezuv^F+D@f^%U&&k_jps;E#sA
zaO<OAI^;<T;E-ASl0QL!%Xoa)>l^@oTNI;ZD*|3Mq6#)iOeW_f_sp*50ifc&7Iz#K
zkglB<q|l95NphH@D{B^Hf20Jw@uy%31WrHtgQz<cENoFr&grQcFy9MUe;TI@`y?sj
z@#P=Uy8o=EA`A!qixGQ>h0p@W><=a9h?y$7F-z!Ea17)dJzE)_6eINl&0r}nJY;J^
z$RHUyCpC|~F4;p4l79>YK@y(X0NDP~Dj^`vj7jbcQX&AhX%omLbZSjs|L;>6CC6|u
zNLK)EPn-?I9KI$$MpkVvmE9`%J^*xEejFnG^y<^Ad|b<FUQ_*0Y|8pL24mU2pOS_!
zFok6a;uALK^4>!PO|Jr2-+T^p3199Dg;R_WeI2i{el!I^&Ri6^t`MASlQS5yLmJPx
zKY#ac(akI<1hjRwc8>XY$0i&lNn`KvO&<I=Y+N3!#n#khP8n27S&{SP`>jC_A-m#R
zoEi5B$nos2JDbhR=08`2RS^i5EGrfg!@4_;?zDVR(FTjHuY)qd8htf+7!K+^5!);M
z--sXjeR${yKXrYWNQW8`rQ7;tPGovuCr7#x7}{CxbhqC0S|%gNr5K6$(YO^zTAT^#
z5#3-0?j}`Pcpyzt@TR<1XFy#R%g$JC#A+Vtutq#&M^Td%NpCSzcjthJEc?kqAO~Kf
z7X%S3$r8hV^4VBoCX70+(NKc*b1Z~xAxaMXz#`Wrt?usT(vR^l&^7L;;r_#|=VNE{
zElusx4YQWm(ywzW;!QEPL2KIHw~*~$&zv0vFpBxK^477y7TUaiz*vA=j2|i^_6N>q
z>N?*6q9|<l-@hQ}q?eincrC0(@-NHJ;dU5(%4=o;T0HigcA9r98}K*(=oDVEz2LOM
zGGE0bGu#|N79Cj1C<G6yRc{C<nc_HqYU5o~xW(wPHmfVVXgZg^B+<dS2nrx<yaTb~
zz}d0aCAt?|PJM0`KVDm92A!YG7ro&ezV>fxKkCjoJ`aH$lEocKV*LhAc`>#AWm$Z0
zSzGalY1gm9Ze`65Y#I(;q3M$j*Ck+k%k>qMT|@MayoyIx?>Qp)0AW}&=eE8fwa|QK
z-_U#G;lEU2cFjK>3ifU+yOQL;{rnz=NS}OWn8rV1`lf?s+4*x*zaEXDuJB6MA+X~8
zNfO{4BKr5I@OffQuy$u-4(7OEMd<wc>8)Q5Kxk?2?qEPL2i_O0fd0!gYvw_;bbPM)
zXjX#BY_{gL4?0=0$GO5gC-HJ?{DhYy9nwR1xx=W!gkzwXP2w^JQFnffv_ruyXOk9g
zif<tP=L_$mL`o9-!742_H;j+&OeB@iFo%!leKKLnCnIjfy{WHG8#PbuGV*+A+uydk
zo@okMA-Y^1Qo!R!poi}eK5hNmu=8_YF_p9QYtEuOuiF~a0U>xZf|I=HuH*(K->6>E
zaZKHcv6>H&E)i4)jo5M<v=}Cp67<w~15xcMk<s>Q4F?<}f+6d9cN(sh0h+D2XIXiq
zYND2@r@kitJWjy?p+c8sI$WBqyV`qe>BlwQ!gPf-yJts#+9ECIQbvi=^5x0u9%Vqj
z?RJlzr9x&kKgRVpikUNA&}@y$pjML>=tHc`eN4C6VNLSG0dGwafYDFmG+`HcBH?uD
z5EQO^HvGBF9P$6Y3m_*BS47=xwCC)<CYHHX+J=H~rMh{g6po2%mUPM(&-vi=Z&O_f
zD36HJKHLjVg`k7n$+wmlUoRys_Is6RELpB}ggc}P$Y-w~3cwk$1&Bpgx`$ar&QtnN
z(;#s{gq+-BJ&gd$jBf4^An~0>6`PE4!UaWE;BD!i9~tE`fEZoA+`X8yBwyY87~SR{
z_f4Ls2OK|p=!@z*VJv#FU}CyeoGb)?X6$(MwOsn-;mErVP4WJ7$d}6jgt*+;_)mZm
zg%T~~y4V`?)#Orl^UbHR`H@T0>V-eVrc;MLUFTVTX4G#2)0`i4)m8dcbdUjECozkQ
zK}O4&N12)fXb%xvh$SJNK`I-P93HVGV&xu~##>8ifJlFQJ`t$7EA?-u?il2+5(TQc
z;r}smYFgd@kK}8>>%Vlir^(JOg++{IS0`C{zmH}||6-(`IH4I<C91#S=lnu+)%9rz
z{afK64p_?G;W+Mi(N}C4juL@~{27B&z`yke!Lk*2|Fr)SxIRu)7h|>DxHGLz9-ITH
zfnNr=Bf8Y-Y!YK2>{RT-^Ir+j<2gbJ?^neEG#llmVnq2IbRw0j)PyWR?~Ys|HZt%-
z`0dIsDnP$EZSniy*}Eqb4fJqNddp;TC7;S%$!6KUya$hgE{{6_9>P6(s^Hm@pxs^y
zf@u__!|rPKT2%ir{GVEtJTLEtS$U+UaKC|mN+z5%0#%Cne*aKW<~<Z4s}6R58v_we
z5$oAu12RCqKR-xz9o8-fy5SXsLL5oGM2dqK`dNU^hipVixDd?#-AzXx?`AGN+#n$O
zu99n;6aO{Ii|2iR(H$FiQGZ^k=4(fwtU#RECL-|vMbmjlv-QV)+!ll;X4Dp9lb~9A
z)s9iB_K3Zz_HM1vAgCRCYqeU6Dz#&eQnXZ!)?Te$vv~4*p7Z>dob$aox%Yn0J)iq|
zf8MWmJy8`L)q4x!cXBHTQ<}%~GP*!%tgQjb%jS^&xDIE?rjO_iDV)}Z{0!k9(g0IT
zW#4-nITQ`)%m|}0*#|XT6h4Q4EeAbastj<_Q%Ducel4sA83haZO8)>IWd!x!BRNi?
zcwr4k!5Ok7s|f;li^*M-F1IEYE;?Z^XWTH56Wk=^WRJav`jwpe99Q$+o?IC({i5#{
zKT^DF^<FQTHJ}p%zGH`l|C4YKmc#ozg4;f-oTYyXj+HLD$LuWOn91b;*MbZ#Yk?@f
zOa7&aw*aiP3nW1vtc1}ahA=~PxC-9>&_jE~SD&{1ocqAZfNi5=9@rB>!RVhy<EQv)
za(W7chWlKb09Mo?hVEJ<0kXu9&WY#ajd$ZYD5w1(B7idgJvyEK`zn>);Y;=#4eV6)
zwmS9_z@W`}i*WtcC7<o4Ya)Bw*vTt8X>8U(4Z*{z5KC_7q5#BNVfaOQDk2@NdZKMW
z@U95160W)h9HsKdNB`qhzxfSs5)K!Lpu^la=Sx1vT~X^P_I^o)JLS0NIzoZ5S;>8j
z`Nq#7F$)1IZfc{3Vc=LL{QbRH05lSPeps2o=dj#Vsm*<Ji$YMYmyf%uOR7EqzyI$}
zwiA>te;cG;pI21Cn_u2!f)#n^wm3-zrNiB^yRF)*GNvV2>u}$0A1VAHNyg0Bw$JFY
zidOMTHc~fOhRxtjKr4N%14f4RQ4<dgwlOPQ)Z{}$gn~<{=k{{g-L4jJjwH^zl8~o6
z6Z=#d3x^5-N=i=jD3OQxtC`mdu(qJB_?zdfk^$ksEuE(l8bRBi-tFnk8t%RzlBHQ^
z(!_;8qeLQEv?K(9pbMJ$^+=0^eLGu!p6Xfk8$A?x^%Of<C7a$8I#R!4+WAYG*EdT}
z$V5a+*4tZ%Fvw8`7yKYI?sgv)^a4BDljE{Z{4vV+>eTOCqQNJjRCk2OrHtJ6D~OsC
z1rgPrR-)_O7eX<9eW%Cv14R0DEylwjQ$FlF<mtyUbXtVw`Ub&>R0fRfq!`Lks(V;g
zBUW;lUZQ3-82WT1rD<W?P^P0IC2wQJcj6kcvXweK8PHO35a;RasRk@S6q2jfVb%KR
zyeW8qU8Jx)=Odp~xSV+D+&h=%t`5VG|A15|H(!>tOed!AYR^*5@n1*<uga7PeajlM
z&z3mUUeNq{whc+Y@V8XGJiZr~bxl(Q58>m1Qr^XaHGi{O6iA4U?Hl}xj<d=-Ym~nD
zya)a(`&<zp(3N>z|Ng8(N0}?p!gb$nk0*YKM||zC%)-hf(nF63bylvkPJ?HizQtRo
z-KvIp5Qy<@TBbAV2E!(FQ)DxokzZwQIZOgaqv5!r^-NszJXe03J`|lMi|0Ec{{wWq
z%Q43|9x=T4K@^ofVAbL0oQs%PgWN6LE!y)B)xhw)B)pFfJ?=rPGQ0Zpv}q9kreifv
z8Gp=}#SQE#=SESp;XZ@@I9bAgm*P%r>vlh=Mv-?gr5jjO=68=xFLr)lxHS6?yF2kZ
zSigh4cusIK6mzcG$5xb`+V5U$VnGshkKT{`3Y)ewq`@q|Y>+|UZ|pK<MM(wqK`))N
z$K4_JO-u!vnMvu2pg?aC?*iFTo7s}@!0#P_qr#*P^kKCt_6WWmX@m_>OY`?e$iNx|
zg)3jO_(cYNFP{09@_IgAD*xSYr4jn7=D4^@<2|Pbi)8bv$yF^9k@KTl(xI;<iy;df
zYG{-)-sfpd8HDN+j66O{c+#)l?~;rY+^M{N41{;nQ-6CH-Lf1EXv)Px{OnlOwO<a-
z49H%|hWQ_3KoltXAK8D<1~;!ZcY{hb;?KXwe<&1w`W&Z0vzT7=H?S(|?Y|NA7=%xf
zj&oE7{1doGQb$OpS;?Og5R0U&pq)#SbGU1!#cNmp)C*eO#Y3?eBOy~|GYB5`!mh;@
z*N`#u_1njUcXW;LP+WK94cPS;-NV~kM^Pis8Qww3B<;R%%?XNc4*y2%8YIGny=rQF
z34Rpj4etUm-UZiv2HKb*#z~Rmrz9E59?Df_iK_VhxSp0W2IS*b*giSIzJ{4t_0A=-
zDOT6ZwtRL9_&D$MdD&KrL+6<K;@;SXxZPbdWeL>1RCpU#t}*iMvi35~k|WzUu+I4{
z#IH;1G+*NQeE92`zDdO1O1HvpD2-0Syw-xBncmP{qJtD3Ja-Mn+@biov}4j@^?XmD
zcr>nU(ZRFp+`r6_UsHy{caJY*p~v*Zk?ULd`L@WPCPI+`8eEi7e2PXX-~|W2f4X$O
zkea3M^A)d_>-8E>L2}-qr}$8x$bo`wkpIL?fYmqqZpTzut$XO)cGLwJ;kk(!+(ruD
zJNc`t08uE~x>}B?&YH!N;B!EE|L_>>i-%5^kMiF5YEhsC%W!8TkVGRUpbdG@wEJ5C
z*zfWkHF3AvfM$ylqRU4P%J^mx!W!}n?1!6491-sInt+Zns>se~fMbK1Fj1c&v33dc
z(RXqoBv`nY7UUX>`0ltw^Y^&AYLg~}RI`kV)>w#SR*VkP!LQSxb~u*TXe{-Mz1iq#
zmxX>ZYVM~V1glZ^F~IkIbRYHSz}JqMA8W?cZsoLhPApWif%%d1KA~N8rgvV0mp7!-
zC43zdQu>slct>RS=12ul%_(q6t&RX17P|f~${#Avzio?Vf}Y`hX!S+aqllQ;QF8Cm
zs!gij6Xu^oI511+^E7^aQ;cqSEeN(fO5g-rnOCrmP`u|5*%G+wn>}LY#2<Cr?_N({
znl6YSvv&9{o2LiRgzU9tJn~|m?g71}v7aI-ma#@J$>Oz%VFzMgop@vwJWl5^Lu<lq
zj3k|h7ix1$@D&7y%)3><RGsJ$87W_{QDMu%kp(o2S)t2IL4&P6#OmKV4{+kgMZC&z
zx|i+r5zgQZ8cxsd_|(~t$(E)h2>6bA_TW36#SAGfMj$NjGM3$;V+ABZrE;l&x=+>P
zhfdS6Nc=`?m-jVP9~k$W<&SX2%K|h?uf#13&xK?nuF|RX9cq7%(b-@38FNkkWqC#<
z6&e%Yv*d~D*N0#cQ4Hez<_kxaI&+Vz1BM%4`FnE}*Z%pt`r;lj^m+o})1MQNZ0eAu
z$4*;6SkL94%L^m5Q*m5Pf|8=pewt0&`TgjM^{X%0>q1H8u*TcmpVO}xr5Sx*1U~0`
z^*y0zx#(^<`35h8J&~gm7ODopLJh5CyZG9yU6s6iBi!^5Q3siQh76W;-9K~Zzla_{
zs6YA0KA^<G&b5oD<mFL>W{98cFw@|0W(FK$f}q6m=Fvv#mbhe>BVU_aj&=|c!z;wA
zJJFRTsqJh}wQvS~DaUT7qvs-b*mAtI2+)j>asWN-s!ljtGJRo_k^UuaL*1&3>Qynj
zx2D;cR$3Yj&|t6%CmL7i^HN3k*l$3Ro84jZ>MgreUQcto+bhy}ToRbVaq!AJ76R#J
zD={RvG}?jXqm7NaTn^T2cm5RC54zhLwK7#50j8?Irg82R%0~0AJBsV~?GRncCT76j
zd8%#U+2V^-6Mo}^d@sK3r>B1VHa2r@ER5PL>_he`+C+428NY{K%{(=T)c!asF44*r
zr`XTCbR^9-PfxKjYGOd<_igsRRs3P4>IMXr`|U{#i?4v4o$sE|fe&r`W4fi??9SE0
zBo;9VFLjbIF3b0vuDPPoS7O%LHKe>9J?K)Lg*81f@Y(Fjr;<@SW;{A``|eOubH=Rc
zv92&`@ac?8B2QHprAy|kaIu}2;_K%l3IcOqHhyZ9?=yN47dxQAc>@1D$^up5dnA8Y
zW5B0~@Wjrq27u=xHZ}f$(|cDh2KY?}8{r%DPS2kn)*B>350?_X7`<zaU1JUgxY~l@
zj{xlmH3<RT<$g}wx6hLE?BoKD4>AORA9kh<T=1hw#)|fNcj!>bvi_`r;_y&b*FKKm
zlyA*<BsSoL{uo&I2{wwx<CA_tbq7O*$WhAhgS8Yj$TV+uF8uRkdC+i#JMNNUW~Ozf
zq<3}A3eZs}B|7jSROg8#I*p8%`)cL!g><x;*wMw9(fZ+TF>Zqo4+`I<U85MC@uW2z
zhO?$upFT^XM-Tnw5d{Ut|816**d8T36~=9>TB1~GJX(HZd{tWgYxl}|VEb=QZ-14U
z5ljSCjm{>#N=HXS<G(@k-<RZkFYgrHjk9^0ya$MVzw~}cT-1J&<Gr-HoQu?>!Je51
zzh)ZEAp=IQ#w2pKP#~xzH@Aa1$06&{Ck&}(0i4CRLpsSTovI<(=$5?`!Jji>BddRy
z>mLsK(7Eq#aWl+J<C-E3TKB<#eTx`b$9Ren#gw(F3M0HfxBVF?R%E#Bx%;B{AMgLx
zdn{-WD^8hyl9S><03wxAtks(W;C)43P;ra53VIUGvU#v)6vyF%#up6-{3?((@(8mU
zBH5+E`8ARvq68f|pu?kQ^Gfb%b#x`#$v;Un?>c-z4{cd<N7qk*6PRry=+y@scMH(n
zA&ZSnv~opE*|Vw)@xO)av*uh3E@_m<iWkJ(f-+3$;2+wORQD)Lq1b_USVK*J`MEm2
zVU4uFZ|8v6yYMU6%|84Gw_>=Fl-g7#F(Q4P#ale7xP@ftvq8eeE)mwD*M2#Z9|=HU
zmDWxu;SZNI9w1O?$9`fMg8l_ZwFQ0Al{L9O7EXm|85O#RchdweH}WGu5C@#1U!K(u
z^1N(ohESHmBsgSXrztw+W}rlgKW&cYL(V?c*6~rv?3eLp8s3p{z$FOg{L{g`3N$$J
zs+cVB1oV00AdIrkrgrDSm7X=aj`FIf_WPA>V<zoIzWO2?o$A7pBN-h&{Ib?FdF9Hp
zejb$x19se;B{c|8+Q866#0_oHL(fUM5%?&g*#1!zsNuZZzu3WFjM`cdturv{J?<6U
zs$i{+V?77M07USR(FoaP3aS`5_b|pQr7I<<JURi+Hi}mIrj5yf7e%$4O-k`a%*+re
zuuH4^pjD+PCfKXJ&@Jfs18_VL7fDZ)(F@sV6AunI=;DTussnIvK)<NYvI|8D{6Rm!
zE5%6h)Gs;}F6AhB#Gea)BiIvFs)^$jbpWudSU!ryjqZ#XLcADv#YLafVA8=19Fj``
zs`T`T%m%IR^j{gzHoIMtBA=kQ&#w!*tM52udkJYmM&G4!qJ}4&IY=r>LDrje_my2%
zRYzC*Hn8^z)BFlPO9w2hzRb<P77*QW_Ah9~0ZH>#(6A`fUuM60AfFXE<k=Z2l4@#U
zs#d}7*8>bG#oh`8Lv^k;@T)$P4K5@OwHOCNUTFn&MLj+wrLfNob|vG|b|%IS7*3Dl
zw$Q%LgmEN$-&G>4T(5Snt!xXK^#edd`$Hmltclv)z$n%T%{*-)3Kvr{_=ARmR`GxR
zP!{`wqC+JZ6ue#!8$D^F?~n%dIxXp`ha1Bfb^AA*($PZQEU&GI>^H#=nEndmb0}gU
ztp6?AKTBwPFy5cjABU3zFl<**!1blGt)p12QF&a%hzRuTb87%xdf+xO!(S0{Tz9}`
z6U3fk?mFL^4LX`r6U93J_L;<e$a2y%t-Fd^I2bchv#aDOm6Fg%4DQ#!JrdM3OCoXn
zp^tdMo^e!yLOuf-vbTxP9^W5&qJfrZA#OOzIW@hxHAjOR8}oim#30-2L<U^U6?Jd|
zJ}JP^$3q@{s_R%8ADmn+6;Xr84j}M@p>&oT-&qhHzsDJuh5ax(Jfpm^yHKQG0QPkb
zJO(oqj5+w(lV@p1;KG#7Y2uPSV8~T+qy!Y$Slf_O4@JU@3T_Q)FK%mLZ<r@1FVtF|
zJB4fr+Z|~=)l=f5v0{0u%`8*=NrXFM==#OzIS#p4&4)5}TrtJLWuZLHZ#WT~@<r2$
zDRAw*_ed5p<Y0@SeayNuF+*c<vk-TLkPmvcMN|#)Bk9@Its%ANxHc_Fp7MJEXVym+
zXNo|4`#UHw1y0oE#)th4hB?`t0T=>aGz!`O?}6u{8j$-l<fx|LZ{U8e2QFP-*mZdb
zCcgMT8rXU$FdN=|sX@%ZhppKFp_~*c75IKraoZ)bqnZpFl)6g5{Rb<8k84qRKd4L)
z4r=*yE7o)BlA~bZytQStZ|hxr30WAilr}M{p51|rDubP^L0A9_q3H$pXyZO-;nC5T
zsT<9rsJ2~09msZy%6=apc60mpe<PpUb;iADnMGLu>0lk$rMw<p`n_j=tr4wS;dXT!
zmiC|Uu<<8u>%+JC%RiE&(9!Rvk|4QBbW!Gzc%zbw4O%DPrZIA`MwO}ocy-YtXW2Z%
zK5<1&WbJ+=XYOHo!>=I4KrxW>_<iIv?~t63HGxrQL&$xzfz%8Ddh0r}HQlE*E&)Cn
z&BvbS1cHWRwvsQce=%FPqZ68z_<?zng7~u<lkf3R;7iv>c>D8up_5wC2q6L-fs!1(
zg<iaPQ)zk#j0Nzmy|<3TuzC>~72!gZs7EHvxrVq%qoBTV_!6&|E3S6GmH01-%h%W6
zm5WlC`MlyU*Kcx$uyB1FG}w|UUGFgrJJLf1ART#2xCUit)!EX0XOrK>&VY<1kh%c!
z?9OOa!B5V;anI}VKZ`EEj#?BuXxCpl{rM9d-l`~b64^9Ez|8#_A3vE4?2y`=T?`xV
zUtfIsY~ZBm1+$+LKLe2O;0-zN07SJHo4vpZp_;0#bvN!Wu$%jKvc2OIzSQC&H#+Ws
z_dXq{|9r3WO#;WWgR|4;w5-dYdTW<0D9*P%{#>?0EI#JTeXW_EZ`>{V3_Ergd&j&t
zc4g*?$BN7CWZdkYkCk5?U2oM!GIZ?v{pXL!apXq16+KC+tn=*BfY94b_jMGOk2}%@
z+;N;brfX3C$eMGMIlaca`Hb_z+utS%o<ocBk+|Sc9{T~RV*C5Acwqm_LC2ZBBtuo}
zg3I=gM~9b=ng```Y>!>$lwZmxr8K;+4vq};+M{vj5kQfV>RcV>F*-Esdy8HINRM|s
zT42chV|U}Kg3jWZrn&%zgAQA}Q|GAxbFX!(@Mk~zf8J@yJul_B29k38iihWitaHy9
z9?b-&7P>q6`&ztfxxU%piBvB>thZ=0BAjH{PWjE)-_{R5T`&LAdd^kSuK)1^@&PA~
z9>2e8q74D4uO_Vk8)_JSvR=|x2@Ev88)zw<{A5XH6Z+>t+vh-5JXZOt?wn(SNewq}
zs64fF{r9TtOtDA$DlNoA^7rP<A357|>$`>&>ta7y^TSTL-1|>D2)C~owMwFM9o9(Z
zuu6xZ!B?<5SkvBh#f7@RUEjQJ@h%qld^F{rSGU{Xb5sz#8n!+V#D~BCeA+!cezy0>
z#x(UhHPk67wf1w{ddH48$3+`+@c2xWE3UiG=}L*@&(sgDwr3Kxt;aL9;sal#fI~-t
zKdLe|oJqM*aQW!b97WW0po6Vfk`%JxjiSG0PSP*Ar=va!_Z7uawr_3%wB}E@qkFbn
zo%X!L+kcKD9*BLp{D4rZxV<MTH%8clM^bOcmtSBANIfHNU?lNNSnp_1;eQ*@@PV?B
z$=D_DQ;y;a>HMmNf1(0juQq=gDvsn9|0ty*9ovqFkAysWvmLZkOf(9;WX|c!W1C*E
z_h7vU7JJ7cv95^Raj0Lsj8xd$+j+}6{NwcS(+CJVf>>4rb{}nAEWDeeAjUUC{XSIe
z<%yv>{2MI2?C_%NPx9tu-@eh<&)xY3n=u8ir)%PNHol2+aS$Y^Lfc|rt7AhGGp-?}
zH-yb~HPOvAsa7h5z0b`a*=O<#0!8Eg(#X#|t}9&lpoq$QX8fr5fW#HIhn59QMYulm
ze$Ee6c=-%p6XSJr6k+$#71w$LY@mLc6+bhubNz-O`8@L+xA=H4fjkB__!s!ZCxjFU
ze-5GYkN4GTe3Db|JnxEgY5iWc^=F{La^~|o(Elg6YIgOL3|=%crG4usL)Dd$uRq8o
zVlgbz`sE)*#l&5i=_}cvxpGSV8_9+57j{xF>up~w_Gpvg4^sgTV{w1A(h5mOpDo(V
zgI)46g*HYAaNTRg(O~=uI@k4??EAgvKb583KB}QD$Y!rLW;rT1(i8`{toKkqcxwM|
z<5LknFU((`A|m*Ct~p&caMzr$^-$5Kqm$T<=h2@nc<mJSP@h!pO38FS$zLr0yAx(T
zW@5Ra0cTiW8Vr1fw+;a?+%z3t3(c;agvyV@S6Cc=T&`@q9|#DE0RF^6_OPEmvbZ|#
z)cPpO{H!Q>nuFv96-8EQQXtwWK5BCCKmiS%&B{H<49WEff=ZOX(&}`CmK}qaSAXk0
z=?wWN??@pay`SmD>U2}Abt>m`d-gbXhkyar*yH){3B@9D(CU=h5z+p{rlz+0=)X-u
zAey{&{1H6FUG&mHo;Gx@OC9p1g8!)R@8h+;`3TqFmxbL*o~h@p=^xjEht@UVv|*MX
z@@qro0nkKTT&M;9mC+T!yQ2AfMBS-f;}JC%D#GvPEtt8Fic}`+-(Gs<RTg-%Q!gm^
zM*nzi&Fx`hWDf9`>|&Q54g;p^*gOj!A_yk|cxyuWz3gXB?$9K%M&C?(M@r`|%4oJG
z8D6{l{@{9#%3Fd4M6oHm7^Mwi?tq|vDY9)mBB5WGi;(4Qgnz$C0L4oa$yB^o7Gh8t
z^{#%Q@F|~%iZoAj6iacn;%C8zdiQ1^Wp2cuo}BO`WZ&C32Ib#@bri(HS;X=8E@oL_
z+)B@xxM3$K0ta@$^z%)avSh|d?dg+;n|6D*#&5IP5j<q)u27^VLhfrS?z^$9AgV4@
z8t?Tm9^UOv=wX0iKcoyGk1?`*sIxR~dkMD=2Rs>p!7^ddNMN!gCxZKD3bG_LAt8o_
z%22Fb8(m8(;@`SaXHN}K*bp+#5kZ&4(EoJB34Jg&7nl3r!tsOPzo|q2UcN4tg2vAk
zd-OJsi7u(r@>X>5@IG%_q+&fT;;k3F4|&jg_UA_bHbR`TD0F2*&+c~v)MYQ@rv1(M
z*)74~sOMX^sa9IEis5Y(cc3m$68$g-A0~7l#@7M#nin7jcyTm*PH!xsReQfu$PeS&
z%m^Dj&EX0%>#|u;{Lqe}>dQC<L6`I;91740ZN0JPtPhU~rV@h@+aUth<>PN2n3K&J
zh*SX&SgI8A#rTHS<iLM`7_hLrsI!+@p)C6r{!e~r(_pex<EU0XAwSz#;6wIw82Td#
zl!Q6G%iN3=J(FLc6+d$@%KAA(5kp0paRfR#Qi1FSj-)~m>iOEl(U<v*3cVrL85^bZ
zJg`lRLl8rgSQy3^65P&{8DW@q30gT8xYrT5OHZ}(`}Kd7C@Xj>^ixEiOU}>RIi)yX
z5GvwhB2PRUd=12KAcDbRoOOD8i_&!3pIQh@ITS29Q8k<;N5k`qGy2rdW!W4wwQ?$P
z$T#<~Q$Q(#$`;QBe_AOKg~QE6Vt8OKXHWbvR9q-BC8AHqGc_1r`8V<K4s|SnSx+*A
z##OQf4cF~b!K{?<))Af2$w=W?$g?i#I0J-lu^NULm!*c>yb}p)J3f)(?O)OL#^nS3
zK>eYZ2T>4nh%Bf<S@^(MlgD9R^Tl(VPb>-Mpr9b#B%2FS5}QHhV6O!Q_ud=VT$y+7
z9+}Y)?+r=SD??jc=OH1%8#f}ia-RUwvY^)$zg<%)H3v8ULp0@>=f6s`Xd$p<Oo-r)
zGoyyX*W||oge6=?N&g66)@5e(qZDj|iUk`Kgf8FwO!{vB(VZ|2#DZ9ZG#qnTDJzle
zZ*G)?$_)E>gXl~+8APTKcPH*e@Y|`~{1cG3&px`AaGNlk=>&Cj-6z`O0{C-s8~wej
zY=5}y?8orx8hPP47{`Zt)cKO3f9mpvFR&#l?<r}3{(+ftl1R2`yy;jn?!J{0DI1=w
zy*KvE)w`v<V%0-_<7QaY9(QAJCM~vN7`b8=!5s48t%8U^<mWOKH+tye9t9`9xmAyz
zq9FSE_y5lVMB$!hUh5j*H9g~_B`!DZ{g;|le~&y7QGx99P|&LbsdAD;?cAb#$AUnB
z96hRzfHar&5nd=h0(EZB!miWXlTMbSXV8J6cX=8e4UblYY2zS{3cg{kHYxDBuYN}H
zz72f&he39?UAU2G1d9M_-i#`wln5q0Lfi$Xr(hA;kfq4X?PHSd7+i8Tb0OjEh#-vW
z25}9sG}U7JKl_0+4_m)ot4u2G_}55mebf0p2GO-VJuAed9(RYZXO7T64&$fh`TAI<
zQ}VCUYT^PHyI9Jz_sm;MR8|32YrIH(jC->n^b`?woAs{4W>XBBnA0JBV402BEd7O0
z*_bD5ms8D|Z2N)N70%*cJ*G(~Ng}qdzh%iBU_ycbDrL;%&DU*3BfRtPpM|x$K(TiL
zRg9(d_wC#?i2RWqH9<@n17uARm!!ABJtM9YkEIVHxnN=HMhY~1?tvJ(d`J#mFIrB#
zCXv3MOVQE2yRouz>I}c*;@9^}l1aRQN?S@TB2?^?N#04PFRQeec$2e^Eo`}`s<*Gj
zYN#sa>Jggw12mo}go)4L=R~@K2zk>q@p=7E%>*UZUz1ViyvM^ivt@HI29&VYs28x+
z-Vb}5i`mhZew%mD?Ul_BLA7?-Nz&w$QnIwL^sFoiouO_yiwhl}VSIKRkGWobP7Q1g
zV6rO9dUM_LuF%>F+)_3Ih0W6-Au$_ITzN)_F5f&7au!s!ryF27?Jo^osOP`Bg9=;H
z-!JOp6F^aOYi^za>@--lk}ZDpfeAxs{EOnwfTdR%8KlEc!gz-1aIpt{t*CwP(51<e
zGhSnBXX=e>VKx*U8aE{Bq?%Pt6xBEMF<2*?*CrdCc8^(0%|M@4`X(X6$EgFe!ry4>
zlQO#`$)+K_zAxl8_1G5%JkFIos45_v5wdq@`X%i1Hb(<l06$O%Dqfp+CFBxUDz!th
zh>@CRYGY?IZvj>v*@(BZ3Q4oa*iFjJR(nfo#<|%iMHx+=Sy`;oPTyEGkTpGC4`SdG
zJ{j~3Lljnu!)?Wf2XURYd-~56qO3={E4!b(vIMMr!a|Y9ApiTl5l@?^t*TYkZAbPc
zDdvc$x$SdrM<{&HZ0xNbGf6uXulW}NmZZT&Vf@jRndWI6&5rgTx7Zj2P`Zi&BTa0D
zdbQt6J`$hrHiW7*QQmeE0CImt*Yrk>WgXT;fnQfVGyEq&au3`lcH#ye0(ATFqkU~R
zV59HBQZ2)EbnXYuPwHD#?@%`Vf&uvq{0#nC*xS;Q6J?CtdMSW^6wv7B<^De41=dUj
zviHiI$^?R#<q~#_Mq(u6K%~5*&gTMsnq6Ks;6=ZcapFt4!Ip~cRhmG`cX@IXyt&kQ
zlxZbrR4W|^{J=rG8jBW7tFo~-%UFpNNer=cVO||Q1v69(S|rPiyT*D}t94X(wZ0Dx
zYK&-~V*a2l+BQviL%$#ets^nHSHj&`c`Q9P8VrlTq|IR;2Ay-NOB#lEm}Y)#gk9>y
z$K;DzWa#e<asri63c)xk6MBNcZo(*)!cEOhG3o8mqYbf-Se;yL8&q7Foi@~n(D{iX
zpQr~!{YaCCxZuguR$<8Q4W~YkRQNZ1t_`3Kv-En25Ne@wn=<Ope*E31y)Lp+_L40;
zmpbcezKx>zk`t$txCM_c92sg1gVP7DAsVQnau6jh`<m8h;#9DEsB&sdL%!$G=7ydy
zB(r3Vjiz0~Q}o{>_1GQ;%{vamaF;!kKD_Puh6eEL&a<x%OAwW1QC7>N?|vkWc~te4
z<x3<u`IJ@i*-Zx7F`*9@tw8ZR=XhE0N^<3Ik{k??1J$}mV~;wdcORZTQ=U~cs_=$y
z%CEOor$P-NLOx>b(DaP~cJC7x^V_KYN%q7y_r-37il*f9VpPQhQtw0rsy1qv7qX2Q
zIv8}V9>nNL2fmH}a`-qZq3A(%hbK4oriu(j4Xx_+AYq&@D_9C*ze9c;kPBYvNJs)q
zLA(PGSk1$B>qW7mJGataGd493)$iCkjJz!rxcw_pqt_!Of(-A|zL8~e{g=!@k5C!6
zl5<md#K*#$AMnKF|73K=y=NCuW3bzFJ9yEzpm<|DCui8y#W>pU#jIzJf9Q)5>cUlg
z=DD3aemC{)m&bKX<6){~3GAQ{@c69*Kjg!X$>JWZV~pMpGQmdCR(s7ZL5iPh-3__P
z*nLX-*2?yP+QZ@Y*LAyzzVQ>zU&vm&G+XtM@;Pxi`J?TM&WB>_tRoWU``@XNSY&e`
zp!SSO{%h!jWFHuC&ry;eGXx<SvH*|m%K}+dmb%-myI@~>FdxU2#8}$~Q+tcWdAYit
zSjMzJaI|Mk^)Drc=O)wnYC!tck)gd#O8tEI%AatHN4i%<1cPJ>K6k^uKrw-XvTQV|
zhn)5MeSx&?%Yz)Jnm*;f7*hSK!Gxpp&gm(l>0Wu#VuqO2U%&affIjIv>>ZtBK^-j1
z_I9!GNuVV4@ne<EvDk<MV&2;%ABtxfOQA{41$wm6VsebAsDVe~8_UBjgaYm0w-_$J
zUA?{<O^Uezk=mwicBv~?nCu-1J+}MWR{-bYmgKMuI(gNe(YJ2n0rFZCo^(=$oaCHo
zxSp}8^#^ZatD0pj2V<u=K7W13)jw&SH{GrR(?(pj4mvBa8+@aLNIc@U9pd15G=;7B
z<SJR74}T8}!E^sy%6`-l*T0K-I``*oc)<k8f^E-<on->0)I1?kW=4l^;)NUERRZ1M
zz;k0K3szTzM$ovErUy5>3(xz<fSwB+vWEw@;gE3v{XoSr7q#p3liV4#ahG$@=Q)nn
zOycApZwy%34v<?}qaSAI<P&kCeckd~`>hT|eo1pS8WoAm{;P<I5hPpC{;mCz^>xx?
z%7=S};~^j&_wRuj^TP+`s}a;pPw*bcK~7u|x$g648x4FPL|MIJkX}?h;g|_D4v4~p
zFD({x(nwS8te+L>MNSY+_IK_Iw50NsYDbEqq*j?fc!rr1KUeE7y=K+daTZyCiPkWo
z&Tb)eK^Y=g1xM%kcTx0oA3Hp-Hc&QK#7}q9&Khk&n<L&d04Z!%#r*6Y6|t&1b70zd
zr`u$DGD%MvD1M)eY|&af^pWjYspMl@pInt*=_?EsYDIAQuJ6vDc1Fwcm}gUu&}KU_
zm@)VAZgf$Pi6D`SznY`&9qMEcEyymx@zA?U1-&$yy6ay-Il2FlM<K<YHmRsC%AjfC
zqqmStI{yYlS#U$w&vn2T{KM3k-_;LH{86zKQE3@>V#lDl3M!{YO!_k3b{}3$xvObW
zn8zczL@mE5{sEo!CZ-kM3+(w=Z&yl}wumh!e!cl{jCkLQ@wRE1sn5f3JiK#fAP)Kl
zs8_eCU2VlNNrz(Wlwfa)l`;0T*m5{aRvY&2ZS=2HIRyVUm&<id!m1FZ>*!0-q<W}B
zqJys*=_H5isF|>gok!6-5HD~i;v|SvX3`_CDqYY8qT=$EIw8oFlgxO{5tN`J+2Zc9
z)K0Ju-|KY~Y-?XND@%{&RV5vxse4!Qv46~^OHIw$#Dt}CUcP?~MIue^-tMohDk@2=
zO(!}e&ih&nRY8ZawCx`pD0M2xAYiq!ShUl+cT?V~Mh1ICPd(&}`?tri2_6NtM{2rb
z1bYIgzZ-1P>>~MqJROH4_Ak!Dz9L4GiA+gybKJVSnakh4e}#IO7x+zkD2CDaHSsmh
zq>=~T0w5d6wudlfCc!12L+)VG=<bieZK#=PBWz>(D9v)NuTMJ-Ki??1ogW#aJzl<j
zR8RS$H_m5cL_>oqSWm;L6m*TN8k2l4!o7_q?qS;lK!Y(8CgE{8mD#AUH4(Lnwh$4E
zNR<D8`%{|Kz9)2D$Ygp7tFV~w|Fi~G_C8(yD)rOY!Hlw~^I98IF_Z%peVYs9sh|NE
z+0O{gA#SK&7L$W*M3%en6Yq~ZZO^p|;8(W>yO_b*g(Y7<P->8V^8Ce}M_>CQ^Zt5L
zG%1_|H3=kJS?8$J3dd`6I%M)_3e$_lq5*c5WkJkl*&lBQPYd{PN1V6TU~APZ$;|?u
zO)>!c)E%@|XyCw^<dB3M)33_1p@S*jcnpWb>d%H)epXC2A|_5mfe5;?=Y5n!@FkMT
zi$AVyd(4qj!1>Tc)$6<SooSkTtzQGVCGMo1mkCm{m@|GoMjh>P1D8SS*P@!61L8v(
zubnX3V37x1=#7hrTp+~kr_JZ~n*I97C^NOF%;jI8$NB{1-?1zBnP^$sy>~lANCqGp
zrxLKNh)xSPsM>`<=@YWS>Afa-O$*jJCPWQ?6!UFEeQ%Y_|2{KGK&v;(I&TN93)#eF
zNea81<BdphMuNw2(L29*vHLvhi~|rcXoXOVrR?SJcl-yH7SMjC>bt)kG8&TkDJaK1
z`R_~k;rjd-Gphz8qBocH7*v2_NoT8<fDB@UaCU+@pCyI%)LpS-`lg+_J89Ios=Mz`
z1F2Rjg|u)ydED%eh_$TBU}Rvv1|9M0#8H+6VS=MZ;oV4u0)71+A6CKPnBB3LcS2IB
zOvarj>x0h&C!$fZ>c{5qAr0gD=vztJJYvt+yL7lUhrP1ZKk**LmC+J}NA28-06zwb
zHu5cocdj1*8fDHQ4`rv8Yq6XRMM~|H#8T`IB&PQuxQc&>KhWOB1vyY+-i&Bcrp9`*
z2AjPCIL%mRYgORC(?7#bNXpg!vq;G5d?FP|DT9xqu9twz^zGLr02d`!^<4mMk1Q(0
zr6b{#FrYhy0=k>sfgW6r`Zh>b=IBM0=)`Xco-qqbCZ}T9>O)AWXd;Qbtzv8hZgtCY
z+P)E;WXV&$9e8U<fd@RPlOIEbswmF5qveWCFRcW;$TG^c4qj|rB0@fgs3lkF10nP#
zAY9WX96*Y>mHbUGgR!3(pv7fBk62>REg&?E3F1#)td!=^pfQ;~bxGK+Hv;T305NKW
z@2K1#wN3jhr2BjKZnghg51<(61s;l(7Ys0{ZUFHjnR_=a7D;?xLCo@ubf_LZ?BH#s
zCx(~)A&=^rywxT<9wICCv2oBVQ%_i(Mxz1@AA412!EYFw13w4(JN3IkOkV?Jhs=C!
zQioIyO3T~7<l<$_$J)figl#J29BJYM9^$`jbQ?nE%gORiE6JHW6XbzRvAU*e#w6cz
zHtrArI%<cZD-fZO5c4FtoL7_I@d%DgI28jJhLj=vv)`i++#trE24!<wRo@_NFFU9A
zQq;9e*nNOmcid@W<#JyB_{FQrkpqADlNirET3<unTgAlm1Vl~I%k80?z3`>){3@pT
z`_<cLsdr`gE26~n_L^0*JTvmPfr%RY2Dmu~6KiEN^Oq0t6{8wxxCNNu8j@$&hG68P
zOT`Ga_<&%Ty#1fshK$d;tlpb1Ei%cS<Z9)`CdCJ8<N5S8F>vkP7mnXQiFUS`I|?d7
zU=xyl2Vjc*)bv36gkr~kk=4o~ItmVCKy_(7e>t<tl&d`xg*qrb?6F)7dg!X)x996K
zUPo-nu~GAV-J!CPFxzv0x{6K{>k=+}@pFkoaA<{WlR?xMI&EApRRVn#kwbv!oem<#
zJ=OvY#`naQVf`c|lAs*@Ir{yq<yQX7%8{d^^S1C&>p6<rXIhHSA2pN$^zo`giKunw
zGrV)^gBPJ9BHCVWEEnHy&(t-}jqCo0Q@44#ptN^#>dnzvI}+FaN6Pzo%ZAUJ%kO_}
z16q}yeBAi(;fHkU$?2u*5B-i;>)rdwausmYVPU29v~KNPnVY<iw7O{Q*^eiHDR0bw
zX-YwCZmPH?;MC2R#fV_}?JI)xlk`XZtlm!?FGz}w5nIbz|E?KbX2notHM9`P`SJQR
z{NtzE;&A^b(Fa}!hanQoCspHZ9P33rZ)g54__}`JvRpl8t9tw`*Sb02J~<cgb*Z5*
zz61n4aK7?;GX`p4kb`!d%t;}qA}>6OZrw#)P?DUUD7OlbwnpBeU}kgyN+U!$sXF$}
z9Ze6C>A<2LpIKf0jW@d4jh}BUrH%*MexFl13c`h_e7sPezC_K76*rlFL}iPgKF}{m
zu?K2{@g<#pU9H-Q>hU8jhYPVFa>Tn4Hhf!^*xvw#s?Jc3Vv=THH57R<jIutjJbj>&
zsBT5Yyp#v8-BSL8f7*U2(D6qu^5aHo5Xb4KahmWs*F`%2@ulgv$^~90>fz#H|0|>%
zNw)@Q#X)EXv>b_0y1e}dEF6+a1A*l708uA3H*|~R7<{CQfFfbAKc3!S=Hlrl>EGlZ
z{X2C%tBBt|z8KvJ367ryyX?v?hIQ1wlN!C6<_z(^>OsiheUL~jTw8$Gvr^P4L}8{b
zp#;B1YKYr7!?Xw>PqpnYIjbz%pIfj_szDSZ9$5D<J#S)?9wtKViji3w5VECG;8)?e
zOaB|repTH2jewod*4>{yjY>a$a`u=x;{ZL9cOKx~RFJ4ah=Yt8fA5sE3U13&lEZxW
z?m+O6w_z=h4{}q<oKlDL`#Ih|(Sm7-{2+JuCiC~iyVYyp)b$s;aNtj&xo0_EH<@1B
zvOmKbF|*HdYBWB--DwQiO7XZ2cKceQO35TZ=zL0R7=C5<7MVrDdB7{py2P<Zmp8Ym
zGVaZIjaWYh*HTHm`etx>3IEmkU5yZafSe-b1`;ETX<p7hi0^h6yQJc~uQa9wVeZ;T
zm-<b7>tlJSA<vAu7oiR)2OvZSooq_BI7NP#fT9_ey@TxW6`zkL&W%}ZUrg!1Mgz~h
zXQdW#+|;P%DgeRrhp|wHt;p%0n0@O=q>05{X;4E7jJ0qT7A}@wE#HI$hIAXa^529-
zMi_ZzkU@dd#s6*#tK~j?lJ)29tEh7y897lDBax;%tHi;3xW%B)4iB=K@~0><RrxK1
zKGiVa<!i5S4f2tJTtJ3o&&DskEfpg?MFa>BA7jXb3xlUtUm{<>+-@?zUlP!q$DRQf
zN{Y>rsCL1b3fKbX{zA}81wwFCBX8A^e(rr`Chvwi?kyjNcya`vys^I#J6w1t<ZVc)
z*R-<7{Za8(gU#j})CLf0hr_qk1ut~`P1YXK2p7?Hs{@CyUl{pBlX?#MS6r37)5;#{
z;{L{Aqq;#9tr86qZz3{v{EXMUX{gb-Km0}Nz|V=Jz|6HIyjszLNpX~$uZTkLZ*^)q
zZ0M&K=x!+N=*O@|E+b3UZu1L=S&)417c2&;G8v|g)kg8xvPm4@8Wg5#XraH&dWuE?
zVuNj+o;t@MW0E|AZB){n02~C51K*)x;aBy;%AhvBhvI?HAsgU+d6^R-b#N7ZuXL`U
zda%*K^pS7@%{UK?32~o>WY+MkE{!?)ld-L$4SjKkn>uhC(4*{m4-X;{{5UCD6Hp)f
z`n1@v`(X$mBOdaa_+<5E#H9AsrRnSGN#gx<Ig!Y(aS$zO6dr!5MPDV{>l6p9%-2@o
zXVQ*4(&Tj*Txjt(%>UeTum%WOBE<DjA-ucDlqj=dKqkZ*f%IN4kXIW4{lug>dhI~%
zK%b&Anrldm%rjB++yn*a+)#{vF{O1V*r=|F>Sl<i*Vqbt$sk$TzzcoCIiE9xI}0C{
zl3DC!`IS`f@eI_)x9^-m#6f7KZ<1nN(Y!~`1%Rvw4VIq{w=lh`2wNo)-b30E#p%ur
zBh?)ysyJ`#3Fj_%ge<i!CZfel+B%TswC18*AigcC6cm5FWYt6Rm6Da#Vkt*}rs(~3
zVNC^KtMIf5qmi&{IQ~hN+J-=W(-`@*#}zop(G;xk3fq032e`$6OL2ZhODK%+;-_^H
z(7yCFKb?Q^L;|Q+@;qr0jf-vci%8ixwt*tCcSV&;hgut~!*_MxyYqqLn{*m$G$xXx
zo8Iz^jQS-D!;Sb*;t7$d@e(VWLwXjWdDIug2PCWjj95m%Vc!v%l?Sgoy7V!lB8}uN
zC!^2jD}o|3uW<djMG{RBj?krFG}ThIDVUs1ZIIgoVY~UVl~VeupnkU)Y*~%ZFhaQM
z0qkXx$eeVSk>Zv)|DBxgZ==+!i^U2P_xa=C-=s8y8cIYxb*YO_%9n3f(&M>%v^Quf
zCEj?{*6T&b`vIG!`_2dgH7Q)vZ=^)@jV^V5P`Pmoq)5<5^@XdszOTGW;gzsbW=~h8
z&|uC(JbN_4lDk|)$a*pXlFfZcq^T1suUT5i$U2hteX^R{Zj#C$(68;y-HZy<K}`f1
zSwoS!Oj@8<M6cIJqCFnJ{AHd(kz>Grj#tIS!Zk^qj``!%cT|08;X)=rg#A3jOH=hU
zd6y5grt|8wAB8@%aN^nH5VG8)o&&$z>!xEJG=LMvVSvLZA-fGaGw-M6vVvA}S?e?`
zD7qeq>M+sg*GS#hk76CzI29iZfw>iChH`qCtWSM{ykvXLT|@gb17BIAWkHU*>g)TS
zg-nRADY@(!RrUMS_pJK@v977vj=(@6+K44S;A2O@LhI^4N~!R{JDm{vxxrVmx<D0X
zxQ4)u4}Y*cH;W}5;>+8B{kR->^OFtli@N@9-clw&eb`{1aw_JD^?g0}>uv)oDf58E
z)JFk@(y~fca_&nscTsLh5BHqjL;+uJ;PW&e-bU!`mWnf+sU?c&Yl*bFYUl}!Lq`7N
z?b2?QybYO-d&EDuP5hBGSjIiuxGjmeYTk?M?O_qKSn}zVXUhQ%sEQ%Xz|^pFltfX;
z38!^mGV2$|X38jwhxJ2_w-^7dt2JOUhykJGc(kg*IR=poz3T@}7$V1goBM^R=Voq+
zsgI%zld}8o=1|k~XyXiCfj#U%eS-eYpN(%LoMz~wgrr*5^lrT|7-jAY*y2Ot3@~Zg
z<621Gh9I>;AE;(?&phoH@SS?zqu1rf`<q|g+!g$*g;)uZDO3thVxd$z4i`o*9u%-O
zL!QSpsUOiK=Jd0uN#7TGt>`W%ZyBLsYNaFzu43`p{B*ct!aDpg?RMRnX3QP+Lux8!
zsfvFl)iozZ&Rz#T_nkHU=&Kk+$uAC9)4KxtX{Xa}mwfM!Zd5y<@`;8RoX9>j5C7tq
zUb%b#T4QhhfhP2T1-={_<<-7D+hDS<P5G+Qdxx%qHm0YUv{zfwuy}fzbh=0=F!j0k
z#urn83Nn;jG;`u|n^>Q6r*C+m69sn_>KF3Yg2v|_t+e1}X_n&VAm_5}3BKVwAy=$I
z8aRUzjdF#^jPC-s?Q7G&-tWBwz>+sM;C#;jeq*?c?UkHznXmE?iWW^fEnQuD7o&))
zuP+7+<{9Yf%ye90mT!~diKb0^rSDx}GfXx9`VKv~<-tLnWW7==B(z2)CD@cLtRrx_
zk=Wc~`6$oM8X9$~B}dIhp;hFWy<M#&BCD%X2}PwtiJ4#;G$;!48PhwVYqq4mp5H3^
z-}3AppLH+zddIkwK5)$|9v(Inn5jBtj`jznr(4#SCBb7!hH7A)^GQ4&HTVx7zkmY#
z_P;C&3h<P7VU9-iZ1lJv%8A^jXh_LnN=CD;>Un7-xY#m3@cb*@v(;)}okl9}XibL1
zmwlGx`HPGw@qX0yY%1_;0ijA0bj|i0mffRIxcw$j!(al|PrH-k1A+q*rF)PM5lc&W
zi=Efb6YovRir}#xhRU98zE&S9U(PUkSbiyaXMYr}&HZ3_JnAGg#cTc|Nb=3ti!@z;
z)T`a<U&(j9U)LXt4u=}|z9=$UdB55(ujgWWbWCyTo;8<a7X6Q1l*bh8V)ShafBVV2
zvCY<bKaSqJ0?Ny_sCjUW7n+fKlUBIZ?#oMV959G}sEt#-V+eL_CNg2F++ed)t5H`r
zbT#7b`1^&+`QMYl!3xVYr~VU%U(=zhhLdRw#$qU#sk{w#Nlnx=y;#HR_@25ty28Gh
zr3rL8R*ZUTvbpdy$37{YC+CH-{=m4_Uv%!njo$}-)b$V5{g{0Ti2=Y{@C*A`ZEfS+
zZ!*3U3(PG~>CWo|sv_9q;ThZc$bJ>o_(oS3R>nJcmCmzfQrf6ojRCgwcUy^n-hIE4
z8qA3GUtjSd{OaudVH{Osxva!Ul-m+{n1&X6GuO62nvRA)<D7oc<?HFJx;CME&N48L
zY<RnHzGnw_v`Zt*nTeLC***u?{#ZsPI_jcA@AK&Wrqqq5yh=WaClm%an&&C{IMu+<
zN>&&nsIW<WA{m=`h#=T;?mkkPOb#JnFcff)d71b2xD8X1KvEmkJv-qTIZ{-ZX%lxd
zo^;MACpnN2SI1ipu==rX|BuHzg&%br*8TpNxin-q+fE{PmE&ehlK87TV2Y+r;o)2Q
zs;(~PfHf2Y@x7|=L@_Q+g*{D>qlXGdDSb_V-e9vKgS716Gl3CO+YMf1?U}=arT}vX
zHOQBY=;Ux8m8I|MnoT`)mdm<ELlLcy{z+o@7s^Fm6iQX{XbA8T)=y3}72PXSg?vFA
z7d~;5E^St4mboHv386O;9_b+8g|x&6zOX3$T_8ZCASZ$fJ7+PBNXQavq4F&OW^x@b
zO2E@qdatkZeJ`}eUZm1j|E*mg*&3oB{jB51U1GxEV&o)C%xrs-mBUUltg^JUTPTMI
zDD+-93yiPXHD$M#8|dF(M_;h)_VrDsXB3)s_ZC3OmnHF|B%%^FBZyh~s?L~bxiX9E
zj=c?ygv)IV=TE!8`{%G^el6y9y?~{=KYqDU!`8@F#tEs!kG_z<bk2f>_aBRY0H2Oc
zUl*31SuA(RfvV((YivXnE3xv1+igj{Yf_8L_o!W0zo?rL!kY~-rmyUY1zPV1h}X-X
zdNaQ(L_`!nCH_0|Y2aB1Re9D}_LI+#7l`xNRhI(TgTCK(MkfGTB}d-<d1!~}m<_J!
zQSXwc6Hj?&HRXgRG9n@(C)K&H`fwF7>^vwn7jE`K=7UH!b|tx)kYQ}kD`vJ|=4yB1
zZVP(a;O+R_x7bS9zQaZ<T!)lFCsAU}N}}!&lSnp3owmu673HIA-n&oR-2_b^Ethls
zF(=+C9!V9&Dkq=(vTdP^Vx?iDiObV8@O|>FUC4?;HWtoTS^K9TpT7?-F8rRSS>ayc
ziJ9?P{kFkv(v@HBvv!s#(#xMCHHrHJqaJV)a+~>l-<~P33H9;46A4~bTA_2*m$Y{w
zJ^LO?@w-^yX9;;jp<WcRsdB#*A(jw2oy^@fl8#-vI3Q!M-+8q?6vfI_Um9b;<>X^Y
z5uOGUt^QnoroGKs>`sW=v-%x=--``qq55}Omt6FpQ<t=&W61<8MMQ~eiFqh=v}6<D
z!GRMJhGGlQC#hWq`kN_R_<YLRZZEVBL{EBYi9*MM;gR&=MZ8nl2p~hf%ql5R#6;fV
zhksSKGmI34>DJ0%^DcQ$F~IxIQyNBygO(KO=~$Wl-FT1n2X98*&LA<aHtVqp+Ppjz
zEB)m`f}<+Q!!SS<i6oxUq{AlH90X8xh>!iqDbuBk(_kDNT(;0-`;ck8BF@pNlY@m*
zXNLduV@h*FH<y+mDKC79sw}P@ulJ2!FX?&|oI_>42|=+kOtXs-<O8gnVeGj6)=O)2
zblh;7VoKC``7m?!R+l7x^HKL7i3f#(v6v6@Hm*F#21=l3khuermRIjZmB|ReB#U|m
z5X~Ep;h0X?xI+L#C#<}rO#dZqhJg`+H*tBDrFr|N_v{<r_p70Gs4Vn26H0TDPOPXD
zsK&SF&`k3Ze5?wCrk)F^8;6`EgaKQAdd=)9-KFxg*3Enu2Ewm$;^tb1DhLVOCc36W
z49V)5@G@SPDW^$3DAv7n(lkpeiCB>kwbvRNsq|6na*mLk-S%#8{LkN(pu10sPXoOU
zJt8|KOn7v0H5nd~SsCDdvP`GGwQrK!TTy<UAC%|4#qKYTd(z})!^fE=Qcu6EUbNgL
z3@k$v;pZ;hPOk~E<_tL%QPI<qug<~WcV-Jwa@mbOEgx^TBHe!60MRYpnhlpCITn6y
zk{o)3=<h#2_MNXDuNA>-gKzk1w#E>rhEhap*IR*HdGkO2%(~3_o{slBv8N3M5TbsO
zA3T$z?N0tL6;2cHMS!QD;*Ksv<-KowouSwvG$E<|Ve2&TJN-JM!>J`1ZPEF>Mfv&v
zOzr#etIjh{I`&PR;o}bU5l~5b4=?cYIbbMk`9JfVf$#U&P44`YkAL{PaplR}uWO%9
zl?9sSG6}3yo#z5@A>n@4*#&=pr>R|@3A_?s);rK>Pu$tB-+Z<w8_iq<yh^=qsfOgb
z^U@3kz-!_=9f4Q7Z#)8A$S0lXvkVw_KF5J3rCd^0?s*xJq;wd#&Tg`1ELe@Z1fQ|m
z<(*(7ecJhfmL35XPA?OHCi-Nh04?&-X_u(xO*Se#!I5k=3pl_c(0Q^1nDTwxg%AHc
z+-IOB+zq_U{jlatem0=R?m$Bye(_dP<N-=d21+n%^$LO{Jb@B|QyXSn5uR`|WfsV~
zpb)=5NezZGe}HTW776+Dshjaw%A+ZQ$4UbJI)%PG;QN37UMmGghK5FkEBn6&$uzrr
xpWr)ea9aPv83Ufn+<T087<dmeTrhxBz07GFMfU0Y+?c@t1fH&bF6*2UngI5mLBIe2

diff --git a/docs/rnaseq.rst b/docs/rnaseq.rst
deleted file mode 100644
index 2e34c25b..00000000
--- a/docs/rnaseq.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-.. _rnaseq:
-
-RNA-seq workflow
-================
-
-This workflow is used for RNA-seq and RNA-seq-like analysis (like euRNA-seq, RIP-seq or small RNA-seq).
-
-This workflow can use references created by the references workflow with no
-need to run the references workflow separately. This workflow performs the
-following tasks:
-
-- Builds a HISAT2 index
-- Builds a salmon transcriptome index
-- Downloads a GTF annotation
-- Converts the GTF to refflat format
-- Trims reads with cutadapt
-- Aligns with HISAT2
-- Runs FastQC on raw, trimmed, and aligned reads
-- Aligns reads to rRNA using bowtie2 to evaluate rRNA contamination
-- Counts reads in genes with featureCounts
-- Runs dupRadar and preseq to assess library complexity
-- Checks for evidence of cross-contamination using fastq_screen on multiple
-  configured genomes
-- Assesses transcript coverage with Picard CollectRnaSeqMetrics
-- Builds bigWigs (optionally strand-specific) created from BAM files
-- Optionally merges bigWigs as defined by config
-- Aggregates QC results using MultiQC. Includes custom tables for library
-  sizes and rRNA contamination
-- Runs comprehensive downstream analysis including QC and differential expression
-  in R. See section below for more details.
-- Constructs and uploads a track hub of scaled coverage bigWigs for each
-  sample that can be viewed in UCSC Genome Browser
-
-The DAG of jobs looks like this:
-
-.. image:: rnaseq.png
-
-Downstream analysis
-~~~~~~~~~~~~~~~~~~~
-
-This is  performed in an RMarkdown file (``rnaseq.Rmd``) that uses DESeq2
-for differential expression analysis, along with diagnostic plots, 
-exported tables of differentially expressed genes for each comparison of 
-interest, gene patterns analysis for finding coexpressed genes and downstream
-functional enrichment analysis using clusterProfiler. This file is run and
-rendered into an output HTML file. See :ref:`downstream` for more details.
-
-.. toctree::
-   :maxdepth: 2
-
-   downstream-rnaseq
-
-
diff --git a/docs/sampletable.rst b/docs/sampletable.rst
deleted file mode 100644
index 4a56b511..00000000
--- a/docs/sampletable.rst
+++ /dev/null
@@ -1,272 +0,0 @@
-.. _sampletable:
-
-Sample tables
-=============
-Sample tables map sample names to files on disk and provide additional
-metadata. It is expected to have a header and be tab-delimited. Empty lines and
-lines that start with a comment (``#``) are skipped.
-
-For running new experiments, you will need to write your own sample table. For
-running experiments uploaded to SRA (Sequence Read Archive), **you can use the SRA sample table
-as-is**, with the addition of a new column indicating what you would like to
-name each sample. This makes it almost trivial to run arbitrary SRA RNA-seq
-data sets! For ChIP-seq data from SRA, the additional columns `antibody`,
-`label`, and `biological_material` as described below will need to be added,
-but often that information is already in the SRA sampletable so the columns
-just need to be renamed.
-
-.. _rnaseq-sampletable:
-
-RNA-seq sample table
---------------------
-Here is an example minimal sample table for RNA-seq. It only contains sample
-IDs for four samples::
-
-    # Minimal RNA-seq sample table
-    sample
-    c1
-    c2
-    t1
-    t2
-
-In this minimal example, the original FASTQ files are expected to be at the
-locations ``data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz``. That pattern
-is configured in the ``config/rnaseq_patterns.yaml`` file if you would like to
-change it (see :ref:`patterns-and-targets`). Specifically, the workflow will
-expect the following files to already exist (paths relative to the Snakefile)::
-
-    # The above sample table expects these files to exist:
-    data/rnaseq_samples/c1/c1_R1.fastq.gz
-    data/rnaseq_samples/c2/c2_R1.fastq.gz
-    data/rnaseq_samples/t1/t1_R1.fastq.gz
-    data/rnaseq_samples/t2/t2_R1.fastq.gz
-
-.. _symlinks:
-
-Symlinking FASTQs
-~~~~~~~~~~~~~~~~~
-
-To avoid having to copy or symlink files over into the expected directory
-structure, we can instead list the original filenames in a column called
-``orig_filename`` and they will be automatically symlinked into
-``data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz``. That is, the following
-sampletable::
-
-    # Example RNA-seq sample table with original filenames are specified
-    sample   orig_filename
-    c1       /data/c1.fastq.gz
-    c2       /data/c2.fastq.gz
-    t1       /data/other/t1.fq.gz
-    t2       ../raw-data/t2.fq.gz
-
-Will result in the following symlinks::
-
-    data/rnaseq_samples/c1/c1_R1.fastq.gz -> /data/c1.fastq.gz
-    data/rnaseq_samples/c2/c2_R1.fastq.gz -> /data/c2.fastq.gz
-    data/rnaseq_samples/t1/t1_R1.fastq.gz -> /data/other/t1.fq.gz
-    data/rnaseq_samples/t2/t2_R1.fastq.gz -> ../raw-data/t2.fq.gz
-
-Note that `orig_filename` paths in the sampletable are considered *relative to
-the Snakefile*.
-
-Additional metadata
-~~~~~~~~~~~~~~~~~~~
-For RNA-seq, only the first column and optionally the `orig_filename` column
-are used directly by the RNA-seq workflow.
-
-However, the sampletable is imported into the ``downstream/rnaseq.Rmd`` file
-(see :ref:`downstream` for more info). It's often useful to include
-any metadata in the sampletable so it's all in one place, and you'll get all
-that information imported into R.
-
-For example, with this sample table we would be easily able to use a DESeq
-model of ``~condition`` since the condition column will be imported into R.
-
-::
-
-    # Example RNA-seq sampletable with "condition" metadata included
-    sample   orig_filename          condition
-    c1       /data/c1.fastq.gz      control
-    c2       /data/c2.fastq.gz      control
-    t1       /data/other/t1.fq.gz   treatment
-    t2       /data/other/t2.fq.gz   treatment
-
-Paired-end data
-~~~~~~~~~~~~~~~
-**Paired-end and single-end data may be mixed in the same sampletable.**
-A sample is specified as paired-end using a separate column in the sampletable.
-That column can either be named `layout` (easiest if you're writing your own
-sample table) or `LibraryLayout` (if you're using an SRA sampletable, in which
-case you can leave it as-is). An error will be raised if both columns are
-provided.
-
-If one of these columns exists, the values of the column are converted to
-lowercase. For each sample, if the value is either `pe` or `paired`, the sample
-will be considered paired-end. In all other cases the sample will be considered
-single-end.
-
-For paired-end samples that will be symlinked, both `orig_filename` and
-`orig_filename_R2` must be specified as paths relative to the Snakefile (see
-:ref:`symlinks` above). If there is a mix of SE and PE samples, the SE sample
-must have an empty entry for `orig_filename_R2` (in the context of the
-tab-delimited sampletable, this means two tab characters next to
-each other with nothing in between).
-
-.. note::
-
-  If the sample table contains both single- and paired-end samples, the
-  `fastq_dump` and `cutadapt` rules will create empty R2 files.
-
-  Once the BAM files are created (after alignment in a single- or paired-end
-  fashion as appropriate for the sample), we operate mostly on the BAM.
-
-  After the alignment stage, remaining rules **do not** differentiate between
-  single- and paired-end reads. In particular, featureCounts and bamCoverage
-  may need different parameters depending on the library layout.
-
-::
-
-    # Example RNA-seq sample table with original filenames are specified,
-    # and c1 is a paired-end sample
-    sample   orig_filename         orig_filename_R2      layout
-    c1       /data/c1_R1.fastq.gz  /data/c1_R2.fastq.gz  PE
-    c2       /data/c2.fastq.gz                           SE
-    t1       /data/other/t1.fq.gz                        SE
-    t2       /data/other/t2.fq.gz                        SE
-
-.. _chipseq-sampletable:
-
-ChIP-seq sample table
----------------------
-**Three additional columns are required** for ChIP-seq: ``antibody``,
-``biological_material`` and ``label``.
-
-
-``antibody``
-    Used for differentiating between input and IP samples. Input samples should
-    be listed with an antibody of exactly ``input``.
-
-``biological_material``
-    Ties together which samples came from the same chromatin. This is how we
-    know a particular input sample is the matched control for a particular IP
-    sample. This is primarily used in the `fingerprint` rule, where we collect
-    all the input BAMs together for performing QC. See the
-    `lib.chipseq.merged_input_for_ip` function for the technical details of how
-    this is handled.
-
-``label``
-    Used to tie together technical replicates, and **used to configure the
-    ChIP-seq peak-calling runs** (see :ref:`cfg-chipseq`).
-
-    Technical replicates share the same label. If you don't have technical
-    replicates, then this column can be a copy of the first column containing
-    sample names. Technical replicates will have their BAMs merged together
-    and duplicates removed from the merged BAM.
-
-The reason that the ChIP-seq sample table is more complicated than RNA-seq is
-because RNA-seq is often analyzed in R, and complicated sample handling (like
-summing technical replicates) can be performed very flexibly in R. In contrast,
-ChIP-seq peak-callers are command-line tools and frequently only take a single
-biological replicate, and so are run as Snakemake rules. As a result, more
-complex configuration is required to ensure complex experimental designs are
-handled correctly.
-
-
-Minimal ChIP-seq sample table, no replicates
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A minimal ChIP-seq sample table, with no biological replicates, looks like
-this::
-
-    # Example minimal ChIP-seq sample table
-    sampleid    antibody   biological_material  label          orig_filename
-    ip1         gaf        s2cell-1             s2cell-gaf-1   /data/ip1.fastq.gz
-    input1      input      s2cell-1             s2cell-input-1 /data/input.fastq.gz
-
-- The input sample is required to have the antibody as "input"
-- For an IP, its matched input is the sample with ``antibody == input`` that
-  also has the same biological material as the IP. Here, we know `input1` goes
-  with `ip1` because they both have the same biological material.
-
-
-ChIP-seq sample table, biological replicates
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Here is another example, this time with biological replicates::
-
-    # Example ChIP-seq sampletable with biological replicates
-    sampleid    antibody   biological_material  label          orig_filename
-    ip1         gaf        s2cell-1             s2cell-gaf-1   /data/ip1.fastq.gz
-    ip2         gaf        s2cell-2             s2cell-gaf-2   /data/run2/ip3.fastq.gz
-    input1      input      s2cell-1             s2cell-input-1 /data/input.fastq.gz
-    input2      input      s2cell-2             s2cell-input-2 /data/run2/input2.fastq.gz
-
-- As before, `ip1` and `input1` share the same biological material, indicating
-  that `input1` is the matched input for `ip1`.
-- The matched input for `ip2` is `input2` because they share the same
-  biological material (`s2cell-2`) and `input2` has ``antibody == input``.
-- Each sample has a unique label because there are no technical replicates
-  here.
-
-ChIP-seq sample table, biological and technical replicates
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Another example, this time with biological and technical replicates:
-
-::
-
-    # Example ChIP-seq sampletable with bio and tech reps
-    sampleid    antibody   biological_material  label          orig_filename
-    ip1         gaf        s2cell-1             s2cell-gaf-1   /data/ip1.fastq.gz
-    ip1a        gaf        s2cell-1             s2cell-gaf-1   /data/ip2.fastq.gz
-    ip2         gaf        s2cell-2             s2cell-gaf-2   /data/run2/ip3.fastq.gz
-    input1      input      s2cell-1             s2cell-input-1 /data/input.fastq.gz
-    input2      input      s2cell-2             s2cell-input-2 /data/run2/input2.fastq.gz
-
-
-- `ip1` and `ip1a` are technical replicates because they share the label
-  `s2cell-gaf-1`. This is often the case when we need to sequence the same
-  sample again for higher depth.
-
-- `ip1` and `ip1a` will be merged into one BAM file named after their common
-  label, `s2cell-gaf-1` (described further below). The remaining `ip2`,
-  `input1`, and `input2` do not have to be merged with anything, so they will
-  be symlinked.
-
-Merging technical replicates for ChIP-seq
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In contrast to technical replicates in RNA-seq, where counts can be summed in
-R, ChIP-seq is a bit more complicated. The ChIP-seq workflow uses ``samtools
-merge`` to merge together the unique, duplicates-removed BAM files from
-technical replicates into a single BAM, and then removes the duplicates again
-from that merged file.
-
-There is a "merged_techreps" key in ``config/chipseq_patterns.yaml`` which
-defines the filenames to which technical replicates will be merged. By default
-this pattern is
-``data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam``.
-After trimming, aligning, removing multimappers, and removing duplicates, tech
-reps are merged together. Specifically, these files:
-
-::
-
-    data/chipseq_samples/ip1/ip1.cutadapt.unique.nodups.bam
-    data/chipseq_samples/ip1a/ip1a.cutadapt.unique.nodups.bam
-
-get merged and then duplicates removed again from that merged file, resulting
-in this file::
-
-    data/chipseq_merged/s2cell-gaf-1/s2cell-gaf-1.cutadapt.unique.nodups.merged.bam
-
-For samples with no technical replicates, only symlinks are performed, so for
-example this file::
-
-
-    data/chipseq_samples/ip2/ip2.cutadapt.unique.nodups.bam
-
-will get symlinked to this file::
-
-    data/chipseq_merged/s2cell-gaf-2/s2cell-gaf-2.cutadapt.unique.nodups.merged.bam
-
-For peak-calling (see :ref:`cfg-chipseq`) and any other downstream analysis,
-**the files to use are these merged (or symlinked) BAM files.**
diff --git a/docs/tests.rst b/docs/tests.rst
deleted file mode 100644
index 79ae28eb..00000000
--- a/docs/tests.rst
+++ /dev/null
@@ -1,183 +0,0 @@
-.. _running-the-tests:
-
-Testing the installation
-========================
-This section describes how to set up and run the example data.
-It is useful for verifying everything is working correctly. This
-reproduces the steps that are performed during the automated tests
-on `Circle CI <https://circleci.com>`_. You can see the latest test
-results `here <https://circleci.com/gh/lcdb/lcdb-wf/tree/master>`_.
-
-The example run takes up about 360 MB of space and runs in about 15 mins on
-2 cores.
-
-.. note::
-
-   The ``deploy.py`` script specifically **excludes** the various test files,
-   so the commands below must be run in a full clone of the repo, not in
-   a directory in which lcdb-wf has been deployed.
-
-Create conda envs
------------------
-
-This assumes you have set up the `bioconda channel
-<https://bioconda.github.io>`_ properly.
-
-.. code-block:: bash
-
-   mamba env create -p ./env --file env.yml
-
-.. code-block:: bash
-
-   mamba env create -p ./env-r --file env-r.yml
-
-We **highly recommend** using conda for isolating projects and for analysis
-reproducibility. If you are unfamiliar with conda, we provide a more detailed look
-at :ref:`conda-envs`.
-
-
-Activate the main env
----------------------
-
-Depending on how you have set up conda, either
-
-.. code-block:: bash
-
-   conda activate ./env
-
-or
-
-.. code-block:: bash
-
-   source activate ./env
-
-Download example data
----------------------
-
-This will download the example data from our `test data repository
-<https://github.com/lcdb/lcdb-test-data>`_ into the directories
-``workflows/{references,rnaseq,chipseq}/data``:
-
-.. code-block:: bash
-
-    python ci/get-data.py
-
-
-.. _test-settings:
-
-A note about test settings
---------------------------
-
-.. warning::
-
-    The default configuration assumes a machine with large amounts of RAM.
-    Running the workflows as-is on a single machine with limited RAM may cause
-    all RAM to be consumed! Use ``run_test.sh`` as described below to avoid
-    this.
-
-A major benefit of ``lcdb-wf`` is that the code undergoes automated testing on
-`CircleCI <https://circleci.com/gh/lcdb>`_. However this test environment only
-has 2 cores and 2GB RAM. To accommodate this, we developed a small
-representative `test dataset <https://github.com/lcdb/lcdb-test-data>`_ from
-real-world data. This allows the workflows to run in their entirety in a reasonable time frame.
-We also needed to adjust specific settings to the workflows, e.g.
-we set the Java VM memory to only 2GB for Java tools like Picard and FastQC.
-
-We had to make a design decision about the “default” state of the workflows:
-should the workflows reflect production-ready (high-RAM) settings, or reflect
-test-ready (low RAM) settings? We chose to have the default to be real-world,
-production-ready settings, because we want to minimize the edits required
-(and therefore possibility of introducing errors!) for running on real data.
-
-What this all means is that if we want to run tests, we need use the ``run_test.sh`` 
-script in each workflows directory to make adjustments. This script runs a
-preprocessor, ``ci/preprocessor.py``, which looks for specially-formatted 
-comments in the workflows. It swaps out production settings for test settings,
-and writes the results to a new ``Snakefile.test`` file that
-is then run. In production, especially when running on a cluster, there's no
-need to do this.
-
-See the docstring in the ``ci/preprocessor.py`` for details on how this works.
-
-The ``run_test.sh`` simply passes all arguments on to Snakemake. Take a look at
-the script to see what it's doing, and see the examples below for usage.
-
-Run the RNA-seq workflow with example data
-------------------------------------------
-
-With the `lcdb-wf` environment activated, change to the RNA-seq workflows
-directory:
-
-.. code-block:: bash
-
-    cd workflows/rnaseq
-
-First, run in dry-run mode which will print out the jobs to be run.  The
-arguments will be described later, this is just to get things running:
-
-.. code-block:: bash
-
-    ./run_test.sh -n --use-conda
-
-If all goes well, you will get lots of output ending with a summary of the
-number of jobs that will be run. Then, use the same command but remove the
-``-n``, and optionally include the ``-j`` argument to specify the number of
-cores to use, for example ``-j 8`` if you have 8 cores on your machine (this
-example just uses 2 cores):
-
-.. code-block:: bash
-
-    ./run_test.sh -j 2 --use-conda
-
-This will take ~15 minutes to run.
-
-Then activate the R environment (this assumes you're still in the
-``workflows/rnaseq`` subdirectory):
-
-.. code-block:: bash
-
-    conda activate env-r   # or source activate env-r
-
-and run:
-
-.. code-block:: bash
-
-    ./run_downstream_test.sh
-
-After the workflow runs, here are some useful points of interest in the output:
-
-    - ``data/rnaseq_samples/*``: sample-specific output. For example,
-      individual BAMs and bigWig files can be found here
-    - ``data/aggregation/multiqc.html``:  MultiQC report.
-    - ``downstream/rnaseq.html``: Differential expression results generated
-      from running the ``downstream/rnaseq.Rmd`` RMarkdown file.
-
-See :ref:`rnaseq` and :ref:`config` for more details.
-
-Run the ChIP-seq workflow with example data
--------------------------------------------
-
-To run the ChIP-Seq workflow, follow the same steps as above but
-with the workflow directory updated to ``workflows/chipseq``.
-The most notable difference here is that the downstream analysis
-in R (e.g. the ``rmarkdown::render`` step)  is not run.
-
-Points of interest after running the ChIP-seq workflow:
-
-    - ``data/chipseq_samples/*``: sample-specific output. Individual BAM files
-      for a sample can be found here.
-    - ``data/chipseq_merged/*``: technical replicates merged and re-deduped, or
-      if only one tech rep, symlinked to the BAM in the samples directory
-    - ``data/chipseq_peaks/*``: peak-caller output, including BED files of
-      called peaks and bedGraph files of signal as output by each algorithm
-    - ``data/chipseq_aggregation/multiqc.html``: MultiQC report
-
-See :ref:`chipseq` for more details.
-
-
-Exhaustive tests
-----------------
-
-The file ``.circleci/config.yml`` configures all of the tests that are run on
-CircleCI. There's a lot of configuration happening there, but look for the
-entries that have ``./run_test.sh`` in them to see the commands that are run.
diff --git a/docs/toc.rst b/docs/toc.rst
index 1180c8cd..624a6c88 100644
--- a/docs/toc.rst
+++ b/docs/toc.rst
@@ -6,17 +6,5 @@ Table of Contents
 
    index
    getting-started
-   guide
    workflows
    config
-   references
-   rnaseq
-   downstream-rnaseq
-   chipseq
-   integrative
-   conda
-   tests
-   faqs
-   changelog
-   developers
-
diff --git a/docs/workflows.rst b/docs/workflows.rst
index 99bb44cb..2f58da9a 100644
--- a/docs/workflows.rst
+++ b/docs/workflows.rst
@@ -1,145 +1,169 @@
 .. _workflows:
 
-Overview of workflows
-=====================
+Workflows
+=========
 
-.. note::
+The workflows are RNA-seq-like, ChIP-seq-like, and variant calling.
 
-   These workflows **are intended to be edited and customized by the user**.
+They are currently labeled ``rnaseq``, ``chipseq``, and ``variant-calling``,
+but you can rename the workflow directories to whatever you need. And you can
+make copies of a workflow directory to support multiple experiments.
 
-   See :ref:`getting-started` for recommendations on setting up these workflows in
-   your project directory.
+If multiple experiments can all use the same parameters in the Snakefile, all
+the samples can be combined into the same sampletable. But if they differ --
+for example, they had different library prep such that the cutadapt parameters
+need to be changed -- then they need to split up into multiple workflow
+directories, each with their own sample and with respective edits to the
+Snakefile. See :ref:`decisions-sample-specific-params` for rationale.
 
-Each workflow lives in its own directory:
+RNA-seq
+-------
+
+**Sampletable:** :ref:`rnaseq-sampletable`
+
+**Config:** :ref:`rnaseq-config`
+
+**Downstream:** :ref:`rnaseq-downstream`
+
+This workflow can be used for any bulk Illumina-based RNA-seq-like assay that
+quantifies transcripts of some sort and where a gene-by-sample matrix of counts
+is useful.
+
+This of course includes standard bulk RNA-seq, but also things like
+RIP-seq, small RNA-seq, or even differential ChIP-seq within gene
+bodies.
+
+This workflow trims raw reads with cutadapt, aligns with STAR and quantifies
+reads in genes with featureCounts. It also quantifies reads with Salmon.
+Extensive QC is performed at each stage and is aggregated with MultiQC.
+
+The biggest advantage of using this workflow is the extensive downstream
+analysis (see :ref:`rnaseq-downstream`), which is run after the Snakefile
+completes due to the frequent need for project-specific customization.
+
+The primary output of the Snakefile consists of the following:
+
+- Salmon quantification files for each sample
+
+::
+
+  data/rnaseq_samples/{sample_id}/{sample_id}.salmon/quant.sf
+
+
+- aligned BAM files for each sample (duplicates marked but not removed)
+
+::
+
+   data/rnaseq_samples/{sample_id}/{sample_id}.cutadapt.markdups.bam
+
+- strand-specific bigWig files for each sample
+
+::
+
+  data/rnaseq_samples/{sample_id}/{sample_id}.cutadapt.bam.neg.bigwig
+  data/rnaseq_samples/{sample_id}/{sample_id}.cutadapt.bam.pos.bigwig
+
+- single featureCounts file with all samples
+
+::
+
+  data/rnaseq_aggregation/featurecounts.txt
+
+- MultiQC output
+
+::
+
+  data/rnaseq_aggregation/multiqc.html
+
+The primary output of the downstream analysis (:ref:`rnaseq-downstream`) is the
+final HTML report and the RDS files ready for exploration with `Carnation
+<https://github.com/NICHD-BSPC/carnation>`__.
 
 ::
 
-    ├── references/
-    │   ├── Snakefile
-    │   └── ...
-    ├── rnaseq/
-    │   ├── Snakefile
-    │   └── ...
-    ├── chipseq/
-    │   ├── Snakefile
-    │   └── ...
-    ├── colocalization/
-    │   ├── Snakefile
-    │   └── ...
-    ├── external/
-    │   ├── Snakefile
-    │   └── ...
-    └── figures/
-        ├── Snakefile
-        └── ...
-
-
-There are two general classes of workflows, **primary analysis** and the
-**integrative analysis**. 
-
-Each workflow is driven by a ``Snakefile`` and is configured by plain text
-`YAML <https://en.wikipedia.org/wiki/YAML>`_ and `TSV
-<https://en.wikipedia.org/wiki/Tab-separated_values>`_ format files (see
-:ref:`config` for much more on this).
-
-Features common to workflows
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In this section, we will take a higher-level look at the features common to
-the primary analysis workflows.
-
-- The ``lib`` module is imported in each Snakefile, allowing various helper
-  functions to be used.
-
-- The config file is hard-coded to be ``config/config.yaml`` by default, but
-  a custom config can be specified at the command-line, using  ``snakemake
-  --configfile <path to other config file>``.
-
-- The config file is loaded using ``lib.common.load_config``. This function
-  resolves various paths (especially the references config section) and checks
-  to see if the config is well-formatted.
-
-- The ``c`` object: To make it easier to work with the config, a `SeqConfig`
-  object is created. It needs that parsed config file as well as the patterns
-  file (see :ref:`patterns-and-targets` for more on this). The act of creating
-  this object reads the sample table, fills in the patterns with sample names,
-  creates a reference dictionary (see ``common.references_dict``) for easy
-  access to reference files, and for ChIP-seq, also fills in the filenames for
-  the configured peak-calling runs. This object, called ``c`` for convenience,
-  can be accessed to get all sort of information -- ``c.sampletable``,
-  ``c.config``, ``c.patterns``, ``c.targets``, and ``c.refdict`` are frequently
-  used in rules throughout the Snakefiles.
-
-
-Primary analysis workflows
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-The primary analysis workflows are generally used for transforming raw data
-(fastq files) into usable results. For RNA-seq, that's differentially-expressed
-genes (along with comprehensive QC and analysis). For ChIP-seq, that's called
-peaks or differentially bound chromatin regions.
-
-The primary analysis workflows are:
-
-   - References
-   - RNA-seq
-   - ChIP-seq
-
-These are each described further in their respective sections.
-
-While the references workflow can be stand-alone, usually it is run as
-a by-product of running the RNA-seq or ChIP-seq workflows. Here we will
-focus on RNA-seq and ChIP-seq which share some common properties.
-
-Where possible, we prefer to have rules use the normal command-line syntax for
-tools (examples include rules calling samtools, deepTools bamCoverage, picard,
-salmon).  However in some cases we use wrapper scripts. 
-
-Situtations where we use wrappers:
-
-- Ensuring various aligners (HISAT2, Bowtie2, STAR, bwa) behave uniformly.
-  These wrappers call the aligner, followed by samtools sort and view. The end
-  result is that FASTQs go in, and a sorted BAM comes out.
-- Tools with legacy dependencies like Python 2.7 that must be run in an
-  independent environment (sicer, rseqc)
-- R analyses (particularly spp and dupradar, which build up an R script
-  incrementally before calling it).
-- Tools that need complicated setup, or handling output files hard-coded by the
-  tool (fastqc, fastq_screen).
-
-In all cases, search for the string **NOTE:** in the Snakefile to read notes on
-how to configure each rule, and make adjustments as necessary. You may see some
-comments that say `# [TEST SETTINGS]`; you can ignore these, and see
-:ref:`test-settings` for more info.
-
-.. note:: 
-
-    If you have two different RNA-seq experiments, from different species, they
-    have to be run separately. However, if downstream analyses will use them both
-    then you would like to keep them in the same project. In this case, you can copy
-    the ``workflows/rnaseq`` directory to two other directories:
-
-    .. code-block:: bash
-
-        cp -r workflows/rnaseq workflows/genome1-rnaseq
-        cp -r workflows/rnaseq workflows/genome2-rnaseq
-
-    This way, downstream analyses can link to and utilize results from these
-    individual folders, while the whole project remains self-contained.
+   downstream/rnaseq.html
+   downstream/combined.Rds
 
-Integrative analysis workflows
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The integrative analysis workflows take input from the primary workflows and
-tie them together.
+ChIP-seq (and other chromatin-associated assays)
+------------------------------------------------
+
+**Sampletable:** :ref:`chipseq-sampletable`
+
+**Config** :ref:`chipseq-config`
+
+This workflow can be used for various bulk Illumina-based sequencing assays related
+to chromatin binding, like ChIP-seq, CUT&RUN, Cut&Tag, ATAC-seq. There may need
+to be modifications you need to make within the particular tool calls, but the
+framework is useful for all of them.
+
+This workflow trims raw reads with cutadapt, aligns with bowtie2, and runs peak
+calling on all samples. Extensive QC is performed at each stage which is
+aggregated with MultiQC.
+
+The biggest advantage of using this workflow is the flexibility of
+peak-calling. Since peak-calling tends to need extensive tweaking depending on
+the antibody or assay, it is straightfoward to configure multiple peak-calling
+runs (different algorithmss, each with possibley different parameters) on the
+same sample, and view them all together in a genome browser to decide on
+a final strategy.
+
+The primary output of the Snakefile consists of the following:
+
+- aligned BAM files (multimappers removed, duplicates removed)
+
+::
+
+  data/chipseq_samples/{sample_id}/{sample_id}.cutadapt.unique.nodups.bam
+
+- peak calls
+
+::
+
+  data/chipseq_peaks/{algorithm}/{peak_run}/peaks.bed
+  data/chipseq_peaks/{algorithm}/{peak_run}/peaks.bigbed
+
+- bigWigs for merged technical replicates
+
+::
+
+  data/chipseq_merged/{biological_material}/{biological-material}.cutadapt.unique.nodups.bam.bigwig
+
+- MultiQC output
+
+::
+
+  data/chipseq_aggregation/multiqc.html
+
+
+Multiple workflows
+------------------
+
+If you have multiple experiments of the same type, and the same parameters can
+be used for all of them, then they can go in the same sampletable. 
+
+If samples need different parameters in any way, then make a copy of the
+respective workflow directory and consider them as part of a different workflow.
+
+(see :ref:`decisions-sample-specific-params` for rationale on this)
+
+For example, miRNA-seq and SMART-seq would likely need different cutadapt
+parameters and maybe alignment parameters, so we might put them in different
+workflows:
+
+::
 
-The integrative analysis workflows are described in :ref:`integrative`:
+  workflows/
+    mirnaseq/
+    smartseq/
 
-- Colocalization
-- "External"
-- Figures
+Each of these would be a copy of the ``rnaseq`` workflow, but with appropriate
+changes in the respective Snakefiles.
 
-These are each described in more detail in their respective sections.
+This is also the mechanism for working with different genomes, which would have different references in the config::
 
-Next Steps
-~~~~~~~~~~
+  workflows/
+    chipseq-human/
+    chipseq-mouse/
 
-Next we look at :ref:`config` for details on how to configure specific
-workflows.
+Each workflow can be considered as independent, which gives lots of flexibility
+in configuring and customizing the Snakefile.

From 4e839d60b89c9a03765cba8b4336882bc45911c3 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 6 Nov 2025 10:32:50 -0500
Subject: [PATCH 187/196] a round of fixes in docs

---
 docs/workflows.rst | 85 ++++++++++++++++++++++++----------------------
 1 file changed, 45 insertions(+), 40 deletions(-)

diff --git a/docs/workflows.rst b/docs/workflows.rst
index 2f58da9a..847b5d66 100644
--- a/docs/workflows.rst
+++ b/docs/workflows.rst
@@ -43,43 +43,42 @@ completes due to the frequent need for project-specific customization.
 
 The primary output of the Snakefile consists of the following:
 
-- Salmon quantification files for each sample
+- Salmon quantification files for each sample:
 
-::
+.. code-block:: text
 
   data/rnaseq_samples/{sample_id}/{sample_id}.salmon/quant.sf
 
+- Aligned BAM files for each sample (duplicates marked but not removed):
 
-- aligned BAM files for each sample (duplicates marked but not removed)
-
-::
+.. code-block:: text
 
    data/rnaseq_samples/{sample_id}/{sample_id}.cutadapt.markdups.bam
 
-- strand-specific bigWig files for each sample
+- Strand-specific bigWig files for each sample:
 
-::
+.. code-block:: text
 
   data/rnaseq_samples/{sample_id}/{sample_id}.cutadapt.bam.neg.bigwig
   data/rnaseq_samples/{sample_id}/{sample_id}.cutadapt.bam.pos.bigwig
 
-- single featureCounts file with all samples
+- Single featureCounts file with all samples:
 
-::
+.. code-block:: text
 
   data/rnaseq_aggregation/featurecounts.txt
 
-- MultiQC output
+- MultiQC output:
 
-::
+.. code-block:: text
 
   data/rnaseq_aggregation/multiqc.html
 
 The primary output of the downstream analysis (:ref:`rnaseq-downstream`) is the
 final HTML report and the RDS files ready for exploration with `Carnation
-<https://github.com/NICHD-BSPC/carnation>`__.
+<https://github.com/NICHD-BSPC/carnation>`__:
 
-::
+.. code-block:: text
 
    downstream/rnaseq.html
    downstream/combined.Rds
@@ -92,9 +91,8 @@ ChIP-seq (and other chromatin-associated assays)
 **Config** :ref:`chipseq-config`
 
 This workflow can be used for various bulk Illumina-based sequencing assays related
-to chromatin binding, like ChIP-seq, CUT&RUN, Cut&Tag, ATAC-seq. There may need
-to be modifications you need to make within the particular tool calls, but the
-framework is useful for all of them.
+to chromatin binding, including ChIP-seq, CUT&RUN, Cut&Tag, and ATAC-seq. You may need
+to make modifications to specific tool parameters, but the framework is useful for all of them.
 
 This workflow trims raw reads with cutadapt, aligns with bowtie2, and runs peak
 calling on all samples. Extensive QC is performed at each stage which is
@@ -102,35 +100,35 @@ aggregated with MultiQC.
 
 The biggest advantage of using this workflow is the flexibility of
 peak-calling. Since peak-calling tends to need extensive tweaking depending on
-the antibody or assay, it is straightfoward to configure multiple peak-calling
-runs (different algorithmss, each with possibley different parameters) on the
+the antibody or assay, it is straightforward to configure multiple peak-calling
+runs (different algorithms, each with possibly different parameters) on the
 same sample, and view them all together in a genome browser to decide on
 a final strategy.
 
 The primary output of the Snakefile consists of the following:
 
-- aligned BAM files (multimappers removed, duplicates removed)
+- Aligned BAM files (multimappers removed, duplicates removed):
 
-::
+.. code-block:: text
 
   data/chipseq_samples/{sample_id}/{sample_id}.cutadapt.unique.nodups.bam
 
-- peak calls
+- Peak calls:
 
-::
+.. code-block:: text
 
   data/chipseq_peaks/{algorithm}/{peak_run}/peaks.bed
   data/chipseq_peaks/{algorithm}/{peak_run}/peaks.bigbed
 
-- bigWigs for merged technical replicates
+- BigWig files for merged technical replicates:
 
-::
+.. code-block:: text
 
-  data/chipseq_merged/{biological_material}/{biological-material}.cutadapt.unique.nodups.bam.bigwig
+  data/chipseq_merged/{merged_label}/{merged_label}.cutadapt.unique.nodups.bam.bigwig
 
-- MultiQC output
+- MultiQC output:
 
-::
+.. code-block:: text
 
   data/chipseq_aggregation/multiqc.html
 
@@ -138,32 +136,39 @@ The primary output of the Snakefile consists of the following:
 Multiple workflows
 ------------------
 
-If you have multiple experiments of the same type, and the same parameters can
-be used for all of them, then they can go in the same sampletable. 
+When to use multiple workflows
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Same sampletable:** If you have multiple experiments of the same type and the same parameters can
+be used for all of them, then they can go in the same sampletable.
 
-If samples need different parameters in any way, then make a copy of the
-respective workflow directory and consider them as part of a different workflow.
+**Separate workflows:** If samples need different parameters in any way, then make a copy of the
+respective workflow directory and treat them as separate workflows.
 
-(see :ref:`decisions-sample-specific-params` for rationale on this)
+See :ref:`decisions-sample-specific-params` for the rationale behind this design choice.
 
-For example, miRNA-seq and SMART-seq would likely need different cutadapt
-parameters and maybe alignment parameters, so we might put them in different
-workflows:
+Examples of multiple workflows
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-::
+**Different library preparations** require different parameters:
+
+.. code-block:: text
 
   workflows/
     mirnaseq/
     smartseq/
 
 Each of these would be a copy of the ``rnaseq`` workflow, but with appropriate
-changes in the respective Snakefiles.
+changes in the respective Snakefiles to handle different cutadapt parameters
+and alignment settings.
+
+**Different organisms** require different reference genomes:
 
-This is also the mechanism for working with different genomes, which would have different references in the config::
+.. code-block:: text
 
   workflows/
     chipseq-human/
     chipseq-mouse/
 
-Each workflow can be considered as independent, which gives lots of flexibility
-in configuring and customizing the Snakefile.
+Each workflow directory contains its own config file pointing to the appropriate
+reference genome and annotations.

From 9eee642e7e435fb08a8098a49872a64617e97f04 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 6 Nov 2025 10:38:01 -0500
Subject: [PATCH 188/196] docs updates

---
 docs/config.rst | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/docs/config.rst b/docs/config.rst
index 6275af7d..ab986067 100644
--- a/docs/config.rst
+++ b/docs/config.rst
@@ -107,7 +107,7 @@ them on the command line. You would still need to specify strandedness, which
 can be the only config entry in :file:`config/config.yaml`. Or it could be
 specified directly on the command line, like this:
 
-.. code:block:: bash
+.. code-block:: bash
 
   snakemake \
     --configfile=../../include/reference_configs/Homo_sapiens/GENCODE.yaml \
@@ -213,7 +213,9 @@ a commandline config option if you want to use something different:
 
     # ...remainder of config
 
-or don't edit the config and instead modify the command-line call::
+or don't edit the config and instead modify the command-line call:
+
+.. code-block:: bash
 
   snakemake --config sampletable="config/mytable.csv" ...
 
@@ -223,7 +225,9 @@ RNA-seq sample table
 ~~~~~~~~~~~~~~~~~~~~
 
 Here is an example minimal sample table for single-end RNA-seq data. The column
-``orig_filename`` is required::
+``orig_filename`` is required:
+
+.. code-block:: text
 
     # Example RNA-seq sample table
     sample   orig_filename
@@ -233,18 +237,22 @@ Here is an example minimal sample table for single-end RNA-seq data. The column
     t2       ../../raw-data/t2_1.fq.gz
 
 For paired-end data, we need to specify the second end of the pair in the
-``orig_filename_R2`` column::
+``orig_filename_R2`` column:
+
+.. code-block:: text
 
     sample   orig_filename                   orig_filename_R2
     c1       /data/c1_R1.fastq.gz            /data/c1_R2.fastq.gz
-    c2       /data/c2_rR.fastq.gz            /data/c2_R1.fastq.gz
+    c2       /data/c2_R1.fastq.gz            /data/c2_R2.fastq.gz
     t1       other-data/treatment_1_1.fq.gz  other-data/treatment_1_2.fq.gz
     t2       ../../raw-data/t2_1.fq.gz       ../../raw-data/t2_2.fq.gz
 
 
 Relative paths are interpreted relative to the Snakefile
-(:file:`workflows/rnaseq`), so the paired-end examplea above would result in the
-following symlinks being created::
+(:file:`workflows/rnaseq`), so the paired-end example above would result in the
+following symlinks being created:
+
+.. code-block:: text
 
   data/rnaseq_samples/c1/c1_R1.fastq.gz --> /data/c1_R1.fastq.gz
   data/rnaseq_samples/c1/c1_R2.fastq.gz --> /data/c1_R2.fastq.gz
@@ -257,11 +265,13 @@ following symlinks being created::
 
 This sampletable will be read into the downstream differential expression
 analysis, so it's a good idea to add lots of metadata here. Here is a final
-paired-end sample table we could use::
+paired-end sample table we could use:
+
+.. code-block:: text
 
     sample   group     replicate  orig_filename                   orig_filename_R2
     c1       control   1          /data/c1_R1.fastq.gz            /data/c1_R2.fastq.gz
-    c2       control   2          /data/c2_rR.fastq.gz            /data/c2_R1.fastq.gz
+    c2       control   2          /data/c2_R1.fastq.gz            /data/c2_R2.fastq.gz
     t1       treatment 1          other-data/treatment_1_1.fq.gz  other-data/treatment_1_2.fq.gz
     t2       treatment 2          ../../raw-data/t2_1.fq.gz       ../../raw-data/t2_2.fq.gz
 
@@ -275,7 +285,8 @@ Here are some additional tips:
   sampletable becomes a single source of truth.
 - Creating sampletables is by far the most error-prone step -- it's very easy
   to miss changing an R1 to an R2, for example. Double- and triple-check!
-- You can split up experiments
+- You can split up experiments across multiple sampletables if needed
+- Descriptive sample names help with interpreting QC and downstream analysis
 
 
 .. _chipseq-sampletable:
@@ -292,7 +303,9 @@ sampletable for ChIP-seq. This is in contrast to RNA-seq, where we can simply
 sum counts of tech reps in R. See :ref:`decisions-techreps` for details.
 
 Use the ``merged_label`` column to control this. Rows with the same
-``merged_label`` value will merged together. Take the following example::
+``merged_label`` value will be merged together. Take the following example:
+
+.. code-block:: text
 
   samplename  merged_label  orig_filename
   ip1a        ip1           /data/run1/ip1.fq.gz
@@ -315,7 +328,9 @@ peak-calling.
 
 The workflow will automatically fill in missing values in ``merged_label`` with
 values from the first column. Or, to be explicit, we could write it all out
-like this::
+like this:
+
+.. code-block:: text
 
   samplename  merged_label  orig_filename
   ip1a        ip1           /data/run1/ip1.fq.gz

From c81adc88c2c023be03f23d7950a353c9a61349fa Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 6 Nov 2025 13:52:33 -0500
Subject: [PATCH 189/196] config docs

---
 docs/config.rst | 238 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 210 insertions(+), 28 deletions(-)

diff --git a/docs/config.rst b/docs/config.rst
index ab986067..107fbe28 100644
--- a/docs/config.rst
+++ b/docs/config.rst
@@ -23,14 +23,17 @@ Configuration happens in two places:
 Config file
 -----------
 
+Within a workflow directory, the default config file is expected to be at :file:`config/config.yaml`.
+
 Config files, at a minimum, specify which reference FASTA to use (:ref:`reference-config`).
 
-For RNA-seq (:ref:`rnaseq-config`) the config file also specifies strandedness.
+For RNA-seq (:ref:`rnaseq-config`) the config file also specifies a GTF
+reference and strandedness of the libraries.
 
 For ChIP-seq (:ref:`chipseq-config`) the config file specifies peak-calling runs.
 
-Config files are in YAML format. By default, they are expected to be at
-:file:`config/config.yaml`, but you can override from the command line like this::
+You can override the default config file location when calling snakemake like
+this::
 
   snakemake --configfile="otherdir/myconfig.yaml" ...
 
@@ -39,8 +42,39 @@ default config file (:file:`config/config.yaml`).
 
 .. _reference-config:
 
-Configuring genome fasta (RNA-seq & ChIP-seq)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+References section
+~~~~~~~~~~~~~~~~~~
+
+This section is just about the references part of the config; see
+:ref:`rnaseq-config` and :ref:`chipseq-config` for any additional config for
+those workflows.
+
+Using included reference config templates
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The repository includes pre-configured reference genome and annotation
+templates in :file:`include/reference_config_templates/` for common model
+organisms. These templates provide organism name, genome FASTA URL, and
+annotation GTF URL (for RNA-seq). They can be used for both ChIP-seq and
+RNA-seq to conveniently fill in the references part of the config.
+
+This is the easiest way to configure references. There are two ways to use
+these templates:
+
+1. Command-line: Point to the template using ``--configfile`` when calling Snakemake::
+
+    snakemake --configfile=../../include/reference_config_templates/Homo_sapiens/GENCODE.yaml ...
+
+   This merges the template with your default :file:`config/config.yaml`,
+   creating new or replacing existing keys.
+
+2. Copy-paste: Copy the contents from a template file into your
+   :file:`config/config.yaml` file.
+
+Otherwise, see the next section for customizing the references section.
+
+Configuring references
+^^^^^^^^^^^^^^^^^^^^^^
 
 Both RNA-seq and ChIP-seq need a reference fasta configured, like this:
 
@@ -49,24 +83,65 @@ Both RNA-seq and ChIP-seq need a reference fasta configured, like this:
   genome:
     url: <URL to gzipped FASTA file>
 
-The value of ``url`` can be a file, like
-``file:///data/references/Homo_sapiens/gencode.fa.gz``, or any FTP or HTTP URL.
 
+RNA-seq also needs a GTF annotation configured, which works similarly:
+
+.. code-block:: yaml
+
+  annotation:
+    url: <URL to gzipped GTF>
+
+
+The value of ``url`` can be a file (like
+``file:///data/references/Homo_sapiens/gencode.fa.gz``) or any FTP or HTTP URL.
+
+This is useful if you have existing reference files you want to use.
+
+By default, reference files will be downloaded to the :file:`references`
+directory within the current workflow. Aligner indexes will be built here as well.
+
+For ChIP-seq, the references directory will look like:
+
+.. code-block:: text
+
+  references/
+  ├── genome.fa.gz  # Downloaded FASTA
+  ├── bowtie2/  # bowtie2 index
+  │   └── genome.*.bt2
+  └── genome.chromsizes  # chromsizes from fasta
+
+For RNA-seq, it will look like:
+
+.. code-block:: text
 
-You could optionally use the included reference configs to fill in the genome
-and annotation from the commandline, and Snakemake would be called like this::
+  references/
+  ├── bowtie2/  # bowtie2 index for rRNA
+  │   └── rrna.*.bt2
+  ├── salmon/  # salmon index
+  ├── star/   # STAR index
+  ├── annotation.gtf.gz  # downloaded GTF
+  ├── annotation.refflat # GTF converted to refflat
+  ├── annotation.bed12   # GTF converted to bed12
+  ├── annotation.mapping.tsv.gz  # TSV of attributes from GTF
+  ├── genome.fa.gz  # downloaded FASTA
+  ├── genome.fa.fai  # chrom sizes
+  ├── rrna.fa.gz  # rRNA sequence for organism from SILVA
+  └── transcriptome.fa.gz  # created from genome FASTA and GTF
 
-  snakemake --configfile=../../include/reference_config_templates/Homo_sapiens/GENCODE.yaml ...
 
-Or you could copy the contents of the reference config templates and paste in
-your own :file:`config/config.yaml`.
+See :ref:`decisions-references` for a discussion on why it's done this way. You
+can control this behavior by using the optional ``references`` entry in the
+config file, which will instead look for (and create if needed) the specified
+directory. If you do this, keep in mind that each reference directory uses
+generic labels like ``genome``, ``annotation``, etc, so using the same
+directory for different organisms will cause the files to be overwritten for
+the last-run organism. So if you use this approach you should consider putting
+your references in directories named after organisms and the versions of
+aligners used.
 
 
-- url can be file
-- postprocessing
-- overrides
-- included reference configs
 
+.. _rnaseq-config:
 
 RNA-seq config
 ~~~~~~~~~~~~~~
@@ -101,21 +176,23 @@ Here is an example for human:
       url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz"
     stranded: "fr-firststrand"
 
-In :file:`include/reference_configs` you can find configs for common model
-organisms. These have both genome and annotation, so you can point Snakemake to
-them on the command line. You would still need to specify strandedness, which
-can be the only config entry in :file:`config/config.yaml`. Or it could be
-specified directly on the command line, like this:
+In :file:`include/reference_config_templates` you can find configs for common
+model organisms. These have both genome and annotation, so you can point
+Snakemake to them on the command line. You would still need to specify
+strandedness, which can be a config entry in
+:file:`config/config.yaml`. Or it could be specified directly on the command
+line, like this:
 
 .. code-block:: bash
 
   snakemake \
-    --configfile=../../include/reference_configs/Homo_sapiens/GENCODE.yaml \
+    --configfile=../../include/reference_config_templates/Homo_sapiens/GENCODE.yaml \
     --config stranded=fr-firststrand
 
-(in this case no separate :file:`config/config.yaml` would be needed, as long
-as you use the default :file:`config/sampletable.tsv` as your sampletable)
+(in this case a separate :file:`config/config.yaml` would not be needed, as
+long as you use the default :file:`config/sampletable.tsv` as your sampletable)
 
+.. _chipseq-config:
 
 ChIP-seq config
 ~~~~~~~~~~~~~~~
@@ -225,7 +302,7 @@ RNA-seq sample table
 ~~~~~~~~~~~~~~~~~~~~
 
 Here is an example minimal sample table for single-end RNA-seq data. The column
-``orig_filename`` is required:
+``orig_filename`` is required. Paths are **relative to the Snakefile**.
 
 .. code-block:: text
 
@@ -248,9 +325,8 @@ For paired-end data, we need to specify the second end of the pair in the
     t2       ../../raw-data/t2_1.fq.gz       ../../raw-data/t2_2.fq.gz
 
 
-Relative paths are interpreted relative to the Snakefile
-(:file:`workflows/rnaseq`), so the paired-end example above would result in the
-following symlinks being created:
+The paired-end example above would result in the following symlinks being
+created:
 
 .. code-block:: text
 
@@ -369,3 +445,109 @@ With the sampletable above, a peak-calling config section might then look like t
   In general, you may find it useful to add an antibody column and a chromatin
   prep column to the sampletable so you know which inputs/controls go with
   which IPs.
+
+Advanced: post-processing reference files
+-----------------------------------------
+
+In some cases, reference files may need to be modified after download. This is
+becoming increasingly rare thanks to updates from providers like Ensembl and
+NCBI, but sometimes files need to be post-processed. For example:
+
+- only a GFF is available, so it needs to be post-processed into GTF format
+- extra chromosomes are included that should be removed
+- renaming chromosomes (e.g. to match UCSC Genome Browser nomenclature)
+- adding transgenic constructs to FASTA and/or GTF
+- removing problematic annotations (like trans-splicing events which some tools have issues with)
+
+To handle these situations, a reference file config can take an optional
+``postprocess:`` key. This is a string containing a dotted name referring to
+a Python function importable by :file:`lib.utils`. For
+:file:`lib/postprocess/__init__.py` has many such functions, but you can write
+your own.
+
+This is a bit of an advanced topic. See the help and comments in
+``lib.utils.download_and_postprocess`` (in the file :file:`lib/utils.py`) for
+details; the following attempts to provide enough information and direction
+for you to implement your own customizations.
+
+A function used for post-processing must have the signature:
+
+.. code-block:: python
+
+    def name_of_function(tmpfiles, outfile, **kwargs):
+        pass
+
+It should expect ``tmpfiles`` to be a list of files that were just downloaded,
+and ``outfile`` is the final gzipped file to create.
+
+If the function does not need any kwargs, configure it like this:
+
+.. code-block:: yaml
+
+  genome:
+    url: <URL>
+    postprocess: "lib.postprocess.name_of_function"
+
+If it needs kwargs, configure it like this:
+
+.. code-block:: yaml
+
+  genome:
+    url: <URL>
+    postprocess:
+      function: "lib.postprocess.name_of_function"
+      kwargs:
+        kwarg1:
+          - list
+          - of
+          - items
+        verbose: true
+
+Note these examples use the genome fasta, but the functionality works for
+annotations as well.
+
+If a post-processing function has a keyword argument with starts and ends with
+a double underscore (``__``), the config system will assume this is a string
+that should be interpreted as a dotted function name and the actual function
+will be resolved and passed to the post-processing function.
+
+Here is a complete (and complex) example to illustrate the mechanism. In this
+example, we want to include ERCC spike-ins to the reference genome as well as
+to the GTF file so we can quantify them. However, only a GFF file is available
+for *S. pombe*, so we also need to post-process that into a GTF before
+appending ERCC annotations. As another wrinkle, there is no ERCC spike-in GTF,
+so we need to create our own from the FASTA file. Here is how this would be
+configured:
+
+.. code-block:: yaml
+
+    genome:
+      url:
+        # S. pombe fasta
+        - "ftp://ftp.ensemblgenomes.org/pub/fungi/release-41/fasta/schizosaccharomyces_pombe/dna/Schizosaccharomyces_pombe.ASM294v2.dna_sm.toplevel.fa.gz"
+        # ERCC fasta
+        - "https://tsapps.nist.gov/srmext/certificates/documents/SRM2374_Sequence_v1.FASTA"
+      postprocess:
+        # See lib/postprocess/ercc.py
+        function: "lib.postprocess.ercc.add_fasta_to_genome"
+
+    annotation:
+      url:
+        # S. pombe GFF, which needs to be converted to GTF
+        - "ftp://ftp.ensemblgenomes.org/pub/fungi/release-41/gff3/schizosaccharomyces_pombe/Schizosaccharomyces_pombe.ASM294v2.41.gff3.gz"
+
+        # ERCC GTF is not available; conversion function needed to convert
+        # fasta to GTF
+        - "https://tsapps.nist.gov/srmext/certificates/documents/SRM2374_Sequence_v1.FASTA"
+
+      postprocess:
+        function: "lib.postprocess.ercc.add_gtf_to_genome"
+        kwargs:
+          # As per the docs for add_gtf_to_genome(), this function will be
+          # applied to all but the last input file. It is specified as a string
+          # here, but the config-processing system will resolve this to the
+          # actual function and pass that along to add_gtf_to_genome
+          __preprocess__: "lib.common.gff2gtf"
+
+The end result is a genomic fasta with ERCC spike-ins added and a GTF version
+of Ensembl's GFF file with ERCC spike-ins added as additional annotations.

From 284a3c797e52f6878961ff2709942983e82880d9 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Thu, 6 Nov 2025 13:52:40 -0500
Subject: [PATCH 190/196] decisions reorganization

---
 docs/decisions.rst | 317 +++++++++++++++++++++++----------------------
 1 file changed, 162 insertions(+), 155 deletions(-)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 7d08e5d4..1c3800b7 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -3,6 +3,8 @@ Decision log
 
 This document keeps track of the reasoning behind various architecture decisions.
 
+.. _decisions-references:
+
 References
 ----------
 Here are use-cases we have that are common enough to warrant supporting:
@@ -35,6 +37,67 @@ should be only one organism per workflow though.**
 - Overall, here we make the decision that the time and space cost to re-make
   references for each project is worth the gain in simplicity and isolation.
 
+Arguments for and against a separate references workflow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+RNA-seq, ChIP-seq, and the upcoming variant calling all need to do something
+with references, including possibly patching them. We have to deal with this
+inherent complexity. It initially made sense to put common rules in the
+separate references workflow.
+
+However, only a subset of the rules in the references workflow are actually
+shared across RNA-seq and ChIP-seq -- currently, only the bowtie2 index
+(genome-wide ChIP-seq alignment; rRNA screening for RNA-seq), the fasta rule,
+chromsizes, and the generic unzip rule. The other rules in the <v2.0 references
+workflow (gtf, mappings, conversion_bed12, conversion_refflat, kallisto_index,
+salmon_index, transcriptome_fasta, star_index, rrna) are all unique to RNA-seq.
+So the <v2.0 references workflow is actually mostly an RNA-seq-only references
+workflow. It would make more sense to have those RNA-seq-specific rules in the
+RNA-seq workflow directly.
+
+Furthermore, much of the complexity is handled in the
+lib.utils.download_and_postprocess function, rather than in the workflow rules.
+This is the function that downloads, figures out what functions to apply for
+post-processing, and outputs the prepared file. We already are using the utils
+module separately in the ChIP-seq and RNA-seq workflows, so there's no
+additional overhead to import it into the Snakefiles. We can use that function
+directly.
+
+Last, having a workflow split across two Snakefiles hampers the ability to
+understand the complete workflow.
+
+Taken together, it made more sense to eliminate the references workflow
+entirely, and port the rules to the respective workflows.
+
+
+Selection of reference genomes and annotations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Where possible, we select "primary" assemblies -- those with th canonical
+chromosomes and unassembled contigs (scaffolds) but NOT haplotypes, alternate
+loci, or assembly patches.
+
+`Heng Li's blog post
+<https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use>`__ on
+the subject is a useful guideline. To summarize, we want to exclude alt contigs
+/ haplotypes because they may create multimapping issues, and we want to
+include unassembled contigs because excluding them would artificially decrease
+alignment percentage.
+
+Since lcdb-wf is intended to be used with arbitrary organisms, the PAR and
+mitochondrial sequences mentioned there are not relevant in general.
+
+Reference genome and annotation sources
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+lcdb-wf has always been organism-agnostic. It would be nice to have a single
+source of all genomics data such that we could pass an organism name and get
+back the referencs. But even Ensembl and NCBI are not uniform in their support.
+Sometimes primary assemblies are available; sometimes primary chromosome fastas
+are available but the top-level is actually primary (rat, Ensembl); A GTF might
+not be available (pombe, Ensembl); or only a toplevel assembly is available and
+we need to remove the haplotypes and alt loci out (hg19, Ensembl).
+
 Reference nomenclature and directory structure
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -97,6 +160,84 @@ For ChIP-seq:
       genome.fasta <symlink to ../genome.fasta>
       <bowtie2 files>
 
+Zipping/unzipping references
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some tools need uncompressed files, others are fine with compressed. For example,
+STAR requires uncompressed FASTA and GTF files to build the index, but bowtie2
+can use a compressed fasta. gffread nees uncompressed FASTA and GTF to make
+a transcriptome fasta.
+
+Previously, anything using a FASTA or GTF would use the uncompressed version,
+and the ``unzip`` rule marked the uncompressed output as temporary. The problem
+with this was when we wanted to make a change in featureCounts. Since this used
+the temp uncompressed GTF file, the ``unzip`` rule needed to run again...but
+that would then trigger the STAR rule to rerun, because it too used that temp
+file and it was being changed (well, re-created but that's the same to
+Snakemake). As a result, we had to spend the time/resource cost to realign
+*everything* and all the downstream jobs after alignment, just to run
+featureCounts.
+
+Making the featureCounts rule use the compressed GTF avoids this issue. Now,
+just the transcriptome fasta and the STAR index need the uncompressed
+references, and these are set in the ``unzip`` rule to be temporary.
+
+Annotations
+~~~~~~~~~~~
+
+We use the most comprehensive annotations. For human and mouse, this is the
+GENCODE "comprehensive" annotation for the primary assembly, which will include
+many more than just protein-coding transcripts. For example, here are the
+frequencies of ``transcript_type`` values in GENCODE v19's comprehensive
+annotation:
+
+::
+
+  1726632 protein_coding
+  214952 nonsense_mediated_decay
+  154780 processed_transcript
+  135772 retained_intron
+   54584 lincRNA
+   44207 antisense
+   22976 processed_pseudogene
+   15313 pseudogene
+   11202 unprocessed_pseudogene
+    9477 miRNA
+    7090 transcribed_unprocessed_pseudogene
+    6149 misc_RNA
+    5783 snRNA
+    4521 snoRNA
+    3148 sense_intronic
+    1662 polymorphic_pseudogene
+    1610 rRNA
+    1430 unitary_pseudogene
+    1417 sense_overlapping
+    1117 IG_V_gene
+    1091 transcribed_processed_pseudogene
+    1035 non_stop_decay
+     755 TR_V_gene
+     681 IG_V_pseudogene
+     300 TR_J_gene
+     185 IG_C_gene
+     152 IG_D_gene
+     100 3prime_overlapping_ncrna
+      99 TR_V_pseudogene
+      80 IG_J_gene
+      66 Mt_tRNA
+      56 TR_C_gene
+      36 IG_C_pseudogene
+      12 TR_J_pseudogene
+      12 TR_D_gene
+       9 IG_J_pseudogene
+       6 Mt_rRNA
+       3 translated_processed_pseudogene
+
+Erring on the side of too many annotations (i.e., using the comprehensive
+annotation instead of a curated version) will result in more features, which at
+face value might make the FDR adjustment more harsh in DESeq2. But DESeq2's
+independent filtering (not even testing those features with so few reads that
+they would not reach significance) guards against this. So we stick with the
+comprehensive annotations when available.
 
 Params
 ------
@@ -423,37 +564,31 @@ Guidelines:
   call directly, to visually match the equivalent command-line call and to make
   it clear what should be edited.
 
-Arguments for and against a separate references workflow
---------------------------------------------------------
-
-RNA-seq, ChIP-seq, and the upcoming variant calling all need to do something
-with references, including possibly patching them. We have to deal with this
-inherent complexity. It initially made sense to put common rules in the
-separate references workflow.
+Lack of sample-specific parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-However, only a subset of the rules in the references workflow are actually
-shared across RNA-seq and ChIP-seq -- currently, only the bowtie2 index
-(genome-wide ChIP-seq alignment; rRNA screening for RNA-seq), the fasta rule,
-chromsizes, and the generic unzip rule. The other rules in the <v2.0 references
-workflow (gtf, mappings, conversion_bed12, conversion_refflat, kallisto_index,
-salmon_index, transcriptome_fasta, star_index, rrna) are all unique to RNA-seq.
-So the <v2.0 references workflow is actually mostly an RNA-seq-only references
-workflow. It would make more sense to have those RNA-seq-specific rules in the
-RNA-seq workflow directly.
+Currently if we have samples with different library preps that need different
+arguments for cutadapt, then they need to be split into two separate workflow
+directories and the Snakefiles edited accordingly to have the correct parameters
+for rules.
 
-Furthermore, much of the complexity is handled in the
-lib.utils.download_and_postprocess function, rather than in the workflow rules.
-This is the function that downloads, figures out what functions to apply for
-post-processing, and outputs the prepared file. We already are using the utils
-module separately in the ChIP-seq and RNA-seq workflows, so there's no
-additional overhead to import it into the Snakefiles. We can use that function
-directly.
+Supporting sample-specific parameters would certainly be possible. But this
+would go against the goal of reducing complexity.
 
-Last, having a workflow split across two Snakefiles hampers the ability to
-understand the complete workflow.
+For example, we'd need a location to store multiple sets of parameters (probably
+in the config file) and a mechanism to retrieve them based on sample names. This
+could be an additional column in the sampletable indicating "parameter sets".
+Then we could create a lookup table in the config storing the different
+parameter sets, with each set containing parameters for all rules. We'd need to
+handle default params in case they weren't specified. Then we'd need to have
+each rules' ``params:`` directive do the lookup in a sample-specific manner,
+which would be a lookup function in :file:`lib/utils.py`.
 
-Taken together, it made more sense to eliminate the references workflow
-entirely, and port the rules to the respective workflows.
+Again, this would all be possible. But to reduce complexity it is a deliberate
+design choice to opt for a simpler approach: use multiple workflow directories
+and edit the respective Snakefiles appropriately. In cases where samples need to
+be compared or considered together across the workflows, an additional workflow
+can be introduced to aggregate their output.
 
 featureCounts all-in-one or individually
 ----------------------------------------
@@ -479,102 +614,9 @@ matter of choosing the input file for featureCounts rule), it made the most
 sense to run featureCounts once, providing it all samples, and having it use
 the temporarily name-sorted BAMs as input for paired-end experiments.
 
-Selection of reference genomes and annotations
-----------------------------------------------
 
-Where possible, we select "primary" assemblies -- those with th canonical
-chromosomes and unassembled contigs (scaffolds) but NOT haplotypes, alternate
-loci, or assembly patches.
 
-`Heng Li's blog post
-<https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use>`__ on
-the subject is a useful guideline. To summarize, we want to exclude alt contigs
-/ haplotypes because they may create multimapping issues, and we want to
-include unassembled contigs because excluding them would artificially decrease
-alignment percentage.
 
-Since lcdb-wf is intended to be used with arbitrary organisms, the PAR and
-mitochondrial sequences mentioned there are not relevant in general.
-
-
-Annotations
------------
-
-We use the most comprehensive annotations. For human and mouse, this is the
-GENCODE "comprehensive" annotation for the primary assembly, which will include
-many more than just protein-coding transcripts. For example, here are the
-frequencies of ``transcript_type`` values in GENCODE v19's comprehensive
-annotation:
-
-::
-
-  1726632 protein_coding
-  214952 nonsense_mediated_decay
-  154780 processed_transcript
-  135772 retained_intron
-   54584 lincRNA
-   44207 antisense
-   22976 processed_pseudogene
-   15313 pseudogene
-   11202 unprocessed_pseudogene
-    9477 miRNA
-    7090 transcribed_unprocessed_pseudogene
-    6149 misc_RNA
-    5783 snRNA
-    4521 snoRNA
-    3148 sense_intronic
-    1662 polymorphic_pseudogene
-    1610 rRNA
-    1430 unitary_pseudogene
-    1417 sense_overlapping
-    1117 IG_V_gene
-    1091 transcribed_processed_pseudogene
-    1035 non_stop_decay
-     755 TR_V_gene
-     681 IG_V_pseudogene
-     300 TR_J_gene
-     185 IG_C_gene
-     152 IG_D_gene
-     100 3prime_overlapping_ncrna
-      99 TR_V_pseudogene
-      80 IG_J_gene
-      66 Mt_tRNA
-      56 TR_C_gene
-      36 IG_C_pseudogene
-      12 TR_J_pseudogene
-      12 TR_D_gene
-       9 IG_J_pseudogene
-       6 Mt_rRNA
-       3 translated_processed_pseudogene
-
-Erring on the side of too many annotations (i.e., using the comprehensive
-annotation instead of a curated version) will result in more features, which at
-face value might make the FDR adjustment more harsh in DESeq2. But DESeq2's
-independent filtering (not even testing those features with so few reads that
-they would not reach significance) guards against this. So we stick with the
-comprehensive annotations when available.
-
-Zipping/unzipping references
-----------------------------
-
-Some tools need uncompressed files, others are fine with compressed. For example,
-STAR requires uncompressed FASTA and GTF files to build the index, but bowtie2
-can use a compressed fasta. gffread nees uncompressed FASTA and GTF to make
-a transcriptome fasta.
-
-Previously, anything using a FASTA or GTF would use the uncompressed version,
-and the ``unzip`` rule marked the uncompressed output as temporary. The problem
-with this was when we wanted to make a change in featureCounts. Since this used
-the temp uncompressed GTF file, the ``unzip`` rule needed to run again...but
-that would then trigger the STAR rule to rerun, because it too used that temp
-file and it was being changed (well, re-created but that's the same to
-Snakemake). As a result, we had to spend the time/resource cost to realign
-*everything* and all the downstream jobs after alignment, just to run
-featureCounts.
-
-Making the featureCounts rule use the compressed GTF avoids this issue. Now,
-just the transcriptome fasta and the STAR index need the uncompressed
-references, and these are set in the ``unzip`` rule to be temporary.
 
 Test framework
 --------------
@@ -630,44 +672,9 @@ Aligners don't seem to make that much of a difference, and officially
 supporting just one (plus a psueodaligner for RNA-seq) makes the workflows and
 config simpler.
 
-Reference genome and annotation sources
----------------------------------------
-
-lcdb-wf has always been organism-agnostic. It would be nice to have a single
-source of all genomics data such that we could pass an organism name and get
-back the referencs. But even Ensembl and NCBI are not uniform in their support.
-Sometimes primary assemblies are available; sometimes primary chromosome fastas
-are available but the top-level is actually primary (rat, Ensembl); A GTF might
-not be available (pombe, Ensembl); or only a toplevel assembly is available and
-we need to remove the haplotypes and alt loci out (hg19, Ensembl).
 
 .. _decisions-sample-specific-params:
 
-Lack of sample-specific parameters
-----------------------------------
-
-Currently if we have samples with different library preps that need different
-arguments for cutadapt, then they need to be split into two separate workflow
-directories and the Snakefiles edited accordingly to have the correct parameters
-for rules.
-
-Supporting sample-specific parameters would certainly be possible. But this
-would go against the goal of reducing complexity.
-
-For example, we'd need a location to store multiple sets of parameters (probably
-in the config file) and a mechanism to retrieve them based on sample names. This
-could be an additional column in the sampletable indicating "parameter sets".
-Then we could create a lookup table in the config storing the different
-parameter sets, with each set containing parameters for all rules. We'd need to
-handle default params in case they weren't specified. Then we'd need to have
-each rules' ``params:`` directive do the lookup in a sample-specific manner,
-which would be a lookup function in :file:`lib/utils.py`.
-
-Again, this would all be possible. But to reduce complexity it is a deliberate
-design choice to opt for a simpler approach: use multiple workflow directories
-and edit the respective Snakefiles appropriately. In cases where samples need to
-be compared or considered together across the workflows, an additional workflow
-can be introduced to aggregate their output.
 
 PEP support
 -----------

From be92aa60e93c28cd37adbc0efee57c26d7f6dafb Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 7 Nov 2025 03:09:45 +0000
Subject: [PATCH 191/196] chipseq: merged_label column,

this is a backwards-compatible new alias for label column but hopefully
less confusing
---
 lib/utils.py                | 8 +++++---
 workflows/chipseq/Snakefile | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/lib/utils.py b/lib/utils.py
index 2535286f..541b6480 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -508,11 +508,11 @@ def _default(origfn, newfn):
 
 def get_techreps(sampletable, label):
     """
-    Return all sample IDs for which the "label" column is `label`.
+    Return all sample IDs for which the "merged_label" column is `label`.
     """
     # since we're not requiring a name but we want to use `loc`
     first_col = sampletable.columns[0]
-    result = list(sampletable.loc[sampletable["label"] == label, first_col])
+    result = list(sampletable.loc[sampletable["merged_label"] == label, first_col])
 
     # If we're using a ChIP-seq-like sampletable we can provide a more
     # informative error message.
@@ -821,7 +821,9 @@ def prepare_chipseq_sampletable(config):
     Given a config, return the validated and prepared ChIP-seq table.
     """
     sampletable = read_sampletable(config)
-    sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0])
+    if "label" in sampletable.columns:
+        sampletable["merged_label"] = sampletable["label"]
+    sampletable["merged_label"] = sampletable["merged_label"].fillna(sampletable.iloc[:, 0])
     chipseq_preflight(config, sampletable)
     return sampletable
 
diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index e6b01bb8..5d36e65d 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -15,7 +15,7 @@ sampletable = utils.prepare_chipseq_sampletable(config)
 is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
 SAMPLES = sampletable.index.values
-LABELS = sampletable.label.values
+LABELS = sampletable.merged_label.values
 REFERENCES = config.get("references", "references")
 peaks = chipseq.add_bams_to_peak_calling(config)
 

From 159297662602faefd5ea54b29bcc2decfd215824 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 7 Nov 2025 03:58:50 +0000
Subject: [PATCH 192/196] no temp references

---
 docs/decisions.rst         | 13 +++++++------
 workflows/rnaseq/Snakefile |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 1c3800b7..f52002f7 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -178,9 +178,13 @@ Snakemake). As a result, we had to spend the time/resource cost to realign
 *everything* and all the downstream jobs after alignment, just to run
 featureCounts.
 
-Making the featureCounts rule use the compressed GTF avoids this issue. Now,
-just the transcriptome fasta and the STAR index need the uncompressed
-references, and these are set in the ``unzip`` rule to be temporary.
+Making the featureCounts rule use the compressed GTF avoids this issue.
+However, the transcriptome fasta and the STAR index need the uncompressed
+references. During testing, there were multiple times when the entire workflow
+needed to run because a file marked as temporary was transiently needed. Upon
+closer inspection, this was correct behavior, but it happened enough for subtle
+reasons that, to avoid future confusion, we keep both compressed and
+uncompressed.
 
 Annotations
 ~~~~~~~~~~~
@@ -615,9 +619,6 @@ sense to run featureCounts once, providing it all samples, and having it use
 the temporarily name-sorted BAMs as input for paired-end experiments.
 
 
-
-
-
 Test framework
 --------------
 
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 5c22fda8..0d162bc9 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -138,7 +138,7 @@ rule unzip:
     input:
         f"{REFERENCES}/{{prefix}}.gz",
     output:
-        temporary(f"{REFERENCES}/{{prefix}}"),
+        f"{REFERENCES}/{{prefix}}",
     resources:
         mem="4g",
         runtime="2h",
@@ -209,7 +209,7 @@ rule transcriptome_fasta:
         gtf=f"{REFERENCES}/annotation.gtf",
         fai=f"{REFERENCES}/genome.fa.fai",
     output:
-        fa=temporary(f"{REFERENCES}/transcriptome.fa"),
+        fa=f"{REFERENCES}/transcriptome.fa",
         gz=f"{REFERENCES}/transcriptome.fa.gz",
     log:
         f"{REFERENCES}/transcriptome.log",

From 4dcb6a962d6a580ed42eb8e4817cc05e980f306d Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 7 Nov 2025 03:59:34 +0000
Subject: [PATCH 193/196] support for PE fastqc in multiqc (finally!)

---
 workflows/chipseq/Snakefile                  | 12 ++--
 workflows/chipseq/config/multiqc_config.yaml | 51 +++++++++----
 workflows/rnaseq/Snakefile                   | 12 +++-
 workflows/rnaseq/config/multiqc_config.yaml  | 76 ++++++++++++++------
 4 files changed, 108 insertions(+), 43 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index 5d36e65d..e2b1adc5 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -550,12 +550,12 @@ rule samtools_idxstats:
         bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
         bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai",
     output:
-        txt="data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt",
+        txt="data/chipseq_samples/{sample}/{sample}.samtools_idxstats.txt",
     resources:
         mem="16g",
         runtime="2h",
     log:
-        "data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt.log",
+        "data/chipseq_samples/{sample}/{sample}.samtools_idxstats.txt.log"
     shell:
         "samtools idxstats {input.bam} 2> {log} 1> {output.txt}"
 
@@ -565,12 +565,12 @@ rule samtools_flagstat:
         bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
         bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai",
     output:
-        "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt",
+        "data/chipseq_samples/{sample}/{sample}.samtools_flagstat.txt",
     resources:
         mem="8g",
         runtime="2h",
     log:
-        "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt.log",
+        "data/chipseq_samples/{sample}/{sample}.samtools_flagstat.txt.log"
     shell:
         "samtools flagstat {input.bam} > {output}"
 
@@ -580,12 +580,12 @@ rule samtools_stats:
         bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam",
         bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai",
     output:
-        "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt",
+        "data/chipseq_samples/{sample}/{sample}.samtools_stats.txt",
     resources:
         mem="8g",
         runtime="2h",
     log:
-        "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt.log",
+        "data/chipseq_samples/{sample}/{sample}.samtools_stats.txt.log"
     shell:
         "samtools stats {input.bam} > {output}"
 
diff --git a/workflows/chipseq/config/multiqc_config.yaml b/workflows/chipseq/config/multiqc_config.yaml
index e548ad16..30c59ac5 100644
--- a/workflows/chipseq/config/multiqc_config.yaml
+++ b/workflows/chipseq/config/multiqc_config.yaml
@@ -14,15 +14,12 @@ extra_fn_clean_exts:
   - '.salmon'
   - '_R1'
   - '_R2'
+  - '_samtools'
 
 
-# Modify the module search patterns to match what we're creating in the
-# workflow.
-#
-# See http://multiqc.info/docs/#module-search-patterns
-sp:
-  fastq_screen:
-    fn: '*.screen.txt'
+fn_ignore_files:
+  - '*.merged.bam.metrics'
+
 
 # Set the module order to reflect the order of the workflow. Note that here
 # we're also running the FastQC module multiple times, and putting them in
@@ -35,19 +32,35 @@ sp:
 # See http://multiqc.info/docs/#order-of-modules
 module_order:
     - fastqc:
-        name: 'FastQC (raw)'
+        name: 'FastQC (raw R1)'
         path_filters_exclude:
             - '*.cutadapt.fastq.gz_fastqc.zip'
+            - '*_R2.*'
         path_filters:
-            - '*.fastq.gz_fastqc.zip'
+            - '*_R1.fastq.gz_fastqc.zip'
+    - fastqc:
+        name: 'FastQC (raw R2)'
+        path_filters_exclude:
+            - '*.cutadapt.fastq.gz_fastqc.zip'
+            - '*_R1.*'
+        path_filters:
+            - '*_R2.fastq.gz_fastqc.zip'
     - libsizes_table
     - cutadapt
     - fastqc:
-        name: 'FastQC (trimmed)'
+        name: 'FastQC (trimmed R1)'
+        target: ''
+        path_filters_exclude:
+            - '*_R2.*'
+        path_filters:
+            - '*_R1.cutadapt.fastq.gz_fastqc.zip'
+    - fastqc:
+        name: 'FastQC (trimmed R2)'
         target: ''
+        path_filters_exclude:
+            - '*_R1.*'
         path_filters:
-            - '*.cutadapt.fastq.gz_fastqc.zip'
-    - fastq_screen
+            - '*_R2.cutadapt.fastq.gz_fastqc.zip'
     - bowtie2
     - fastqc:
         name: 'FastQC (aligned, unique, nodups)'
@@ -62,15 +75,23 @@ module_order:
 #
 # See http://multiqc.info/docs/#customising-tables
 table_columns_placement:
-  FastQC (raw):
+  FastQC (raw R1):
     total_sequences: 20
     percent_duplicates: 30
     percent_gc: 40
-  FastQC (trimmed):
+  FastQC (raw R2):
     total_sequences: 21
     percent_duplicates: 31
     percent_gc: 41
-  FastQC (aligned, unique, nodups):
+  FastQC (trimmed R1):
     total_sequences: 22
     percent_duplicates: 32
     percent_gc: 42
+  FastQC (trimmed R2):
+    total_sequences: 23
+    percent_duplicates: 33
+    percent_gc: 43
+  FastQC (aligned, unique, nodups):
+    total_sequences: 24
+    percent_duplicates: 34
+    percent_gc: 44
diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
index 0d162bc9..a9bc40f1 100644
--- a/workflows/rnaseq/Snakefile
+++ b/workflows/rnaseq/Snakefile
@@ -9,6 +9,7 @@ from lib import utils
 
 configfile: "config/config.yaml"
 
+
 sampletable = utils.prepare_rnaseq_sampletable(config)
 is_paired = utils.detect_layout(sampletable) == "PE"
 n = ["1", "2"] if is_paired else ["1"]
@@ -865,7 +866,16 @@ rule multiqc:
             expand(
                 rules.fastqc.output.zip,
                 sample=SAMPLES,
-                suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"],
+                suffix=utils.flatten(
+                    [
+                        expand(i, n=n)
+                        for i in [
+                            "_R{n}.fastq.gz",
+                            "_R{n}.cutadapt.fastq.gz",
+                            ".cutadapt.bam",
+                        ]
+                    ]
+                ),
             ),
             expand(rules.markduplicates.output, sample=SAMPLES),
             expand(rules.salmon.output, sample=SAMPLES),
diff --git a/workflows/rnaseq/config/multiqc_config.yaml b/workflows/rnaseq/config/multiqc_config.yaml
index 0fe650a7..4b8c9f91 100644
--- a/workflows/rnaseq/config/multiqc_config.yaml
+++ b/workflows/rnaseq/config/multiqc_config.yaml
@@ -16,15 +16,6 @@ extra_fn_clean_exts:
   - '_R2'
 
 
-# Modify the module search patterns to match what we're creating in the
-# workflow.
-#
-# See http://multiqc.info/docs/#module-search-patterns
-sp:
-  fastq_screen:
-    fn: '*.screen.txt'
-
-
 # Ignore the rRNA files, which were just cluttering the tables. We're
 # independently calculating the rRNA libsizes anyway, so there's no need for
 # them to be included.
@@ -33,10 +24,6 @@ sp:
 fn_ignore_files:
   - '*rrna.bam*'
 
-  # ignore log files from the first pass of STAR 2-pass alignment
-  - '*pass1*'
-
-
 # Set the module order to reflect the order of the workflow. Note that here
 # we're also running the FastQC module multiple times, and putting them in
 # their logical locations within the order of the workflow.
@@ -48,18 +35,35 @@ fn_ignore_files:
 # See http://multiqc.info/docs/#order-of-modules
 module_order:
     - fastqc:
-        name: 'FastQC (raw)'
+        name: 'FastQC (raw R1)'
         path_filters_exclude:
-            - '*.cutadapt.fastq.gz_fastqc.zip'
+            - '*_R2*'
+            - '*_R1.cutadapt.fastq.gz_fastqc.zip'
         path_filters:
-            - '*.fastq.gz_fastqc.zip'
+            - '*_R1.fastq.gz_fastqc.zip'
+    - fastqc:
+        name: 'FastQC (raw R2)'
+        path_filters_exclude:
+            - '*_R1*'
+            - '*_R2.cutadapt.fastq.gz_fastqc.zip'
+        path_filters:
+            - '*_R2.fastq.gz_fastqc.zip'
     - rrna_percentages_table
     - cutadapt
     - fastqc:
-        name: 'FastQC (trimmed)'
+        name: 'FastQC (trimmed R1)'
         target: ''
+        path_filters_exclude:
+            - '*_R2*'
         path_filters:
-            - '*.cutadapt.fastq.gz_fastqc.zip'
+            - '*_R1.cutadapt.fastq.gz_fastqc.zip'
+    - fastqc:
+        name: 'FastQC (trimmed R2)'
+        target: ''
+        path_filters_exclude:
+            - '*_R1*'
+        path_filters:
+            - '*_R2.cutadapt.fastq.gz_fastqc.zip'
     - fastq_screen
     - bowtie2
     - fastqc:
@@ -84,16 +88,46 @@ remove_sections:
 # of the different stages on the stats.
 #
 # See http://multiqc.info/docs/#customising-tables
+table_columns_name:
+  FastQC (raw R1):
+    total_sequences: "Seq (raw R1)"
+    percent_duplicates: "Dups (raw R1)"
+    percent_gc: "GC (raw R1)"
+  FastQC (raw R2):
+    total_sequences: "Seq (raw R2)"
+    percent_duplicates: "Dups (raw R2)"
+    percent_gc: "GC (raw R2)"
+  FastQC (trimmed R1):
+    total_sequences: "Seq (trim R1)"
+    percent_duplicates: "Dups (trim R1)"
+    percent_gc: "GC (trim R1)"
+  FastQC (trimmed R2):
+    total_sequences: "Seq (trim R2)"
+    percent_duplicates: "Dups (trim R2)"
+    percent_gc: "GC (trim R2)"
+  FastQC (aligned):
+    total_sequences: "Seq (aligned)"
+    percent_duplicates: "Dups (aligned)"
+    percent_gc: "GC (aligned)"
+
 table_columns_placement:
-  FastQC (raw):
+  FastQC (raw R1):
     total_sequences: 20
     percent_duplicates: 30
     percent_gc: 40
-  FastQC (trimmed):
+  FastQC (raw R2):
     total_sequences: 21
     percent_duplicates: 31
     percent_gc: 41
-  FastQC (aligned):
+  FastQC (trimmed R1):
     total_sequences: 22
     percent_duplicates: 32
     percent_gc: 42
+  FastQC (trimmed R2):
+    total_sequences: 23
+    percent_duplicates: 33
+    percent_gc: 43
+  FastQC (aligned):
+    total_sequences: 24
+    percent_duplicates: 34
+    percent_gc: 44

From 6e45da441ce080f69de3a7c4626003af03f99958 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 7 Nov 2025 09:58:01 -0500
Subject: [PATCH 194/196] lots of docs

---
 docs/changelog.rst | 923 +++++++++++++++++++++++++++++++++++++++++++++
 docs/conf.py       | 184 +--------
 docs/decisions.rst |  51 +++
 docs/toc.rst       |   3 +
 docs/v2.0.rst      | 111 ++++++
 5 files changed, 1108 insertions(+), 164 deletions(-)
 create mode 100644 docs/changelog.rst
 create mode 100644 docs/v2.0.rst

diff --git a/docs/changelog.rst b/docs/changelog.rst
new file mode 100644
index 00000000..9583eafc
--- /dev/null
+++ b/docs/changelog.rst
@@ -0,0 +1,923 @@
+Changelog
+=========
+
+v2.0
+----
+
+Major release, heavily focused on simplification where possible. This includes
+the reference configurations, more streamlined config files, and many fixes and
+improvements that have been requested over the years.
+
+- Requires Snakemake 8+
+- Removed colocalization workflow
+
+
+v1.10.3
+-------
+
+- improve the deploy script (thanks @aliciaaevans)
+- support the epic2 peak-caller for the ChIP-seq workflow (thanks @Mira0507)
+- for later versions of featureCounts, add ``--countReadPairs`` argument to RNA-seq workflow (@therealgenna)
+
+v1.10.2
+-------
+
+Minor bugfix release.
+
+- Fix multiqc configs so that they coorectly ignore any cutadapt fastqc zips when building the raw fastq section
+- Fix multiqc config for chipseq so it correctly cleans the ``_R2`` extension to better support PE ChIP-seq-like workflows
+- Fix functional enrichment label truncation to ensure that truncated labels are unique
+
+v1.10.1
+-------
+This is a bugfix and minor patch release.
+
+- Bugfix: the references workflow was missing the ``resources:`` directives;
+  they have now been added.
+
+- Bugfix: kallisto strandedness was set incorrectly for libraries using
+  ligation prep (fr-secondstrand)
+
+- The new ``utils.autobump`` function can be used to easily specify default and
+  incremented resources, and the ``utils.gb`` and ``utils.hours`` make it
+  a little easier to specify when autobump is not required.
+
+  In the following example, memory will be set to 8 * 1024 MB and will
+  increment by that much each retry. The runtime will be set to 2 * 60 minutes,
+  and will increment by 10 * 60 minutes each retry. The disk will be set to 100
+  * 1024 MB, and will not increase each retry.
+
+  .. code-block:: python
+
+      resources:
+          mem_mb=autobump(gb=8),
+          runtime=autobump(hours=2, increment_hours=10),
+          disk_mb=gb(100)
+
+- WRAPPER_SLURM no longer has the ``--latency-wait=300``,
+  ``--max-jobs-per-second=1``, and ``--max-status-checks-per-second=0.01``
+  which would override any profile settings.
+
+- In RNA-seq and ChIP-seq, the cutadpt rule now defaults to using
+  ``--nextseq-trim 20`` instead of ``-q 20``, to better handle the majority of
+  sequencing data we have recently been working with (NovaSeq). See `this
+  section of the cutadapt docs
+  <https://cutadapt.readthedocs.io/en/stable/guide.html#nextseq-trim>`_ for
+  details.
+
+- Updated requirements to use a recent version of salmon to avoid segfaults
+
+- rnaseq.Rmd, when saving the Rds file at the end, now disables compression.
+  This can have a dramatic improvement on downstream performance for
+  a reasonable disk space cost.
+
+- functional-enrichment.Rmd, now supports KEGG pathways & parallel operation.
+
+- functional-enrichment.Rmd, gene-patterns.Rmd, now saves Rds file at the
+  end (without compression) adding the respective object lists.
+
+- added ``--overlap 6`` to cutadapt to avoid greedy trimming
+
+
+v1.10
+-----
+The major change here is refactoring the Snakefiles to use the ``resources:``
+directive in each rule, and removing the ``--clusterconfig`` mechanism which
+has long been deprecated.
+
+For running on a cluster, this requires a `profile
+<https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles>`_.
+E.g., on `NIH's Biowulf <https://hpc.ni.gov>`_, use the `NIH-HPC
+snakemake_profile <https://github.com/NIH-HPC/snakemake_profile>`_.
+
+General
+~~~~~~~
+- No longer using clusterconfig, instead using resources to configure cluster resources
+- Migrated to a unified testing script that simplifies local and CI testing
+- If sampletable is from SRA, raise an error if a Layout column can't be found
+  (to prevent incorrect interpretation of samples as single-end)
+- Ensure bam indexes are made for the markdups bams, even if bigwigs are not created
+- Remove libsizes table, which was largely redundant with fastqc results
+
+RNA-seq
+~~~~~~~
+- Fix R tests
+- All ``lcdbwf`` R functions use the ``:::`` namespace lookup syntax
+- Fix library loads in rnaseq.Rmd to ensure they come before parallelization configuration
+- New function ``lcdbwf:::lfc_scatter`` for comparing multiple DESeq2 contrasts
+- Updates and fixes to ``gene-patterns.Rmd``
+
+
+v1.9
+----
+
+This version has substantial changes in the ``rnaseq.Rmd`` file to streamline
+its use in a production environment. This involves moving most of the code
+complexity into the ``lcdbwf`` R package and using a new config file as much as
+possible. See details below.
+
+General
+~~~~~~~
+- environments have been updated with recent versions of all tools
+- WRAPPER_SLURM arguments updated with arguments better suited for cluster submission
+- PhiX reference configs have been removed
+- compatibility with Python 3.10
+- fastq-dump rules have been converted to scripts. This is because sra-tools in
+  versions earlier than 3.0 have issue with SSL certs, however sra-tools=3
+  cannot be installed alongside recent versions of salmon (due to conflicting
+  pinnings with the ``icu`` package). Therefore, fastq-dump is now run as
+  a script in its own conda environment.
+- new idxstats rule for chipseq and rnaseq
+
+RNA-seq
+~~~~~~~
+
+**This version has major changes to** ``rnaseq.Rmd``. Briefly:
+
+1. This file has been overhauled to be driven by a config file. This
+   dramatically reduces the need to scroll through the RMarkdown file and make
+   all the customizations for a particular experiment. Now, editing the config
+   file sets up most of the project-specific components. Note that contrasts
+   still need to be customized in the Rmd file.
+2. The narrative and explanatory text has been moved to ``text.yaml`` and is
+   included at render time. This reduces the need to scroll through lots of
+   boilerplate text in the RMarkdown while still retaining the ability to
+   easily edit it.
+3. Most of the complexity has been offloaded to the ``lcdbwf`` R package.
+4. Caches are much improved. See the :ref:`downstream-detailed` section for
+   more information.
+5. Functional enrichment is moved into a separate RMarkdown file.
+
+Downstream RNA-seq config
+,,,,,,,,,,,,,,,,,,,,,,,,,
+
+The file, `workflows/rnaseq/downstream/config.yaml` is heavily commented to
+describe the various settings. The sections of the config are designed such
+that they can be used as additional chunk options to chunks in which they are
+used. This additional chunk option is used by RMarkdown to compute the hash of
+the chunk. The result is that making a change in the config file is sufficient
+to invalidate the cache of any chunks that specify that section as a chunk
+option.
+
+Complexity moved to ``lib/lcdbwf/R``
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+Another major change is that most of the complexity in the ``rnaseq.Rmd`` file
+has been factored out into the ``lcdbwf`` R package that is stored inn
+``lib/lcdbwf``. While this means that all code is no longer included in the
+final rendered HTML file, it does make the Rmd much more streamlined to work
+with. It also has the side effect of making it easier to write unit tests on
+separate functions.
+
+Many helper functions have been added to the ``lcdbwf`` R package, including
+ones to streamline the creation of dds and results objects, composing and saving
+them, and generating many of the outputs.
+
+Improved caching of results chunks
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+A somewhat major change is a new strategy for allowing ``results()`` calls to be
+split across multiple, independently-cached chunks that are then properly merged
+together into a single ``res.list`` object while handling dependencies and
+parallelization (thanks to `@njohnso6 <https://github.com/njohnso6>`_). This
+dramatically speeds up the process of incrementally adding contrasts to complex
+experimental designs.
+
+Other changes
+,,,,,,,,,,,,,
+
+In addition to these major changes, there are also many other improvements
+to ``rnaseq.Rmd``:
+
+    - AnnotationHub databases are only retrieved from cache when they are
+      needed. This dramatically speeds up rendering of the HTML, since before
+      the OrgDb would always load no matter what.
+    - Toggle Kallisto or Salmon quantification with a simple true/false; this
+      automatically sums to gene level using automatically retrieved TxDb. This
+      also now supports creating dds objects from featureCounts, Salmon, or
+      Kallisto in such a way that they can be easily compared with each other.
+    - ``lcdbwf::compose_results()`` to combine res_list and dds_list objects
+      together by inspecting the global namespace for specially-named objects
+    - Helper functions for retrieving global config and data structures (e.g.,
+      ``lcdbwf::get_config()``, ``lcdbwf::get_dds()``)
+    - Helper function ``lcdbwf::match_from_dots`` for working with `...`
+      arguments and splitting them up to only go to the functions they are
+      intended for
+    - Much faster to attach info (e.g., adding SYMBOL to all results) since the
+      AnnotationDbi calls are only done once instead of for each results
+      object.
+    - Refactored functional enrichment to be much more generalized, currently
+      using Gene Ontology and MSigDB. MSigDb, via the ``msigdbr`` package, is
+      available for multiple species and so this incorporates Reactome and
+      KEGG. But the generalized method can be applied to any arbitrary gene
+      sets, allowing for much more customization.
+    - Fixes to clusterProfiler::emapplot calls in particular corner cases
+    - Functional enrichment is now a completely separate file, using the
+      ``combined.Rds`` file as an intermediate between ``rnaseq.Rmd`` and
+      ``functional_enrichment.Rmd``.
+    - All-in-one enrichment function that runs either overrepresentation or
+      GSEA. Makes it much easier to do *ad hoc* tests.
+    - Helper function ``lcdbwf::enrich_list_lapply()`` to apply arbitrary
+      functions to the highly-nested `enrich_list` data structure
+    - Helper function ``lcdbwf::collect_objects`` to help compile discovered
+      results objects
+    - ``lcdbwf::get_sig()`` has more options for what to return
+    - Plotting wrappers for clusterProfiler plot functions, allowing plots to be
+      configured via the config file.
+    - New dds diagnostics and results diagnostics functions and sections of the
+      Rmd, useful for troubleshooting
+    - Refactored the results tabs: MA plots come first; ensure 10 genes are always plotted in MA
+      plots, added volcano plots with labeled genes, removed top 3 and bottom
+      3 gene plots
+    - PCA plots using plotly no longer need "unrolled" for-loops; multiple PCA
+      coloring and clustered heatmap row side colors are now configured in the
+      YAML config file
+    - Moved size factor plots and gene version removal to lcdbwf package
+    - Use datatable to show initial sampletable for cleaner output
+    - Make original dds_initial object the same way as later dds objects and
+      always using a design of ``~1`` to be used in PCA and heatmaps
+    - "Differential expression" header moved so that code is no longer hidden
+      under the size factors plot
+    - Option for filling in NA in symbol with Ensembl IDs
+    - collapseReplicates2 uses ``collapse_by`` rather than ``combine.by``
+    - Updated the code style throughout to use the tidyverse/google style guide
+    - RNA-seq differential expression output is additionally included in an
+      Excel file with one sheet per contrast.
+
+Tests
+~~~~~
+
+- ``lcdbwf`` R package now has its own tests via ``devtools`` and ``testthat``
+- recent versions of Snakemake are broken when ``--until`` is used in certain
+  circumstances; a ChIP-seq test has been disabled temporarily.
+- after a successful test, the environment is written as an artifact on circleci
+
+References
+~~~~~~~~~~
+
+- Fixed a longstanding issue with *S. cerevisiae*, now the GFF file is properly converted to GTF.
+
+v1.8
+----
+
+General
+~~~~~~~
+
+- Complete shift to using pinned ``env.yaml`` files to specify conda
+  environments, and using ``mamba`` for building environments (consistent with
+  recent versions of Snakemake). This is now reflected in documentation and
+  the updated-and-improved ``deploy.py``.
+
+- Reorganization/cleanup of the ``include`` directory
+
+- Added conda troubleshooting notes to the documentation (see
+  :ref:`conda-troubleshooting`).
+
+- The ``lib.helpers.preflight`` function no requires the first column of the
+  sampletable to be named `samplename` when checking configs.
+
+- Improvements to the deployment script ``deploy.py``:
+
+    - now requires Python >3.6
+    - proper logs (so you can easily see how long it takes to build an env)
+    - supports downloading and running the script directly, which will clone
+      a temporary copy and deploy from there
+    - using Control-C to stop the deployment will also stop mamba/conda
+    - colored output
+    - mamba is used by default, but ``--conda-frontend`` will use conda instead
+
+- fastq-dump log is sent to file rather than printed to stdout
+
+- Threads: cutadapt single-end now uses specified threads (it was using
+  1 thread by default); use 6 threads for fastqc
+
+- Added new preflight checks for RNA-seq and ChIP-seq specific configs.
+
+- Added a ``run_complex_test.sh`` driver script for testing the workflows on
+  full-scale publicly available data 
+
+RNA-seq
+~~~~~~~
+
+- **Configuration change:** The ``stranded:`` field is now required for RNA-seq.
+  This is used to choose the correct parameters for various rules, and avoids
+  one of the main reasons to edit the Snakefile. See :ref:`cfg-stranded` for
+  more details on its use.
+
+- added ``stranded:`` field to all configs used in testing
+
+- The ``strand_check`` rule now runs MultiQC for a convenient way of evaluating
+  strandedness of a library.
+
+- Kallisto is now supported in both the RNA-seq Snakefile, references
+  Snakefile, included reference configs, and downstream ``rnaseq.Rmd``
+
+
+References
+~~~~~~~~~~
+
+- When checking URLs in reference configs, don't use ``curl`` to check
+  ``file://`` URIs.
+
+- There is a new feature for reference configs that allows chaining
+  post-processing functions together, see :ref:`advanced-postprocessing`. This
+  means that it is possible, for example, to add ERCC spike-ins (which need
+  post-processing) onto references that themselves need post-processing.
+
+- ``lib/postprocess/ercc.py`` has new helper functions for adding ERCC
+  spike-ins to fasta files and GTF files.
+
+- added ``'kallisto'`` to included reference configs
+
+ChIP-seq
+~~~~~~~~
+
+- symlinks rule is now local
+- added collectinsertsizes pattern to support PE ChIP-seq experiments
+- merging bigwigs log no longer goes to stdout
+
+
+v1.7
+----
+
+Setup
+~~~~~
+
+Use mamba for installation of environments, consistent with Snakemake recommendations
+
+Testing
+~~~~~~~
+
+- We now recommend using `mamba <https://github.com/mamba-org/mamba>`_ to
+  create conda environments. This is dramatically faster and solves some
+  dependency issues. Our automated tests now use this.
+
+- We have moved from requirements.txt files to env.yaml files. We also now
+  encourage the use of the strictly-pinned environments for a more stable
+  experience to hopefully avoid transient issues in the packaging ecosystem.
+
+- ``tbb=2020.2`` as a dependency to fix a recent packaging issue with conda-forge.
+
+- many documentation improvements
+
+- symlinks rule is only set to localrule when it exists (it does not exist when
+  running an analysis exclusively from SRA)
+
+References
+~~~~~~~~~~
+
+- updated URLs for those that have changes (e.g., Sanger -> EBI; using https
+  instead of ftp for UCSC-hosted genomes)
+
+- new ``gff2gtf`` post-process tool for when an annotation is only available as
+  GFF. *S. pombe* needs this, for example, and the
+  `Schizosaccharomyces_pombe.yaml`` reference config has been updated
+  accordingly.
+
+
+- The references workflow no longer reads the config file in its directory.
+  This fixes some subtle overwriting issues when providing config files or
+  items from the command line during as is used during certain test runs. If
+  running the references workflow alone, it must be called with
+  ``--configfile``
+
+RNA-seq
+~~~~~~~
+
+- featureCounts now uses BAM files with duplicates marked. Previously if you
+  wanted to run featureCounts in a mode where it excluded duplicates you would
+  need to reconfigure rules.
+
+- improved comments in RNA-seq downstream RMarkdown files
+
+Testing
+~~~~~~~
+
+- new test that checks all URLs identified in config files to ensure that the
+  included reference files remain valid
+
+- there is now a separate ``run_downstream_test`` script`
+
+- simplified the CircleCI DAG to optimize testing resources
+
+v1.6
+----
+
+References
+~~~~~~~~~~
+- overhaul the way transcriptome fastas are created. Instead of requiring
+  separate download, they are now created out of the provided GTF and fasta
+  files. The reference config section now uses keys ``genome:``,
+  ``transcriptome:``, and ``annotation:`` rather than the ``fasta:`` and
+  ``gtf:`` keys.
+- **backwards-incompatible change:** reference config files have been updated
+  to reflect the changes in the references workflow
+- Update PhiX genome fasta to use NCBI rather than Illumina iGenomes
+
+ChIP-seq workflow
+~~~~~~~~~~~~~~~~~
+- ChIP-seq workflow now properly supports paired-end reads
+- A ChIP-seq workflow can now be run when the ``chipseq:`` and/or
+  ``peak_calling:`` sections are omitted.
+- added a missing bowtie2 config entry in ``clusterconfig.yaml`` which could
+  result in out-of-memory errors when submitting to a cluster using that file
+
+
+RNA-seq workflow
+~~~~~~~~~~~~~~~~
+- if colData is a tibble this no longer causes issues for importing counts
+- dupRadar removed from RNA-seq workflow. We ended up never using it, and it
+  depends on R which we've since removed from the main environment.
+- new ``strand_test`` rule, which can be run explicitly with ``snakemake -j2
+  strand_check``. This generates ``strandedness.tsv`` in the current directory,
+  which is the summarize output of RSeQC's ``infer_experiment.py`` across all
+  samples.
+- implement STAR two-pass alignment. Default is still single-pass.
+- Clean up hard-coded STAR indexing Log.out file
+- Include ``ashr`` and ``ihw`` Bioconductor packages in the R requirements, for
+  use with recent versions of DESeq2.
+
+
+RNA-seq downstream
+~~~~~~~~~~~~~~~~~~
+
+- Functional enrichment and gene patterns are now separate child documents.
+  This makes it easier to turn them on/off by only needing to adjust the chunk
+  options of the child chunk
+- Created a new documentation method for rnaseq.Rmd. Now there is a separate,
+  dedicated documentation page with sections that exactly correspond to each
+  named chunk in the Rmd, as well as a tool for ensuring that chunks and docs
+  stay synchronized. See :ref:`rnaseqrmd` for the new docs.
+- New ``counts.df`` and ``counts.plot`` functions to make it much easier to
+  make custom dotplots of top counts by melting and joining the counts table
+  with the metadata in colData.
+- DEGpatterns cluster IDs are now added as additional columns in the output
+  TSVs for each contrast
+- Many functions in the rnaseq.Rmd now expect a list of :term:`dds` objects.
+  See :ref:`dds_list` for more info on this.
+- Created a new R package, ``lcdbwf`` stored in :file:`lib/lcdbwf`. This can be
+  edited in place, and it is loaded from disk within ``rnaseq.Rmd``.
+- Modified some output keys to support recent versions of Snakemake, for which
+  ``count`` is a reserved keyword
+
+
+General
+~~~~~~~
+- Conda environments are now split into R and non-R. See :ref:`conda-envs` for
+  details. Updated ``deploy.py`` accordingly
+- symlinks rules are now set to be localrules
+- updated workflows to work on recent Snakemake versions
+- split environments into non-R and R. This, along with a loose pinning of
+  versions (``>=``), dramatically speeds up environment creation.
+- updates to support latest Snakemake versions
+- improvements to testing:
+   - environment YAML files, rendered HTML, and docs are stored as artifacts on CircleCI
+   - consolidations of some RNA-seq tests to reduce total time
+   - additional comments in the test config yaml to help new users understand the system
+- new "preflight check" function is run to hopefully catch errors before running workflows
+- updates to support recent Picard versions
+- added wildcard constraints to help Snakemake solve DAG
+
+
+v1.5.3
+------
+
+General
+~~~~~~~
+- default 12-hr wall time in WRAPPER_SLURM
+- update .gitignore (`#223 <https://github.com/lcdb/lcdb-wf/issues/223>`_)
+- remove the FastQC status checks section from the MultiQC report (which shows
+  up in recent MultiQC versions) (`#246 <https://github.com/lcdb/lcdb-wf/issues/246>`_
+
+Bugs
+~~~~
+
+- add bed12 conversion for all species with default reference configs
+- presence of an orig_filename_R2 in sampletable is sufficient to consider the
+  experiment PE
+- ensure DEGpattern output only contains unique genes
+- bring back featurecounts in multiqc report
+- "attach" chunk in rnaseq.Rmd was not properly set to depend on the "results" chunk
+
+RNA-seq
+~~~~~~~
+
+- dds objects can now be created from a full featureCounts input file and
+  a subsetted colData table, if subset.counts=TRUE
+- improve the dependencies between rnaseq.Rmd chunks so that cache=TRUE behaves
+  as expected: (`#232 <https://github.com/lcdb/lcdb-wf/issues/232>`_)
+- add plots for rnaseq.Rmd size factors (`#222 <https://github.com/lcdb/lcdb-wf/issues/222>`_)
+- run rseqc instead of CollectRnaSeqMetrics (the multiqc output is nicer for
+  it, and it's pretty much doing the same thing) (`#218 <https://github.com/lcdb/lcdb-wf/issues/218>`_)
+- when converting Ensembl to symbol, if there is no symbol then fall back to
+  the Ensembl ID to avoid NA (`#246
+  <https://github.com/lcdb/lcdb-wf/issues/246>`_)
+- in rnaseq.Rmd, all caches will be invalidated if the sampletable or the
+  featurecounts table have changed.
+
+Tests
+~~~~~
+- using continuumio/miniconda3 container; finally got en_US.utf8 locale
+  installed and working correctly in that container so that multiqc works.
+
+
+v1.5.2
+------
+
+Bug fixes
+~~~~~~~~~
+
+- When some samples were substrings of other samples (e.g., `WT_1_1` and
+  `WT_1_10`), DESeqDataSetFromCombinedFeatureCounts was assigning the wrong
+  names. This has now been fixed in `helpers.Rmd`.
+
+v1.5.1
+------
+
+Bug fixes
+~~~~~~~~~
+
+- DESeqDataSetFromCombinedFeatureCounts (added in v1.5) was incorrectly
+  assigning labels to samples when the order of the sampletable did not match
+  the order of the samples in the featureCounts table columns. This has been
+  fixed.
+
+General
+~~~~~~~
+
+- `deploy.py` deployment script now only pays attention to files checked in to
+  version control and optionally can create a conda environment in the target
+  directory.
+
+- tests now work out of a newly-deployed instance to better reflect real-world
+  usage
+
+
+ChIP-seq and RNA-seq
+~~~~~~~~~~~~~~~~~~~~
+- reorder cutadapt commands to avoid a MultQC parsing bug in the cutadapt
+  module (see https://github.com/ewels/MultiQC/issues/949)
+
+RNA-seq
+~~~~~~~
+The majority of these changes affect ``rnaseq.Rmd``:
+
+- modifications to MultiQC config to get back featureCounts output
+- `plotMA.label` function (in ``helpers.Rmd``) now defaults to FDR < 0.1
+  (instead of 0.01), and additionally supports labeling using different columns
+  of the results object (e.g., "symbol").
+- remove some now-redundant featureCounts code
+- add a comment showing where to collapse replicates
+- convert colData's first column to rownames
+- implement lower limit for DEGpatterns clustering (default is 0, but can
+  easily set to higher if you're getting issues)
+- expose arbitrary additional function arguments to ``top.plots``. This allows
+  different `intgroup` arguments to be passed to the `my.counts` function,
+  enabling different ways of plotting the gene dotplots.
+
+
+v1.5 (Sept 2019)
+----------------
+
+Major change: **it is no longer possible to mix single-end and paired-end
+samples within the same run of the workflow.** See `#208
+<https://github.com/lcdb/lcdb-wf/pull/208>`_ and the corresponding issue
+description at `#175 <https://github.com/lcdb/lcdb-wf/issues/175>`_.
+
+This version also has many improvements to the ``rnaseq.Rmd`` file for RNA-seq,
+as described below.
+
+RNA-seq
+~~~~~~~
+
+Many changes and improvements to ``rnaseq.Rmd``, including:
+
+- Differential analysis summaries now include labeled MA plots (`#192 <https://github.com/lcdb/lcdb-wf/pull/192/files>`_)
+- PCA plots now use plotly for improved insepction of samples (`#192 <https://github.com/lcdb/lcdb-wf/pull/192/files>`_
+- don't use knitrBootstrap any more (`#192 <https://github.com/lcdb/lcdb-wf/pull/192/files>`_
+- heatmaps use heatmaply package for better interaction (`#192 <https://github.com/lcdb/lcdb-wf/pull/192/files>`_
+- allow ``sel.list`` to be used for UpSet plots and fix some typos `#205 <https://github.com/lcdb/lcdb-wf/pull/205>`_
+- workaround for degPatterns for corner cases where there are few clusters because of the ``minc`` parameter (`#205 <https://github.com/lcdb/lcdb-wf/pull/205>`_)
+- alpha and lfc.thresh are now pulled out into a separate chunk (`#206 <https://github.com/lcdb/lcdb-wf/pull/206>`_)
+- Support AnnotationHub http proxy handling in new version of AnnotationHub (`#207 <https://github.com/lcdb/lcdb-wf/pull/207>`_).
+
+As well as the following changes to other parts of the RNA-seq workflow, such as:
+
+- better bigWig file nomenclature (`#194 <https://github.com/lcdb/lcdb-wf/pull/194/files>`_), uses "pos" and "neg".
+- featureCounts only runs once on all BAMs rather than individual samples (`#195 <https://github.com/lcdb/lcdb-wf/pull/195>`_)
+- support `rseqc infer_experiment`, which replaces running featureCounts in multiple stranded modes (`#199 <https://github.com/lcdb/lcdb-wf/pull/199>`_, `#203 <https://github.com/lcdb/lcdb-wf/pull/203>`_)
+- use ``--validateMappings`` for salmon (`#203 <https://github.com/lcdb/lcdb-wf/pull/203>`_)
+
+References
+~~~~~~~~~~
+- fix typo in *S. pombe* name
+
+All workflows
+~~~~~~~~~~~~~
+
+- Documentation now recommends creating an environment for each directory using the `-p` argument (`#195 <https://github.com/lcdb/lcdb-wf/pull/195>`_)
+
+
+v1.4.2 (Jul 2019)
+-----------------
+
+Bugfixes
+~~~~~~~~
+
+- Don't require ChIP-seq configs to have at least one block for each supported
+  peak-caller
+
+v1.4.1 (Jul 2019)
+-----------------
+
+RNA-seq
+~~~~~~~
+
+- KEGG results were not being added to the ``all.enrich`` list in ``rnaseq.Rmd``
+- symlinking bigWigs is now a local rule
+- default cutadapt options have changed to reflect current recommendations from
+  the author, and the cutadapt rule is now explicity using arguments rather
+  than requiring a separate ``adapters.fa`` file.
+- featureCounts now auto-detects whether it should be run with the ``-p``
+  argument in paired-end mode (previously it was up to the user to make sure
+  this was added). The rule does have an override if this behavior is not wanted.
+
+References
+~~~~~~~~~~
+
+- The reference config for *Drosophila* is now fixed. Previously it depended on
+  `chrom_convert`. That script was a fly-specific script in lcdblib, but
+  lcdblib is no longer a dependency since v1.3. This fix uses the
+  `convert_fastq_chroms` and `convert_gtf_chroms` used in reference configs for
+  other species.
+
+v1.4 (May 2019)
+---------------
+RNA-seq
+~~~~~~~
+Much-improved ``rnaseq.Rmd``:
+
+- tabbed PCA plot
+- improved DEGpatterns chunk
+- dramatically improved functional enrichment section, with tabbed clusterprofiler plots and exported data in two flavors (combined and split)
+- improved upset plots, with exported files showing sets of genes
+- improved comments to highlight where to make changes
+- add new helper functions to ``helpers.R``:
+   - ``fromList.with.names``, for getting UpSet plot output
+   - ``rownames.first.col``, to make tidier dataframes
+   - ``nested.lapply``, for convenient 2-level nested list apply
+   - clusterprofiler helper functions
+
+
+v1.3 (May 2019)
+---------------
+Bugfixes
+~~~~~~~~
+- Fix broken paired-end support for RNA-seq. Previously, when using data from
+  elsewhere on disk and using the symlink rules, R2 would be symlinked to the
+  same file as R1.
+- Support for Snakemake 5.4.0 which changes behavior of the ``expand()``
+  function.
+
+Infrastructure
+~~~~~~~~~~~~~~
+- new deploy script to copy over only the files necessary for an analysis,
+  avoiding the clutter of testing infrastructure.
+- lcdblib, an external package, is no longer a dependency. In the interest of
+  better transparency and to make the code here easier to follow, the relevant
+  code from lcdblib was copied over to the ``lib`` directory in this
+  repository.
+
+ChIP-seq and RNA-seq
+~~~~~~~~~~~~~~~~~~~~
+
+- Bowtie2, HISAT2, and rRNA rules no longer use wrappers. This makes it easier
+  to track down what parameters are being used in each rule.
+- RSeQC is now available in Python 3 so wrappers have been removed.
+- NextGenMap support removed
+
+v1.2 (Mar 2019)
+---------------
+
+RNA-seq
+~~~~~~~
+- First-class paired-end support, including mixing PE and SE samples in the
+  same sampletable
+
+- Support for STAR aligner
+
+References
+~~~~~~~~~~
+- FASTA files are always symlinked into the directories of indexes that were
+  created from it
+
+- Reference configs:
+
+   - updated existing
+   - added more species
+   - new post-process for fasta or gtf: you can now use
+     NICHD-BSPC/chrom-name-mappings to convert chromosome names between UCSC
+     and Ensembl (see reference configs for examples of use)
+
+ChIP-seq and RNA-seq
+~~~~~~~~~~~~~~~~~~~~
+- Updates to dependencies and MultiQC config
+
+Infrastructure
+~~~~~~~~~~~~~~
+
+- Updated requirements in ``requirements.txt`` and in wrappers
+
+- Changed all ``pd.read_table()`` to ``pd.read_csv(sep="\t")`` to prevent warnings
+
+- Changed all ``yaml.load()`` to ``yaml.load(Loader=yaml.FullLoader)`` to
+  prevent warnings
+
+- Using DeprecationWarning rather than UserWarning in the deprecation handler
+  so there's less spam in the logs
+
+- Improved tests:
+
+  - using data from pybedtools repo because modENCODE seems to be down
+  - append rather than prepend base conda to PATH on circleci
+  - separate isolated tests for STAR, ngm, and SRA
+  - updated conda
+
+- Docs additions:
+
+  - TMPDIR handling
+  - clusterconfig
+  - WRAPPER_SLURM
+  - docs for developers
+  - symlinking fastqs
+  - using SRA sampletables
+  - paired-end data
+
+Colocalization
+~~~~~~~~~~~~~~
+- From colocalization, removed the GAT "fractions" heatmap due to unresolved
+  pandas index errors
+
+v1.1 (Aug 2018)
+---------------
+
+Infrastructure
+~~~~~~~~~~~~~~
+
+- The default settings in Snakefiles are for real-world use, rather than for
+  testing. This reduces the amount of editing necessary before running actual
+  data. See :ref:`test-settings` for the extra step to take when testing
+  locally.
+
+- new ``run_test.sh`` script in each workflow directory to automatically run
+  the preprocessor when running test data
+
+- added extensive comments to Snakefiles with ``NOTE:`` string to make it
+  obvious where and how to make changes.
+
+- Documentation overhaul to bring everything up to v1.1. This includes Sphinx
+  autodocs on the ``lib`` module.
+
+- pytest test suite is run on the ``lib`` module
+
+References
+~~~~~~~~~~
+
+- new `metadata` section in references config, which can be used to store
+  additional information like mappable bases and genome size.
+
+- References can now be included from other YAML files into the main config
+  file. This dramatically simplifies individual configfiles, and allows
+  multiple workflows to use identical references without having to do
+  error-prone and hard-to-maintain copy/pastes between workflow configs. See
+  :ref:`references-config` for details.
+
+- New GTF conversion, ``mappings``. This is intended to replace the
+  ``annotation_hub`` conversion, which was problematic because 1) a particular
+  annotation hub accession is not guaranteed to be found in new versions of
+  AnnotationHub, resulting in lack of reproducibility, and 2) it was difficult
+  to synchronize the results with a particular GTF annotation. The
+  ``annotation_hub`` conversion is still supported, but if it's used then
+  a DeprecationWarning will be emitted, recommending ``mappings`` instead.
+
+
+Both RNA-seq and ChIP-seq
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- `fastq_screen` is now configured via ``config.yaml``. This reduces the need
+  to edit the Snakefile and coordinate between the config and the fastq_screen
+  rule. Now everything is done within the config file.
+
+- `fastq_screen` wrapper now handles additional output files created when using
+  the ``--tag`` and ``--filter`` arguments to ``fastq_screen``.
+
+- In the config file, ``assembly`` has been changed to the more-descriptive
+  ``organism``. The change is backwards compatible, but a DeprecationWarning is
+  raised if ``assembly:`` is still used, and changed to ``organism`` (though
+  only in memory, not on disk).
+
+- Patterns no longer use ``{sample_dir}``, ``{agg_dir}``, etc placeholders that
+  need to be configured in the config YAML. Instead, these directories are
+  hard-coded directly into the patterns. This simplifies the config files,
+  simplifies the patterns, and removes one layer of disconnect between the
+  filenames and how they are determined.
+
+- removed 4C workflow since it used 4c-ker
+
+ChIP-seq
+~~~~~~~~
+- macs2 and sicer can accept mappable genome size overrides
+
+RNA-seq
+~~~~~~~
+
+- RNA-seq downstream:
+
+    - ``downstream/help_docs.Rmd`` can be included for first-time users to
+      describe the sections of the RNA-seq analysis
+
+    - ``rnaseq.Rmd`` now uses the same ``NOTE:`` syntax as the Snakefiles for
+      indicating where/what to change
+
+    - Easy swapping of which strand to use from the three featureCounts runs
+      performed by the workflow
+
+    - Be explicit about using DESeq2::lfcShrink as is now the default in recent
+      DESeq2 versions
+
+    - improved the mechanism for keeping together results objects, dds objects, and
+      labels (list of lists, rather than individual list object; refactored
+      functions to use this new structure
+
+v1.0.1 (Jun 2018)
+-----------------
+Bugfixes, last release before references changes.
+
+Infrastructure
+~~~~~~~~~~~~~~
+
+- Transition to CircleCI for testing
+- Use production settings by default; see :ref:`test-settings` for
+  more.
+- lots o' docs
+- new ``include/references_configs`` to help organize references. These are
+  currently not used by the workflows directly.
+- bugfix: use additional options when uncompressing downloaded reference files
+  (``--no-same-owner`` for ``tar``, ``-f`` for ``gunzip``)
+- additional dependencies in the top-level environment to support the
+  additional features in rnaseq.Rmd and track hubs.
+- colocalization workflow, external workflow, figures workflow to demonstrate
+  vertical integration
+
+RNA-seq
+~~~~~~~
+- remove kallisto indexing, use salmon
+- improvements to how chipseq sampletables are parsed (with more informative
+  error messages)
+- run preseq for RNA-seq library complexity QC
+- support for merging bigwigs
+- featureCounts is now run in all three strandedness modes, and results
+  incorporated into MultiQC as separate modules.
+- RNA-seq now symlinks "pos" and "neg" bigWigs, which describe how reads map to
+  the *reference*, to "sense" and "antisense" bigWigs, which describe the
+  *originating RNA*. This makes it easy to swap strands depending on protocol.
+- new ``downstream/helpers.Rmd`` which factors out a lot of the work previously
+  done in ``rnaseq.Rmd`` into separate functions.
+- track hub building respects new sense/antisense bigwig symlinks
+
+``downstream/rnaseq.Rmd``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+- AnnotationHub uses cache dir that will not clobber default home directory cache
+- use varianceStabilizingTransform instead of rlog
+- print a size factors table
+- use multiple cores for computationally expensive DESeq2 operations
+- using separate lists for results, dds objects, and nice labels for automated
+  plots for each contrast
+- UpSet plots for comparing gene lists across contrasts
+- DEGpattern plots for showing clusters of expression patterns (from the
+  DEGreport package)
+- attach normalized counts per sample and per factor (parsed from the model
+  used for the contrast) as well as TPM estimates to the results tables
+- trim the labels in GO enrichment plots when too long
+
+ChIP-seq
+~~~~~~~~
+- sicer for chipseq domain calling
+- pin snakemake <4.5.0 so that subworkflows behave correctly
+- chipseq peak-calling rules (and therefore wrappers) now expect a chromsizes
+  file as input
+- bigbed files for narrowPeak and broadPeak files are created correctly
+  depending on their format
+- run multiBigWigSummary and plotCorrelation from deepTools for ChIP-seq QC
+- ChIP-seq track hub generation script
+
+Both RNA-seq and ChIP-seq
+~~~~~~~~~~~~~~~~~~~~~~~~~
+- update deeptools calls to reflect >v3.0 syntax
+- support for SRA run tables so it's trivial to re-run experiments
+  in SRA
+- multiple FastQC runs are shown separately in MultiQC output
+
+v1.0 (May 2018)
+---------------
+First official full release.
diff --git a/docs/conf.py b/docs/conf.py
index 2f653095..047fd82f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,179 +1,35 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
+# Configuration file for the Sphinx documentation builder.
 #
-# lcdb-wf documentation build configuration file, created by
-# sphinx-quickstart on Tue Apr 11 11:06:34 2017.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import sys
-sys.path.insert(0, os.path.abspath('.'))
-sys.path.insert(0, os.path.abspath('..'))
-sys.path.insert(0, os.path.abspath('../lib'))
-
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.doctest',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.todo',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.githubpages']
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
 
-# The master toctree document.
-master_doc = 'toc'
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-# General information about the project.
 project = 'lcdb-wf'
-copyright = '2017, Ryan Dale, Justin Fear'
-author = 'Ryan Dale, Justin Fear'
+copyright = '2025, Ryan Dale'
+author = 'Ryan Dale'
+release = '2.0'
 
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = '1.9'
-# The full version, including alpha/beta/rc tags.
-release = '1.9'
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = "en"
+extensions = []
 
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
+templates_path = ['_templates']
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = True
 
-autoclass_content = "both"
-autosummary_generate = True
 
-# -- Options for HTML output ----------------------------------------------
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'alabaster'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# Default options here: https://github.com/bitprophet/alabaster/blob/master/alabaster/theme.conf
-#
-html_theme_options = {
-    'description': 'Customizable workflows for high-throughput sequencing analysis',
-    'show_related': 'true',
-    'fixed_sidebar': 'true',
-    'sidebar_width': '300px',
-}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
+html_theme = 'shibuya'
 html_static_path = ['_static']
+master_doc = 'toc'
 
-html_sidebars = {
-    "*": [
-        'about.html',
-        'navigation.html',
-        'relations.html',
-        'searchbox.html',
-    ]
-}
-# -- Options for HTMLHelp output ------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'lcdb-wfdoc'
-
-
-# -- Options for LaTeX output ---------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
+html_theme_options = {
+    "globaltoc_expand_depth": 1,
+    "toctree_titles_only": False,
+    "accent_color": "gold",
 }
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, 'lcdb-wf.tex', 'lcdb-wf Documentation',
-     'Ryan Dale, Justin Fear', 'manual'),
-]
-
-
-# -- Options for manual page output ---------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'lcdb-wf', 'lcdb-wf Documentation',
-     [author], 1)
-]
-
-
-# -- Options for Texinfo output -------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (master_doc, 'lcdb-wf', 'lcdb-wf Documentation',
-     author, 'lcdb-wf', 'One line description of project.',
-     'Miscellaneous'),
-]
diff --git a/docs/decisions.rst b/docs/decisions.rst
index f52002f7..422323c3 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -243,8 +243,57 @@ independent filtering (not even testing those features with so few reads that
 they would not reach significance) guards against this. So we stick with the
 comprehensive annotations when available.
 
+.. _decisions-patterns:
+
+Patterns/targets
+----------------
+
+Previously, we had a system of "patterns", which were the string filenames with
+wildcards, stored in a separate yaml file. The original idea was to provide
+flexibility in reorganizing outputs -- if you didn't like where things were
+stored (e.g., if you didn't want your files to always have ``{sample}`` in the
+basename), then you could edit that one file and everything would be updated.
+
+The config system would fill in the patterns so that you also had a list of the
+filled-in targets. This was mildly convenient for aggregation rules like
+multiqc that use lots of inputs.
+
+It was also useful in integrative workflows (e.g., making figures using results
+from ChIP-seq and RNA-seq workflows), where you could use the single patterns
+yaml file as a record of all the files created. This made writing rules and
+keeping track of input/output files a bit easier:
+
+.. code-block:: python
+
+  rule downstream_figure:
+      input: c.targets["bam"]
+      output: "fig1.pdf"
+
+instead of:
+
+.. code-block:: python
+
+  rule downstream_figure:
+      input: expand('../rnaseq/data/{sample}/{sample}.cutadapt.markdups.bam', sample=SAMPLES)
+      output: "fig1.pdf"
+
+However, over the years, this system proved to be too obscure, hampering
+understandibility of the workflows. I'm not aware of anyone making changes to
+it to modify the output locations. In fact, over the years we realized that the
+consistency of output directory structure across hundreds of projects is
+a *major* benefit, so we specifically *don't* want to change output locations.
+
+So in version 2.0, this system has been completely removed, preferring to use
+hard-coded filenames and plenty of ``expand()`` calls.
+
+Advanced users can always create their own patterns yaml files to use in
+downstream work.
+
+.. _decisions-params:
+
 Params
 ------
+
 The ``params:`` directive allows `non-file parameters for rules
 <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#non-file-parameters-for-rules>`__.
 Much (perhaps all?) of what can be done in a ``params:`` directive can also be
@@ -619,6 +668,8 @@ sense to run featureCounts once, providing it all samples, and having it use
 the temporarily name-sorted BAMs as input for paired-end experiments.
 
 
+.. _decisions-testframework:
+
 Test framework
 --------------
 
diff --git a/docs/toc.rst b/docs/toc.rst
index 624a6c88..c07089e9 100644
--- a/docs/toc.rst
+++ b/docs/toc.rst
@@ -8,3 +8,6 @@ Table of Contents
    getting-started
    workflows
    config
+   decisions
+   changelog
+   v2.0.rst
diff --git a/docs/v2.0.rst b/docs/v2.0.rst
new file mode 100644
index 00000000..5e940832
--- /dev/null
+++ b/docs/v2.0.rst
@@ -0,0 +1,111 @@
+Version 2.0
+===========
+
+This is a major release, heavily focused on simplification where possible.
+
+Config has been dramatically simplified, especially reference configurations.
+Many fixes and improvements that have been requested over the years have been
+addressed here. The docs have been completely rewritten.
+
+
+Reference handling
+------------------
+
+- The separate ``workflows/references/`` workflow has been completely removed.
+  Reference generation rules are now integrated directly into the respective
+  RNA-seq and ChIP-seq workflows. See :ref:`decisions-references` for detailed
+  reasoning.
+
+- The reference configs have been **dramatically** simplified.
+
+- Using prepared reference configs is now more obvious and can use standard
+  Snakemake commandline args (``--configfile``, ``--config``) to modify at run
+  time.
+
+- The prepared reference configs have been updated to the latest assemblies.
+
+Infrastructure
+--------------
+
+- Simplified the test framework (see :ref:`decision log: test framework <decisions-testframework>`)
+
+- The custom wrappers system (``wrappers/`` directory) has been removed.
+  Peak callers and other tools are now implemented as straightforward scripts in the
+  :file:`scripts` directory.
+
+- Cleanup of :file:`lib/utils.py`, removing unused functions.
+
+- Updated all tool versions in ``env.yml`` and ``env-r.yml``
+
+- Removed unused dependencies (GAT, kallisto, etc.)
+
+- Default to conda rather than mamba as package manager frontend
+
+- Complete rewrite of documentation.
+
+- New ``decisions.rst`` documenting architectural decisions and rationale
+
+Snakefiles
+----------
+
+- **Snakemake 8+ is now required**.
+
+- Resource specifications changed, for example ``mem_mb=8192`` is now
+  ``mem="8g"``. With this string representation, we no longer need the
+  ``utils.gb()`` convenience functions, so they have been removed.
+
+- Autobump functionality was removed.
+
+- Snakefiles are formatted using ``snakefmt``.
+
+- The patterns/targets system has been removed (see :ref:`decision log for patterns/targets <decisions-patterns>`).
+
+- A principled approach to ``params:`` was used (see :ref:`decision log for params <decisions-params>`).
+
+Workflow removals
+-----------------
+
+- Removed colocalization workflow due to limited usage and maintenance burden.
+
+
+RNA-seq workflow
+----------------
+
+- Removed kallisto (but kept salmon)
+
+- Removed STAR two-pass mode (it added complexity and was rarely used in practice)
+
+- Removed preliminary strand check on subset of data; use the existing strand
+  check on full data.
+
+- Self-contained reference rules
+
+- Improved preflight checks that catch configuration errors before workflow runs:
+  required columns in sampletables, reference configuration validation, SRA Layout
+  column verification, and warnings about common misconfigurations
+
+- Removed separate libsizes table in MultiQC (redundant with General Stats table)
+
+- Use config's organism key to automatically handle rRNA FASTA, rather than manually configuring
+
+- Paired-end BAMs are temporarily name-sorted before featureCounts, dramatically improving performance
+
+- Paired-end FastQC results are run and properly reported in separate sections in MultiQC
+
+
+ChIP-seq workflow
+-----------------
+
+- Removed plotFingerprint rule (see :ref:`decision log: plotFingerprint <decisions-plotfingerprint>`).
+
+- Simpler sampletable no longer requires antibody and biological material
+  (though they're still recommended for clarity)
+
+- Sampletable ``merged_label`` now an alias for ``label`` to be more clear that
+  it reflects the name of merged tech reps
+
+- ``merged_label`` automatically filled in with samplenames if there are no techreps
+
+- macs2 --> macs3
+
+- Paired-end FastQC results are run and properly reported in separate sections in MultiQC

From 1acc56e4fd3ea5aecda1603f6aed6ae28349d3c3 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Fri, 7 Nov 2025 04:01:44 +0000
Subject: [PATCH 195/196] snakefmt on chipseq

---
 workflows/chipseq/Snakefile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
index e2b1adc5..33926815 100644
--- a/workflows/chipseq/Snakefile
+++ b/workflows/chipseq/Snakefile
@@ -555,7 +555,7 @@ rule samtools_idxstats:
         mem="16g",
         runtime="2h",
     log:
-        "data/chipseq_samples/{sample}/{sample}.samtools_idxstats.txt.log"
+        "data/chipseq_samples/{sample}/{sample}.samtools_idxstats.txt.log",
     shell:
         "samtools idxstats {input.bam} 2> {log} 1> {output.txt}"
 
@@ -570,7 +570,7 @@ rule samtools_flagstat:
         mem="8g",
         runtime="2h",
     log:
-        "data/chipseq_samples/{sample}/{sample}.samtools_flagstat.txt.log"
+        "data/chipseq_samples/{sample}/{sample}.samtools_flagstat.txt.log",
     shell:
         "samtools flagstat {input.bam} > {output}"
 
@@ -585,7 +585,7 @@ rule samtools_stats:
         mem="8g",
         runtime="2h",
     log:
-        "data/chipseq_samples/{sample}/{sample}.samtools_stats.txt.log"
+        "data/chipseq_samples/{sample}/{sample}.samtools_stats.txt.log",
     shell:
         "samtools stats {input.bam} > {output}"
 
@@ -606,7 +606,9 @@ rule multiqc:
         expand(rules.samtools_idxstats.output, sample=SAMPLES),
         expand(rules.bigwig.output, label=LABELS),
         expand(rules.merge_techreps.output, label=LABELS),
-        expand(rules.collectinsertsizemetrics.output.metric, sample=SAMPLES) if is_paired else [],
+        expand(rules.collectinsertsizemetrics.output.metric, sample=SAMPLES)
+        if is_paired
+        else [],
         [v["bigbed"] for v in peaks.values()],
         config="config/multiqc_config.yaml",
     output:

From 3983a28a678d336775118cba104001016ba62514 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 16 Nov 2025 20:53:33 +0000
Subject: [PATCH 196/196] decisions log for variant-calling

---
 docs/decisions.rst | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/docs/decisions.rst b/docs/decisions.rst
index 422323c3..13e9a32c 100644
--- a/docs/decisions.rst
+++ b/docs/decisions.rst
@@ -829,4 +829,28 @@ We had accumulated a lot of useful functions over time, but things have changed
 enough that they haven't been used. To avoid clutter and additional maintenance
 burden in supporting otherwise unused code, these functions were removed.
 
-
+Variant-calling workflow
+------------------------
+
+lcdb-wf originally started as a way of combining RNA-seq and ChIP-seq
+experiments for different model organisms. It was important to have an
+organism-agnostic system that could be highly customized.
+
+An experimental variant calling workflow was added. This worked well for human
+and, with some effort, mouse. However, it was difficult to support an
+organism-agnostic workflow for variant annotation which depends on large
+amounts of downloaded annotation data that is typically very human-focused.
+
+All that said, it is useful to have an organism-agnostic variant-calling
+workflow that at least does everything upstream of annotation and that can be
+integrated with RNA-seq and ChIP-seq if needed.
+
+This will be added back in to lcdb-wf v2 at a later point once it is refactored
+to be similar to the other workflows (i.e., following the decisions outlined in
+this document when possible).
+
+For human variant calling, consider the `snakemake workflows
+dna-seq-gatk-variant-calling
+<https://github.com/snakemake-workflows/dna-seq-gatk-variant-calling>`__
+workflow, the `nf-core/raredisease <https://nf-co.re/raredisease>`__ NextFlow
+pipeline, or the `nf-core/sarek <https://nf-co.re/sarek/>`__ NextFlow pipeline.