From 4d1222c72a25a6d84f8f27543742d4fddbf51e3b Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Thu, 1 Jun 2023 18:15:22 -0400
Subject: [PATCH 01/59] Initial variant calling workflow

---
 env.yml                                       | 204 ++--
 .../reference_configs/variant-calling.yaml    |  30 +
 include/requirements.txt                      |  11 +-
 lib/aligners.py                               |  25 +
 lib/common.py                                 |  34 +-
 lib/helpers.smk                               | 335 +++++++
 workflows/references/Snakefile                | 185 +++-
 workflows/references/config/config.yaml       |   6 +-
 workflows/variant-calling/Snakefile           | 911 ++++++++++++++++++
 workflows/variant-calling/config/config.yaml  |  74 ++
 workflows/variant-calling/config/samples.tsv  |   3 +
 workflows/variant-calling/config/units.tsv    |   4 +
 12 files changed, 1707 insertions(+), 115 deletions(-)
 create mode 100644 include/reference_configs/variant-calling.yaml
 create mode 100644 lib/helpers.smk
 create mode 100644 workflows/variant-calling/Snakefile
 create mode 100644 workflows/variant-calling/config/config.yaml
 create mode 100644 workflows/variant-calling/config/samples.tsv
 create mode 100644 workflows/variant-calling/config/units.tsv

diff --git a/env.yml b/env.yml
index 10696a3a6..ee82a16ed 100644
--- a/env.yml
+++ b/env.yml
@@ -1,4 +1,4 @@
-name: null
+name: /gpfs/gsfs10/users/NICHD-core0/test/fridellsa/v1.10-lcdbwf-vc/lcdb-wf/env
 channels:
   - conda-forge
   - bioconda
@@ -6,7 +6,7 @@ dependencies:
   - _libgcc_mutex=0.1
   - _openmp_mutex=4.5
   - _r-mutex=1.0.1
-  - alsa-lib=1.2.3.2
+  - alsa-lib=1.2.8
   - amply=0.1.5
   - appdirs=1.4.4
   - argcomplete=3.0.8
@@ -17,20 +17,20 @@ dependencies:
   - backcall=0.2.0
   - backports=1.0
   - backports.functools_lru_cache=1.6.4
+  - bcftools=1.17
   - bedtools=2.31.0
-  - binutils_impl_linux-64=2.39
-  - binutils_linux-64=2.39
+  - binutils_impl_linux-64=2.40
   - biopython=1.81
-  - boost-cpp=1.74.0
+  - boost-cpp=1.78.0
   - bowtie=1.3.1
   - bowtie2=2.5.1
   - brotli=1.0.9
   - brotli-bin=1.0.9
-  - brotlipy=0.7.0
+  - bwa=0.7.17
   - bwidget=1.9.14
   - bx-python=0.9.0
   - bzip2=1.0.8
-  - c-ares=1.18.1
+  - c-ares=1.19.1
   - ca-certificates=2023.5.7
   - cairo=1.16.0
   - certifi=2023.5.7
@@ -49,8 +49,7 @@ dependencies:
   - configargparse=1.5.3
   - connection_pool=0.0.3
   - contourpy=1.0.7
-  - cryptography=39.0.0
-  - curl=7.86.0
+  - curl=7.87.0
   - cutadapt=4.4
   - cycler=0.11.0
   - datrie=0.8.2
@@ -59,8 +58,8 @@ dependencies:
   - deeptools=3.5.2
   - deeptoolsintervals=0.1.9
   - dnaio=0.10.0
-  - docutils=0.20
-  - dpath=2.1.5
+  - docutils=0.20.1
+  - dpath=2.1.6
   - exceptiongroup=1.1.1
   - execnet=1.9.0
   - executing=1.2.0
@@ -81,30 +80,30 @@ dependencies:
   - fribidi=1.0.10
   - future=0.18.3
   - gat=1.3.6
-  - gcc_impl_linux-64=10.4.0
-  - gcc_linux-64=10.4.0
+  - gatk4=4.4.0.0
+  - gcc_impl_linux-64=12.2.0
   - gettext=0.21.1
   - gffread=0.12.7
   - gffutils=0.11.1
-  - gfortran_impl_linux-64=10.4.0
-  - gfortran_linux-64=10.4.0
+  - gfortran_impl_linux-64=12.2.0
   - giflib=5.2.1
   - gitdb=4.0.10
   - gitpython=3.1.31
-  - glib=2.74.1
-  - glib-tools=2.74.1
+  - glib=2.76.3
+  - glib-tools=2.76.3
+  - gmp=6.2.1
   - graphite2=1.3.13
   - gsl=2.7
-  - gst-plugins-base=1.18.5
-  - gstreamer=1.20.3
-  - gxx_impl_linux-64=10.4.0
-  - gxx_linux-64=10.4.0
-  - harfbuzz=4.2.0
+  - gst-plugins-base=1.21.3
+  - gstreamer=1.21.3
+  - gstreamer-orc=0.4.33
+  - gxx_impl_linux-64=12.2.0
+  - harfbuzz=6.0.0
   - hdf5=1.12.1
   - hisat2=2.2.1
-  - htslib=1.16
+  - htslib=1.17
   - humanfriendly=10.0
-  - icu=69.1
+  - icu=70.1
   - idna=3.4
   - importlib-metadata=6.6.0
   - importlib_resources=5.12.0
@@ -112,7 +111,7 @@ dependencies:
   - intervalstats=1.01
   - ipython=8.13.2
   - isa-l=2.30.0
-  - jack=1.9.18
+  - jack=1.9.22
   - jedi=0.18.2
   - jinja2=3.1.2
   - jpeg=9e
@@ -122,19 +121,21 @@ dependencies:
   - kernel-headers_linux-64=2.6.32
   - keyutils=1.6.1
   - kiwisolver=1.4.4
-  - krb5=1.19.3
+  - krb5=1.20.1
+  - lame=3.100
   - lcms2=2.14
-  - ld_impl_linux-64=2.39
+  - ld_impl_linux-64=2.40
   - lerc=4.0.0
   - libblas=3.9.0
   - libbrotlicommon=1.0.9
   - libbrotlidec=1.0.9
   - libbrotlienc=1.0.9
-  - libcap=2.64
+  - libcap=2.66
   - libcblas=3.9.0
-  - libclang=13.0.1
+  - libclang=15.0.7
+  - libclang13=15.0.7
   - libcups=2.3.3
-  - libcurl=7.86.0
+  - libcurl=7.87.0
   - libdb=6.2.32
   - libdeflate=1.13
   - libedit=3.1.20191231
@@ -142,33 +143,36 @@ dependencies:
   - libevent=2.1.10
   - libexpat=2.5.0
   - libffi=3.4.2
-  - libflac=1.3.4
-  - libgcc-devel_linux-64=10.4.0
+  - libflac=1.4.2
+  - libgcc-devel_linux-64=12.2.0
   - libgcc-ng=12.2.0
+  - libgcrypt=1.10.1
   - libgd=2.3.3
   - libgfortran-ng=12.2.0
   - libgfortran5=12.2.0
-  - libglib=2.74.1
+  - libglib=2.76.3
   - libgomp=12.2.0
-  - libhwloc=2.8.0
+  - libgpg-error=1.46
+  - libhwloc=2.9.1
   - libiconv=1.17
   - libjemalloc=5.3.0
   - liblapack=3.9.0
   - liblapacke=3.9.0
-  - libllvm13=13.0.1
+  - libllvm15=15.0.7
   - libnghttp2=1.51.0
   - libnsl=2.0.0
   - libogg=1.3.4
   - libopenblas=0.3.21
   - libopus=1.3.1
   - libpng=1.6.39
-  - libpq=14.5
-  - libsanitizer=10.4.0
-  - libsndfile=1.0.31
-  - libsqlite=3.41.2
+  - libpq=15.1
+  - libsanitizer=12.2.0
+  - libsndfile=1.2.0
+  - libsqlite=3.42.0
   - libssh2=1.10.0
-  - libstdcxx-devel_linux-64=10.4.0
+  - libstdcxx-devel_linux-64=12.2.0
   - libstdcxx-ng=12.2.0
+  - libsystemd0=252
   - libtiff=4.4.0
   - libtool=2.4.7
   - libudev1=253
@@ -177,9 +181,10 @@ dependencies:
   - libwebp=1.2.4
   - libwebp-base=1.2.4
   - libxcb=1.13
-  - libxkbcommon=1.0.3
-  - libxml2=2.9.14
+  - libxkbcommon=1.5.0
+  - libxml2=2.10.3
   - libzlib=1.2.13
+  - lz4-c=1.9.4
   - lzo=2.10
   - lzstring=1.0.4
   - make=4.3
@@ -190,65 +195,33 @@ dependencies:
   - matplotlib-base=3.7.1
   - matplotlib-inline=0.1.6
   - mdurl=0.1.0
+  - mpg123=1.31.3
   - multiqc=1.14
   - munkres=1.1.4
   - mysql-common=8.0.32
   - mysql-connector-c=6.1.11
   - mysql-libs=8.0.32
   - nbformat=5.8.0
-  - ncbi-vdb=3.0.2
   - ncurses=6.3
   - networkx=3.1
   - nspr=4.35
   - nss=3.89
   - numpy=1.23.5
-  - openjdk=11.0.1
+  - openjdk=17.0.3
   - openjpeg=2.5.0
   - openssl=1.1.1t
-  - ossuuid=1.6.2
   - packaging=23.1
   - pandas=2.0.1
   - pandoc=3.1.2
-  - pango=1.50.7
+  - pango=1.50.14
   - parso=0.8.3
   - patsy=0.5.3
   - pbzip2=1.1.13
-  - pcre2=10.37
+  - pcre2=10.40
   - perl=5.32.1
-  - perl-alien-build=2.48
-  - perl-alien-libxml2=0.17
-  - perl-business-isbn=3.007
-  - perl-business-isbn-data=20210112.006
-  - perl-capture-tiny=0.48
-  - perl-carp=1.50
-  - perl-constant=1.33
-  - perl-data-dumper=2.183
-  - perl-encode=3.19
-  - perl-exporter=5.74
-  - perl-extutils-makemaker=7.70
-  - perl-ffi-checklib=0.28
-  - perl-file-chdir=0.1011
-  - perl-file-path=2.18
-  - perl-file-temp=0.2304
-  - perl-file-which=1.24
   - perl-gd=2.76
   - perl-gdgraph=1.54
   - perl-gdtextutil=0.86
-  - perl-importer=0.026
-  - perl-mime-base64=3.16
-  - perl-parent=0.241
-  - perl-path-tiny=0.124
-  - perl-pathtools=3.75
-  - perl-scope-guard=0.21
-  - perl-storable=3.15
-  - perl-sub-info=0.002
-  - perl-term-table=0.016
-  - perl-test2-suite=0.000145
-  - perl-uri=5.12
-  - perl-xml-libxml=2.0207
-  - perl-xml-namespacesupport=1.12
-  - perl-xml-sax=1.02
-  - perl-xml-sax-base=1.09
   - pexpect=4.8.0
   - picard=2.27.5
   - pickleshare=0.7.5
@@ -261,6 +234,7 @@ dependencies:
   - platformdirs=3.5.1
   - plotly=5.14.1
   - pluggy=1.0.0
+  - ply=3.11
   - pooch=1.7.0
   - preseq=3.2.0
   - prompt-toolkit=3.0.38
@@ -269,7 +243,7 @@ dependencies:
   - pthread-stubs=0.4
   - ptyprocess=0.7.0
   - pulp=2.7.0
-  - pulseaudio=14.0
+  - pulseaudio=16.1
   - pure_eval=0.2.2
   - py2bit=0.3.0
   - pybedtools=0.9.0
@@ -277,57 +251,59 @@ dependencies:
   - pycparser=2.21
   - pyfaidx=0.7.2.1
   - pygments=2.15.1
-  - pyopenssl=23.1.1
   - pyparsing=3.0.9
-  - pyqt=5.15.4
-  - pyqt5-sip=12.9.0
+  - pyqt=5.15.7
+  - pyqt5-sip=12.11.0
   - pyrsistent=0.19.3
-  - pysam=0.20.0
+  - pysam=0.21.0
   - pysocks=1.7.1
   - pytest=7.3.1
-  - pytest-xdist=3.2.1
+  - pytest-xdist=3.3.1
   - python=3.10.8
   - python-dateutil=2.8.2
-  - python-fastjsonschema=2.16.3
+  - python-fastjsonschema=2.17.1
   - python-isal=1.1.0
-  - python-lzo=1.14
+  - python-lzo=1.15
   - python-tzdata=2023.3
   - python_abi=3.10
   - pytz=2023.3
   - pyvcf3=1.0.3
   - pyyaml=6.0
-  - qt-main=5.15.2
-  - r-base=4.1.3
+  - qt-main=5.15.6
+  - r-base=4.2.2
   - readline=8.2
-  - requests=2.29.0
+  - requests=2.31.0
   - reretry=0.11.8
   - rich=13.3.5
   - rich-click=1.6.1
   - rseqc=5.0.1
+  - rust-bio-tools=0.42.0
   - salmon=1.10.1
-  - samtools=1.16.1
+  - samtools=1.17
   - scipy=1.10.1
   - seaborn=0.12.2
   - seaborn-base=0.12.2
   - sed=4.8
   - setuptools=67.7.2
   - simplejson=3.19.1
-  - sip=6.5.1
+  - sip=6.7.9
   - six=1.16.0
   - smart_open=6.3.0
   - smmap=3.0.5
-  - snakemake-minimal=7.25.3
+  - snakemake-minimal=7.26.0
+  - snpeff=5.1
+  - snpsift=5.1
   - spectra=0.0.11
-  - sqlite=3.41.2
-  - sra-tools=3.0.3
+  - sra-tools=2.9.6
   - stack_data=0.6.2
   - star=2.7.10b
+  - starcode=1.4
   - statsmodels=0.14.0
   - stopit=1.1.2
   - subread=2.0.3
   - sysroot_linux-64=2.12
   - tabulate=0.9.0
-  - tbb=2021.7.0
+  - tbb=2021.9.0
   - tenacity=8.2.2
   - throttler=1.2.1
   - tk=8.6.12
@@ -335,38 +311,50 @@ dependencies:
   - toml=0.10.2
   - tomli=2.0.1
   - toposort=1.10
-  - tornado=6.3
+  - tornado=6.3.2
   - trackhub=0.2.4
   - traitlets=5.9.0
-  - typing-extensions=4.5.0
-  - typing_extensions=4.5.0
+  - typing-extensions=4.6.1
+  - typing_extensions=4.6.1
   - tzdata=2023c
-  - ucsc-bedgraphtobigwig=377
+  - ucsc-bedgraphtobigwig=445
   - ucsc-bedsort=377
-  - ucsc-bedtobigbed=377
+  - ucsc-bedtobigbed=447
   - ucsc-bigwigmerge=377
   - ucsc-fetchchromsizes=377
-  - ucsc-genepredtobed=377
-  - ucsc-gtftogenepred=377
-  - ucsc-liftover=377
+  - ucsc-genepredtobed=447
+  - ucsc-gtftogenepred=447
+  - ucsc-liftover=447
   - ucsc-oligomatch=377
-  - ucsc-twobittofa=377
-  - ucsc-wigtobigwig=377
+  - ucsc-twobittofa=447
+  - ucsc-wigtobigwig=447
   - unicodedata2=15.0.0
-  - urllib3=1.26.15
+  - urllib3=2.0.2
   - wcwidth=0.2.6
   - wheel=0.40.0
   - wrapt=1.15.0
+  - xcb-util=0.4.0
+  - xcb-util-image=0.4.0
+  - xcb-util-keysyms=0.4.0
+  - xcb-util-renderutil=0.3.9
+  - xcb-util-wm=0.4.1
+  - xkeyboard-config=2.38
   - xopen=1.7.0
+  - xorg-fixesproto=5.0
+  - xorg-inputproto=2.3.2
   - xorg-kbproto=1.0.7
   - xorg-libice=1.0.10
   - xorg-libsm=1.2.3
   - xorg-libx11=1.8.4
-  - xorg-libxau=1.0.9
+  - xorg-libxau=1.0.11
   - xorg-libxdmcp=1.1.3
   - xorg-libxext=1.3.4
+  - xorg-libxfixes=5.0.3
+  - xorg-libxi=1.7.10
   - xorg-libxrender=0.9.10
   - xorg-libxt=1.2.1
+  - xorg-libxtst=1.2.3
+  - xorg-recordproto=1.14.2
   - xorg-renderproto=0.11.1
   - xorg-xextproto=7.3.0
   - xorg-xproto=7.0.31
@@ -377,4 +365,4 @@ dependencies:
   - zlib=1.2.13
   - zstandard=0.19.0
   - zstd=1.5.2
-prefix: /gpfs/gsfs10/users/NICHD-core0/test/dalerr/lcdb-wf/env
+prefix: /gpfs/gsfs10/users/NICHD-core0/test/fridellsa/v1.10-lcdbwf-vc/lcdb-wf/env
diff --git a/include/reference_configs/variant-calling.yaml b/include/reference_configs/variant-calling.yaml
new file mode 100644
index 000000000..6ad0f9b48
--- /dev/null
+++ b/include/reference_configs/variant-calling.yaml
@@ -0,0 +1,30 @@
+references:
+  human:
+    ensembl-104:
+      metadata:
+        build: 'GRCh38'
+        release: 104
+        species: 'homo_sapiens'
+      genome:
+        url: 'ftp://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz'
+        # URL format is 'ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species_capitalized}.{build}.{datatype}.{assembly}.{suffix}'
+        # When using GRCh37, branch changes to "grch37/release-{release}"
+        # always use primary_assembly for human, NEVER use top_level for assembly for human
+        indexes:
+          - 'bwa'
+          - 'faidx'
+      known:
+        # You can download structural_variations, somatic, or "all" which corresponds to germline known variation for all chromosomes
+        type: 'all'
+    # Comment the variation key out if not requiring dbnsfp
+    #variation:
+    # Download of variation databases will be handled by a unique rule in the Snakefile
+    # ONLY include keys like 'dbnsfp' IF you intend to download them, comment out these keys if not.
+    #dbnsfp:
+        # The version of the database should be correctly formatted like this
+        #version: 'dbNSFPv4.4'
+        # The url is found by copying the link address of the latest version found here: https://sites.google.com/site/jpopgen/dbNSFP
+        #url: 'https://usf.box.com/shared/static/bvfzmkpgtphvbmmrvb2iyl2jl21o49kc'
+        # Match the build to the metadata block above
+        #build: 'GRCh38'
+
diff --git a/include/requirements.txt b/include/requirements.txt
index 5f1b4a8e3..188dbe289 100644
--- a/include/requirements.txt
+++ b/include/requirements.txt
@@ -1,13 +1,17 @@
+bcftools>=1.15.1
 bedtools
 biopython
 bowtie
 bowtie2
-cutadapt
+bwa
+curl
+cutadapt>=3.0
 deeptools
 fastq-screen
 fastqc
 font-ttf-dejavu-sans-mono
 gat
+gatk4
 gffread
 gffutils
 hisat2
@@ -27,14 +31,17 @@ pyfaidx
 pysam
 pytest
 pytest-xdist
+python>=3.10
 rseqc
+rust-bio-tools
 
 # earlier versions of salmon can segfault on Slurm
 salmon>=1.10.1
-
 samtools
 seaborn
 snakemake-minimal
+snpeff
+snpsift
 sra-tools
 star
 subread
diff --git a/lib/aligners.py b/lib/aligners.py
index 62fe58a57..629a3eb17 100644
--- a/lib/aligners.py
+++ b/lib/aligners.py
@@ -83,3 +83,28 @@ def fastq_arg_from_input(fastqs):
         fastqs = '-1 {0} -2 {1} '.format(*fastqs)
     return fastqs
 
+def bwa_index_from_prefix(prefix):
+    """
+    Given a prefix, return a list of the corresponding bwa index files
+    """
+    ext_list = ["amb", "ann", "bwt", "pac", "sa"]
+    return ['{prefix}.{ext}'.format(prefix=prefix, ext=ext_list[i]) for i in range(len(ext_list))]
+
+def bwa_prefix_from_index(index_files):
+    """
+    Given a list of index files, return the corresponding prefix
+    """
+    if isinstance(index_files, str):
+        return '.'.join(index_files.split('.')[:-1])
+    else:
+        prefixes = list(
+            set(
+                map(
+                    lambda x: '.'.join(x.split('.')[:-1]), index_files)
+            )
+        )
+        if len(prefixes) != 1:
+            raise ValueError(
+                "More than one prefix detected from '{0}'".format(prefixes)
+            )
+        return prefixes[0]
diff --git a/lib/common.py b/lib/common.py
index 829cc1298..653dc2f6c 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -419,6 +419,9 @@ def references_dict(config):
         'bowtie2': aligners.bowtie2_index_from_prefix('')[0],
         'hisat2': aligners.hisat2_index_from_prefix('')[0],
         'star': '/Genome',
+        # Add BWA and samtools faidx indices
+        'bwa': aligners.bwa_index_from_prefix('')[0],
+        'faidx': '.fai',
 
         # Notes on salmon indexing:
         #   - pre-1.0 versions had hash.bin
@@ -451,13 +454,40 @@ def references_dict(config):
     type_extensions = {
         'genome': 'fasta',
         'annotation': 'gtf',
-        'transcriptome': 'fasta'
+        'transcriptome': 'fasta',
+        'known': 'vcf.gz'
     }
 
     for organism in merged_references.keys():
         d[organism] = {}
         for tag in merged_references[organism].keys():
             e = {}
+            # add support for variation databases
+            if tag == 'variation':
+                # get the variation databases
+                # they should be the the keys of a dictionary containing a URL and postprocess block
+                for type_ in merged_references[organism][tag].keys():
+                    ext = '.vcf.gz'
+                    if type_ == 'dbnsfp':
+                        type_ = merged_references[organism][tag][type_]['version'] + '_' +  merged_references[organism][tag][type_]['build']
+                        e[type_] = (
+                            '{references_dir}/'
+                            '{organism}/'
+                            '{tag}/'
+                            '{type_}/'
+                            '{organism}_{tag}{ext}'.format(**locals())
+                        )
+                        d[organism][tag] = e
+                        continue
+                    e[type_] = (
+                        '{references_dir}/'
+                        '{organism}/'
+                        '{tag}/'
+                        '{type_}/'
+                        '{organism}_{tag}{ext}'.format(**locals())
+                    )
+                    d[organism][tag] = e
+                continue
             for type_, block in merged_references[organism][tag].items():
                 if type_ == 'metadata':
                     continue
@@ -546,6 +576,7 @@ def references_dict(config):
                             '{type_}/'
                             '{organism}_{tag}.chromsizes'.format(**locals())
                         )
+
                 d[organism][tag] = e
     return d, conversion_kwargs
 
@@ -912,3 +943,4 @@ def gff2gtf(gff, gtf):
         shell('gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}')
     else:
         shell('gffread {gff} -T -o- | gzip -c > {gtf}')
+
diff --git a/lib/helpers.smk b/lib/helpers.smk
new file mode 100644
index 000000000..63e717932
--- /dev/null
+++ b/lib/helpers.smk
@@ -0,0 +1,335 @@
+import pandas as pd
+import yaml
+import os
+
+# Read sample table
+samples = pd.read_table(config["samples"], dtype=str).set_index("sample", drop=False)
+units = pd.read_table(config["units"], dtype=str).set_index(["sample","unit"], drop=False)
+units.index = units.index.set_levels([i for i in units.index.levels])
+
+def preflight():
+    """
+    This helper function gets called at the top of the main Snakefile. It
+    handles reading the config to see if references are provided externally, or
+    if we are relying on lcdb-wf references. Returns variables containing
+    filepaths of references to be used in rules. It will also perform some
+    checks to make sure the config is not contradicting itself under certain
+    configurations.
+    """
+    aln_index = []
+    dbnsfp = []
+    dictionary = []
+    indexed = []
+    known_sites = []
+    reference = []
+
+    # Handle reference names if LCDB-WF References is ran
+    if config['ref']['use_references_workflow']:
+        include: '../references/Snakefile'
+        refdict = common.references_dict(config)
+        reference = refdict[config['ref']['organism'][config['ref']['genome']['tag']]['genome']]
+        aln = refdict[config['ref']['organism'][config['ref']['aligner']['tag']]['bwa']]
+        aln_index = multiext(os.path.splitext(aln)[0], ".amb", ".ann", ".bwt", ".pac", ".sa")
+        indexed = refdict[config['ref']['organism'][config['ref']['faidx']['tag']]['faidx']]
+        if config['ref']['variation']['dbnsfp']:
+            dbnsfp = refdict[config['ref']['organism']]['variation'][str(config['ref']['variation']['dbnsfp'] + '_' + config['ref']['genome']['build'])]
+        else:
+            dbnsfp = []
+        if config['ref']['variation']['known']:
+            known_sites = refdict[config['ref']['organism']][config['ref']['genome']['tag']][config['ref']['variation']['known']]
+        else:
+            known_sites = []
+    else:
+        known_sites = (
+            config['ref']['paths']['known']
+            if config['ref']['paths']['known']
+            else []
+        )
+        reference = config['ref']['paths']['ref']
+        indexed = (
+            config['ref']['paths']['index']
+            if config['ref']['paths']['index']
+            else reference + '.fai'
+        )
+        dbnsfp = (
+            config['ref']['paths']['dbnsfp']
+            if config['ref']['paths']['dbnsfp']
+            else []
+        )
+        aln_index = []
+
+    # Handle dictionary name, stop the workflow if the fasta file is not named properly. Stop the workflow if there is no reference
+    if reference == []:
+        raise ValueError("You must supply a reference file to workflow.")
+    if reference.endswith('.gz'):
+        dictionary = '.'.join(reference.split('.')[:-2]) + '.dict'
+    else:
+        try:
+            dictionary ='.'.join(reference.split('.')[:-1]) + '.dict'
+            # If there is no exception, python will raise a TypeError trying to concatenate an empty list with str
+        except TypeError:
+            raise ValueError("There is something wrong with your reference extension. " 
+                             "Please make sure your reference has an extension")
+    # Stop the workflow easily if there is no known variation, but bqsr is set in the config
+    if config['filtering']['bqsr'] == True:
+        assert known_sites != [], 'Check your config.yaml. You are requiring that bqsr be run, but there is no known sites vcf'
+
+    return aln_index, dbnsfp, dictionary, indexed, known_sites, reference
+
+
+def get_contigs():
+    """
+    Helper function to read the contigs from the fasta index checkpoint rule.
+    These contigs define the regions to split variant calling by for joint-calling.
+    """
+    with checkpoints.genome_index.get().output[0].open() as fai:
+        ser = pd.read_table(fai, header=None, usecols=[0], dtype=str)
+        ser = ser.squeeze()
+        # TODO: make this less brittle, and better support non-Ensembl organisms
+        # Remove all contigs that don't correspond to a chromosome
+        ser = ser[ser.apply(lambda x: len(x)) <= 2]
+        # Remove mitochondiral if specified in the config
+        if config["processing"]["remove-mitochondrial"]:
+            return ser[ser != "MT"]
+        else:
+            return ser
+
+
+def get_fastq(wildcards):
+    """
+    Get fastq files of given sample-unit. Sample-unit structure is how technical replicates
+    are handled. This is defined in the sampletable.
+    """
+    fastqs = units.loc[(wildcards.sample, wildcards.unit), ["fq1", "fq2"]].dropna()
+    if len(fastqs) == 2:
+        return {"r1": fastqs.fq1, "r2": fastqs.fq2}
+    return {"r1": fastqs.fq1}
+
+
+def get_read_group(wildcards):
+    """Denote sample name and platform in read group."""
+    return "-R '@RG\\tID:{sample}\\tSM:{sample}\\tPL:{platform}'".format(
+        sample=wildcards.sample,
+        platform=units.loc[(wildcards.sample, wildcards.unit), "platform"],
+    )
+
+
+def get_recal_input(bai=False):
+    """
+    Handle providing bams the input of the bqsr rules.
+    Read config options to determine the appropriate bam and bam index files.
+    If we don't remove duplicates, return the sorted bams from map reads rule.
+    If duplicates are removed, return the deduplicated bams from the mark duplicates rule.
+    If a bed file is used in variant calling
+    """
+    # Case 1: no duplicate removal
+    f = "results/mapped/{sample}-{unit}.sorted.bam"
+    if config["processing"]["remove-duplicates"]:
+        # Case 2: remove duplicates
+        f = "results/dedup/{sample}-{unit}.bam"
+    if bai:
+        if config["processing"]["restrict-regions"]:
+            # Case 3: need an index because random access is required
+            f += ".bai"
+            return f
+        else:
+            # Case 4: no index needed
+            return []
+    else:
+        return f
+
+
+def get_sample_bams(wildcards):
+    """
+    Get all aligned reads of given sample. Return the recal bams if bqsr is run
+    otherwise return the dedup bams. We return all units for a given sample because
+    we want to provide technical replicates to the variant calling rule where this is called
+    """
+    unitlist = units.loc[wildcards.sample].unit
+    reslist = []
+    if config['filtering']['bqsr']:
+        reslist.extend(
+            [
+                "results/recal/{}-{}.bam".format(wildcards.sample, unit) for unit in unitlist
+            ]
+        )
+    else:
+        reslist.extend(
+            [
+                "results/dedup/{}-{}.bam".format(wildcards.sample, unit) for unit in unitlist
+            ]
+        )
+
+    return reslist
+
+
+def get_sample_unit_bams(wildcards):
+    """
+    Get all aligned reads of given sample. Unlike the function above, we return a single sample-unit combination per function call.
+    This is because this function is used to QC rules like samtools-stats where we do not want to combine technical replicates.
+    Return the recal bams if bqsr is run otherwise return the dedup bams
+    """
+    reslist = ''
+    if config['filtering']['bqsr']:
+        reslist = "results/recal/{sample}-{unit}.bam".format(sample=wildcards.sample, unit=wildcards.unit)
+    else:
+        reslist = "results/dedup/{sample}-{unit}.bam".format(sample=wildcards.sample, unit=wildcards.unit)
+    return reslist
+
+
+def get_regions_param(regions=config["processing"]["restrict-regions"], default=""):
+    """
+    If a captured regions bedfile is present, split the variant calling up into regions
+    follwing GATK best practices
+    """
+    if regions:
+        params = "--intervals '{}' ".format(regions)
+        padding = config["processing"].get("region-padding")
+        if padding:
+            params += "--interval-padding {}".format(padding)
+        return params
+    return default
+
+
+def get_call_variants_params(wildcards, input):
+    """
+    Calls the previous function to assemble the regions into interval lists
+    along with any specified parameters for variant calling in the config
+    """
+    return (
+        get_regions_param(
+            regions=input.regions, default="--intervals {}".format(wildcards.contig)
+        )
+    )
+
+
+def set_java_opts(resources):
+    """
+    Using the resources directive from the snakemake rule
+    set the heap size. Request 75 percent of the requested
+    mem_mb. The remaining 25 percent should be enough for 
+    OS and other system processes that occur outside the shell command
+    """
+    heap = int(resources.mem_mb * 0.75)
+    heap = int(heap / 1024)
+    java_temp ='''"-Xmx{}g -Djava.io.tmpdir=$TMPDIR\"'''
+    java_opts = java_temp.format(heap)
+    return java_opts
+
+def all_input_mutect2():
+    """
+    Format the input for the all rule for mutect2
+    """
+    comparisons = config['mutect2'].keys()
+    return expand("results/mutect2_annotated_normed/{comp}.vcf.gz", comp=comparisons)
+
+
+def names_for_somatic(wildcards):
+    """
+    Format the names into arguments to pass to mutect2.
+    Mutect2 requires you to specify the names of the "normal" samples.
+    There can be multiple normal samples in a single mutect2 call.
+    Tumor samples do not need to be named. This will be done by reading
+    from the config.
+    """
+    comp = wildcards.comp
+    normals = config['mutect2'][comp]['normal']
+    if not isinstance(normals, list):
+        normals = [normals]
+    return normals
+
+
+def input_for_somatic(wildcards):
+    """
+    Format the bam input for mutect2 by reading from the config. 
+    Technical replicates are separated and grouped. Returns a dictionary
+    contains the reference genome, sequence dictionary, and input bams
+    """
+    comp = wildcards.comp
+    normals = config['mutect2'][comp]['normal']
+    if not isinstance(normals, list):
+        normals = [normals]
+    tumors = config['mutect2'][comp]['tumor']
+    if not isinstance(tumors, list):
+        tumors = [tumors]
+    # Fill these lists with paths to tumor and normal files
+    t_files = []
+    n_files = []
+    for i in range(len(tumors)):
+        # Get the unit for each tumor sample
+        unitlist = units.loc[tumors[i]].unit
+        if config['filtering']['bqsr']:
+            t_files.extend(
+                [
+                "results/recal/{}-{}.bam".format(tumors[i], unit) for unit in unitlist
+                ]
+            )
+        else:
+            t_files.extend(
+                [
+                "results/dedup/{}-{}.bam".format(tumors[i], unit) for unit in unitlist
+                ]
+            )
+    # Do the same for Normals
+    for i in range(len(normals)):
+        unitlist = units.loc[normals[i]].unit
+        if config['filtering']['bqsr']:
+            n_files.extend(
+                [
+                "results/recal/{}-{}.bam".format(normals[i], unit) for unit in unitlist
+                ]
+            )
+        else:
+            n_files.extend(
+                [
+                "results/dedup/{}-{}.bam".format(normals[i], unit) for unit in unitlist
+                ]
+            )
+
+
+    # Put all the input files needed into a dictionary to pass to the rule
+    d = dict(
+        ref=reference,
+        normals=n_files,
+        tumors=t_files,
+        dict=dictionary,
+        regions=(
+            "results/called/{contig}.regions.bed".format(contig = wildcards.contig)
+            if config["processing"]["restrict-regions"]
+            else []
+        ),
+    )
+    return d
+
+
+def get_fai_nomenclature():
+    """
+    Helper function to get the nomenclature of the fasta index
+    Returns True if the chr prefix is present, and False if it is absent
+    """
+    nom = False
+    with checkpoints.genome_index.get().output[0].open() as fai:
+        for line in fai:
+            if line.startswith('chr'):
+                nom = True
+                break
+    return nom
+
+
+def get_bed_nomenclature(input):
+    """
+    Helper function to get the nomenclature of the bedfile
+    Returns True if the chr prefix is present, and False if it is absent
+    """
+    nom = False
+    with open(input.bed, 'r') as f:
+        for line in f:
+            if line.startswith('browser') or line.startswith('track'):
+                continue
+            if f.startswith('chr'):
+                nom = True
+                break
+    return nom
+
+
+# vim: ft=python
diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index d6bc9d0f6..c1438675d 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -5,7 +5,8 @@ import gzip
 import yaml
 import importlib
 import tempfile
-import pandas
+from tempfile import TemporaryDirectory
+import pandas as pd
 from snakemake.utils import makedirs
 from lib.imports import resolve_name
 from lib import utils
@@ -61,6 +62,29 @@ rule unzip:
         '{references_dir}/logs/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}.log'
     shell: 'gunzip -c {input} > {output}'
 
+rule bwa_index:
+    """
+    Build bwa index
+    """
+    input:
+        '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta'
+    output:
+        protected(aligners.bwa_index_from_prefix('{references_dir}/{organism}/{tag}/genome/bwa/{organism}_{tag}'))
+    log:
+        '{references_dir}/logs/{organism}/{tag}/genome/bwa/{organism}_{tag}.log'
+    resources:
+        runtime=autobump(hours=3),
+        mem_mb=gb(24),
+        disk_mb=gb(24)
+    run:
+        prefix=aligners.bwa_prefix_from_index(output)
+        print(prefix)
+        shell(
+            'bwa index '
+            '-p {prefix} '
+            '-a bwtsw '
+            '{input} '
+            '&> {log}')
 
 rule bowtie2_index:
     """
@@ -353,7 +377,7 @@ rule mappings:
             d['__featuretype__'] = ft
             res.append(d)
 
-        df = pandas.DataFrame(res)
+        df = pd.DataFrame(res)
 
         # Depending on how many attributes there were and the
         # include_featuretypes settings, this may take a while.
@@ -364,4 +388,161 @@ rule mappings:
         # Restore original setting
         gffutils.constants.always_return_list = orig_setting
 
+
+checkpoint genome_index:
+    """
+    Build fasta index. GATK uses this file for rapid fasta accession as well as
+    the fasta index is used to identify chromosomes and contigs in the genome
+    to download the appropriate known variation file
+    """
+    input:
+        '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta'
+    output:
+        protected('{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai')
+    log:
+        '{references_dir}/logs/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai.log'
+    resources:
+        runtime=hours(1),
+        mem_mb=gb(4)
+    run:
+        shell(
+            'samtools '
+            'faidx '
+            '-o {output} {input} '
+            '&> {log}'
+        )
+
+
+rule known_variation:
+    """
+    Download all the chromosomes on the ensembl ftp site and combine them to
+    create a known variation vcf
+    """
+    input:
+        #fai='{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai'
+        # can't do it this way since the {tag} wildcard is not congruous between input and output
+        fai = lambda w: checkpoints.genome_index.get(**w).output[0]
+
+    output:
+        protected('{references_dir}/{organism}/{tag}/known/{organism}_{tag}.vcf.gz')
+    log:
+        '{references_dir}/{organism}/{tag}/known/{organism}_{tag}.known.log'
+    resources:
+        runtime=hours(4),
+        mem_mb=gb(16),
+        disk_mb=gb(32)
+    run:
+        # Get the configuration options in the metadata chunk using wildcards
+        release = int(config['references'][wildcards.organism][wildcards.tag]['metadata']['release'])
+        species = config['references'][wildcards.organism][wildcards.tag]['metadata']['species']
+        build = config['references'][wildcards.organism][wildcards.tag]['metadata']['build']
+        typ = config['references'][wildcards.organism][wildcards.tag]['known']['type']
+        def get_contigs():
+            with open(input[0], 'r') as fai:
+                ser =  pd.read_table(fai, header=None, usecols=[0], dtype=str)
+                ser = ser.squeeze()
+                ser =  ser[ser.apply(lambda x: len(x)) <= 2]
+            return ser
+        contigs = get_contigs()
+        branch=""
+        if release >= 81 and build == "GRCh37":
+            branch="grch37/"
+        if typ == "all":
+            if species == "homo_sapiens" and release >= 93:
+                suffixes = [
+            "-chr{}".format(chrom) for chrom in contigs
+        ]
+            else:
+                suffixes = [""]
+        elif typ == "somatic":
+            suffixes=["_somatic"]
+        elif typ == "structural_variations":
+            suffixes=["_structural_variations"]
+        species = species.lower()
+        release = int(release)
+        build = build
+        typ = typ
+        species_filename=species if release >= 91 else species.capitalize()
+        urls=[
+            "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}".format(
+                release=release,
+                species=species,
+                suffix=suffix,
+                species_filename=species_filename,
+                branch=branch,
+                ext=ext,
+            )
+            for suffix in suffixes
+            for ext in ["vcf.gz", "vcf.gz.csi"]
+        ]
+        names=[os.path.basename(url) for url in urls if url.endswith(".gz")]
+        gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls)))
+        with tempfile.TemporaryDirectory() as tmpdir:
+            if input.get("fai"):
+                shell(
+                    "(cd {tmpdir}; {gather} && "
+                    "bcftools concat -Oz --naive-force {names} > concat.vcf.gz && "
+                    "bcftools reheader --fai {input.fai} concat.vcf.gz "
+                    "> {output}) && "
+                    "tabix -p vcf {output} 2> {log} "
+                )
+
+if config['references']['human'].get('variation'):
+    rule dbnsfp:
+        """
+        Download and process dbNSFP database. This involves downloading and
+        extracting the zip file, then combining the chromosomes to create
+        a single file. For genome builds like hg19 and GRCh37, some processing
+        needs to be done to make them compatible with dbNSFP version > 3.X
+        dbNSFP is only for human genomes.
+        """
+        output:
+            protected(
+                '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.vcf.gz'.format(
+                    dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'],
+                    build=config['references']['human']['variation']['dbnsfp']['build']
+                )
+            )
+        log:
+            '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.log'.format(
+                dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'],
+                build=config['references']['human']['variation']['dbnsfp']['build']
+            )
+        resources:
+            disk_mb=gb(500),
+            mem_mb=gb(500),
+            runtime=hours(8)
+        threads: 16
+        run:
+            version = config['references']['human'][wildcards.tag]['dbnsfp']['version']
+            URL = config['references']['human'][wildcards.tag]['dbnsfp']['url']
+            build = config['references']['human'][wildcards.tag]['dbnsfp']['build']
+            workdir = wildcards.references_dir
+            if build == 'GRCh37':
+                # We need to process the dbNSFP file to make it compatible with older genomes
+                with tempfile.TemporaryDirectory() as tmpdir:
+                    shell(
+                        '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && '''
+                        '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && '''
+                        '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && '''
+                        '''awk '$8 != "." ' all_chrs > all_chrs_filtered && '''
+                        '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && '''
+                        '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && '''
+                        '''bgzip -c all_chrs_filtered_sorted_header > {output}) && '''
+                        '''tabix -s 1 -b 2 -e 2 {output} '''
+                    )
+            if build == 'GRCh38':
+                with tempfile.TemporaryDirectory() as tmpdir:
+                    # No need for processing and we can use the first 2 columns for coordinates
+                    shell(
+                        '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && '''
+                        '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && '''
+                        '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && '''
+                        '''sort -S 50% --parallel=24 all_chrs -k1,1 -k2,2n > all_chrs_sorted && '''
+                        '''cat h all_chrs_sorted > all_chrs_sorted_header && '''
+                        '''bgzip -c all_chrs_sorted_header > {output}) && '''
+                        '''tabix -s 1 -b 2 -e 2 {output} '''
+                    )
+
+
 # vim: ft=python
diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml
index 49618dcd0..5f8f2c664 100644
--- a/workflows/references/config/config.yaml
+++ b/workflows/references/config/config.yaml
@@ -1,6 +1,8 @@
-references_dir: 'references_dir'
+references_dir: '/data/NICHD-core0/references'
+
 
 # See the reference config files in the top level of the repo,
 # include/reference_configs, for inspiration for more species.
 include_references:
-  - '../../include/reference_configs/test.yaml'
+  # - '../../include/reference_configs/test.yaml'
+  - '../../include/reference_configs/variant-calling.yaml'
diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
new file mode 100644
index 000000000..2496fcfe7
--- /dev/null
+++ b/workflows/variant-calling/Snakefile
@@ -0,0 +1,911 @@
+import sys
+sys.path.insert(0, srcdir('.'))
+import pandas as pd
+import tempfile
+import os
+from os import path
+import re
+from tempfile import TemporaryDirectory,NamedTemporaryFile
+from snakemake.shell import shell
+import yaml
+from textwrap import dedent
+from pathlib import Path
+from urllib.request import urlretrieve
+from zipfile import ZipFile
+sys.path.append('../..')
+from lib import common, utils, helpers, aligners
+from lib.utils import autobump, gb, hours
+
+configfile: "config/config.yaml"
+
+include: '../../lib/helpers.smk'
+
+aln_index, dbnsfp, dictionary, indexed, known_sites, reference = preflight()
+
+wildcard_constraints:
+    vartype="snvs|indels",
+    sample="|".join(samples.index),
+    unit="|".join(units["unit"]),
+    comp="|".join(config['mutect2'].keys())
+
+
+rule all:
+    input:
+        "results/annotated/ann.vcf.gz",
+        "results/qc/multiqc.html",
+        "results/filtered/all.normed.vcf.gz",
+        expand("results/somatic_filtered/normed.{comp}.vcf.gz", comp = config['mutect2'].keys()),
+        expand("results/mutect2_annotated/snpeff.{comp}.vcf.gz", comp = config['mutect2'].keys()),
+
+
+
+checkpoint genome_index:
+    threads: 2
+    resources:
+        mem_mb=gb(4),
+        runtime=autobump(60)
+    input:
+        reference
+    output:
+        indexed
+    log:
+        'logs/fasta_index.log'
+    shell:
+        "samtools "
+        "faidx "
+        "{input} > {output} 2> {log} "
+
+
+rule fasta_dict:
+    threads: 2
+    resources:
+        mem_mb=gb(4),
+        # mem_mb = gb(4), # [ TEST SETTINGS -1 ]
+        disk_mb=gb(4),
+        runtime=autobump(60)
+    input:
+        ref=reference
+    output: dictionary
+    log: "logs/sequence_dictionary.log"
+    run:
+        java_opts = set_java_opts(resources)
+        shell(
+            'picard CreateSequenceDictionary {java_opts} \
+            -R {input.ref} \
+            -O {output} &> {log} '
+        )
+
+
+if not aln_index:
+    rule bwa_index:
+        """
+        Generate BWA index for the reference genome if we are not using lcdb-wf references workflow
+        """
+        threads: 8
+        resources:
+            disk_mb=gb(24),
+            mem_mb=gb(24),
+            runtime=autobump(180)
+        input:
+            reference
+        output:
+            multiext(reference, ".amb", ".ann", ".bwt", ".pac", ".sa")
+        log:
+            "logs/bwa_index.log"
+        params:
+            "bwtsw "
+        shell:
+            "bwa index -a {params}"
+            "{input} "
+            " &> {log}"
+
+
+rule trim_adapters:
+    threads: 8
+    resources:
+        mem_mb=gb(32),
+        runtime=autobump(360)
+    input: unpack(get_fastq),
+    output:
+        r1="results/trimmed/{sample}-{unit}.1.fastq.gz",
+        r2="results/trimmed/{sample}-{unit}.2.fastq.gz",
+    log:
+        "logs/{sample}-{unit}_trimming.log"
+    shell:
+        'cutadapt '
+        '-o {output.r1} '
+        '-p {output.r2} '
+        '-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT '
+        '--nextseq-trim 20 '
+        '--overlap 6 '
+        '-j {threads} '
+        '--minimum-length 25 '
+        '{input.r1} '
+        '{input.r2} '
+        ' &> {log} '
+
+
+rule map_reads:
+    threads: 32
+    resources:
+        disk_mb=gb(40),
+        mem_mb=gb(48),
+        runtime=autobump(1920)
+    input:
+        reads=["results/trimmed/{sample}-{unit}.1.fastq.gz","results/trimmed/{sample}-{unit}.2.fastq.gz"],
+        idx=multiext(reference, ".amb", ".ann", ".bwt", ".pac", ".sa"),
+    output:
+        bam=temp("results/mapped/{sample}-{unit}.sorted.bam"),
+    params:
+        extra=get_read_group,
+        index=lambda w, input: os.path.splitext(input.idx[0])[0],
+    log:
+        "logs/{sample}-{unit}_bwamem.log"
+    shell:
+        "bwa mem "
+        "-t {threads} "
+        "{params.extra} "
+        "{params.index} "
+        "{input.reads} | "
+        "samtools view -bh | samtools sort -o {output} -O BAM "
+        "2> {log}"
+
+
+rule mark_duplicates:
+    """
+    If we run bqsr, then we do not need to save the output of mark duplicates, since those bams will
+    be recalibrated. However, if we don't recalibrate, then we need to save the bams from mark duplicates 
+    and we don't want to mark them as temporary
+    """
+    threads: 4
+    resources:
+        disk_mb=gb(40),
+        mem_mb=gb(32),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(720)
+    input:
+        bam = "results/mapped/{sample}-{unit}.sorted.bam", 
+    output:
+        metrics="results/qc/picard/markdups/{sample}-{unit}_marked_dup_metrics.txt",
+        bam=(
+            temp("results/dedup/{sample}-{unit}.bam") if config['filtering']['bqsr']
+            else "results/dedup/{sample}-{unit}.bam"
+        )
+
+    log:
+        "logs/{sample}-{unit}_mark_dup.log"
+    params:
+        rm = ('-REMOVE_DUPLICATES true '
+              if config['processing']['remove-duplicates']
+              else '')
+    run:
+        java_opts = set_java_opts(resources)
+        shell(
+            'picard MarkDuplicates '
+            '{java_opts} '
+            '-INPUT {input.bam} '
+            '-OUTPUT {output.bam} '
+            '-ASSUME_SORT_ORDER coordinate '
+            '{params.rm} '
+            '-METRICS_FILE {output.metrics} '
+            ' 2> {log} '
+        )
+
+
+if config["filtering"]['bqsr']:
+    rule base_recalibrator:
+        threads: 4
+        resources:
+            mem_mb=gb(8),
+            # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+            disk_mb=gb(40),
+            runtime=autobump(960)
+        input:
+            bam=get_recal_input(bai=False),
+            bai=get_recal_input(bai=True),
+            ref=reference,
+            dict=dictionary,
+            known=known_sites,
+            known_idx=known_sites + '.tbi'
+        output:
+            recal_table="results/recal/{sample}-{unit}.grp"
+        log:
+            "logs/{sample}-{unit}_base_recalibrator.log"
+        run:
+            java_opts = set_java_opts(resources)
+            shell(
+                'gatk --java-options {java_opts} BaseRecalibrator \
+                -R {input.ref} \
+                -I {input.bam} \
+                -O {output.recal_table} \
+                --known-sites  {input.known} 2> {log}'
+            )
+
+
+    rule apply_bqsr:
+        threads: 8
+        resources:
+            mem_mb=gb(32),
+            # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+            disk_mb=gb(40),
+            runtime=autobump(960)
+        input:
+            bam=get_recal_input(bai=False),
+            bai=get_recal_input(bai=True),
+            ref=reference,
+            dict=dictionary,
+            recal_table="results/recal/{sample}-{unit}.grp",  
+        output:
+            bam=protected("results/recal/{sample}-{unit}.bam")
+        log:
+            "logs/{sample}-{unit}_apply_bsqr.log"
+        run:
+            java_opts = set_java_opts(resources)
+            shell(
+                'gatk --java-options {java_opts} ApplyBQSR \
+                -R {input.ref} \
+                -I {input.bam} \
+                --bqsr-recal-file {input.recal_table} \
+                -O {output.bam} 2> {log}'
+            )
+
+
+rule build_bam_index:
+    resources:
+        mem_mb=gb(2),
+        disk_mb=gb(2),
+        runtime=autobump(30)
+    input:
+        bam ="{prefix}.bam"
+    output:
+        "{prefix}.bam.bai"
+    run:
+        basename = os.path.basename(input.bam)
+        log = 'logs/' + os.path.splitext(basename)[0] + '_buildbamindex.log'
+        shell("samtools index {input.bam} > {output} 2> {log}")
+
+
+if config["processing"]["restrict-regions"]:
+    rule compose_regions:
+        """
+        This command will ONLY work if the chromosome nomeclature matches the format in the reference genome
+        That is, chromosomes DO NOT have the 'chr' prefix. 
+
+        .bed files are formatted like so:
+        <chromosome>    <start region>    <end region>
+        Some bed files have header lines that can start with the word 'browser' or 'track' per UCSC
+
+        To check this, we will read the lines of the .bed file and compare them to what is in the fasta index.
+        If we encounter any mismatches, then we exit with division by zero error and a print
+        statement that explains the bed file needs to be edited to make the nomenclature match.
+
+        The awk command in the shell statement prints the entire lines of the input bed
+        into distinct files that are each named by the first column (chromosome)
+        Basically, we are splitting the provided .bed file up into contigs. 
+        """
+        resources:
+            disk_mb=1024,
+            mem_mb=1024,
+            runtime=20
+        input:
+            bed = config["processing"]["restrict-regions"],
+        output:
+            "results/called/{contig}.regions.bed"
+        log:
+            "logs/{contig}_compose_regions.log"
+        run:
+            # Check for nomenclature mismatch using helper function in helpers.smk
+            chr_fai = get_fai_nomenclature
+            chr_bed = get_bed_nomenclature
+            if chr_fai != chr_bed:
+                raise ValueError("Nomenclature mismatch detected. Please review the fasta index file and the .bed files being used. The chromosome format MUST match between the .bed file and the reference. Please edit the bed file. For GRCh38 genomes, there should be no 'chr' prefix.")
+            shell(''' awk '$1 == "{wildcards.contig}" {{print $0 >> (t "/" $1 ".regions.bed" )}}' t=results/called {input} ''')
+
+
+rule call_variants:
+    input:
+        bam=get_sample_bams,
+        ref=reference,
+        dict=dictionary,
+        known=known_sites,
+        tbi=(
+            known_sites + '.tbi' if known_sites else []
+        ),
+        regions=(
+            "results/called/{contig}.regions.bed"
+            if config["processing"]["restrict-regions"]
+            else []
+        ),
+    output:
+        gvcf=protected("results/called/{sample}.{contig}.g.vcf.gz"),
+    log: "logs/{sample}_{contig}_call_variants.log"
+    resources:
+        disk_mb=gb(16),
+        mem_mb=gb(40),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(hours=8)
+    threads: 8
+    params:
+        extra=get_call_variants_params,
+        pcr='--pcr-indel-model ' + config['processing']['pcr']
+    run:
+        java_opts = set_java_opts(resources)
+        known = input.known
+        if known:
+            known = "--dbsnp " + str(known)
+        regions = params.extra
+        bams = input.bam
+        if isinstance(bams, str):
+            bams = [bams]
+        bams = list(map("-I {}".format, bams))
+        shell(
+            'gatk --java-options {java_opts} HaplotypeCaller {regions} \
+            -R {input.ref} \
+            {bams} \
+            -ERC GVCF \
+            {params.pcr} \
+            -O {output.gvcf} {known} 2> {log}'
+        )
+
+
+rule combine_calls:
+    threads: 4
+    resources:
+        disk_mb=gb(10),
+        mem_mb=gb(4),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(720)
+    input:
+        ref=reference,
+        gvcfs=expand("results/called/{sample}.{{contig}}.g.vcf.gz", sample=samples.index),
+    output: 
+        gvcf="results/called/all.{contig}.g.vcf.gz",
+    log: "logs/{contig}_combine_calls.log"
+    run:
+        java_opts = set_java_opts(resources)
+        gvcfs=list(map("-V {}".format, input.gvcfs))
+        shell(
+            'gatk --java-options {java_opts} CombineGVCFs \
+            {gvcfs} \
+            -R {input.ref} \
+            -O {output.gvcf} 2> {log} '
+        )
+
+
+rule genotype_variants:
+    threads: 4
+    resources:
+        disk_mb=gb(10),
+        mem_mb=gb(8),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(480)
+    input:
+        ref=reference,
+        idx="results/called/all.{contig}.g.vcf.gz.tbi",
+        gvcf="results/called/all.{contig}.g.vcf.gz",
+    output:
+        vcf=temp("results/genotyped/all.{contig}.vcf.gz"),
+    log:
+        "logs/genotypegvcfs.{contig}.log",
+    run:
+        java_opts = set_java_opts(resources)
+        shell(
+            'gatk --java-options {java_opts} GenotypeGVCFs '
+            '-V {input.gvcf} '
+            '-R {input.ref} '
+            '-O {output.vcf} 2> {log}'
+        )
+
+
+rule merge_variants:
+    threads: 4
+    resources:
+        disk_mb=gb(10),
+        mem_mb=gb(8),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(480)
+    input:
+        vcfs=lambda w: expand(
+            "results/genotyped/all.{contig}.vcf.gz", contig=get_contigs()
+        ),
+    output:
+        vcf="results/genotyped/all.vcf.gz",
+    log:
+        "logs/merge-genotyped.log",
+    run:
+        inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs)
+        java_opts = set_java_opts(resources)
+        shell(
+            'picard'
+            ' MergeVcfs'
+            ' {java_opts}'
+            ' {inputs}'
+            ' -OUTPUT {output}'
+            ' 2> {log}'
+        )
+
+
+rule tabix_variants:
+    threads: 2
+    resources:
+        disk_mb = gb(2),
+        mem_mb = gb(2),
+        runtime=autobump(30)
+    input:
+        vcf="{prefix}.vcf.gz",
+    output:
+        "{prefix}.vcf.gz.tbi",
+    run:
+        basename = os.path.basename(input.vcf)
+        log = 'logs/' + os.path.splitext(basename)[0] + '_tabix.log'
+        shell("tabix -p vcf {input.vcf} 2> {log} ")
+
+
+rule select_calls:
+    threads: 4
+    resources:
+        disk_mb=gb(10),
+        mem_mb=gb(4),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(480)
+    input:
+        ref=reference,
+        vcf="results/genotyped/all.vcf.gz",
+    output:
+        vcf=temp("results/filtered/all.{vartype}.vcf.gz"),
+    log:
+        "logs/selectvariants_{vartype}.log",
+    run:
+        java_opts = set_java_opts(resources)
+        vartype_arg="--select-type-to-include {}".format(
+            "SNP" if wildcards.vartype == "snvs" else "INDEL"
+        ),
+        shell(
+            'gatk --java-options {java_opts} SelectVariants '
+            '-R {input.ref} '
+            '-V {input.vcf} '
+            '{vartype_arg} '
+            '-O {output.vcf} 2> {log}'
+        )
+
+
+rule hard_filter_calls:
+    threads: 4
+    resources:
+        disk_mb=gb(10),
+        mem_mb=gb(4),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(480)
+    input:
+        ref=reference,
+        vcf="results/filtered/all.{vartype}.vcf.gz",
+    output:
+        vcf=temp("results/filtered/all.{vartype}.hardfiltered.vcf.gz"),
+    log:
+        "logs/variantfiltration_{vartype}.log",
+    run:
+        java_opts = set_java_opts(resources)
+        filter_arg = {'snv_hard_filer' : config['filtering']['hard'][wildcards.vartype]}
+        filters = [
+            "--filter-name {} --filter-expression '{}'".format(name, expr.replace("'", "\\'"))
+            for name, expr in filter_arg.items()
+        ]
+        shell(
+            'gatk --java-options {java_opts} VariantFiltration '
+            '-R {input.ref} '
+            '-V {input.vcf} '
+            '{filters} '
+            '-O {output.vcf} 2> {log}'
+        )
+
+
+rule merge_calls:
+    threads: 2
+    resources:
+        disk_mb=gb(10),
+        mem_mb=gb(8),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(480)
+    input:
+        vcfs=expand(
+            "results/filtered/all.{vartype}.{filtertype}.vcf.gz",
+            vartype=["snvs", "indels"], filtertype='hardfiltered',
+        ),
+    output:
+        vcf=temp("results/filtered/all.final.vcf.gz"),
+    log:
+        "logs/merge-filtered.log",
+    run:
+        inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs)
+        java_opts = set_java_opts(resources)
+        shell(
+            'picard '
+            ' MergeVcfs'
+            ' {java_opts}'
+            ' {inputs}'
+            ' -OUTPUT {output}'
+            ' 2> {log}'
+        )
+
+
+rule norm:
+    """
+    Split multiallielic variants into multiple biallelic ones.
+    """
+    resources:
+        mem_mb = gb(16),
+        runtime = autobump(120)
+    input:
+        ref=reference,
+        vcf="results/filtered/all.final.vcf.gz"
+    output:
+        "results/filtered/all.normed.vcf.gz"
+    log:
+        "logs/norm-vcf.log"
+    shell:
+        "bcftools norm -f {input.ref} "
+        "-m- "
+        "{input.vcf} "
+        "--output-type z "
+        "--output {output} 2> {log}"
+
+
+rule fastqc:
+    resources:
+        mem_mb=gb(12),
+        runtime=autobump(120),
+    threads: 8
+    input:
+        unpack(get_fastq),
+    output:
+        html="results/qc/fastqc/data/{sample}-{unit}_fastqc.html",
+        zip="results/qc/fastqc/zip/{sample}-{unit}_fastqc.zip"
+    log:
+        "logs/{sample}-{unit}_fastqc.log"
+    run:
+        def base_file(file_path):
+            baseName = Path(path.basename(file_path))
+            while baseName.suffix in {'.gz','.bz2','.txt','.fastq','.fq','.sam','.bam'}:
+                baseName = baseName.with_suffix('')
+            return str(baseName)
+        with TemporaryDirectory() as tempdir:
+            shell(
+                "fastqc "
+                "--threads {threads} "
+                "--noextract "
+                "--quiet "
+                "--outdir {tempdir:q} "
+                "{input:q} "
+                "&> {log} "
+            )
+            output_base = base_file(input[0])
+            html_path = path.join(tempdir, output_base + "_fastqc.html")
+            zip_path = path.join(tempdir, output_base + "_fastqc.zip")
+            if output.html != html_path:
+                shell("mv {html_path:q} {output.html:q}")
+            if output.zip != zip_path:
+                shell("mv {zip_path:q} {output.zip:q}")
+
+
+rule samtools_stats:
+    """
+    Run samtools stats
+    """
+    resources:
+        mem_mb = gb(16),
+        runtime = autobump(120)
+    input:
+        get_sample_unit_bams
+    output:
+        "results/qc/samtools-stats/{sample}-{unit}.txt"
+    log:
+        "logs/samtools-stats_{sample}-{unit}.log"
+    shell:
+        "samtools stats {input} 1> {output} 2> {log} "
+
+
+snpeff_input_for_multiqc = []
+if config['snpeff']['germline']:
+    snpeff_input_for_multiqc.append('results/qc/snpEff_summary.csv')
+if config['snpeff']['somatic']:
+    soms = expand('results/qc/snpEff_{comp}_summary.csv', comp = config['mutect2'].keys())
+    snpeff_input_for_multiqc.extend(soms)
+
+
+
+rule multiqc:
+    """
+    Gather qc metrics and run MultiQC
+    Get the html output from somatic and germline VCF annotation if specified in the config.
+    """
+    resources:
+        mem_mb=gb(4),
+        runtime=autobump(60)
+    input:
+        fastqc=expand("results/qc/fastqc/zip/{u.sample}-{u.unit}_fastqc.zip", u=units.itertuples()),
+        markdup=expand("results/qc/picard/markdups/{u.sample}-{u.unit}_marked_dup_metrics.txt", u=units.itertuples()),
+        samstats=expand("results/qc/samtools-stats/{u.sample}-{u.unit}.txt", u=units.itertuples()),
+        snpeff=snpeff_input_for_multiqc
+    output:
+        "results/qc/multiqc.html",
+    params:
+        dirname="results/qc/",
+        name="multiqc.html",
+    log:
+        "logs/multiqc.log",
+    run:
+        input_dirs=params.dirname
+        shell(
+            "multiqc "
+            "--force "
+            "-o {params.dirname} "
+            "-n {params.name} "
+            "{input_dirs} "
+            " &> {log} "
+        )
+
+
+rule snpeff:
+    """
+    Annotate variants with SnpEff
+    """
+    resources:
+        disk_mb=gb(20),
+        mem_mb=gb(16),
+        # mem_mb = gb(5.5) # [ TEST SETTINGS -1 ]
+        runtime=autobump(120)
+    input:
+        vcf='results/filtered/all.normed.vcf.gz',
+    log: 'logs/snpeff.log'
+    output:
+        ann='results/annotated/ann.vcf.gz',
+        stats='results/qc/snpEff_summary.csv',
+        html='results/qc/snpEff_summary.html'
+    params:
+        annotations = config['snpeff']['annotations'],
+        gen = config['snpeff']['genome']
+    run:
+        java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
+        shell(
+            "snpEff {java_opts} "
+            "-o vcf "
+            "-csvStats {output.stats} "
+            "-stats {output.html} "
+            "{params.gen} {input.vcf} "
+            "| bcftools view -Oz > {output.ann} 2> {log} "
+        )
+        dbnsfp_arg = []
+        if dbnsfp:
+            dbnsfp_arg = "DbNsfp -db {}".format(dbnsfp)
+        if dbnsfp_arg:
+            sift_output = 'results/annotated/dbnsfp.ann.vcf.gz'
+            field_arg = (
+                "-f '{}'".format(params.annotations)
+                if params.annotations
+                else ''
+            )
+
+            shell(
+                "SnpSift {java_opts} "
+                "{dbnsfp_arg} "
+                "{field_arg} {output.ann} "
+                "| bcftools view -Oz > {sift_output} 2>> {log} " 
+            )
+
+
+rule mutect2:
+    """
+    Use Mutect2 to call variants on individual samples, one per contig
+    """
+    resources:
+        disk_mb = gb(40),
+        mem_mb = gb(32),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime = autobump(720)
+    input:
+        unpack(input_for_somatic)
+    output:
+        vcf="results/mutect2_called/raw.{comp}.{contig}.vcf.gz",
+        stats='results/mutect2_called/raw.{comp}.{contig}.vcf.gz.stats',
+        orientation='results/lrom/{contig}_{comp}.tar.gz'
+    log:
+        "logs/{comp}_{contig}_mutect2_call_variants.log"
+    params:
+        pon = (
+            '--panel-of-normals ' + config['mutect2']['PON']
+            if config['PON']
+            else []
+        ),
+        extra=get_call_variants_params,
+        pcr='--pcr-indel-model ' + config['processing']['pcr']
+    run:
+        java_opts = set_java_opts(resources)
+        normals = " ".join("-I {} ".format(n) for n in input.normals)
+        tumors = " ".join("-I {} ".format(t) for t in input.tumors)
+        names = names_for_somatic(wildcards)
+        formatted_names = " ".join('-normal {} '.format(name) for name in names)
+        shell(
+            "gatk Mutect2 "
+            "--java-options {java_opts} "
+            "-R {input.ref} "
+            "{normals} "
+            "{tumors} "
+            "{params.extra} "
+            "{formatted_names} "
+            "{params.pcr} "
+            "--f1r2-tar-gz {output.orientation} "
+            "{params.pon} "
+            "-O {output.vcf} 2> {log}"
+        )
+
+
+rule lrom:
+    """
+    Run LearnReadOrientationModel to get the maximum likelihood estimates of artifact prior probabilities 
+    in the orientation bias mixture model filter
+    """
+    resources:
+        disk_mb = gb(20),
+        mem_mb = gb(32),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime = autobump(120)
+    input:
+        orientation=lambda w: expand('results/lrom/{contig}_{{comp}}.tar.gz', contig=get_contigs())
+    output:
+        lrom='results/lrom/artifact-prior-{comp}.tar.gz'
+    log:
+        'logs/lrom_{comp}.log'
+    run:
+        java_opts = set_java_opts(resources)
+        def get_format_lrom():
+            names= ['-I {} '.format(i) for i in input.orientation]
+            names = ' '.join(names)
+            return names
+        lrom_names = get_format_lrom()
+
+        shell(
+            'gatk --java-options {java_opts} LearnReadOrientationModel {lrom_names} '
+            '-O {output.lrom} &> {log}'
+        )
+
+
+rule merge_mutect2_variants:
+    """
+    After individual contigs are called via mutect2, we merge them together here.
+    """
+    resources:
+        disk_mb = gb(20),
+        mem_mb = gb(32),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime = autobump(120)
+    input:
+        vcfs=lambda w: expand(
+            "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz", contig=get_contigs()
+        ),
+    output:
+        temp("results/somatic/merged.{comp}.vcf.gz")
+    log:
+        "logs/merge_mutect2.{comp}.log",
+    run:
+        inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs)
+        java_opts = set_java_opts(resources)
+        shell(
+            'picard'
+            ' MergeVcfs'
+            ' {java_opts}'
+            ' {inputs}'
+            ' -OUTPUT {output}'
+            ' &> {log}'
+        )
+
+
+rule merge_mutect2_stats:
+    """
+    Just like merging VCFs for Mutect2, we also need to merge stats for filtering.
+    """
+    resources:
+        disk_mb = gb(20),
+        mem_mb = gb(16),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime = autobump(120)
+    input:
+        stats=lambda w: expand(
+            "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz.stats", contig=get_contigs()
+        ),
+    output:
+        temp("results/somatic/merged.{comp}.vcf.gz.stats")
+    log:
+        "logs/merge_mutect2_stats.{comp}.log"
+    run:
+        java_opts = set_java_opts(resources)
+        inputs = " ".join(" -stats {} ".format(f) for f in input.stats)
+        shell(
+            "gatk MergeMutectStats "
+            "--java-options {java_opts} "
+            "{inputs} "
+            "-O {output} "
+            "&> {log}"
+        )
+
+
+rule filter_mutect2_calls:
+    """
+    New versions of Mutect2 have optimized defaults for filtering; we can just use those.
+    """
+    resources:
+        disk_mb = gb(20),
+        mem_mb = gb(16),
+        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        runtime = autobump(120)
+    input:
+        ref=reference,
+        unfiltered="results/somatic/merged.{comp}.vcf.gz",
+        stats="results/somatic/merged.{comp}.vcf.gz.stats",
+        lrom='results/lrom/artifact-prior-{comp}.tar.gz'
+    output:
+        "results/somatic_filtered/filtered.{comp}.vcf.gz"
+    log:
+        "logs/{comp}.vcf.gz.log"
+    run:
+        java_opts = set_java_opts(resources)
+        shell(
+            "gatk FilterMutectCalls "
+            "--java-options {java_opts} "
+            "-stats {input.stats} "
+            "--orientation-bias-artifact-priors {input.lrom} "
+            "-R {input.ref} "
+            "-V {input.unfiltered} "
+            "-O {output}"
+        )
+
+
+rule mutect2_norm:
+    """
+    Split multiallielic variants into multiple biallelic ones.
+    """
+    resources:
+        mem_mb = gb(16),
+        runtime = autobump(120)
+    input:
+        ref=reference,
+        vcf="results/somatic_filtered/filtered.{comp}.vcf.gz"
+    output:
+        "results/somatic_filtered/normed.{comp}.vcf.gz"
+    log:
+        "logs/norm-{comp}-vcf.log"
+    shell:
+        "bcftools norm -f {input.ref} "
+        "-m- "
+        "{input.vcf} "
+        "--output-type z "
+        "--output {output} 2> {log}"
+
+
+rule snpeff_cancer:
+    """
+    Annotate somatic variants with SnpEff Cancer
+    """
+    resources:
+        disk_mb=gb(20),
+        mem_mb=gb(16),
+        # mem_mb = gb(5.5) # [ TEST SETTINGS -1 ]
+        runtime=autobump(120)
+    input:
+        vcf='results/somatic_filtered/normed.{comp}.vcf.gz',
+    output:
+        vcf='results/mutect2_annotated/snpeff.{comp}.vcf.gz',
+        stats='results/qc/snpEff_{comp}_summary.csv',
+        html='results/qc/snpEff_{comp}.html'
+    log:
+        'logs/cancer_snpeff_{comp}.log'
+    params:
+        snpeff_genome = config['snpeff']['genome']
+    run:
+        java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
+        shell(
+            'snpEff {java_opts} -v -o vcf -cancer -csvStats {output.stats} \
+            -stats {output.html} {params.snpeff_genome} {input.vcf}  | bcftools view -Oz > {output.vcf}'
+        )
+
+
+# vim: ft=python
diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml
new file mode 100644
index 000000000..4b3e71bd2
--- /dev/null
+++ b/workflows/variant-calling/config/config.yaml
@@ -0,0 +1,74 @@
+samples: config/samples.tsv
+units: config/units.tsv
+ref:
+  # Set to true only if you want your references to come from lcdb-wf references
+  use_references_workflow: false
+  # Match these to the reference config that is included at the bottom of this file
+  # Only configure this section if use_references_workflow is set to true
+  organism: 'human'
+  genome:
+    tag: 'ensembl-104'
+    build: 'GRCh38'
+  aligner:
+    index: 'bwa'
+    tag: 'ensembl-104'
+  faidx:
+    index: 'faidx'
+    tag: 'ensembl-104'
+  variation:
+    known: 'known'
+    dbnsfp: 'dbNSFPv4.4'
+  # If you are providing your own references, include their paths here.
+  paths:
+    # When using BWA, you should not use a top level genome assembly for human, see http://lh3.github.io/2017/11/13/which-human-reference-genome-to-use
+    ref: 'references/GRCh38.6.20.fa.gz'
+    known: 'references/known_variation_noiupac.vcf.gz'
+    index:
+    dbnsfp: 'references/dbnsfp_6_20.vcf.gz'
+processing:
+  remove-duplicates: true
+  remove-mitochondrial: true
+  # See https://gatk.broadinstitute.org/hc/en-us/articles/360036465912-HaplotypeCaller#--pcr-indel-model for pcr
+  pcr: "NONE"
+  # Point to a bed file, e.g. captured regions
+  restrict-regions: 'references/exons_subset.bed'
+  # If regions are restricted, optionally enlarge them by a given value
+  region-padding:
+filtering:
+  # Set to true in order to apply machine learning based recalibration of
+  # quality scores instead of hard filtering.
+  bqsr: true
+  hard:
+    # hard filtering as outlined in GATK docs
+    # (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set)
+    snvs:
+      "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
+    indels:
+      "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
+snpeff:
+  # MultiQC rule needs these set to collect summary files from the snpeff and snpeff_cancer rule
+  # Set each to true respectively if you plan on generating annotations for somatic or germline data
+  somatic: true
+  germline: true
+  # Run snpEff databases to see available databases (https://pcingola.github.io/SnpEff/se_commandline/)
+  # See https://pcingola.github.io/SnpEff/se_build_db/ for docs on building your own database
+  genome: 'GRCh38.p14'
+  # Add annotations in the form of a comma-separated string to attach from dbnsfp for snpsift.
+  # These annotations should be column names in the dbnsfp file.
+  # Leave this blank if you are not using dbnsfp.
+  annotations: 'FATHMM_pred,SIFT_pred'
+# Supple a panel of normals file if you have one for your genome. It is OK to leave this blank. 
+# See https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- for details on the PON file.
+PON:
+mutect2:
+  # See the docs on how to configure this section
+  tumor-normal:
+    tumor:
+      - 'tumor'
+    normal:
+      - 'normal'
+
+
+
+include_references:
+  - '../../include/reference_configs/variant-calling.yaml'
diff --git a/workflows/variant-calling/config/samples.tsv b/workflows/variant-calling/config/samples.tsv
new file mode 100644
index 000000000..21c6b82e8
--- /dev/null
+++ b/workflows/variant-calling/config/samples.tsv
@@ -0,0 +1,3 @@
+sample
+tumor
+normal
diff --git a/workflows/variant-calling/config/units.tsv b/workflows/variant-calling/config/units.tsv
new file mode 100644
index 000000000..3d6d261d8
--- /dev/null
+++ b/workflows/variant-calling/config/units.tsv
@@ -0,0 +1,4 @@
+sample	unit	platform	fq1	fq2
+tumor	1	Illumina	data/example_data/tumor_R1.fq.gz	data/example_data/tumor_R2.fq.gz
+tumor	2	Illumina	data/example_data/test_unit_R1.fq.gz	data/example_data/test_unit_R2.fq.gz
+normal	1	Illumina	data/example_data/normal_R1.fq.gz	data/example_data/normal_R2.fq.gz

From a34c9d3f81fe427bdf4a9d1e54b8a33493ea446e Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Thu, 1 Jun 2023 18:18:11 -0400
Subject: [PATCH 02/59] initial testing infrastructure for VC

---
 .circleci/config.yml                  | 27 ++++++++
 test/lcdb-wf-test                     | 97 +++++++++++++++++++++++++--
 workflows/variant-calling/run_test.sh |  3 +
 3 files changed, 123 insertions(+), 4 deletions(-)
 create mode 100755 workflows/variant-calling/run_test.sh

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e0e743334..667077b22 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -289,6 +289,22 @@ variables:
           conda activate $LCDBWF_ENV
           $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -r -p -j2 --use-conda --orig $ORIG
 
+
+  # --------------------------------------------------------------------------
+  # REVIEW: not RNA-seq
+  # Standard RNA-seq workflow
+  variantcalling-step: &variantcalling-step
+      run:
+        name: variantcalling workflow
+        command: |
+          cd $DEPLOY
+          source /opt/mambaforge/etc/profile.d/conda.sh
+          conda activate $LCDBWF_ENV
+          $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow -n
+          $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow --use-conda -j2
+
+          tar -zcf variantcalling.tar.gz workflows/variant-calling/results
+
 # --------------------------------------------------------------------------
 # Syntax note: All of the steps above, with their "&step-name" labels, can be
 # referred to by a corresponding "*step-name" below. The "<<: *defaults"
@@ -387,6 +403,17 @@ jobs:
           destination: gene-patterns.html
 
 
+  variantcalling:
+    <<: *defaults
+    steps:
+      - checkout
+      - *restore_cache
+      - *set-path
+      - *get-data
+      - *variantcalling-step
+      - store_artifacts:
+        path: /tmp/lcdb-wf-test/workflows/variant-calling/variantcalling.tar.gz
+
   rnaseq-misc:
     <<: *defaults
     steps:
diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test
index 7f92cb2b4..f215433d5 100755
--- a/test/lcdb-wf-test
+++ b/test/lcdb-wf-test
@@ -67,6 +67,8 @@ class Runner(object):
                 %(prog)s rnaseq --downstream
                 %(prog)s chipseq --run-workflow
                 %(prog)s references --run-workflow --configfile=config/config.yaml
+                %(prog)s variantcalling --run-workflow
+
 
             DATA
             ----
@@ -130,7 +132,7 @@ class Runner(object):
         parser.add_argument(
             "--kind",
             default="all",
-            choices=["all", "rnaseq", "chipseq"],
+            choices=["all", "rnaseq", "chipseq", "variantcalling"],
             help="Kind of data to download",
         )
         parser.add_argument(
@@ -144,8 +146,22 @@ class Runner(object):
 
         args = parser.parse_args(sys.argv[2:])
 
-        repo = "lcdb-test-data"
-        URL = f"https://github.com/lcdb/{repo}/blob/{args.branch}/data/{{}}?raw=true"
+        # Create a repo lookup for the different assays
+        # For variantcalling, the `args.branch` should be "main" instead of "master", unless we can fix this
+        repo_lookup = {
+            'rnaseq': {
+                'repo': "lcdb-test-data",
+                'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true"
+            },
+            'chipseq': {
+                'repo': "lcdb-test-data",
+                'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true"
+            },
+            'variantcalling': {
+                'repo': 'lcdb-wf-variant-calling-test-data', 
+                'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true"
+            }
+        }
 
         # This dict maps files in the `data` directory of the repo to a local
         # path to which it should be downloaded.
@@ -214,6 +230,41 @@ class Runner(object):
                     "workflows/chipseq/data/example_data/chipseq_ip4.fq.gz",
                 ),
             ],
+            "variantcalling": [
+                (
+                    "GRCh38.6.20.fa.gz",
+                    "workflows/variant-calling/references/GRCh38.6.20.fa.gz",
+                ),
+                (
+                    "known_variation_noiupac.vcf.gz",
+                    "workflows/variant-calling/references/known_variation_noiupac.vcf.gz"
+
+                ),
+                (
+                    "normal_R1.6.20.fq.gz",
+                    "workflows/variant-calling/data/example_data/normal_R1.fq.gz"
+                ),
+                (
+                    "normal_R2.6.20.fq.gz",
+                    "workflows/variant-calling/data/example_data/normal_R2.fq.gz"
+                ),
+                (
+                    "tumor_R1.6.20.fq.gz",
+                    "workflows/variant-calling/data/example_data/tumor_R1.fq.gz"
+                ),
+                (
+                    "tumor_R2.6.20.fq.gz",
+                    "workflows/variant-calling/data/example_data/tumor_R2.fq.gz"
+                ),
+                (
+                    "dbnsfp_6_20.vcf.gz",
+                    "workflows/variant-calling/references/dbnsfp_6_20.vcf.gz"
+                ),
+                (
+                    "dbnsfp_6_20.vcf.gz.tbi",
+                    "workflows/variant-calling/references/dbnsfp_6_20.vcf.gz.tbi"
+                ),
+            ]
         }
 
         if args.kind == "all":
@@ -222,7 +273,7 @@ class Runner(object):
             kinds = [args.kind]
         for kind in kinds:
             for fn, dest in data_files[kind]:
-                url = URL.format(fn)
+                url = repo_lookup[kind]['URL'].format(fn, repo=repo_lookup[kind]['repo'])
                 if args.verbose:
                     print(f"downloading {url}")
                 if dest is None:
@@ -504,6 +555,44 @@ class Runner(object):
                 executable="/bin/bash"
             )
 
+    def _cmd_variantcalling(self):
+        """
+        This function handles the "variantcalling" subcommand.
+        """
+
+        parser = argparse.ArgumentParser(
+            description="Run variant calling workflow and downstream tests",
+            parents=[self.global_parser],
+        )
+        parser.add_argument(
+            "--run-workflow",
+            action="store_true",
+            help="""Run variant workflow using run_tesh.sh, which runs preprocess.py 
+            on the snakefile, converting it to a test file to be run.""",
+        )
+
+
+        workflow_prefix = "bash run_test.sh"
+        workflow_dir = TOPLEVEL / "workflows/variant-calling"
+        args, extra = parser.parse_known_args(sys.argv[2:])
+
+        if args.run_workflow:
+            print(args)
+            extra = [i.replace("__ORIG__", args.orig) for i in extra]
+            strargs = " ".join(extra)
+            cmd = (
+                'eval "$(conda shell.bash hook)" '
+                f"&& conda activate {args.env} "
+                f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})"
+            )
+            print_header(f"Running the following command:\n{cmd}")
+            sp.run(
+                cmd,
+                check=True,
+                shell=True,
+                executable="/bin/bash"
+            )
+
 if __name__ == "__main__":
     Runner()
 
diff --git a/workflows/variant-calling/run_test.sh b/workflows/variant-calling/run_test.sh
new file mode 100755
index 000000000..7aacb413c
--- /dev/null
+++ b/workflows/variant-calling/run_test.sh
@@ -0,0 +1,3 @@
+set -e
+python -m doctest ../../ci/preprocessor.py
+python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@"

From 3fa096ca2a05ba324d380bb4b5db584d11bd69fe Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Thu, 1 Jun 2023 18:35:13 -0400
Subject: [PATCH 03/59] 'typo'

---
 workflows/variant-calling/Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 2496fcfe7..a2d5b04c6 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -652,7 +652,7 @@ rule snpeff:
     resources:
         disk_mb=gb(20),
         mem_mb=gb(16),
-        # mem_mb = gb(5.5) # [ TEST SETTINGS -1 ]
+        # mem_mb = gb(5.5), # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
         vcf='results/filtered/all.normed.vcf.gz',
@@ -888,7 +888,7 @@ rule snpeff_cancer:
     resources:
         disk_mb=gb(20),
         mem_mb=gb(16),
-        # mem_mb = gb(5.5) # [ TEST SETTINGS -1 ]
+        # mem_mb = gb(5.5), # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
         vcf='results/somatic_filtered/normed.{comp}.vcf.gz',

From 3f23aa07b44bcae7f918516207fdff34ccfd939f Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Thu, 1 Jun 2023 18:36:40 -0400
Subject: [PATCH 04/59] 'typo in test setting'

---
 workflows/variant-calling/Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index a2d5b04c6..e608dc7a8 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -652,7 +652,7 @@ rule snpeff:
     resources:
         disk_mb=gb(20),
         mem_mb=gb(16),
-        # mem_mb = gb(5.5), # [ TEST SETTINGS -1 ]
+        # mem_mb = gb(5), # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
         vcf='results/filtered/all.normed.vcf.gz',
@@ -888,7 +888,7 @@ rule snpeff_cancer:
     resources:
         disk_mb=gb(20),
         mem_mb=gb(16),
-        # mem_mb = gb(5.5), # [ TEST SETTINGS -1 ]
+        # mem_mb = gb(5), # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
         vcf='results/somatic_filtered/normed.{comp}.vcf.gz',

From 880a8a084005d41aa13d4bfc36acf01b87899ffb Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Thu, 1 Jun 2023 20:07:35 -0400
Subject: [PATCH 05/59] removed bed from config

---
 workflows/variant-calling/config/config.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml
index 4b3e71bd2..50505c627 100644
--- a/workflows/variant-calling/config/config.yaml
+++ b/workflows/variant-calling/config/config.yaml
@@ -31,7 +31,8 @@ processing:
   # See https://gatk.broadinstitute.org/hc/en-us/articles/360036465912-HaplotypeCaller#--pcr-indel-model for pcr
   pcr: "NONE"
   # Point to a bed file, e.g. captured regions
-  restrict-regions: 'references/exons_subset.bed'
+  restrict-regions: 
+  #'references/exons_subset.bed'
   # If regions are restricted, optionally enlarge them by a given value
   region-padding:
 filtering:

From e0793b505905f6713c3b1d1be0d94924ec4191f6 Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Thu, 1 Jun 2023 20:25:16 -0400
Subject: [PATCH 06/59] fixed typo in helpers checking chr nomenclature

---
 lib/helpers.smk | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/helpers.smk b/lib/helpers.smk
index 63e717932..4188b0654 100644
--- a/lib/helpers.smk
+++ b/lib/helpers.smk
@@ -324,9 +324,11 @@ def get_bed_nomenclature(input):
     nom = False
     with open(input.bed, 'r') as f:
         for line in f:
-            if line.startswith('browser') or line.startswith('track'):
+            if line.startswith('browser'):
                 continue
-            if f.startswith('chr'):
+            if line.startswith('track'):
+                continue
+            if line.startswith('chr'):
                 nom = True
                 break
     return nom

From 5bb3726ec16802dc588cd4d8d0221c66c316f404 Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Thu, 1 Jun 2023 20:29:00 -0400
Subject: [PATCH 07/59] commenting out dbnsfp rule

---
 workflows/references/Snakefile | 114 ++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index c1438675d..35583251c 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -487,62 +487,62 @@ rule known_variation:
                     "tabix -p vcf {output} 2> {log} "
                 )
 
-if config['references']['human'].get('variation'):
-    rule dbnsfp:
-        """
-        Download and process dbNSFP database. This involves downloading and
-        extracting the zip file, then combining the chromosomes to create
-        a single file. For genome builds like hg19 and GRCh37, some processing
-        needs to be done to make them compatible with dbNSFP version > 3.X
-        dbNSFP is only for human genomes.
-        """
-        output:
-            protected(
-                '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.vcf.gz'.format(
-                    dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'],
-                    build=config['references']['human']['variation']['dbnsfp']['build']
-                )
-            )
-        log:
-            '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.log'.format(
-                dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'],
-                build=config['references']['human']['variation']['dbnsfp']['build']
-            )
-        resources:
-            disk_mb=gb(500),
-            mem_mb=gb(500),
-            runtime=hours(8)
-        threads: 16
-        run:
-            version = config['references']['human'][wildcards.tag]['dbnsfp']['version']
-            URL = config['references']['human'][wildcards.tag]['dbnsfp']['url']
-            build = config['references']['human'][wildcards.tag]['dbnsfp']['build']
-            workdir = wildcards.references_dir
-            if build == 'GRCh37':
-                # We need to process the dbNSFP file to make it compatible with older genomes
-                with tempfile.TemporaryDirectory() as tmpdir:
-                    shell(
-                        '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && '''
-                        '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && '''
-                        '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && '''
-                        '''awk '$8 != "." ' all_chrs > all_chrs_filtered && '''
-                        '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && '''
-                        '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && '''
-                        '''bgzip -c all_chrs_filtered_sorted_header > {output}) && '''
-                        '''tabix -s 1 -b 2 -e 2 {output} '''
-                    )
-            if build == 'GRCh38':
-                with tempfile.TemporaryDirectory() as tmpdir:
-                    # No need for processing and we can use the first 2 columns for coordinates
-                    shell(
-                        '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && '''
-                        '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && '''
-                        '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && '''
-                        '''sort -S 50% --parallel=24 all_chrs -k1,1 -k2,2n > all_chrs_sorted && '''
-                        '''cat h all_chrs_sorted > all_chrs_sorted_header && '''
-                        '''bgzip -c all_chrs_sorted_header > {output}) && '''
-                        '''tabix -s 1 -b 2 -e 2 {output} '''
-                    )
-
+#if config['references']['human'].get('variation'):
+#    rule dbnsfp:
+#        """
+#        Download and process dbNSFP database. This involves downloading and
+#        extracting the zip file, then combining the chromosomes to create
+#        a single file. For genome builds like hg19 and GRCh37, some processing
+#        needs to be done to make them compatible with dbNSFP version > 3.X
+#        dbNSFP is only for human genomes.
+#        """
+#        output:
+#            protected(
+#                '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.vcf.gz'.format(
+#                    dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'],
+#                    build=config['references']['human']['variation']['dbnsfp']['build']
+#                )
+#            )
+#        log:
+#            '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.log'.format(
+#                dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'],
+#                build=config['references']['human']['variation']['dbnsfp']['build']
+#            )
+#        resources:
+#            disk_mb=gb(500),
+#            mem_mb=gb(500),
+#            runtime=hours(8)
+#        threads: 16
+#        run:
+#            version = config['references']['human'][wildcards.tag]['dbnsfp']['version']
+#            URL = config['references']['human'][wildcards.tag]['dbnsfp']['url']
+#            build = config['references']['human'][wildcards.tag]['dbnsfp']['build']
+#            workdir = wildcards.references_dir
+#            if build == 'GRCh37':
+#                # We need to process the dbNSFP file to make it compatible with older genomes
+#                with tempfile.TemporaryDirectory() as tmpdir:
+#                    shell(
+#                        '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && '''
+#                        '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && '''
+#                        '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && '''
+#                        '''awk '$8 != "." ' all_chrs > all_chrs_filtered && '''
+#                        '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && '''
+#                        '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && '''
+#                        '''bgzip -c all_chrs_filtered_sorted_header > {output}) && '''
+#                        '''tabix -s 1 -b 2 -e 2 {output} '''
+#                    )
+#            if build == 'GRCh38':
+#                with tempfile.TemporaryDirectory() as tmpdir:
+#                    # No need for processing and we can use the first 2 columns for coordinates
+#                    shell(
+#                        '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && '''
+#                        '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && '''
+#                        '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && '''
+#                        '''sort -S 50% --parallel=24 all_chrs -k1,1 -k2,2n > all_chrs_sorted && '''
+#                        '''cat h all_chrs_sorted > all_chrs_sorted_header && '''
+#                        '''bgzip -c all_chrs_sorted_header > {output}) && '''
+#                        '''tabix -s 1 -b 2 -e 2 {output} '''
+#                    )
+#
 
 # vim: ft=python

From a1f69ed9c1836d6d6f5bb73ea633b5d4533d17ae Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 10:16:37 -0400
Subject: [PATCH 08/59] disable references test for now

---
 .circleci/config.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 667077b22..47fe05359 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -513,10 +513,10 @@ workflows:
           requires:
             - initial-setup
             - pytest
-      - references:
-          requires:
-            - initial-setup
-            - pytest
+    # - references:
+    #     requires:
+    #       - initial-setup
+    #       - pytest
       - colocalization:
           requires:
             - initial-setup

From ac1d3dd5c1a50c1b6e66a9b23b84e7430028379b Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 10:32:15 -0400
Subject: [PATCH 09/59] temporarily disable all but initial setup & variant
 calling tests

---
 .circleci/config.yml | 65 +++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 47fe05359..389b43ea1 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -497,38 +497,41 @@ workflows:
       - pytest:
          requires:
            - initial-setup
-      - chipseq:
-         requires:
-           - initial-setup
-           - pytest
-      - chipseq-misc:
-         requires:
-           - initial-setup
-           - pytest
-      - rnaseq:
-          requires:
-            - initial-setup
-            - pytest
-      - rnaseq-misc:
-          requires:
-            - initial-setup
-            - pytest
+    # - chipseq:
+    #    requires:
+    #      - initial-setup
+    #      - pytest
+    # - chipseq-misc:
+    #    requires:
+    #      - initial-setup
+    #      - pytest
+    # - rnaseq:
+    #     requires:
+    #       - initial-setup
+    #       - pytest
+    # - rnaseq-misc:
+    #     requires:
+    #       - initial-setup
+    #       - pytest
     # - references:
+    #    requires:
+    #      - initial-setup
+    #      - pytest
+    # - colocalization:
     #     requires:
     #       - initial-setup
     #       - pytest
-      - colocalization:
-          requires:
-            - initial-setup
-            - pytest
-      - build-docs:
-          requires:
-            - initial-setup
-      - report-env:
-          requires:
-            - rnaseq
-            - rnaseq-misc
-            - chipseq
-            - chipseq-misc
-            - references
-            - colocalization
+    # - build-docs:
+    #     requires:
+    #       - initial-setup
+    # - report-env:
+    #     requires:
+    #       - rnaseq
+    #       - rnaseq-misc
+    #       - chipseq
+    #       - chipseq-misc
+    #       - references
+    #       - colocalization
+      - variantcalling:
+        requires:
+          - initial-setup

From 9a8d8effbf201bd9dee09a50d687cc7a5cbe2628 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 10:33:19 -0400
Subject: [PATCH 10/59] fix circleci yaml config syntax

---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 389b43ea1..906a7bd00 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -533,5 +533,5 @@ workflows:
     #       - references
     #       - colocalization
       - variantcalling:
-        requires:
-          - initial-setup
+         requires:
+           - initial-setup

From 4d4a261a6a25c32f1fd106d27bb9bc7060cb9881 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 10:44:30 -0400
Subject: [PATCH 11/59] more syntax fixes

---
 .circleci/config.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 906a7bd00..064323883 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -412,7 +412,7 @@ jobs:
       - *get-data
       - *variantcalling-step
       - store_artifacts:
-        path: /tmp/lcdb-wf-test/workflows/variant-calling/variantcalling.tar.gz
+          path: /tmp/lcdb-wf-test/workflows/variant-calling/variantcalling.tar.gz
 
   rnaseq-misc:
     <<: *defaults
@@ -494,9 +494,9 @@ workflows:
   test-suite:
     jobs:
       - initial-setup
-      - pytest:
-         requires:
-           - initial-setup
+    # - pytest:
+    #    requires:
+    #      - initial-setup
     # - chipseq:
     #    requires:
     #      - initial-setup

From 26aaf19b72df14742abfd724a693e51e61b7e4dc Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 10:51:04 -0400
Subject: [PATCH 12/59] ensure run_test.sh is copied over to deploy location
 for variant-calling

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 064323883..82bf4e068 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -139,6 +139,7 @@ variables:
         cp workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq
         cp workflows/colocalization/run_test.sh $DEPLOY/workflows/references
         cp workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization
+        cp workflows/variant-calling/run_test.sh $DEPLOY/workflows/variant-calling
         mkdir $DEPLOY/ci
         mkdir $DEPLOY/test
         cp test/lcdb-wf-test $DEPLOY/test

From 25e6a611d4d850dea6e49eb008ada2cd94a9ac96 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 10:55:55 -0400
Subject: [PATCH 13/59] fix comments in ci config

---
 .circleci/config.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 82bf4e068..68e2ed78a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -292,8 +292,7 @@ variables:
 
 
   # --------------------------------------------------------------------------
-  # REVIEW: not RNA-seq
-  # Standard RNA-seq workflow
+  # Variant-calling workflow
   variantcalling-step: &variantcalling-step
       run:
         name: variantcalling workflow

From 29c3c82f0d04122aaa1661fa8d61e18ea6f6452c Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 11:04:54 -0400
Subject: [PATCH 14/59] try resolving path

---
 test/lcdb-wf-test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test
index f215433d5..d7ce97dc8 100755
--- a/test/lcdb-wf-test
+++ b/test/lcdb-wf-test
@@ -278,7 +278,7 @@ class Runner(object):
                     print(f"downloading {url}")
                 if dest is None:
                     dest = fn
-                dest = Path(dest)
+                dest = Path(dest).resolve()
                 dest.parent.mkdir(parents=True, exist_ok=True)
                 sp.run(
                     f"wget -q -O- {url} > {dest}", shell=True, check=True, cwd=TOPLEVEL

From 6a1da2bfe3ce43de1c5b8755894367347f5a4cfa Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 11:20:33 -0400
Subject: [PATCH 15/59] support deploying variant-calling workflow

---
 deploy.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/deploy.py b/deploy.py
index 51b9e0b59..cdeec6d11 100755
--- a/deploy.py
+++ b/deploy.py
@@ -87,6 +87,10 @@ def write_include_file(flavor=None):
             'recursive-include workflows/chipseq/config *',
             'include workflows/chipseq/chipseq_trackhub.py',
         ],
+        'variant-calling': [
+            'include workflows/variant-calling/Snakefile',
+            'recursive-include workflows/variant-calling/config *',
+        ],
         'all': [
             'recursive-include wrappers *',
             'recursive-include include *',
@@ -104,6 +108,7 @@ def write_include_file(flavor=None):
             'recursive-include workflows/external *',
         ]
 
+
     }
 
     patterns = []
@@ -111,6 +116,8 @@ def write_include_file(flavor=None):
         patterns.extend(PATTERN_DICT['rnaseq'])
     if flavor is None or 'chipseq':
         patterns.extend(PATTERN_DICT['chipseq'])
+    if flavor is None or 'variant-calling':
+        patterns.extend(PATTERN_DICT['variant-calling'])
     if flavor is None or 'full':
         patterns.extend(PATTERN_DICT['full'])
     patterns.extend(PATTERN_DICT['all'])
@@ -324,7 +331,7 @@ def build_envs(dest, conda_frontend="mamba"):
     ap.add_argument(
         "--flavor",
         default="full",
-        help="""Options are {0}. Default is full.""".format(['full', 'rnaseq', 'chipseq']),
+        help="""Options are {0}. Default is full.""".format(['full', 'rnaseq', 'chipseq', 'variant-calling']),
     )
     ap.add_argument(
         "--dest", help="""Destination directory in which to copy files""", required=True

From 79ee0f539d91c1653725680d6bc9ba2b0aee2494 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 12:14:46 -0400
Subject: [PATCH 16/59] remove windows line-endings and unused unit

---
 workflows/variant-calling/config/units.tsv | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/workflows/variant-calling/config/units.tsv b/workflows/variant-calling/config/units.tsv
index 3d6d261d8..d336d4be9 100644
--- a/workflows/variant-calling/config/units.tsv
+++ b/workflows/variant-calling/config/units.tsv
@@ -1,4 +1,3 @@
-sample	unit	platform	fq1	fq2
-tumor	1	Illumina	data/example_data/tumor_R1.fq.gz	data/example_data/tumor_R2.fq.gz
-tumor	2	Illumina	data/example_data/test_unit_R1.fq.gz	data/example_data/test_unit_R2.fq.gz
-normal	1	Illumina	data/example_data/normal_R1.fq.gz	data/example_data/normal_R2.fq.gz
+sample	unit	platform	fq1	fq2
+tumor	1	Illumina	data/example_data/tumor_R1.fq.gz	data/example_data/tumor_R2.fq.gz
+normal	1	Illumina	data/example_data/normal_R1.fq.gz	data/example_data/normal_R2.fq.gz

From 8ad4d36831d4dbf6c674819226f75f17af34360e Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Fri, 2 Jun 2023 13:06:55 -0400
Subject: [PATCH 17/59] 'resource formatting, threads test settings, snpeff
 adjustments'

---
 workflows/variant-calling/Snakefile | 112 +++++++++++++++++-----------
 1 file changed, 67 insertions(+), 45 deletions(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index e608dc7a8..4466406b6 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -57,10 +57,10 @@ checkpoint genome_index:
 
 
 rule fasta_dict:
-    threads: 2
+    threads: 1
     resources:
         mem_mb=gb(4),
-        # mem_mb = gb(4), # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
         disk_mb=gb(4),
         runtime=autobump(60)
     input:
@@ -158,10 +158,11 @@ rule mark_duplicates:
     and we don't want to mark them as temporary
     """
     threads: 4
+    # threads: 1 # [ TEST SETTINGS -1 ]
     resources:
         disk_mb=gb(40),
         mem_mb=gb(32),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
         runtime=autobump(720)
     input:
         bam = "results/mapped/{sample}-{unit}.sorted.bam", 
@@ -195,9 +196,10 @@ rule mark_duplicates:
 if config["filtering"]['bqsr']:
     rule base_recalibrator:
         threads: 4
+        # threads: 1 # [ TEST SETTINGS -1 ]
         resources:
             mem_mb=gb(8),
-            # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+            # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
             disk_mb=gb(40),
             runtime=autobump(960)
         input:
@@ -224,9 +226,10 @@ if config["filtering"]['bqsr']:
 
     rule apply_bqsr:
         threads: 8
+        # threads: 1 # [ TEST SETTINGS -1 ]
         resources:
             mem_mb=gb(32),
-            # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+            # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
             disk_mb=gb(40),
             runtime=autobump(960)
         input:
@@ -322,9 +325,10 @@ rule call_variants:
     resources:
         disk_mb=gb(16),
         mem_mb=gb(40),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
         runtime=autobump(hours=8)
     threads: 8
+    # threads: 1 # [ TEST SETTINGS -1 ]
     params:
         extra=get_call_variants_params,
         pcr='--pcr-indel-model ' + config['processing']['pcr']
@@ -350,10 +354,11 @@ rule call_variants:
 
 rule combine_calls:
     threads: 4
+    # threads: 1 # [ TEST SETTINGS -1 ]
     resources:
         disk_mb=gb(10),
         mem_mb=gb(4),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
         runtime=autobump(720)
     input:
         ref=reference,
@@ -374,10 +379,11 @@ rule combine_calls:
 
 rule genotype_variants:
     threads: 4
+    # threads: 1 # [ TEST SETTINGS -1 ]
     resources:
         disk_mb=gb(10),
         mem_mb=gb(8),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         ref=reference,
@@ -399,10 +405,11 @@ rule genotype_variants:
 
 rule merge_variants:
     threads: 4
+    # threads: 1 # [ TEST SETTINGS -1 ]
     resources:
         disk_mb=gb(10),
         mem_mb=gb(8),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         vcfs=lambda w: expand(
@@ -428,8 +435,8 @@ rule merge_variants:
 rule tabix_variants:
     threads: 2
     resources:
-        disk_mb = gb(2),
-        mem_mb = gb(2),
+        disk_mb=gb(2),
+        mem_mb=gb(2),
         runtime=autobump(30)
     input:
         vcf="{prefix}.vcf.gz",
@@ -443,10 +450,11 @@ rule tabix_variants:
 
 rule select_calls:
     threads: 4
+    # threads: 1 # [ TEST SETTINGS -1 ]
     resources:
         disk_mb=gb(10),
         mem_mb=gb(4),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         ref=reference,
@@ -471,10 +479,11 @@ rule select_calls:
 
 rule hard_filter_calls:
     threads: 4
+    # threads: 1 # [ TEST SETTINGS -1 ]
     resources:
         disk_mb=gb(10),
         mem_mb=gb(4),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         ref=reference,
@@ -504,7 +513,7 @@ rule merge_calls:
     resources:
         disk_mb=gb(10),
         mem_mb=gb(8),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         vcfs=expand(
@@ -514,7 +523,7 @@ rule merge_calls:
     output:
         vcf=temp("results/filtered/all.final.vcf.gz"),
     log:
-        "logs/merge-filtered.log",
+        "logs/merge-filtered.log", 
     run:
         inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs)
         java_opts = set_java_opts(resources)
@@ -533,8 +542,8 @@ rule norm:
     Split multiallielic variants into multiple biallelic ones.
     """
     resources:
-        mem_mb = gb(16),
-        runtime = autobump(120)
+        mem_mb=gb(16),
+        runtime=autobump(120)
     input:
         ref=reference,
         vcf="results/filtered/all.final.vcf.gz"
@@ -592,8 +601,8 @@ rule samtools_stats:
     Run samtools stats
     """
     resources:
-        mem_mb = gb(16),
-        runtime = autobump(120)
+        mem_mb=gb(16),
+        runtime=autobump(120)
     input:
         get_sample_unit_bams
     output:
@@ -652,7 +661,7 @@ rule snpeff:
     resources:
         disk_mb=gb(20),
         mem_mb=gb(16),
-        # mem_mb = gb(5), # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
         vcf='results/filtered/all.normed.vcf.gz',
@@ -664,9 +673,11 @@ rule snpeff:
     params:
         annotations = config['snpeff']['annotations'],
         gen = config['snpeff']['genome']
+    # threads: 2 # [ TEST SETTINGS ]
     run:
         java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
         shell(
+            # 'mkdir -p $(dirname {output.ann}) && ' # [ TEST SETTINGS ]
             "snpEff {java_opts} "
             "-o vcf "
             "-csvStats {output.stats} "
@@ -698,10 +709,10 @@ rule mutect2:
     Use Mutect2 to call variants on individual samples, one per contig
     """
     resources:
-        disk_mb = gb(40),
-        mem_mb = gb(32),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
-        runtime = autobump(720)
+        disk_mb=gb(40),
+        mem_mb=gb(32),
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(720)
     input:
         unpack(input_for_somatic)
     output:
@@ -718,6 +729,7 @@ rule mutect2:
         ),
         extra=get_call_variants_params,
         pcr='--pcr-indel-model ' + config['processing']['pcr']
+    # threads: 1 # [ TEST SETTINGS ]
     run:
         java_opts = set_java_opts(resources)
         normals = " ".join("-I {} ".format(n) for n in input.normals)
@@ -745,16 +757,17 @@ rule lrom:
     in the orientation bias mixture model filter
     """
     resources:
-        disk_mb = gb(20),
-        mem_mb = gb(32),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
-        runtime = autobump(120)
+        disk_mb=gb(20),
+        mem_mb=gb(32),
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(120)
     input:
         orientation=lambda w: expand('results/lrom/{contig}_{{comp}}.tar.gz', contig=get_contigs())
     output:
         lrom='results/lrom/artifact-prior-{comp}.tar.gz'
     log:
         'logs/lrom_{comp}.log'
+    # threads: 1 # [ TEST SETTINGS ]
     run:
         java_opts = set_java_opts(resources)
         def get_format_lrom():
@@ -774,10 +787,10 @@ rule merge_mutect2_variants:
     After individual contigs are called via mutect2, we merge them together here.
     """
     resources:
-        disk_mb = gb(20),
-        mem_mb = gb(32),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
-        runtime = autobump(120)
+        disk_mb=gb(20),
+        mem_mb=gb(32),
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(120)
     input:
         vcfs=lambda w: expand(
             "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz", contig=get_contigs()
@@ -786,6 +799,7 @@ rule merge_mutect2_variants:
         temp("results/somatic/merged.{comp}.vcf.gz")
     log:
         "logs/merge_mutect2.{comp}.log",
+    # threads: 1 # [ TEST SETTINGS ]
     run:
         inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs)
         java_opts = set_java_opts(resources)
@@ -804,10 +818,10 @@ rule merge_mutect2_stats:
     Just like merging VCFs for Mutect2, we also need to merge stats for filtering.
     """
     resources:
-        disk_mb = gb(20),
-        mem_mb = gb(16),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
-        runtime = autobump(120)
+        disk_mb=gb(20),
+        mem_mb=gb(16),
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(120)
     input:
         stats=lambda w: expand(
             "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz.stats", contig=get_contigs()
@@ -816,6 +830,7 @@ rule merge_mutect2_stats:
         temp("results/somatic/merged.{comp}.vcf.gz.stats")
     log:
         "logs/merge_mutect2_stats.{comp}.log"
+    # threads: 1 # [ TEST SETTINGS ]
     run:
         java_opts = set_java_opts(resources)
         inputs = " ".join(" -stats {} ".format(f) for f in input.stats)
@@ -833,10 +848,10 @@ rule filter_mutect2_calls:
     New versions of Mutect2 have optimized defaults for filtering; we can just use those.
     """
     resources:
-        disk_mb = gb(20),
-        mem_mb = gb(16),
-        # mem_mb = gb(4),  # [ TEST SETTINGS -1 ]
-        runtime = autobump(120)
+        disk_mb=gb(20),
+        mem_mb=gb(16),
+        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        runtime=autobump(120)
     input:
         ref=reference,
         unfiltered="results/somatic/merged.{comp}.vcf.gz",
@@ -846,6 +861,7 @@ rule filter_mutect2_calls:
         "results/somatic_filtered/filtered.{comp}.vcf.gz"
     log:
         "logs/{comp}.vcf.gz.log"
+    # threads: 1 # [ TEST SETTINGS ]
     run:
         java_opts = set_java_opts(resources)
         shell(
@@ -864,8 +880,8 @@ rule mutect2_norm:
     Split multiallielic variants into multiple biallelic ones.
     """
     resources:
-        mem_mb = gb(16),
-        runtime = autobump(120)
+        mem_mb=gb(16),
+        runtime=autobump(120)
     input:
         ref=reference,
         vcf="results/somatic_filtered/filtered.{comp}.vcf.gz"
@@ -888,7 +904,7 @@ rule snpeff_cancer:
     resources:
         disk_mb=gb(20),
         mem_mb=gb(16),
-        # mem_mb = gb(5), # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
         vcf='results/somatic_filtered/normed.{comp}.vcf.gz',
@@ -900,11 +916,17 @@ rule snpeff_cancer:
         'logs/cancer_snpeff_{comp}.log'
     params:
         snpeff_genome = config['snpeff']['genome']
+    # threads: 2 # [ TEST SETTINGS ]
     run:
         java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
         shell(
-            'snpEff {java_opts} -v -o vcf -cancer -csvStats {output.stats} \
-            -stats {output.html} {params.snpeff_genome} {input.vcf}  | bcftools view -Oz > {output.vcf}'
+            # 'mkdir -p $(dirname {output.vcf}) && ' # [ TEST SETTINGS ]
+            'snpEff {java_opts} '
+            '-v -o vcf -cancer '
+            '-csvStats {output.stats} '
+            '-stats {output.html} '
+            '{params.snpeff_genome} {input.vcf} '
+            '| bcftools view -Oz > {output.vcf} 2> {log}'
         )
 
 

From c151dde17191bbf0331c452e8dc5ff0d3f2cba3b Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Fri, 2 Jun 2023 14:10:49 -0400
Subject: [PATCH 18/59] add option to provide path for variation dbs when using
 ref wf

---
 lib/helpers.smk                              | 13 +++++++++++--
 workflows/variant-calling/config/config.yaml |  3 +++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/lib/helpers.smk b/lib/helpers.smk
index 4188b0654..de902d6e7 100644
--- a/lib/helpers.smk
+++ b/lib/helpers.smk
@@ -32,11 +32,20 @@ def preflight():
         aln_index = multiext(os.path.splitext(aln)[0], ".amb", ".ann", ".bwt", ".pac", ".sa")
         indexed = refdict[config['ref']['organism'][config['ref']['faidx']['tag']]['faidx']]
         if config['ref']['variation']['dbnsfp']:
-            dbnsfp = refdict[config['ref']['organism']]['variation'][str(config['ref']['variation']['dbnsfp'] + '_' + config['ref']['genome']['build'])]
+            # The config can supply a path to a local file in the variation slots
+            if config['ref']['variation']['dbnsfp'].startswith('/'):
+                dbnsfp = config['ref']['variation']['dbnsfp']
+            else:
+                dbnsfp = refdict[config['ref']['organism']]['variation'][str(
+                    config['ref']['variation']['dbnsfp'] + '_' + config['ref']['genome']['build']
+                )]
         else:
             dbnsfp = []
         if config['ref']['variation']['known']:
-            known_sites = refdict[config['ref']['organism']][config['ref']['genome']['tag']][config['ref']['variation']['known']]
+            if config['ref']['variation']['known'].startswith('/'):
+                known_sites = config['ref']['variation']['known']
+            else:
+                known_sites = refdict[config['ref']['organism']][config['ref']['genome']['tag']][config['ref']['variation']['known']]
         else:
             known_sites = []
     else:
diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml
index 50505c627..c59bda5f0 100644
--- a/workflows/variant-calling/config/config.yaml
+++ b/workflows/variant-calling/config/config.yaml
@@ -16,6 +16,9 @@ ref:
     index: 'faidx'
     tag: 'ensembl-104'
   variation:
+    # Fill these keys in with the name of the variation database that matches the value in the reference config
+    # Or alternatively, you can provide an ABSOLUTE path to these files locally (paths MUST start with '/')
+    # If this is the case, you should go edit the lcdb-wf references config to make sure these jobs are not run for no reason.
     known: 'known'
     dbnsfp: 'dbNSFPv4.4'
   # If you are providing your own references, include their paths here.

From cd4a49bd92df9dc9fee02c730cc6574ec39ccd52 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 14:11:23 -0400
Subject: [PATCH 19/59] create tarball artifact in the right place

---
 .circleci/config.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 68e2ed78a..91a68ea40 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -303,7 +303,7 @@ variables:
           $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow -n
           $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow --use-conda -j2
 
-          tar -zcf variantcalling.tar.gz workflows/variant-calling/results
+          tar -zcf /tmp/variantcalling.tar.gz workflows/variant-calling/results
 
 # --------------------------------------------------------------------------
 # Syntax note: All of the steps above, with their "&step-name" labels, can be
@@ -412,7 +412,8 @@ jobs:
       - *get-data
       - *variantcalling-step
       - store_artifacts:
-          path: /tmp/lcdb-wf-test/workflows/variant-calling/variantcalling.tar.gz
+          path: /tmp/variantcalling.tar.gz
+          destination: variantcalling.tar.gz
 
   rnaseq-misc:
     <<: *defaults

From da845917a5d65892dda61d0b123784164497f8c4 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Fri, 2 Jun 2023 14:12:12 -0400
Subject: [PATCH 20/59] re-enable all tests (except for references)

---
 .circleci/config.yml | 69 ++++++++++++++++++++++----------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 91a68ea40..4c41a50d4 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -495,44 +495,45 @@ workflows:
   test-suite:
     jobs:
       - initial-setup
-    # - pytest:
-    #    requires:
-    #      - initial-setup
-    # - chipseq:
-    #    requires:
-    #      - initial-setup
-    #      - pytest
-    # - chipseq-misc:
-    #    requires:
-    #      - initial-setup
-    #      - pytest
-    # - rnaseq:
-    #     requires:
-    #       - initial-setup
-    #       - pytest
-    # - rnaseq-misc:
-    #     requires:
-    #       - initial-setup
-    #       - pytest
+      - pytest:
+         requires:
+           - initial-setup
+      - chipseq:
+         requires:
+           - initial-setup
+           - pytest
+      - chipseq-misc:
+         requires:
+           - initial-setup
+           - pytest
+      - rnaseq:
+          requires:
+            - initial-setup
+            - pytest
+      - rnaseq-misc:
+          requires:
+            - initial-setup
+            - pytest
     # - references:
     #    requires:
     #      - initial-setup
     #      - pytest
-    # - colocalization:
-    #     requires:
-    #       - initial-setup
-    #       - pytest
-    # - build-docs:
-    #     requires:
-    #       - initial-setup
-    # - report-env:
-    #     requires:
-    #       - rnaseq
-    #       - rnaseq-misc
-    #       - chipseq
-    #       - chipseq-misc
-    #       - references
-    #       - colocalization
+      - colocalization:
+          requires:
+            - initial-setup
+            - pytest
       - variantcalling:
          requires:
            - initial-setup
+      - build-docs:
+          requires:
+            - initial-setup
+      - report-env:
+          requires:
+            - rnaseq
+            - rnaseq-misc
+            - chipseq
+            - chipseq-misc
+          # - references
+            - colocalization
+            - variantcalling

From 0192e0253e947040b921d964d98bd807ec5200cb Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Fri, 2 Jun 2023 15:35:29 -0400
Subject: [PATCH 21/59] hide government addresses and minor fixes

---
 docs/toc.rst                            | 1 +
 env.yml                                 | 2 --
 workflows/references/Snakefile          | 2 +-
 workflows/references/config/config.yaml | 2 +-
 workflows/variant-calling/Snakefile     | 2 --
 5 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/docs/toc.rst b/docs/toc.rst
index 1180c8cd7..0a1cbb5ff 100644
--- a/docs/toc.rst
+++ b/docs/toc.rst
@@ -13,6 +13,7 @@ Table of Contents
    rnaseq
    downstream-rnaseq
    chipseq
+   variant-calling
    integrative
    conda
    tests
diff --git a/env.yml b/env.yml
index ee82a16ed..5f31f28fa 100644
--- a/env.yml
+++ b/env.yml
@@ -1,4 +1,3 @@
-name: /gpfs/gsfs10/users/NICHD-core0/test/fridellsa/v1.10-lcdbwf-vc/lcdb-wf/env
 channels:
   - conda-forge
   - bioconda
@@ -365,4 +364,3 @@ dependencies:
   - zlib=1.2.13
   - zstandard=0.19.0
   - zstd=1.5.2
-prefix: /gpfs/gsfs10/users/NICHD-core0/test/fridellsa/v1.10-lcdbwf-vc/lcdb-wf/env
diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index 35583251c..de7328f1f 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -529,7 +529,7 @@ rule known_variation:
 #                        '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && '''
 #                        '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && '''
 #                        '''bgzip -c all_chrs_filtered_sorted_header > {output}) && '''
-#                        '''tabix -s 1 -b 2 -e 2 {output} '''
+#                        '''tabix -s 8 -b 9 -e 9 {output} '''
 #                    )
 #            if build == 'GRCh38':
 #                with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml
index 5f8f2c664..bd9bd9392 100644
--- a/workflows/references/config/config.yaml
+++ b/workflows/references/config/config.yaml
@@ -1,4 +1,4 @@
-references_dir: '/data/NICHD-core0/references'
+references_dir: 'references'
 
 
 # See the reference config files in the top level of the repo,
diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 4466406b6..8b6a61c8e 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -677,7 +677,6 @@ rule snpeff:
     run:
         java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
         shell(
-            # 'mkdir -p $(dirname {output.ann}) && ' # [ TEST SETTINGS ]
             "snpEff {java_opts} "
             "-o vcf "
             "-csvStats {output.stats} "
@@ -920,7 +919,6 @@ rule snpeff_cancer:
     run:
         java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
         shell(
-            # 'mkdir -p $(dirname {output.vcf}) && ' # [ TEST SETTINGS ]
             'snpEff {java_opts} '
             '-v -o vcf -cancer '
             '-csvStats {output.stats} '

From 7a9fb721cc78fbfddf486c0cd764022810a3d921 Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Mon, 5 Jun 2023 11:00:01 -0400
Subject: [PATCH 22/59] Adding docs and tweaking pcr-indel-model arg in
 workflow

---
 workflows/variant-calling/Snakefile          | 14 +++++++++++---
 workflows/variant-calling/config/config.yaml |  3 ++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 8b6a61c8e..f570c1f5f 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -331,8 +331,12 @@ rule call_variants:
     # threads: 1 # [ TEST SETTINGS -1 ]
     params:
         extra=get_call_variants_params,
-        pcr='--pcr-indel-model ' + config['processing']['pcr']
-    run:
+        pcr=(
+            '--pcr-indel-model ' + config['processing']['pcr']
+            if config['processing']['pcr']
+            else ''
+        )
+        run:
         java_opts = set_java_opts(resources)
         known = input.known
         if known:
@@ -727,7 +731,11 @@ rule mutect2:
             else []
         ),
         extra=get_call_variants_params,
-        pcr='--pcr-indel-model ' + config['processing']['pcr']
+        pcr=(
+            '--pcr-indel-model ' + config['processing']['pcr']
+            if config['processing']['pcr']
+            else ''
+        )
     # threads: 1 # [ TEST SETTINGS ]
     run:
         java_opts = set_java_opts(resources)
diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml
index c59bda5f0..3b2ce5305 100644
--- a/workflows/variant-calling/config/config.yaml
+++ b/workflows/variant-calling/config/config.yaml
@@ -32,7 +32,8 @@ processing:
   remove-duplicates: true
   remove-mitochondrial: true
   # See https://gatk.broadinstitute.org/hc/en-us/articles/360036465912-HaplotypeCaller#--pcr-indel-model for pcr
-  pcr: "NONE"
+  # If you know there was no PCR used to generate your sequencing data, set this to NONE
+  pcr:
   # Point to a bed file, e.g. captured regions
   restrict-regions: 
   #'references/exons_subset.bed'

From 34f34869acb1811ad909c6ca17bbefe9053ef55c Mon Sep 17 00:00:00 2001
From: fridellsa <gus.fridell@nih.gov>
Date: Mon, 5 Jun 2023 11:56:07 -0400
Subject: [PATCH 23/59] fixed syntax err

---
 workflows/variant-calling/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index f570c1f5f..0cc4be94f 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -336,7 +336,7 @@ rule call_variants:
             if config['processing']['pcr']
             else ''
         )
-        run:
+    run:
         java_opts = set_java_opts(resources)
         known = input.known
         if known:

From 3574c69ab0b48673d22c18de30b97a98c8b0183a Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Tue, 13 Jun 2023 20:17:30 -0400
Subject: [PATCH 24/59] fix known_variation

---
 workflows/references/Snakefile | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index de7328f1f..0bb6f00a0 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -422,7 +422,6 @@ rule known_variation:
         #fai='{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai'
         # can't do it this way since the {tag} wildcard is not congruous between input and output
         fai = lambda w: checkpoints.genome_index.get(**w).output[0]
-
     output:
         protected('{references_dir}/{organism}/{tag}/known/{organism}_{tag}.vcf.gz')
     log:
@@ -476,15 +475,25 @@ rule known_variation:
             for ext in ["vcf.gz", "vcf.gz.csi"]
         ]
         names=[os.path.basename(url) for url in urls if url.endswith(".gz")]
+
+        # Compose a command that downloads all vcf.gz files for all contigs in use
         gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls)))
+
+        # Absolute paths needed because we'll be downloading in a temp dir
+        fai = os.path.abspath(input.fai)
+        log = os.path.abspath(str(log))
         with tempfile.TemporaryDirectory() as tmpdir:
             if input.get("fai"):
                 shell(
-                    "(cd {tmpdir}; {gather} && "
-                    "bcftools concat -Oz --naive-force {names} > concat.vcf.gz && "
-                    "bcftools reheader --fai {input.fai} concat.vcf.gz "
-                    "> {output}) && "
-                    "tabix -p vcf {output} 2> {log} "
+                    "( cd {tmpdir}; "
+                    "{gather} && "
+                    "bcftools concat -Oz --naive-force {names} "
+                    "> concat.vcf.gz 2> {log} )"
+                )
+                shell(
+                    "bcftools reheader --fai {fai} {tmpdir}/concat.vcf.gz "
+                    "> {output} 2>> {log} && "
+                    "tabix -p vcf {output} 2>> {log} "
                 )
 
 #if config['references']['human'].get('variation'):

From d02110eba8de107ce2af734474fc78b48bca414d Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Tue, 13 Jun 2023 20:18:09 -0400
Subject: [PATCH 25/59] re-enable references test

---
 .circleci/config.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4c41a50d4..5d423b029 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -514,10 +514,10 @@ workflows:
           requires:
             - initial-setup
             - pytest
-    # - references:
-    #    requires:
-    #      - initial-setup
-    #      - pytest
+      - references:
+         requires:
+           - initial-setup
+           - pytest
       - colocalization:
           requires:
             - initial-setup

From 51b458653e1fa6a65a072296bf4ef7b07fc1a3cd Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Thu, 15 Jun 2023 15:31:55 -0400
Subject: [PATCH 26/59] reorganize test config for variant calling

---
 .../test_configs}/variant-calling.yaml                    | 4 ++--
 workflows/references/config/config.yaml                   | 8 --------
 2 files changed, 2 insertions(+), 10 deletions(-)
 rename {include/reference_configs => test/test_configs}/variant-calling.yaml (90%)
 delete mode 100644 workflows/references/config/config.yaml

diff --git a/include/reference_configs/variant-calling.yaml b/test/test_configs/variant-calling.yaml
similarity index 90%
rename from include/reference_configs/variant-calling.yaml
rename to test/test_configs/variant-calling.yaml
index 6ad0f9b48..d768eb05e 100644
--- a/include/reference_configs/variant-calling.yaml
+++ b/test/test_configs/variant-calling.yaml
@@ -1,3 +1,4 @@
+references_dir: "references"
 references:
   human:
     ensembl-104:
@@ -6,13 +7,12 @@ references:
         release: 104
         species: 'homo_sapiens'
       genome:
-        url: 'ftp://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz'
+        url: 'https://github.com/lcdb/lcdb-wf-variant-calling-test-data/raw/master/data/GRCh38.6.20.fa.gz'
         # URL format is 'ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species_capitalized}.{build}.{datatype}.{assembly}.{suffix}'
         # When using GRCh37, branch changes to "grch37/release-{release}"
         # always use primary_assembly for human, NEVER use top_level for assembly for human
         indexes:
           - 'bwa'
-          - 'faidx'
       known:
         # You can download structural_variations, somatic, or "all" which corresponds to germline known variation for all chromosomes
         type: 'all'
diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml
deleted file mode 100644
index bd9bd9392..000000000
--- a/workflows/references/config/config.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-references_dir: 'references'
-
-
-# See the reference config files in the top level of the repo,
-# include/reference_configs, for inspiration for more species.
-include_references:
-  # - '../../include/reference_configs/test.yaml'
-  - '../../include/reference_configs/variant-calling.yaml'

From 0ffe62decfb6a862ac4e3a95c6d986fb8674fe32 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Thu, 15 Jun 2023 15:32:47 -0400
Subject: [PATCH 27/59] treat fai just like chromsizes

---
 lib/common.py                  | 19 +++++++++++++------
 workflows/references/Snakefile |  8 +++-----
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/lib/common.py b/lib/common.py
index 653dc2f6c..7ed03cc73 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -419,9 +419,7 @@ def references_dict(config):
         'bowtie2': aligners.bowtie2_index_from_prefix('')[0],
         'hisat2': aligners.hisat2_index_from_prefix('')[0],
         'star': '/Genome',
-        # Add BWA and samtools faidx indices
         'bwa': aligners.bwa_index_from_prefix('')[0],
-        'faidx': '.fai',
 
         # Notes on salmon indexing:
         #   - pre-1.0 versions had hash.bin
@@ -462,10 +460,9 @@ def references_dict(config):
         d[organism] = {}
         for tag in merged_references[organism].keys():
             e = {}
-            # add support for variation databases
             if tag == 'variation':
-                # get the variation databases
-                # they should be the the keys of a dictionary containing a URL and postprocess block
+                # variation databases should be the the keys of a dictionary
+                # containing a URL and postprocess block
                 for type_ in merged_references[organism][tag].keys():
                     ext = '.vcf.gz'
                     if type_ == 'dbnsfp':
@@ -567,7 +564,8 @@ def references_dict(config):
                             .format(**locals())
                         )
 
-                    # Only makes sense to have chromsizes for genome fasta, not transcriptome.
+                    # Only makes sense to have chromsizes and faidx for genome
+                    # fasta, not transcriptome.
                     if type_ == 'genome':
                         e['chromsizes'] = (
                             '{references_dir}/'
@@ -576,6 +574,15 @@ def references_dict(config):
                             '{type_}/'
                             '{organism}_{tag}.chromsizes'.format(**locals())
                         )
+                        e['faidx'] = (
+                            '{references_dir}/'
+                            '{organism}/'
+                            '{tag}/'
+                            '{type_}/'
+                            '{organism}_{tag}.fai'.format(**locals())
+                        )
+
+
 
                 d[organism][tag] = e
     return d, conversion_kwargs
diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index 0bb6f00a0..291587ffd 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -398,9 +398,9 @@ checkpoint genome_index:
     input:
         '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta'
     output:
-        protected('{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai')
+        protected('{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fai')
     log:
-        '{references_dir}/logs/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai.log'
+        '{references_dir}/logs/{organism}/{tag}/genome/{organism}_{tag}.fai.log'
     resources:
         runtime=hours(1),
         mem_mb=gb(4)
@@ -419,9 +419,7 @@ rule known_variation:
     create a known variation vcf
     """
     input:
-        #fai='{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai'
-        # can't do it this way since the {tag} wildcard is not congruous between input and output
-        fai = lambda w: checkpoints.genome_index.get(**w).output[0]
+        fai='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fai'
     output:
         protected('{references_dir}/{organism}/{tag}/known/{organism}_{tag}.vcf.gz')
     log:

From b9e7a6fcf817045dbc3fc5f13171549e6c097a6c Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Thu, 15 Jun 2023 15:33:14 -0400
Subject: [PATCH 28/59] clean up known variation rule

- add info about other species
- point out human-specific parts
- improve comments
- more canonical code
---
 workflows/references/Snakefile | 84 +++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 38 deletions(-)

diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile
index 291587ffd..0f956d855 100644
--- a/workflows/references/Snakefile
+++ b/workflows/references/Snakefile
@@ -429,28 +429,38 @@ rule known_variation:
         mem_mb=gb(16),
         disk_mb=gb(32)
     run:
-        # Get the configuration options in the metadata chunk using wildcards
         release = int(config['references'][wildcards.organism][wildcards.tag]['metadata']['release'])
         species = config['references'][wildcards.organism][wildcards.tag]['metadata']['species']
         build = config['references'][wildcards.organism][wildcards.tag]['metadata']['build']
         typ = config['references'][wildcards.organism][wildcards.tag]['known']['type']
-        def get_contigs():
-            with open(input[0], 'r') as fai:
-                ser =  pd.read_table(fai, header=None, usecols=[0], dtype=str)
-                ser = ser.squeeze()
-                ser =  ser[ser.apply(lambda x: len(x)) <= 2]
-            return ser
-        contigs = get_contigs()
-        branch=""
+
+        # ----------------------------------------------------------------------
+        # NOTE: species-specific configuration may be required
+        # ----------------------------------------------------------------------
+        #
+        # Ensembl has many species available, but the code below is designed to
+        # work for human.
+        #
+        # For available species, see https://ftp.ensembl.org/pub/release-109/variation/vcf/
+        #
+
+        # Human-specific patching to deal with GRCh37 filenames and directory
+        # structure on Ensembl FTP
+        branch = ""
         if release >= 81 and build == "GRCh37":
             branch="grch37/"
         if typ == "all":
+            suffixes = [""]
+
+            # Starting in release 93, human germline VCFs are split by chrom
             if species == "homo_sapiens" and release >= 93:
-                suffixes = [
-            "-chr{}".format(chrom) for chrom in contigs
-        ]
-            else:
-                suffixes = [""]
+                ser =  (
+                    pd.read_table(input.fai, header=None, usecols=[0], dtype=str)
+                    .squeeze()
+                )
+                contigs = ser[ser.apply(lambda x: len(x)) <= 2]
+                suffixes = expand("-chr{chrom}", chrom=contigs)
+
         elif typ == "somatic":
             suffixes=["_somatic"]
         elif typ == "structural_variations":
@@ -459,39 +469,37 @@ rule known_variation:
         release = int(release)
         build = build
         typ = typ
-        species_filename=species if release >= 91 else species.capitalize()
-        urls=[
-            "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}".format(
-                release=release,
-                species=species,
-                suffix=suffix,
-                species_filename=species_filename,
-                branch=branch,
-                ext=ext,
-            )
-            for suffix in suffixes
-            for ext in ["vcf.gz", "vcf.gz.csi"]
-        ]
-        names=[os.path.basename(url) for url in urls if url.endswith(".gz")]
-
-        # Compose a command that downloads all vcf.gz files for all contigs in use
+
+        # Prior to release 91, species names started with a capital letter
+        species_filename = species if release >= 91 else species.capitalize()
+
+        urls = expand(
+            "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}",
+            release=release,
+            species=species,
+            suffix=suffixes,
+            species_filename=species_filename,
+            branch=branch,
+            ext=["vcf.gz", "vcf.gz.csi"]
+        )
+        vcfs = [os.path.basename(url) for url in urls if url.endswith(".gz")]
+
+        # Compose a command that downloads all vcf.gz files (possibly one for
+        # each contig in human Ensembl releases >=93)
         gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls)))
 
-        # Absolute paths needed because we'll be downloading in a temp dir
+        # We'll be downloading in a temp dir, so get absolute paths to make things easier
         fai = os.path.abspath(input.fai)
         log = os.path.abspath(str(log))
         with tempfile.TemporaryDirectory() as tmpdir:
             if input.get("fai"):
                 shell(
-                    "( cd {tmpdir}; "
-                    "{gather} && "
-                    "bcftools concat -Oz --naive-force {names} "
-                    "> concat.vcf.gz 2> {log} )"
+                    "cd {tmpdir}; {gather} 2> {log} "
+                    "&& bcftools concat -Oz --naive-force {vcfs} > concat.vcf.gz 2>> {log}"
                 )
                 shell(
-                    "bcftools reheader --fai {fai} {tmpdir}/concat.vcf.gz "
-                    "> {output} 2>> {log} && "
-                    "tabix -p vcf {output} 2>> {log} "
+                    "bcftools reheader --fai {fai} {tmpdir}/concat.vcf.gz> {output} 2>> {log} "
+                    "&& tabix -p vcf {output} 2>> {log} "
                 )
 
 #if config['references']['human'].get('variation'):

From 0b5a9b5d160a06ceba4b4d7b8f0c2d92a0d43586 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Thu, 15 Jun 2023 16:18:05 -0400
Subject: [PATCH 29/59] add back in the original references config

---
 workflows/references/config/config.yaml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 workflows/references/config/config.yaml

diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml
new file mode 100644
index 000000000..49618dcd0
--- /dev/null
+++ b/workflows/references/config/config.yaml
@@ -0,0 +1,6 @@
+references_dir: 'references_dir'
+
+# See the reference config files in the top level of the repo,
+# include/reference_configs, for inspiration for more species.
+include_references:
+  - '../../include/reference_configs/test.yaml'

From fd3c58e5e2441b719b35e3e0fd5a6fee7f908914 Mon Sep 17 00:00:00 2001
From: daler <dalerr@niddk.nih.gov>
Date: Thu, 15 Jun 2023 16:18:22 -0400
Subject: [PATCH 30/59] split variant-calling and non-variant-calling configs

---
 .circleci/config.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5d423b029..e7ca4c5e5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -226,8 +226,14 @@ variables:
         command: |
           source /opt/mambaforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
+          # RNA-seq/ChIP-seq references
           $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -r -k --orig $ORIG
 
+          # Variant-calling references
+          $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=$ORIG/test/test_configs/variant-calling.yaml -j2 -p -r -k --orig $ORIG
+
+
+
   # --------------------------------------------------------------------------
   # Standard RNA-seq workflow
   rnaseq-step: &rnaseq-step

From f090acb4905d89166597e46f9b59e4397edbfe4c Mon Sep 17 00:00:00 2001
From: fridells51 <gus.fridell@gmail.com>
Date: Sat, 17 Jun 2023 11:31:09 -0400
Subject: [PATCH 31/59] copying vc docs rst

---
 docs/variant-calling.rst | 647 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 647 insertions(+)
 create mode 100644 docs/variant-calling.rst

diff --git a/docs/variant-calling.rst b/docs/variant-calling.rst
new file mode 100644
index 000000000..a80cb705b
--- /dev/null
+++ b/docs/variant-calling.rst
@@ -0,0 +1,647 @@
+.. _Overview:
+
+LCDB-WF Variant Calling Overview
+================================
+
+This Snakemake workflow handles detection and annotation of germline and
+somatic variants in DNA sequencing reads. Analysis-ready VCFs are generated by
+following a GATK best practices workflow. Annotations are attached from
+variation databases like dbNSFP using SnpEff. This workflow also provides QC
+analysis on the input fastq files as well as some metrics recorded throughout
+the variant calling workflow.
+
+The workflow is primarily designed for human sequencing data, but it will
+support other organisms. However, due to limiting references in other
+organisms, some of the features of the workflow may not be available. See the
+section on `configuring other organisms <conforg_>`__ for details on how to run
+the workflow for non-human organisms. 
+
+The workflow can be thought of as having 5 main components: references,
+mapping, calling, annotating, and QC.
+
+
+
+.. _References:
+
+References
+----------
+
+There are a handful of files that can be considered references that are
+involved in calling and annotating variants. References can either be provided
+externally, or they will be generated inside of the workflow by implementing
+the LCDB-WF References workflow. See the section on `configuring references
+<confref_>`__ for details on how to properly provide or generate references.
+
+- Reference genome
+    * The reference genome will be downloaded by the LCDB-WF references
+      workflow if it is not supplied. The workflow supports chromosome
+      nomenclature from both GRCh38/hg38 and GRCh37/hg19 references.
+    * By default, Ensembl’s GRCh38 genome build is used because the known
+      variation file is also provided by Ensembl.
+- Known variation
+    * This is a VCF file that contains sites of “known variation” in an
+      organism’s genome. This is used to recalibrate bam files using `base
+      quality score recalibration
+      <https://gatk.broadinstitute.org/hc/en-us/articles/360035890531-Base-Quality-Score-Recalibration-BQSR->`_
+      and avoid detection of variants that are extremely common.
+    * Note that this file is not essential to variant calling and the workflow
+      can run without it. For some organisms, this type of file is not
+      available.
+- BWA Index Files
+    * These index files are generated by ``bwa index``
+    * The references workflow will build these files, but if the references
+      workflow is not used (see `configuring references <confref_>`__), these
+      files will be generated by the Snakefile at workflow execution.
+- Sequence Dictionary
+    * Generated by ``picard CreateSequenceDictionary`` and used by several GATK
+      commands.
+- Fasta Index
+    * Generated by ``samtools faidx`` and used by the workflow to establish
+      contigs for joint-calling.
+- Variation Databases
+    * These are (usually) VCFs that contain variant annotations that are used
+      by SnpEff for annotating VCFs in the workflow.
+    * the dbNSFP database may be downloaded by the references workflow or you
+      can download it yourself (see `Downloading dbNSFP <dldbnsfp_>`__). It
+      contains a comprehensive set of annotations for humans. This file can be
+      provided externally if you are not using the references workflow (see the
+      section on `Configuring Annotation Databases <confdbnsfp_>`__)
+    * These are also not required and we can still run annotation without the
+      file present (we just won’t have all the fancy annotations that these
+      databases provide)
+
+.. _Mapping:
+
+
+Mapping
+-------
+
+First, adapters are trimmed from the reads using ``cutadapt``. After that,
+reads are mapped to the genome using ``bwa mem``.
+
+After bams are generated, we need to mark and delete duplicates using ``picard
+MarkDuplicates``. Duplicate reads are uninformative for variant calling, and
+deleting them will speed up the workflow. 
+
+Once duplicates are removed, we recalibrate base quality score using GATK
+``BaseRecalibrator`` and ``ApplyBQSR`` provided that a known variation file is
+present and the config option for base recalibration is set (see `filtering and
+processing <fp_>`__).
+
+
+
+.. _Calling:
+
+Calling Variants
+----------------
+
+The workflow contains support for both germline and somatic variant calling. 
+
+For germline variants, this workflow takes advantage of Snakemake
+parallelization by splitting up variant calling into regions. By default, these
+regions are defined by the contigs identified in the fasta index file. 
+
+However, you can provide a .bed file with regions that you want to restrict
+variant calling to such as in a targeted exon sequencing project (see
+`Processing <fp_>`__).
+
+Once regions are established, variants are called with GATK ``HaplotypeCaller``
+to produce a `GVCF
+<https://gatk.broadinstitute.org/hc/en-us/articles/360035531812-GVCF-Genomic-Variant-Call-Format>`_
+for every sample for every config. The GVCFs from each sample are combined for
+each contig to produce GVCFs that contain variants from ALL samples for each
+contig. Finally, GVCFs are genotyped and merged to produce a single VCF
+containing mutations in all samples for all contigs.
+
+Somatic variant calling roughly follows the same procedures as above, however
+a bit more configuration is involved. See the section on `Somatic Variant
+Calling with Mutect2 <mutect_>`__. Additionally, for filtering somatic variants
+we run `LROM
+<https://gatk.broadinstitute.org/hc/en-us/articles/360051305331-LearnReadOrientationModel>`_
+to help estimate artifacts produced by read orientation bias.
+
+
+
+.. _Annotation:
+
+Annotation
+----------
+
+Annotating variants is the process of looking up a variant in a database and
+adding a description or metric about that variant to a section in the ``INFO``
+column of a VCF file. `dbNSFP <https://sites.google.com/site/jpopgen/dbNSFP>`_
+is a popular database that contains annotations from hundreds of sources
+compiled into a single VCF. See the section on `Configuring Annotation
+Databases <confdbnsfp_>`__ for information on how to use dbNSFP and other
+variation databases.
+
+Annotations are attached using `SnpEff and SnpSift
+<https://pcingola.github.io/SnpEff/>`_. These are also useful tools for
+downstream analysis, which is not included in the workflow, but discussed
+`here <patho_>`__
+
+Finally, we run ``bcftools norm`` on all final VCFs in order to split
+multi-allelic variant sites into a series of SNPs.
+
+
+
+.. _QC:
+
+QC
+--
+
+The workflow generates QC metrics at several points and aggregates them using
+`MultiQC <https://multiqc.info/>`_. These include metrics from:
+
+- ``samtools stats``
+- ``picard MarkDuplicates``
+- ``FastQC``
+- ``SnpEff`` Variant annotation summary statistics
+
+MultiQC reads from all of these sources and generates an interactive html
+document that allows you to visualize and explore QC data.
+
+
+
+.. _Configuration:
+
+Configuration and config.yaml
+=============================
+
+The config file, ``config.yaml``,  is the main point of communication between
+the user and the Snakefile that the workflow executes. Users should always
+approach this file first before editing the Snakefile. This is how we pass
+paths to reference files, arguments to rules in the Snakefile, and configure
+patterns for somatic variant calling. Each section in the ``config.yaml`` is
+documented here.
+
+
+
+.. _samples:
+
+Input
+-----
+
+In order to pass sequencing data to the workflow, samples need to be configured
+into two tables, ``units.tsv`` and ``samples.tsv``
+
+``units.tsv`` is configured like this:
+
+
+========= ========================== =================== ================ ============================================
+sample    unit                       platform            fq1              fq2
+========= ========================== =================== ================ ============================================
+Sample ID Technical replicate number Sequencing platform Path to R1 fastq Path to R2 fastq (if using paired-end reads)
+========= ========================== =================== ================ ============================================
+
+
+Note:
+
+- sample names needn’t be unique, but no sample-unit combination can be the
+  same. Increment the unit starting from 1 for identical sample names to
+  represent technical replicates. A sampletable with no technical replicates
+  should have the value for unit set to 1 for every sample.
+- platform is used to attach read groups. It should be the sequencing platform
+  of your data, for example, "Illumina."
+- fq1 and fq2 represent the R1 reads and R2 reads from a paired-end sample. If
+  you samples are not paired-end, then leave the value in the fq2 column empty.
+
+``samples.tsv`` is a single column consisting of the sample names. You can use
+``cut -f1 units.tsv > samples.tsv`` to generate this file
+
+
+.. _confall:
+
+Configuring the All Rule
+------------------------
+
+The all rule is how we initially generate the DAG for which rules to execute in
+which order to generate the final outputs of the workflow. We specify the
+outputs we want at the end of the workflow here:
+
+- Germline variants, annotated, multiallelic-split (with ``bcftools norm``):
+  “results/annotated/ann.vcf.gz”
+
+    * If you decide to use dbNSFP to attach annotations AFTER you've already
+      generated the "results/annotated/ann.vcf.gz" file, you need to `configure
+      dbNSFP <confdbnsfp_>`__ and delete “results/annotated/ann.vcf.gz” to
+      regenerate output.
+
+- MultiQC: “results/qc/multiqc.html”
+- Mutect2 annotated: ``expand("results/mutect2_annotated/snpeff.{comp}.vcf.gz",
+  comp = config['mutect2'].keys())``
+
+    * See the `Mutect2 configuration <mutect_>`__
+
+If you do not wish to attach annotations to your variants:
+
+- Germline (multiallelic-split): “results/filtered/all.normed.vcf.gz”
+- Somatic
+  (multiallelic-split): ``expand("results/somatic_filtered/normed.{comp}.vcf.gz",
+  comp = config['mutect2'].keys())``
+
+If for some reason you do not want your variants to be multiallelic-split, then
+you will have to edit the ``rule merge_calls`` for germline variants and change
+the output to not be marked ``temp()``. For somatic variants, you will have to
+edit the ``filter_mutect2_calls`` rule in the same fashion. Specify the output
+of those rules in your all rule.
+
+
+
+.. _confref:
+
+Configuring References in config.yaml
+-------------------------------------
+
+The ``ref:`` section of the config.yaml is how we determine the patterns of the
+reference files generated by the references workflow or pass the paths of
+external references to the workflow.
+
+If you intend on using the LCDB-WF references workflow, set
+``use_references_workflow: true``. The Snakefile reads this argument and
+determines which references to use. At the bottom of the config file is an
+``include_references:`` key that must point to the reference config from the
+LCDB-WF references workflow that you wish to use. The structure of the
+reference config that you include is like so:
+
+.. code-block:: yaml
+
+    references:
+        organism:
+            tag:
+                type:
+
+It is crucial that the values for organism and tag in the ``ref:`` block of
+``config.yaml`` match exactly to the reference config. The ``aligner:`` and
+``faidx`` sub-fields must also match what is found in the ``indexes:`` subfield
+of the ``genome:`` “type” from the above yaml structure in the reference
+config. 
+
+For example, if the reference config looks like this:
+
+.. code-block:: yaml
+
+        references:
+            human:
+                ensembl-104:
+                    genome:
+                        url: ‘dummy.fasta.download.url’
+                        indexes:
+                            - ‘bwa’
+                            - ‘faidx’
+
+Then we will configure the ``config.yaml`` like this:
+
+.. code-block:: yaml
+
+        ref:
+        use_references_workflow: true
+            organism: ‘human’
+            genome:
+                tag: ‘ensembl-104’
+                build: 'GRCh38'
+            aligner:
+                index: ‘bwa’
+                tag: ‘ensembl-104’
+            faidx:
+                index: ‘faidx’
+                tag: ‘ensembl-104’
+
+The ``build:`` key in the ``genome:`` block must match what is provided in the
+metadata of the reference config.
+
+For variation databases, the versioning on the database used is controlled by
+a key-value provided to the reference config under the ``variation:`` “tag”.
+This value must match what is given in the ``config.yaml``. You must also
+supply the build of your genome to the reference config and it has to match the
+build you are using for your reference genome. This is because we need to
+process the dbNSFP file in the references workflow to make it compatible with
+older genomes. To mirror the example above if this is what is in our reference
+config:
+
+.. code-block:: yaml
+
+    references:
+        human:
+            known:
+                type: 'all'
+
+        variation:
+            dbnsfp:
+                version: 'dbNSFPv4.4'
+                url: ‘dummy.download.dbsnfp.url’
+                build 'GRCh38'
+
+Then we will configure the ``config.yaml`` like this:
+
+.. code-block:: yaml
+
+    ref:
+        use_references_workflow: true
+        organism: ‘human’
+
+        variation:
+            known: ‘known’
+            dbnsfp: ‘dbNSFPv4.4’
+
+The workflow supports the option to supply the ``known:`` and ``dbnsfp:`` keys
+with an ABSOLUTE path to a file you have locally. The path MUST start with
+``"/"``. Make sure to read on below to see what sort of processing and
+modifications must be made to these files in order for the workflow to use
+them.
+
+Note that in the reference config, the ``type:`` field under ``known:``
+corresponds to the type of known variation file you wish to download. Available
+options are ‘somatic’, ‘structural_variation’, or ‘all’. At the moment, this
+workflow does not support structural variant calling. Providing ‘all’ to this
+field will download germline known variation for all chromosomes. If your
+organism does not have a variation database, then simply leave these fields
+empty in the variant calling workflow config, and the workflow will run without
+them.
+
+If you are providing your own references to the workflow, set them in the paths
+section of the ref block:
+
+.. code-block:: yaml
+
+    ref:
+        paths:
+            ref: path/to/known/genome/fasta
+            known: path/to/known/variation/file
+            index: path/to/fasta/index
+            dbnsfp: path/to/dbnsfp/file
+
+If you are providing a known variation file that contains `IUPAC-coded alleles
+<https://www.bioinformatics.org/sms/iupac.html>`_, then you must run
+``rust-bio-tools`` on your variation file and provide this output as your known
+variation file:
+
+.. code-block:: bash
+
+    rbt vcf-fix-iupac-alleles < input | bcftools view -Oz > known-variation.vcf.gz 
+
+
+Of these, the only file that you absolutely must provide always is the genome
+fasta. NEVER provide the workflow with a "top level" assembly of the genome
+when working with human data. This contains a lot of unaligned contigs and
+haplotypes and BWA will not be able to map your reads to the genome. The
+workflow accepts both compressed and uncompressed fasta files. Some notes on
+references such as the known variation, dbNSFP, BWA index, and fasta index:
+
+- If you cannot provide the files in the config above, just leave their values
+  empty in the ``config.yaml`` field.
+
+    * If the known variation file is absent, we will not be able to recalibrate
+      bases and our bam file processing ends at marking and deleting
+      duplicates.
+    * If the dbNSFP file is absent, we will not able to run SnpSift to attach
+      fields from the dbNSFP file (see `Configuring Annotation Databases
+      <confdbnsfp_>`__), but we will still be able to run SnpEff.
+    * If the BWA index is not used from the LCDB-WF references workflow, then
+      it will be created in the Snakefile in the same directory as the provided
+      reference
+    * If the fasta index is not provided, it will be created in the same
+      directory as the provided reference during the workflow.
+
+
+.. _dldbnsfp:
+
+Downloading dbNSFP
+------------------
+
+To download the dbNSFP file grab the Box link from the `dbNSFP website
+<https://sites.google.com/site/jpopgen/dbNSFP>`_. Alternatively, you can use
+the `ftp server <ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/>`_. You can grab
+the file from here, but the zips are really big, and if you click the link you
+will open the ftp server on your machine in finder. You can get the file name
+of the file you want and fill in the string like this:
+
+.. code-block:: bash
+
+    version=<filename from ftp server>
+    wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFP${version}.zip
+    # Alternatively, wget <link address from the Box link on the dbNSFP website>
+
+The most recent version may not be on the ftp server. Watch your disk space
+when downloading and processing this file, it will use hundreds of gigabytes.
+You may want to do this from a shell script writing to a tmp dir. Do not
+attempt to download this on a node that is not suitable for heavy computation
+
+Pay attention to the comments in the code block. The process differs for
+GRCh38/hg38 genomes and GRCh37/hg19 genomes. Recent versions of dbNSFP (v3.X
+and later) switched to GRCh38 coordinates. You can still use these later
+versions for GRCh37 coordinates, but you have to process the files differently.
+
+.. code-block:: bash
+
+    unzip <your downloaded dbNSFP zip>
+    zcat dbNSFP*_variant.chr1* | head -n1 > h
+    # head works if you run this command line by line, but if you are setting "set -euo pipefail" head may cause the pipe to exit. Alternatively, you can use pipe to awk "NR <= 1" > h
+
+    # For GRCh38/hg38 data (include the genome version somewhere in your output file name for clarity):
+    zgrep -v "^#chr" dbNSFP*_variant.chr* | sort -k1,1 -k2,2n - | cat h - | bgzip -c > <your named dbNSFP file output>
+    tabix -s 1 -b 2 -e 2 <your named dbNSFP file output>
+
+    # For GRCh37/hg19 data:
+    zgrep -h -v "^#chr" dbNSFP*_variant.chr* | awk '$8 != "." ' | sort -k8,8 -k9,9n - | cat h - | bgzip -c > <your named dbNSFP file output>
+    tabix -s 8 -b 9 -e 9 <your named dbNSFP file output>
+
+It is highly recommended to run ``sort`` with the ``--parallel=N`` option on
+a node with multiple CPUs as this will take a very long time. See the `sort man
+page <https://man7.org/linux/man-pages/man1/sort.1.html>`_ for details on how
+to properly parallelize.
+
+
+.. _conforg:
+
+Support for Non-Human Organisms
+-------------------------------
+
+If you wish to call variants on a non-human organism, then you will have to
+supply the references yourself following the directions above, or edit the
+reference config for your organism in the top level of the cloned LCDB-WF
+directory in ``include/reference_configs/``. You can add to the config:
+``variant-calling.yaml`` reference config if you are using an Ensembl genome.
+The tag should be configured as “ensembl-release”. ONLY include the variation
+block in the reference config if your organism has support for an annotation
+variation database like dbNSFP Downloading the known variation file in the
+references workflow relies on the build, release, and species values specified
+in the ``metadata:`` block.
+
+WARNING: The workflow relies heavily on references downloaded from ensembl,
+mainly due to Ensembl housing known variation files. You will run into issues
+if your references have a "chr" prefix for chromosome nomenclature. Check the
+``get_contigs()`` method in ``lib/helpers.smk`` (from the top-level directory
+of LCDB-WF) and see if this will cause any issues with your organism. 
+
+
+
+.. _fp:
+
+Filtering and Processing
+------------------------
+
+There are several filtering and processing configurations that can be provided
+to the workflow in the ``processing:`` and ``filtering:`` sections of
+``config.yaml``.
+
+In ``processing:`` there are several options:
+
+- ``remove-duplicates:`` if ``true`` is set for this option, we remove
+  duplicates with ``picard MarkDuplicates`` (you should do this unless you are
+  interested in duplicated reads in your data)
+- ``remove-mitochondrial:`` if ``true`` is set for this option, then
+  mitochondrial contigs will be ignored for variant calling (default)
+- ``pcr:`` This option takes a string according to `GATK PCR INDEL model docs
+  <https://gatk.broadinstitute.org/hc/en-us/articles/360036465912-HaplotypeCaller#--pcr-indel-model>`_.
+  The default option in the workflow is to not specify this argument, but if
+  you know that your data is PCR-free sequencing data then you should set this
+  option to ‘NONE’ for the most specific results.
+- ``restrict-regions:`` Supply a capture file in .bed format. Commonly used for
+  whole exome sequencing (WES) and targeted exome sequencing.
+
+    * ``region-padding:`` Increase the size of the regions specified in the
+      .bed file by a flat amount of base pairs. Leave both of these options
+      blank if you do not have a .bed file.
+
+In ``filtering:`` we have these options:
+
+- ``bqsr:`` This option controls whether or not we run `GATK base recalibrator
+  <https://gatk.broadinstitute.org/hc/en-us/articles/360035890531-Base-Quality-Score-Recalibration-BQSR->`_.
+  Basically, if we are able to, we should. What defines if we are able to is
+  whether or not the known variation file is available for the organism we are
+  calling variants on. Turning this option to false means that our bam
+  processing ends after deleting duplicates.
+- ``hard:`` This is the hard-filtering option as outlined in `GATK filtering
+  docs
+  <https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set>`_.
+  These are basic, quality-based filters that remove low-confidence or
+  low-quality variants from our callset.
+
+    * ``snvs:`` and ``indels:`` We split the variants into SNPs and INDELS
+      because they require different hard-filtering options. The filter values
+      are read from these options and can be adjusted. What is present by
+      default is GATK’s standard recommendation.
+
+
+
+.. _confdbnsfp:
+
+Configuring Annotation Databases in config.yaml
+-----------------------------------------------
+
+The `SnpEff and SnpSift  docs <https://pcingola.github.io/SnpEff/>`_ are a very
+helpful resource for running SnpEff and SnpSift. 
+
+In the ``snpeff:`` section of ``config.yaml`` you can configure arguments that
+are passed to the annotation rules.
+
+
+- ``somatic:`` This is how we configure SnpEff summary output for MultiQC to
+  aggregate the correct output files for its report. Set this value to ``true``
+  if you are calling somatic variants
+- ``germline:`` Same as ``somatic:`` but set to ``true`` if you are calling
+  germline variants.
+- ``genome:`` For help with this value, see SnpEff’s `docs
+  <https://pcingola.github.io/SnpEff/se_build_db/>`_ for which database to use.
+- ``annotations:`` This key is an optional configuration that allows you to
+  specify which fields from dbNSFP you would like to attach to your VCF.
+  Leaving this field blank will attach ALL fields from dbNSFP (of which there
+  are many, so it will save you time and computation to specify your fields).
+  The value given to this key is a comma-separated string (WITHOUT WHITESPACE!)
+  of the names of the columns in the dbNSFP file you would like to attach. For
+  example, to get FATHMM and SIFT pathogenic predictions, you would configure
+  like this: ``annotations: ‘FATHMM_pred,SIFT_pred’``
+
+
+
+.. _mutect:
+
+Somatic Variant Calling with Mutect2
+-------------------------------------
+
+When talking about somatic calling, we say “tumor” and “normal” as being the two
+components of a Mutect2 contrast. In reality, the relationship does not have to
+be tumor and normal tissue. However, the “tumor” and “normal” samples should
+come from the same organism or from very genetically similar organisms. In
+somatic calling, we are trying to identify regions where the “tumor” sample
+differs from the “normal” sample.
+
+Directly above the ``mutect2:`` section is ``PON:``. PON stands for panel of
+normals, and is a file that is very similar in use to a known variation file.
+It is a file made from “normal” samples believed to be free from somatic
+alterations. Its purpose is to help identify technical artifacts. You can read
+more about panel of normals `here
+<https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON->`_.
+This file is optional and leaving it blank will not affect our ability to call
+somatic variants.
+
+In the ``mutect2:`` section of the ``config.yaml`` there is (importantly) only
+one option. In the default config, this single option is ``tumor-normal:`` This
+is how we will tell Mutect2 which samples should be analyzed together and which
+of those are “normals” or “tumors”. However, this key need not be named
+“tumor-normal”. Below is an example configuration:
+
+.. code-block:: yaml
+
+    mutect2:
+      patient-1:
+        tumor:
+            - ‘p1_tumor’
+        normal:
+            - ‘p1_normal’
+      patient-2:
+        tumor:
+            - ‘p2_tumor’
+            - ‘p2_tumor_metastasis’
+        normal:
+            - ‘p2_normal’
+
+The keys one level below ``mutect2:`` correspond to the name of the comparisons
+we are making. This is how we establish wildcards in the Snakefile for the
+Mutect2 rule. However, the keys inside of the comparison name MUST be named
+``tumor:`` and ``normal:``. Their values should be formatted as a list in yaml.
+Each value under ``tumor:`` or ``normal:`` corresponds to the sample name found
+in the `sampletable <samples_>`__. The workflow handles combining technical
+replicates, so you only have to provide the sample name.
+
+
+
+.. _down:
+
+Downstream Analysis
+===================
+
+
+.. _sliv:
+
+Slivar for Rare Disease or Pedigrees
+------------------------------------
+
+Several projects in the past have focused on calling variants on pedigree or
+trio data. This is a common archetype in investigating inheritance patterns of
+rare diseases. `Slivar <https://github.com/brentp/slivar>`_ is a set of
+command-line tools that query and filter VCF file in order to investigate
+non-Mendelian and Mendelian inheritance patterns in pedigree data. One of the
+most powerful aspects of this tool is the ability to create custom filtering
+patterns using basic boolean logic.
+
+
+.. _patho:
+
+Filtering Annotations
+---------------------
+
+One of the benefits of SnpSift is the ability to filter your VCF files based on
+what annotations are attached. The `docs page
+<https://pcingola.github.io/SnpEff/ss_filter/>`_ on SnpSift filter has great
+documentation on using this. Similar to Slivar, you can use arbitrary
+expressions and access fields in the VCF such as quality (``QUAL``) and
+genotype (``GEN[<sample number or sample name>]``) to enhance your annotation
+filters.
+
+For example, to get all variants marked as “Damaging” by the FATHMM pathogenic
+prediction annotation from dbNSFP, you can run this on your annotated vcf: 
+
+``SnpSift filter “( dbNSFP_FATHMM_pred = 'D' )" input.vcf > output.vcf``
+

From 44db9d437c4fa53f6ab44c5f2f9c2db317ae99ef Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Fri, 8 Nov 2024 11:40:26 -0500
Subject: [PATCH 32/59] Add function to return top level directory

Added a helper function that recursively searches the directory
structure and returns the path to the top level directory in an lcdb-wf repo
---
 lib/helpers.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/lib/helpers.py b/lib/helpers.py
index 053bca2b1..651a9c139 100644
--- a/lib/helpers.py
+++ b/lib/helpers.py
@@ -5,6 +5,7 @@
 from snakemake.shell import shell
 from snakemake.io import expand, regex
 from lib import common
+import os
 
 
 class ConfigurationError(Exception):
@@ -203,3 +204,22 @@ def strand_arg_lookup(config, lookup):
         keys = list(lookup.keys())
         raise KeyError(f"'{config.stranded}' not one of {keys}")
     return lookup[config.stranded]
+
+def get_top_level_dir(start_dir=None):
+    # Start from the specified directory or current working directory if none is given
+    current_dir = os.path.abspath(start_dir or os.getcwd())
+    # Search current directory and above for targets
+    while True:
+        # Check if the target directories exists in the current directory
+        if (os.path.isdir(os.path.join(current_dir, ".git")) and os.path.isdir(os.path.join(current_dir, "workflows"))):
+            return current_dir
+        # Move up one level
+        parent_dir = os.path.dirname(current_dir)
+        # Stop if we've reached the root directory
+        if current_dir == parent_dir:
+            break
+        current_dir = parent_dir
+        #TODO: Check for other edge cases?
+
+    return None
+

From 2751abf87610d951d705d43cd39e8f061dd7a1b0 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Fri, 8 Nov 2024 11:42:48 -0500
Subject: [PATCH 33/59] Account for different 'layout's in sampletable

-Implemented changes to account for 'LibraryLayout', 'Layout' and
'layout' formats in sampletable.
-Refactored the test suite and added pytest tests to check for correctness
---
 lib/common.py                     | 15 +++++++++++----
 {lib => test/tests}/test_suite.py | 23 ++++++++++++++++++++---
 2 files changed, 31 insertions(+), 7 deletions(-)
 rename {lib => test/tests}/test_suite.py (68%)

diff --git a/lib/common.py b/lib/common.py
index 829cc1298..822f5640a 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -15,7 +15,7 @@
 from lib import utils
 from snakemake.shell import shell
 from snakemake.io import expand
-
+from . import helpers
 # List of possible keys in config that are to be interpreted as paths
 PATH_KEYS = [
     'references_dir',
@@ -616,13 +616,16 @@ def get_techreps(sampletable, label):
     return result
 
 
-def load_config(config, missing_references_ok=False):
+def load_config(config, missing_references_ok=False, test=False):
     """
     Loads the config.
 
     Resolves any included references directories/files and runs the deprecation
     handler.
     """
+    # If this is a test, then change the current working directory to the rnaseq workflow level
+    if test:
+        os.chdir(os.path.join(helpers.get_top_level_dir(), "workflows","rnaseq"))
     if isinstance(config, str):
         config = yaml.load(open(config), Loader=yaml.FullLoader)
 
@@ -728,7 +731,7 @@ def is_paired_end(sampletable, sample):
 
     if "Run" in sampletable.columns:
         if all(sampletable["Run"].str.startswith("SRR")):
-            if "Layout" not in sampletable.columns and "layout" not in sampletable.columns:
+            if "Layout" not in sampletable.columns and "layout" not in sampletable.columns and "LibraryLayout" not in sampletable.columns:
                 raise ValueError(
                     "Sampletable appears to be SRA, but no 'Layout' column "
                     "found. This is required to specify single- or paired-end "
@@ -737,7 +740,7 @@ def is_paired_end(sampletable, sample):
     row = sampletable.set_index(sampletable.columns[0]).loc[sample]
     if 'orig_filename_R2' in row:
         return True
-    if 'layout' in row and 'LibraryLayout' in row:
+    if 'layout' in row and 'LibraryLayout' in row or 'Layout' in row and 'LibraryLayout' in row:
         raise ValueError("Expecting column 'layout' or 'LibraryLayout', "
                          "not both")
     try:
@@ -748,6 +751,10 @@ def is_paired_end(sampletable, sample):
         return row['LibraryLayout'].lower() in ['pe', 'paired']
     except KeyError:
         pass
+    try:
+        return row['Layout'].lower() in ['pe', 'paired']
+    except KeyError:
+        pass
     return False
 
 
diff --git a/lib/test_suite.py b/test/tests/test_suite.py
similarity index 68%
rename from lib/test_suite.py
rename to test/tests/test_suite.py
index 21b9c0527..b1389f037 100644
--- a/lib/test_suite.py
+++ b/test/tests/test_suite.py
@@ -1,8 +1,25 @@
-import os
-import pprint
+import sys
+import subprocess
+top_level_dir = subprocess.run(["dirname $(dirname $(pwd))"], shell=True, capture_output=True, text=True).stdout.strip()
+print("top level dir: ", top_level_dir)
+sys.path.insert(0, top_level_dir)
+import pytest
 from textwrap import dedent
-from . import common
+from lib import common, helpers, patterns_targets
 
+# Make config object that can be re-used for any test
+@pytest.fixture
+def config(request):
+    config_path = request.param
+    config = common.load_config(config_path, test=True)
+    return patterns_targets.RNASeqConfig(config, config.get('patterns', '../workflows/rnaseq/config/rnaseq_patterns.yaml'))
+
+# Call helpers.detect_layout(), which implicitly tests common.is_paired_end()
+# TODO: Make assertion condition NOT hard coded in to work with current example table
+@pytest.mark.parametrize("config", ['../../workflows/rnaseq/config/config.yaml'], indirect=True)
+def test_is_paired_end(config):
+    is_paired = helpers.detect_layout(config.sampletable) == 'PE'
+    assert not is_paired, f"Test failed, is_paired = {is_paired}"
 
 def test_config_loading(tmpdir):
     f0 = tmpdir.mkdir('subdir').join('file0.yaml')

From 385ac92bbb5da309fd70cf072e05386e42ad194a Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Fri, 8 Nov 2024 12:20:33 -0500
Subject: [PATCH 34/59] Fix import bug

Changed import statement for helpers package that was causing an error
---
 lib/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/common.py b/lib/common.py
index 822f5640a..a625086b6 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -15,7 +15,7 @@
 from lib import utils
 from snakemake.shell import shell
 from snakemake.io import expand
-from . import helpers
+from lib import helpers
 # List of possible keys in config that are to be interpreted as paths
 PATH_KEYS = [
     'references_dir',

From c88cb6dd8c2fb4acb419ab81aa4234c4fce9149c Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Wed, 13 Nov 2024 10:36:10 -0500
Subject: [PATCH 35/59] Improve sampletable layout column detection

Improve the way the layout column is checked and remove the test
parameter and functionality in load_config as it has become clear that implementing unit
tests will require its own issues and resolutions
---
 lib/common.py | 39 ++++++++++-----------------------------
 1 file changed, 10 insertions(+), 29 deletions(-)

diff --git a/lib/common.py b/lib/common.py
index a625086b6..447321be3 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -16,6 +16,8 @@
 from snakemake.shell import shell
 from snakemake.io import expand
 from lib import helpers
+from pathlib import Path
+
 # List of possible keys in config that are to be interpreted as paths
 PATH_KEYS = [
     'references_dir',
@@ -616,18 +618,13 @@ def get_techreps(sampletable, label):
     return result
 
 
-def load_config(config, missing_references_ok=False, test=False):
+def load_config(config, missing_references_ok=False):
     """
     Loads the config.
 
     Resolves any included references directories/files and runs the deprecation
     handler.
     """
-    # If this is a test, then change the current working directory to the rnaseq workflow level
-    if test:
-        os.chdir(os.path.join(helpers.get_top_level_dir(), "workflows","rnaseq"))
-    if isinstance(config, str):
-        config = yaml.load(open(config), Loader=yaml.FullLoader)
 
     # Here we populate a list of reference sections. Items later on the list
     # will have higher priority
@@ -725,34 +722,18 @@ def is_paired_end(sampletable, sample):
     # We can't fall back to detecting PE based on two fastq files provided for
     # each sample when it's an SRA sampletable (which only has SRR accessions).
     #
-    # So detect first detect if SRA sampletable based on presence of "Run"
-    # column and all values of that column starting with "SRR", and then raise
-    # an error if the Layout column does not exist.
-
-    if "Run" in sampletable.columns:
-        if all(sampletable["Run"].str.startswith("SRR")):
-            if "Layout" not in sampletable.columns and "layout" not in sampletable.columns and "LibraryLayout" not in sampletable.columns:
-                raise ValueError(
-                    "Sampletable appears to be SRA, but no 'Layout' column "
-                    "found. This is required to specify single- or paired-end "
-                    "libraries.")
+    # So instead first detect if there is in fact a second fastq file listed,
+    # and if not then check if the layout of the library is listed
 
     row = sampletable.set_index(sampletable.columns[0]).loc[sample]
     if 'orig_filename_R2' in row:
         return True
-    if 'layout' in row and 'LibraryLayout' in row or 'Layout' in row and 'LibraryLayout' in row:
-        raise ValueError("Expecting column 'layout' or 'LibraryLayout', "
-                         "not both")
-    try:
-        return row['layout'].lower() in ['pe', 'paired']
-    except KeyError:
-        pass
-    try:
-        return row['LibraryLayout'].lower() in ['pe', 'paired']
-    except KeyError:
-        pass
+    layout_columns = set(sampletable.columns).intersection(['layout', 'LibraryLayout', 'Layout'])
+    if len(layout_columns) != 1:
+        raise ValueError("Expected exactly one of ['layout', 'LibraryLayout', 'Layout'] in sample table")
+    layout_column = list(layout_columns)[0]
     try:
-        return row['Layout'].lower() in ['pe', 'paired']
+        return row[layout_column].lower() in ['pe', 'paired']
     except KeyError:
         pass
     return False

From 6c81cc15f9ee304122e34102d54765827ef31c64 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Wed, 13 Nov 2024 10:51:23 -0500
Subject: [PATCH 36/59] Improve SRA table checking

Make sure the table is in fact an SRA sampletable before checking the
layout
---
 lib/common.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/lib/common.py b/lib/common.py
index 447321be3..08959dc1c 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -728,14 +728,16 @@ def is_paired_end(sampletable, sample):
     row = sampletable.set_index(sampletable.columns[0]).loc[sample]
     if 'orig_filename_R2' in row:
         return True
-    layout_columns = set(sampletable.columns).intersection(['layout', 'LibraryLayout', 'Layout'])
-    if len(layout_columns) != 1:
-        raise ValueError("Expected exactly one of ['layout', 'LibraryLayout', 'Layout'] in sample table")
-    layout_column = list(layout_columns)[0]
-    try:
-        return row[layout_column].lower() in ['pe', 'paired']
-    except KeyError:
-        pass
+    if "Run" in sampletable.columns:
+        if all(sampletable["Run"].str.startswith("SRR")):
+            layout_columns = set(sampletable.columns).intersection(['layout', 'LibraryLayout', 'Layout'])
+            if len(layout_columns) != 1:
+                raise ValueError("Expected exactly one of ['layout', 'LibraryLayout', 'Layout'] in sample table")
+            layout_column = list(layout_columns)[0]
+            try:
+                return row[layout_column].lower() in ['pe', 'paired']
+            except KeyError:
+                pass
     return False
 
 

From c519f408ce3bb01978df736ff86811520d5a89b8 Mon Sep 17 00:00:00 2001
From: Emma Smith <smithemm@biowulf.nih.gov>
Date: Wed, 22 Jan 2025 09:31:09 -0500
Subject: [PATCH 37/59] match string format across rules

---
 workflows/variant-calling/Snakefile | 49 ++++++++++++++---------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 0cc4be94f..9a85c2d37 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -70,9 +70,9 @@ rule fasta_dict:
     run:
         java_opts = set_java_opts(resources)
         shell(
-            'picard CreateSequenceDictionary {java_opts} \
-            -R {input.ref} \
-            -O {output} &> {log} '
+            'picard CreateSequenceDictionary {java_opts} ' 
+            '-R {input.ref} '
+            '-O {output} &> {log} '
         )
 
 
@@ -95,7 +95,7 @@ if not aln_index:
         params:
             "bwtsw "
         shell:
-            "bwa index -a {params}"
+            "bwa index -a {params} "
             "{input} "
             " &> {log}"
 
@@ -216,11 +216,11 @@ if config["filtering"]['bqsr']:
         run:
             java_opts = set_java_opts(resources)
             shell(
-                'gatk --java-options {java_opts} BaseRecalibrator \
-                -R {input.ref} \
-                -I {input.bam} \
-                -O {output.recal_table} \
-                --known-sites  {input.known} 2> {log}'
+                'gatk --java-options {java_opts} BaseRecalibrator '
+                '-R {input.ref} '
+                '-I {input.bam} '
+                '-O {output.recal_table} '
+                '--known-sites  {input.known} 2> {log}'
             )
 
 
@@ -245,11 +245,11 @@ if config["filtering"]['bqsr']:
         run:
             java_opts = set_java_opts(resources)
             shell(
-                'gatk --java-options {java_opts} ApplyBQSR \
-                -R {input.ref} \
-                -I {input.bam} \
-                --bqsr-recal-file {input.recal_table} \
-                -O {output.bam} 2> {log}'
+                'gatk --java-options {java_opts} ApplyBQSR '
+                '-R {input.ref} '
+                '-I {input.bam} '
+                '--bqsr-recal-file {input.recal_table} '
+                '-O {output.bam} 2> {log}'
             )
 
 
@@ -347,12 +347,12 @@ rule call_variants:
             bams = [bams]
         bams = list(map("-I {}".format, bams))
         shell(
-            'gatk --java-options {java_opts} HaplotypeCaller {regions} \
-            -R {input.ref} \
-            {bams} \
-            -ERC GVCF \
-            {params.pcr} \
-            -O {output.gvcf} {known} 2> {log}'
+            'gatk --java-options {java_opts} HaplotypeCaller {regions} '
+            '-R {input.ref} '
+            '{bams} '
+            '-ERC GVCF '
+            '{params.pcr} '
+            '-O {output.gvcf} {known} 2> {log} ' 
         )
 
 
@@ -374,10 +374,10 @@ rule combine_calls:
         java_opts = set_java_opts(resources)
         gvcfs=list(map("-V {}".format, input.gvcfs))
         shell(
-            'gatk --java-options {java_opts} CombineGVCFs \
-            {gvcfs} \
-            -R {input.ref} \
-            -O {output.gvcf} 2> {log} '
+            'gatk --java-options {java_opts} CombineGVCFs '
+            '{gvcfs} '
+            '-R {input.ref} '
+            '-O {output.gvcf} 2> {log} '
         )
 
 
@@ -935,5 +935,4 @@ rule snpeff_cancer:
             '| bcftools view -Oz > {output.vcf} 2> {log}'
         )
 
-
 # vim: ft=python

From 7e952ec9e7aa803c45fc292854856910053402e2 Mon Sep 17 00:00:00 2001
From: Emma Smith <smithemm@biowulf.nih.gov>
Date: Wed, 22 Jan 2025 11:58:00 -0500
Subject: [PATCH 38/59] make snpeff rules require dictionary of input file

---
 lib/helpers.smk                     | 41 +++++++++++++++++++++++++++++
 workflows/variant-calling/Snakefile |  4 +--
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/lib/helpers.smk b/lib/helpers.smk
index de902d6e7..8c146ba80 100644
--- a/lib/helpers.smk
+++ b/lib/helpers.smk
@@ -343,4 +343,45 @@ def get_bed_nomenclature(input):
     return nom
 
 
+def snpeff_input(wildcards):
+    """
+    Ensure all correct input files for snpEff rule are available
+    """
+    dbnsfp = (
+            config['ref']['paths']['dbnsfp']
+            if config['ref']['paths']['dbnsfp']
+            else []
+    )
+    dbnsfp_tbi = dbnsfp + '.tbi'
+    vcf = 'results/filtered/all.normed.vcf.gz'
+
+    # make dictionary containing the required snpEff input files
+    d = dict(
+        dbnsfp=dbnsfp,
+        dbnsfp_tbi=dbnsfp_tbi,
+        vcf=vcf,
+    )
+    return d
+
+
+def snpeff_cancer_input(wildcards):
+    """
+    Ensure all correct input files for snpEff cancer rule are available
+    """
+    dbnsfp = (
+            config['ref']['paths']['dbnsfp']
+            if config['ref']['paths']['dbnsfp']
+            else []
+    )
+    dbnsfp_tbi = dbnsfp + '.tbi'
+    vcf = 'results/somatic_filtered/normed.{comp}.vcf.gz'.format(comp = wildcards.comp)
+
+    # make dictionary containing the required snpEff cancer input files
+    d = dict(
+        dbnsfp=dbnsfp,
+        dbnsfp_tbi=dbnsfp_tbi,
+        vcf=vcf,
+    )
+    return d
+
 # vim: ft=python
diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 9a85c2d37..c6850a37a 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -668,7 +668,7 @@ rule snpeff:
         # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
-        vcf='results/filtered/all.normed.vcf.gz',
+        unpack(snpeff_input)
     log: 'logs/snpeff.log'
     output:
         ann='results/annotated/ann.vcf.gz',
@@ -914,7 +914,7 @@ rule snpeff_cancer:
         # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
-        vcf='results/somatic_filtered/normed.{comp}.vcf.gz',
+        unpack(snpeff_cancer_input),
     output:
         vcf='results/mutect2_annotated/snpeff.{comp}.vcf.gz',
         stats='results/qc/snpEff_{comp}_summary.csv',

From b739523e47eb8c0d2b78de4b2c3559824e1a46a6 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Tue, 19 Aug 2025 12:28:27 -0400
Subject: [PATCH 39/59] Update Snakefile and add script

Updated the Snakefile to annotate and filter the VCF using the gnomAD
database of variants and their associated allele frequencies, which for
now has a hardcoded cutoff of .1
---
 workflows/variant-calling/Snakefile           |  85 +++++--
 workflows/variant-calling/analyze_variants.py | 225 ++++++++++++++++++
 2 files changed, 293 insertions(+), 17 deletions(-)
 create mode 100644 workflows/variant-calling/analyze_variants.py

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index c6850a37a..cf33eb93b 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -32,10 +32,14 @@ wildcard_constraints:
 rule all:
     input:
         "results/annotated/ann.vcf.gz",
+        'results/annotated/filtered_ann.vcf.gz',
+        'results/annotated/filtered_ann.vcf.gz.tbi',
         "results/qc/multiqc.html",
         "results/filtered/all.normed.vcf.gz",
         expand("results/somatic_filtered/normed.{comp}.vcf.gz", comp = config['mutect2'].keys()),
         expand("results/mutect2_annotated/snpeff.{comp}.vcf.gz", comp = config['mutect2'].keys()),
+        expand("results/merged/{sample}.bam", sample=samples.index),
+        expand("results/merged/{sample}.bam.bai", sample=samples.index),
 
 
 
@@ -666,7 +670,7 @@ rule snpeff:
         disk_mb=gb(20),
         mem_mb=gb(16),
         # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
-        runtime=autobump(120)
+        runtime=autobump(480)
     input:
         unpack(snpeff_input)
     log: 'logs/snpeff.log'
@@ -688,23 +692,53 @@ rule snpeff:
             "{params.gen} {input.vcf} "
             "| bcftools view -Oz > {output.ann} 2> {log} "
         )
-        dbnsfp_arg = []
-        if dbnsfp:
-            dbnsfp_arg = "DbNsfp -db {}".format(dbnsfp)
-        if dbnsfp_arg:
-            sift_output = 'results/annotated/dbnsfp.ann.vcf.gz'
-            field_arg = (
-                "-f '{}'".format(params.annotations)
-                if params.annotations
-                else ''
-            )
 
-            shell(
-                "SnpSift {java_opts} "
-                "{dbnsfp_arg} "
-                "{field_arg} {output.ann} "
-                "| bcftools view -Oz > {sift_output} 2>> {log} " 
-            )
+
+rule snpsift_ann:
+    """
+    Annotate variants with the gnomAD database
+    """
+    resources:
+        disk_mb=gb(20),
+        mem_mb=gb(16),
+        # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
+        runtime=autobump(960)
+    threads: 6
+    input:
+        ann="results/annotated/ann.vcf.gz",
+        gnomad="/data/NICHD-core0/references/human/variation/gnomAD/trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz"
+    log: 'logs/snpsift_ann.log'
+    output: 'results/annotated/gnomad_ann.vcf.gz'
+    run:
+        java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
+
+        shell(
+            "SnpSift {java_opts} "
+            "annotate -noId "
+            "{input.gnomad} {input.ann}"
+            "| bcftools view -Oz > {output}"
+        )
+
+rule snpsift_filter:
+    """
+    Filter out common variants with SnpSift filter
+    """
+    resources:
+        disk_mb=gb(20),
+        mem_mb=gb(16),
+        # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
+        runtime=autobump(960)
+    input: 'results/annotated/gnomad_ann.vcf.gz'
+    output: 'results/annotated/filtered_ann.vcf.gz'
+    log: 'logs/snpsift_filter.log'
+
+
+    run:
+        shell(
+            "SnpSift filter \"!(AF_joint > 0.1)\" "
+            "{input} "
+            "| bcftools view -Oz > {output} 2>> {log} "
+        )
 
 
 rule mutect2:
@@ -935,4 +969,21 @@ rule snpeff_cancer:
             '| bcftools view -Oz > {output.vcf} 2> {log}'
         )
 
+rule merge_bams:
+    """
+    Merge technical replicates together to get single bam files used for validation.
+    """
+    input:
+        bam=get_sample_bams
+    output:
+        bam=protected("results/merged/{sample}.bam"),
+    resources:
+        disk_mb=gb(20),
+        mem_mb=gb(16),
+        runtime=autobump(120)
+    threads:
+        6
+    shell:
+        """samtools merge -@ {threads} -o {output.bam} {input}"""
+
 # vim: ft=python
diff --git a/workflows/variant-calling/analyze_variants.py b/workflows/variant-calling/analyze_variants.py
new file mode 100644
index 000000000..90b6231a6
--- /dev/null
+++ b/workflows/variant-calling/analyze_variants.py
@@ -0,0 +1,225 @@
+# Import libraries
+from cyvcf2 import VCF, Writer
+from collections import defaultdict
+import pickle
+import re
+import pandas as pd
+import numpy as np
+import re
+import sys
+import argparse
+import yaml
+import os
+import csv
+import math
+
+def main():
+    parser = argparse.ArgumentParser(description="A script to compute metrics for variants")
+
+    parser.add_argument("--vcf", "-v", default="results/annotated/filtered_ann.vcf.gz", help="Path to vcf file containing variant information")
+    parser.add_argument("--output", "-o", default="results/analyzed", help="Path to the folder that the output files will be written to")
+    parser.add_argument("--gnomad", "-g", action='store_true', default=True, help="Flag to set if vcf is annotated with gnomad fields AF and AC")
+
+
+    args = parser.parse_args()
+    return args
+
+
+def process_vcf(args):
+    # Load data
+    # vcf_path = "/data/NICHD-core0/analysis/stratakis/poi-wes/workflows/variant-calling/results/annotated/ann.vcf.gz"
+    # shortened_vcf_path = "results/annotated/subset.ann.vcf"
+    # output_path = "/data/NICHD-core0/analysis/stratakis/poi-wes/workflows/variant-calling/results/analyzed"
+
+    testing = False # Boolean to turn on if doing testing runs (runs with a subsetted vcf file)
+    vcf = VCF(args.vcf, strict_gt=True)
+    output_path = args.output
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+
+    print("Reading in data...")
+    # W = Writer(output_path + '.vcf', vcf)
+    samples = vcf.samples
+
+    records = []
+    for variant in vcf:
+        ann_raw = variant.INFO.get("ANN")
+        # These fields are assigned via SnpEff's documentation here: (https://pcingola.github.io/SnpEff/snpeff/inputoutput/)
+        if ann_raw:
+            ann_fields = ann_raw.split(",")[0].split("|")
+            if ann_fields[2] not in ["MODERATE", "HIGH"]:
+                continue
+            allele = ann_fields[0] if len(ann_fields) > 0 else None
+            var_type = ann_fields[1] if len(ann_fields) > 1 else None
+            impact = ann_fields[2] if len(ann_fields) > 2 else None
+            gene = ann_fields[3] if len(ann_fields) > 3 else None
+            gene_id = ann_fields[4] if len(ann_fields) > 4 else None
+            feature_type = ann_fields[5] if len(ann_fields) > 5 else None
+            feature_id = ann_fields[6] if len(ann_fields) > 6 else None
+            tscript_biotype = ann_fields[7] if len(ann_fields) > 7 else None
+            ex_int_rank_total = ann_fields[8] if len(ann_fields) > 8 else None 
+            HGVS_c = ann_fields[9] if len(ann_fields) > 9 else None
+            HGVS_p = ann_fields[10] if len(ann_fields) > 10 else None
+            cDNA_pos_rank = ann_fields[11] if len(ann_fields) > 11 else None
+            CDS_pos_rank = ann_fields[12] if len(ann_fields) > 12 else None
+            prot_pos_len = ann_fields[13] if len(ann_fields) > 13 else None
+            feat_dist = ann_fields[14] if len(ann_fields) > 14 else None
+            other = ann_fields[15] if len(ann_fields) > 15 else None
+
+            record = {
+                "CHROM": variant.CHROM,
+                "POS": variant.POS,
+                "REF": variant.REF,
+                "ALT": variant.ALT,
+                "DEPTH": variant.INFO.get("DP"),
+                "GENOTYPES": variant.genotypes,
+                "INFO_ANN": variant.INFO,
+                "ANN": ann_raw,
+                "ALLELE": allele,
+                "VAR_TYPE": var_type,
+                "IMPACT": impact,
+                "GENE": gene,
+                "GENE_ID": gene_id,
+                "FEATURE_TYPE": feature_type,
+                "FEATURE_ID": feature_id,
+                "TSCRIPT_BIOTYPE": tscript_biotype,
+                "EX_INT_RANK_TOTAL": ex_int_rank_total,
+                "HGVS.C": HGVS_c,
+                "HGVS.P": HGVS_p,
+                "CDNA_POS_RANK": cDNA_pos_rank,
+                "CDS_POS_RANK": CDS_pos_rank,
+                "PROT_POS_LEN": prot_pos_len,
+                "FEAT_DIST": feat_dist,
+                "OTHER": other
+            }
+
+            # Add genotype info for each sample
+            for i, sample in enumerate(samples):
+                record[sample] = variant.gt_types[i]
+
+            if args.gnomad:
+                record["gnomad_AF"] = variant.INFO.get("AF_joint")
+                record["gnomad_AC"] = variant.INFO.get("AC_joint")
+                record["gnomad_AN"] = variant.INFO.get("AN_joint")
+            records.append(record)
+
+
+    vcf_df = pd.DataFrame(records)
+
+
+    # Compute metrics for each gene and variant
+
+    print("Computing variant metrics...")
+    # Get list of unique genes in vcf
+    genes = vcf_df['GENE'].unique()
+    genes_df = pd.DataFrame(genes)
+    genes_df.columns = ['gene']
+    genes_df['num_vars'] = pd.Series([pd.NA] * len(genes_df), dtype='Int64')
+    genes_df['sample_AF'] = np.NAN
+    genes_df['sample_AC'] = pd.Series([pd.NA] * len(genes_df), dtype='Int64')
+    genes_df['sample_AN'] = pd.Series([pd.NA] * len(genes_df), dtype='Int64')
+    genes_df['vars'] = pd.Series([None] * len(genes_df), dtype='object')
+
+    #TODO: Change this to an enumerate?
+    index = 0
+    for gene in genes:
+        # Grab all rows/variants associated with the current gene
+        cur_gene_df = vcf_df[vcf_df['GENE'] == gene]
+
+        # Create list of dictionaries (each index corresponds to a variant for that gene, 
+        # each index's dictionary will hold the info for that variant)
+        vars = []
+        total_alleles = 0
+        total_alt_alleles = 0
+
+        for _,var in cur_gene_df.iterrows():
+            # In cyvcf2, the genotype numbers correspond as follows (even though their API says something else):
+            # 0: homozygous reference
+            # 1: heterozygous for alternate
+            # 2: unknown (great ordering here by whoever made this up)
+            # 3: homozygous for alternate
+            var_alleles = 0
+            # samples_gt = {sample: var[sample] for sample in samples}
+            genotypes = var['GENOTYPES']
+            samples_gt = {
+                sample: "./." if genotypes[i][0] is None or genotypes[i][1] is None or genotypes[i][0] == -1 or genotypes[i][1] == -1
+                    else f"{genotypes[i][0]}/{genotypes[i][1]}"
+                for i, sample in enumerate(samples)
+            }
+            all_gt = var[samples] # Grab genotype info from all samples
+            called_gt = all_gt[all_gt != 2] # Filter out 'unknown' variant calls
+            var_samples = len(called_gt) # Number of samples with a genotype called for this variant
+            var_alt_samples = len(called_gt[(called_gt==1) | (called_gt==3)]) # Number of samples with at least one called alternate allele for this variant
+            var_alleles = len(called_gt) * 2 # Total alleles called for this variant
+            alt_alleles = 0 # Total alternate alleles called for this variant
+            for gt in called_gt:
+                if gt == 1:
+                    alt_alleles += 1
+                elif gt == 3:
+                    alt_alleles += 2
+            total_alleles += var_alleles # Add to the total alleles called for the current gene
+            total_alt_alleles += alt_alleles # Add to the total alt alleles called for the current gene
+            var_AF = alt_alleles / var_alleles if var_alleles > 0 else 0 # Calculate the alternate allele frequency for this variant across all samples
+
+            # Store info in list of variant info for current gene
+            var_info = {"var_loc": f"{var['CHROM']}:{var['POS']}", "variant-id": f"{var['REF']}-{'/'.join(var['ALT'])}", "sample_AF": var_AF, "sample_AC": alt_alleles,"sample_AN": var_alleles,  "num_alt_samples": var_alt_samples, "total_called_samples": var_samples, "depth": var['DEPTH'], "var_impact": var['IMPACT'], "var_type": var["VAR_TYPE"], "genotypes": samples_gt}
+            if args.gnomad:
+                var_info["gnomad_AF"] = round(var['gnomad_AF'], 3)
+                var_info["gnomad_AC"] = int(var['gnomad_AC']) if pd.notna(var['gnomad_AC']) else 0
+                var_info["gnomad_AN"] = int(var['gnomad_AN']) if pd.notna(var['gnomad_AN']) else 0
+            vars.append(var_info)
+
+        # Compute alternate allele frequency across all samples
+        gene_alt_freq = round(total_alt_alleles / total_alleles, 3) if total_alleles > 0 else 0
+
+        genes_df.at[index, 'num_vars'] = len(vars)
+        genes_df.at[index, 'sample_AF'] = gene_alt_freq
+        genes_df.at[index, 'sample_AC'] = total_alt_alleles
+        genes_df.at[index, 'sample_AN'] = total_alleles
+        genes_df.at[index, 'vars'] = vars
+        index += 1
+
+    genes_df = genes_df.applymap(lambda x: float(f"{x:.4g}") if isinstance(x, float) and not pd.isnull(x) else x)
+
+    # Expand variant column to create new files where each variant gets its own row
+    vars_df = genes_df.drop(['num_vars', 'sample_AC','sample_AF', 'sample_AN'], axis=1)
+    vars_df = vars_df.explode('vars')
+    variant_details = pd.json_normalize(vars_df['vars'])
+    vars_df= pd.concat([vars_df.drop(columns='vars').reset_index(drop=True), variant_details], axis=1)
+    vars_df = vars_df.applymap(lambda x: float(f"{x:.4g}") if isinstance(x, float) and not pd.isnull(x) else x)
+    vars_df.rename(columns={
+        col: f"POF{col.split('POF')[-1]}_gt"
+        for col in vars_df.columns
+        if col.startswith('genotypes.POF')
+    }, inplace=True)
+
+
+    # Print the variants dataframe
+    vars_df = vars_df.sort_values(by="sample_AC", ascending=False)
+    with open(f'{output_path}/var_metrics.pkl', 'wb') as file:
+        pickle.dump(vars_df, file)
+    vars_df.to_csv(f'{output_path}/var_metrics.tsv', sep="\t", index=False)
+
+    # Print the genes dataframe
+    genes_final_df = genes_df.drop('vars', axis=1)
+    genes_final_df = genes_final_df.sort_values(by="sample_AC", ascending=False)
+    print("Saving to .pkl and .tsv...")
+    if testing:
+        with open(f'{output_path}/shortened_gene_var_metrics.pkl', 'wb') as file:
+            pickle.dump(genes_final_df, file)
+
+        genes_df.to_csv(f'{output_path}/shortened_gene_var_metrics.tsv', sep="\t", index=False, quoting=csv.QUOTE_MINIMAL)
+    else:
+        with open(f'{output_path}/gene_var_metrics.pkl', 'wb') as file:
+            pickle.dump(genes_final_df, file)
+
+        genes_df.to_csv(f'{output_path}/gene_var_metrics.tsv', sep="\t", index=False)
+
+    print("All done!")
+
+
+if __name__ == "__main__":
+    args = main()
+    process_vcf(args)
+

From 842242f2c3c197a1f63b73431227241453218601 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 13 Sep 2025 11:04:31 -0400
Subject: [PATCH 40/59] remove deprecated -r arg

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ad3e2f7b5..3b73c2d5a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -232,7 +232,7 @@ variables:
           $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -k --orig $ORIG
 
           # Variant-calling references
-          $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=$ORIG/test/test_configs/variant-calling.yaml -j2 -p -r -k --orig $ORIG
+          $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=$ORIG/test/test_configs/variant-calling.yaml -j2 -p -k --orig $ORIG
 
 
 

From 663c3c7c2b89fb75480d9818585f52de78e28b18 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 14 Sep 2025 08:25:49 -0400
Subject: [PATCH 41/59] mamba -> conda

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3b73c2d5a..09ffa6963 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -306,7 +306,7 @@ variables:
         name: variantcalling workflow
         command: |
           cd $DEPLOY
-          source /opt/mambaforge/etc/profile.d/conda.sh
+          source /opt/miniforge/etc/profile.d/conda.sh
           conda activate $LCDBWF_ENV
           $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow -n
           $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow --use-conda -j2

From 8a26ef98b62b19bf5b4890ab0962ee6fdf3a4319 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sun, 14 Sep 2025 09:42:25 -0400
Subject: [PATCH 42/59] avoid using srcdir

---
 workflows/variant-calling/Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index cf33eb93b..9566303e5 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -1,5 +1,4 @@
 import sys
-sys.path.insert(0, srcdir('.'))
 import pandas as pd
 import tempfile
 import os
@@ -12,7 +11,8 @@ from textwrap import dedent
 from pathlib import Path
 from urllib.request import urlretrieve
 from zipfile import ZipFile
-sys.path.append('../..')
+HERE = str(Path(workflow.snakefile).parent)
+sys.path.insert(0, HERE + "/../..")
 from lib import common, utils, helpers, aligners
 from lib.utils import autobump, gb, hours
 

From 237cb951a81a90ae37a6cdab74dedfb49b1deb6a Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Mon, 29 Sep 2025 15:02:03 -0400
Subject: [PATCH 43/59] Get vc-wf to merge-ready state

Make a few changes to the tests and clean up some things in the Snakfile
to get the variant calling workflow ready to be merged into the master
lcdb-wf branch
---
 env.yml                             |  4 +---
 test/lcdb-wf-test                   |  4 ++++
 workflows/variant-calling/Snakefile | 11 +++++------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/env.yml b/env.yml
index 0945e2926..e59e17858 100644
--- a/env.yml
+++ b/env.yml
@@ -1,4 +1,3 @@
-name: /home/dalerr/proj/lcdb-wf/env
 channels:
   - conda-forge
   - bioconda
@@ -347,7 +346,7 @@ dependencies:
   - snpsift=5.2
   - soupsieve=2.8
   - spectra=0.0.11
-  - sphinx=8.3.0
+  - sphinx
   - sphinxcontrib-applehelp=2.0.0
   - sphinxcontrib-devhelp=2.0.0
   - sphinxcontrib-htmlhelp=2.1.0
@@ -438,4 +437,3 @@ dependencies:
   - zlib-ng=2.2.5
   - zstandard=0.23.0
   - zstd=1.5.6
-prefix: /home/dalerr/proj/lcdb-wf/env
diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test
index 7f9935529..f7d90c383 100755
--- a/test/lcdb-wf-test
+++ b/test/lcdb-wf-test
@@ -319,6 +319,10 @@ class Runner(object):
                     "dbnsfp_6_20.vcf.gz.tbi",
                     "workflows/variant-calling/references/dbnsfp_6_20.vcf.gz.tbi"
                 ),
+                (
+                    "trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz",
+                    "workflows/variant-calling/references/trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz"
+                ),
             ]
         }
 
diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 9566303e5..7aab57249 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -705,17 +705,17 @@ rule snpsift_ann:
         runtime=autobump(960)
     threads: 6
     input:
-        ann="results/annotated/ann.vcf.gz",
-        gnomad="/data/NICHD-core0/references/human/variation/gnomAD/trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz"
+        vcf="results/annotated/ann.vcf.gz",
+        ann="/data/NICHD-core0/references/human/variation/gnomAD/trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz"
     log: 'logs/snpsift_ann.log'
-    output: 'results/annotated/gnomad_ann.vcf.gz'
+    output: 'results/annotated/snpsift_ann.vcf.gz'
     run:
         java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
 
         shell(
             "SnpSift {java_opts} "
             "annotate -noId "
-            "{input.gnomad} {input.ann}"
+            "{input.ann} {input.vcf}"
             "| bcftools view -Oz > {output}"
         )
 
@@ -728,11 +728,10 @@ rule snpsift_filter:
         mem_mb=gb(16),
         # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
         runtime=autobump(960)
-    input: 'results/annotated/gnomad_ann.vcf.gz'
+    input: 'results/annotated/snpsift_ann.vcf.gz'
     output: 'results/annotated/filtered_ann.vcf.gz'
     log: 'logs/snpsift_filter.log'
 
-
     run:
         shell(
             "SnpSift filter \"!(AF_joint > 0.1)\" "

From e77e2179cda97f490fb16cb0a98fc93416786281 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Tue, 30 Sep 2025 11:09:36 -0400
Subject: [PATCH 44/59] Make references path relative to workflow

---
 workflows/variant-calling/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 7aab57249..9806b2103 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -706,7 +706,7 @@ rule snpsift_ann:
     threads: 6
     input:
         vcf="results/annotated/ann.vcf.gz",
-        ann="/data/NICHD-core0/references/human/variation/gnomAD/trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz"
+        ann="references/trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz"
     log: 'logs/snpsift_ann.log'
     output: 'results/annotated/snpsift_ann.vcf.gz'
     run:

From 72d6b021826f08901ae985d59fec3461e7ea5538 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 12 Nov 2025 18:03:14 +0000
Subject: [PATCH 45/59] minor: filetype

---
 workflows/variant-calling/Snakefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 9806b2103..4255a5bdc 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -984,5 +984,3 @@ rule merge_bams:
         6
     shell:
         """samtools merge -@ {threads} -o {output.bam} {input}"""
-
-# vim: ft=python

From 62ad23df10cbf81011cff5c6fd02b93e0e948f07 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 12 Nov 2025 18:27:21 +0000
Subject: [PATCH 46/59] fix ERCC url

---
 include/reference_configs/ERCC.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/reference_configs/ERCC.yaml b/include/reference_configs/ERCC.yaml
index 22aa25af1..cea79ccc3 100644
--- a/include/reference_configs/ERCC.yaml
+++ b/include/reference_configs/ERCC.yaml
@@ -2,14 +2,14 @@ references:
   ercc:
     srm2374:
       genome:
-        url: 'https://tsapps.nist.gov/srmext/certificates/documents/SRM2374_Sequence_v1.FASTA'
+        url: 'https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA'
         postprocess: "lib.postprocess.ercc.fasta_postprocess"
         indexes:
           - 'bowtie2'
           - 'hisat2'
           - 'star'
       annotation:
-        url: 'https://tsapps.nist.gov/srmext/certificates/documents/SRM2374_Sequence_v1.FASTA'
+        url: 'https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA'
         postprocess: "lib.postprocess.ercc.gtf_postprocess"
 
     fisher92:

From 92fd6d91ac9388599bc34e4eae75294cbb6e2222 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 12 Nov 2025 18:27:36 +0000
Subject: [PATCH 47/59] move variant-calling docs to be in workflow dir

---
 workflows/variant-calling/README.md                     | 3 +++
 {docs => workflows/variant-calling}/variant-calling.rst | 0
 2 files changed, 3 insertions(+)
 create mode 100644 workflows/variant-calling/README.md
 rename {docs => workflows/variant-calling}/variant-calling.rst (100%)

diff --git a/workflows/variant-calling/README.md b/workflows/variant-calling/README.md
new file mode 100644
index 000000000..32423f298
--- /dev/null
+++ b/workflows/variant-calling/README.md
@@ -0,0 +1,3 @@
+Please note, the variant-calling workflow is still experimental.
+
+Detailed documentation can be found in this directory in `variant-calling.rst`.
diff --git a/docs/variant-calling.rst b/workflows/variant-calling/variant-calling.rst
similarity index 100%
rename from docs/variant-calling.rst
rename to workflows/variant-calling/variant-calling.rst

From 2f699467ceb9d0edf6821dff76756acbd47060bb Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Wed, 12 Nov 2025 21:06:47 +0000
Subject: [PATCH 48/59] disable url check

unclear if this is a CI-specific issue; the only problematic URL is
ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M12/gencode.vM12.annotation.gtf.gz
which seems to work fine locally.
---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 09ffa6963..406e39500 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -171,7 +171,7 @@ variables:
         test/lcdb-wf-test unit_tests --ensure-docs
 
         # find all URLs in reference configs and make sure they exist
-        test/lcdb-wf-test unit_tests --url-check
+        # test/lcdb-wf-test unit_tests --url-check
 
         # run R package unit tests using the R env
         test/lcdb-wf-test unit_tests --r-test

From 8c687ce78116bfb80f9060b130f5639b074db68b Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Wed, 19 Nov 2025 11:09:58 -0500
Subject: [PATCH 49/59] Take out annotation steps in vc-wf

We've decided that the annotation steps are best left as a 'downstream'
kind of task since 1) There are always so many problems with downloading
databases and 2) The existing annotations were only built for human
samples, which we want to avoid in lcdb-wf. Therefore we've taken out
the annotation rules and code from `workflows/variant-calling/Snakefile`
---
 .gitignore                          |   3 +
 test/lcdb-wf-test                   |  14 +--
 workflows/variant-calling/Snakefile | 148 +++-------------------------
 3 files changed, 15 insertions(+), 150 deletions(-)

diff --git a/.gitignore b/.gitignore
index ab3fd51ea..0a3984dd2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,3 +66,6 @@ workflows/rnaseq/downstream/rnaseq.html
 ._*
 Rplots.pdf
 /lib/include/*
+WORKLOG.rst
+workflows/variant-calling/references/
+workflows/variant-calling/results/
diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test
index f7d90c383..b12f3bc50 100755
--- a/test/lcdb-wf-test
+++ b/test/lcdb-wf-test
@@ -310,19 +310,7 @@ class Runner(object):
                 (
                     "tumor_R2.6.20.fq.gz",
                     "workflows/variant-calling/data/example_data/tumor_R2.fq.gz"
-                ),
-                (
-                    "dbnsfp_6_20.vcf.gz",
-                    "workflows/variant-calling/references/dbnsfp_6_20.vcf.gz"
-                ),
-                (
-                    "dbnsfp_6_20.vcf.gz.tbi",
-                    "workflows/variant-calling/references/dbnsfp_6_20.vcf.gz.tbi"
-                ),
-                (
-                    "trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz",
-                    "workflows/variant-calling/references/trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz"
-                ),
+                )
             ]
         }
 
diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 4255a5bdc..4e90817e4 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -31,13 +31,9 @@ wildcard_constraints:
 
 rule all:
     input:
-        "results/annotated/ann.vcf.gz",
-        'results/annotated/filtered_ann.vcf.gz',
-        'results/annotated/filtered_ann.vcf.gz.tbi',
         "results/qc/multiqc.html",
         "results/filtered/all.normed.vcf.gz",
         expand("results/somatic_filtered/normed.{comp}.vcf.gz", comp = config['mutect2'].keys()),
-        expand("results/mutect2_annotated/snpeff.{comp}.vcf.gz", comp = config['mutect2'].keys()),
         expand("results/merged/{sample}.bam", sample=samples.index),
         expand("results/merged/{sample}.bam.bai", sample=samples.index),
 
@@ -166,7 +162,7 @@ rule mark_duplicates:
     resources:
         disk_mb=gb(40),
         mem_mb=gb(32),
-        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
         runtime=autobump(720)
     input:
         bam = "results/mapped/{sample}-{unit}.sorted.bam", 
@@ -203,7 +199,7 @@ if config["filtering"]['bqsr']:
         # threads: 1 # [ TEST SETTINGS -1 ]
         resources:
             mem_mb=gb(8),
-            # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+            # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
             disk_mb=gb(40),
             runtime=autobump(960)
         input:
@@ -342,9 +338,6 @@ rule call_variants:
         )
     run:
         java_opts = set_java_opts(resources)
-        known = input.known
-        if known:
-            known = "--dbsnp " + str(known)
         regions = params.extra
         bams = input.bam
         if isinstance(bams, str):
@@ -356,7 +349,7 @@ rule call_variants:
             '{bams} '
             '-ERC GVCF '
             '{params.pcr} '
-            '-O {output.gvcf} {known} 2> {log} ' 
+            '-O {output.gvcf} 2> {log} ' 
         )
 
 
@@ -391,7 +384,7 @@ rule genotype_variants:
     resources:
         disk_mb=gb(10),
         mem_mb=gb(8),
-        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         ref=reference,
@@ -417,7 +410,7 @@ rule merge_variants:
     resources:
         disk_mb=gb(10),
         mem_mb=gb(8),
-        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         vcfs=lambda w: expand(
@@ -462,7 +455,7 @@ rule select_calls:
     resources:
         disk_mb=gb(10),
         mem_mb=gb(4),
-        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         ref=reference,
@@ -491,7 +484,7 @@ rule hard_filter_calls:
     resources:
         disk_mb=gb(10),
         mem_mb=gb(4),
-        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         ref=reference,
@@ -521,7 +514,7 @@ rule merge_calls:
     resources:
         disk_mb=gb(10),
         mem_mb=gb(8),
-        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
         runtime=autobump(480)
     input:
         vcfs=expand(
@@ -621,15 +614,6 @@ rule samtools_stats:
         "samtools stats {input} 1> {output} 2> {log} "
 
 
-snpeff_input_for_multiqc = []
-if config['snpeff']['germline']:
-    snpeff_input_for_multiqc.append('results/qc/snpEff_summary.csv')
-if config['snpeff']['somatic']:
-    soms = expand('results/qc/snpEff_{comp}_summary.csv', comp = config['mutect2'].keys())
-    snpeff_input_for_multiqc.extend(soms)
-
-
-
 rule multiqc:
     """
     Gather qc metrics and run MultiQC
@@ -642,7 +626,6 @@ rule multiqc:
         fastqc=expand("results/qc/fastqc/zip/{u.sample}-{u.unit}_fastqc.zip", u=units.itertuples()),
         markdup=expand("results/qc/picard/markdups/{u.sample}-{u.unit}_marked_dup_metrics.txt", u=units.itertuples()),
         samstats=expand("results/qc/samtools-stats/{u.sample}-{u.unit}.txt", u=units.itertuples()),
-        snpeff=snpeff_input_for_multiqc
     output:
         "results/qc/multiqc.html",
     params:
@@ -662,83 +645,6 @@ rule multiqc:
         )
 
 
-rule snpeff:
-    """
-    Annotate variants with SnpEff
-    """
-    resources:
-        disk_mb=gb(20),
-        mem_mb=gb(16),
-        # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
-        runtime=autobump(480)
-    input:
-        unpack(snpeff_input)
-    log: 'logs/snpeff.log'
-    output:
-        ann='results/annotated/ann.vcf.gz',
-        stats='results/qc/snpEff_summary.csv',
-        html='results/qc/snpEff_summary.html'
-    params:
-        annotations = config['snpeff']['annotations'],
-        gen = config['snpeff']['genome']
-    # threads: 2 # [ TEST SETTINGS ]
-    run:
-        java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
-        shell(
-            "snpEff {java_opts} "
-            "-o vcf "
-            "-csvStats {output.stats} "
-            "-stats {output.html} "
-            "{params.gen} {input.vcf} "
-            "| bcftools view -Oz > {output.ann} 2> {log} "
-        )
-
-
-rule snpsift_ann:
-    """
-    Annotate variants with the gnomAD database
-    """
-    resources:
-        disk_mb=gb(20),
-        mem_mb=gb(16),
-        # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
-        runtime=autobump(960)
-    threads: 6
-    input:
-        vcf="results/annotated/ann.vcf.gz",
-        ann="references/trimmed_gnomad.joint.v4.1.sites.all_chrom.vcf.bgz"
-    log: 'logs/snpsift_ann.log'
-    output: 'results/annotated/snpsift_ann.vcf.gz'
-    run:
-        java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
-
-        shell(
-            "SnpSift {java_opts} "
-            "annotate -noId "
-            "{input.ann} {input.vcf}"
-            "| bcftools view -Oz > {output}"
-        )
-
-rule snpsift_filter:
-    """
-    Filter out common variants with SnpSift filter
-    """
-    resources:
-        disk_mb=gb(20),
-        mem_mb=gb(16),
-        # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
-        runtime=autobump(960)
-    input: 'results/annotated/snpsift_ann.vcf.gz'
-    output: 'results/annotated/filtered_ann.vcf.gz'
-    log: 'logs/snpsift_filter.log'
-
-    run:
-        shell(
-            "SnpSift filter \"!(AF_joint > 0.1)\" "
-            "{input} "
-            "| bcftools view -Oz > {output} 2>> {log} "
-        )
-
 
 rule mutect2:
     """
@@ -829,7 +735,7 @@ rule merge_mutect2_variants:
     resources:
         disk_mb=gb(20),
         mem_mb=gb(32),
-        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
         vcfs=lambda w: expand(
@@ -860,7 +766,7 @@ rule merge_mutect2_stats:
     resources:
         disk_mb=gb(20),
         mem_mb=gb(16),
-        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
         stats=lambda w: expand(
@@ -890,7 +796,7 @@ rule filter_mutect2_calls:
     resources:
         disk_mb=gb(20),
         mem_mb=gb(16),
-        # mem_mb=gb(4),  # [ TEST SETTINGS -1 ]
+        # mem_mb=gb(2),  # [ TEST SETTINGS -1 ]
         runtime=autobump(120)
     input:
         ref=reference,
@@ -936,38 +842,6 @@ rule mutect2_norm:
         "--output-type z "
         "--output {output} 2> {log}"
 
-
-rule snpeff_cancer:
-    """
-    Annotate somatic variants with SnpEff Cancer
-    """
-    resources:
-        disk_mb=gb(20),
-        mem_mb=gb(16),
-        # mem_mb=gb(4), # [ TEST SETTINGS -1 ]
-        runtime=autobump(120)
-    input:
-        unpack(snpeff_cancer_input),
-    output:
-        vcf='results/mutect2_annotated/snpeff.{comp}.vcf.gz',
-        stats='results/qc/snpEff_{comp}_summary.csv',
-        html='results/qc/snpEff_{comp}.html'
-    log:
-        'logs/cancer_snpeff_{comp}.log'
-    params:
-        snpeff_genome = config['snpeff']['genome']
-    # threads: 2 # [ TEST SETTINGS ]
-    run:
-        java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024))
-        shell(
-            'snpEff {java_opts} '
-            '-v -o vcf -cancer '
-            '-csvStats {output.stats} '
-            '-stats {output.html} '
-            '{params.snpeff_genome} {input.vcf} '
-            '| bcftools view -Oz > {output.vcf} 2> {log}'
-        )
-
 rule merge_bams:
     """
     Merge technical replicates together to get single bam files used for validation.

From ba59f2173ad0502cf826c724b0b509d991e6b64a Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Wed, 19 Nov 2025 17:38:58 -0500
Subject: [PATCH 50/59] Delete log redirect to view test errors

---
 workflows/variant-calling/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile
index 4e90817e4..783e87491 100644
--- a/workflows/variant-calling/Snakefile
+++ b/workflows/variant-calling/Snakefile
@@ -349,7 +349,7 @@ rule call_variants:
             '{bams} '
             '-ERC GVCF '
             '{params.pcr} '
-            '-O {output.gvcf} 2> {log} ' 
+            '-O {output.gvcf} ' 
         )
 
 

From 68cc55508c035d5a50344b0a61b214d6658047fa Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 22 Nov 2025 10:49:01 -0500
Subject: [PATCH 51/59] debug tests

---
 lib/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/common.py b/lib/common.py
index 7ed03cc73..7c0a8e96b 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -322,7 +322,7 @@ def default_postprocess(origfn, newfn):
                 url = url.replace('file://', '')
                 shell('cp {url} {tmpfile} 2> {outfile}.log')
             else:
-                shell("wget {url} -O- > {tmpfile} 2> {outfile}.log")
+                shell("wget {url} -O- > {tmpfile}")
 
         for func, args, kwargs, outfile in funcs:
             func(tmpinputfiles, outfile, *args, **kwargs)

From 2ac3e9d12c19cb63ca46f2701ff306d88dea3df5 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 22 Nov 2025 11:07:31 -0500
Subject: [PATCH 52/59] try updating silva urls

---
 include/reference_configs/test.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/reference_configs/test.yaml b/include/reference_configs/test.yaml
index a8f80b770..ae3e31522 100644
--- a/include/reference_configs/test.yaml
+++ b/include/reference_configs/test.yaml
@@ -70,8 +70,8 @@ references:
     rRNA:
       genome:
         url:
-          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
-          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
+          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz'
+          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz'
         postprocess:
           function: 'lib.common.filter_fastas'
           args: 'Drosophila melanogaster'

From 79938b2d8ce42fe6eb7c15c6aab36b3528782064 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Fri, 5 Dec 2025 13:37:53 -0500
Subject: [PATCH 53/59] Change silva to ftp download

---
 include/reference_configs/test.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/reference_configs/test.yaml b/include/reference_configs/test.yaml
index ae3e31522..93af49b42 100644
--- a/include/reference_configs/test.yaml
+++ b/include/reference_configs/test.yaml
@@ -70,8 +70,8 @@ references:
     rRNA:
       genome:
         url:
-          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz'
-          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz'
+          - 'ftp://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz'
+          - 'ftp://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz'
         postprocess:
           function: 'lib.common.filter_fastas'
           args: 'Drosophila melanogaster'

From f16b0bfd188e3f11764e348e23cbd26f3484c0f2 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Mon, 8 Dec 2025 14:08:43 -0500
Subject: [PATCH 54/59] Update FTP download to use lftp

---
 env.yml                  | 272 +++++++++++++++++++--------------------
 include/requirements.txt |   1 +
 lib/common.py            |   7 +
 3 files changed, 137 insertions(+), 143 deletions(-)

diff --git a/env.yml b/env.yml
index e59e17858..9ab3f5e79 100644
--- a/env.yml
+++ b/env.yml
@@ -1,7 +1,6 @@
 channels:
   - conda-forge
   - bioconda
-  - defaults
 dependencies:
   - _libgcc_mutex=0.1
   - _openmp_mutex=4.5
@@ -10,43 +9,39 @@ dependencies:
   - alabaster=1.0.0
   - alsa-lib=1.2.11
   - amply=0.1.6
-  - anndata=0.12.2
   - annotated-types=0.7.0
-  - anyio=4.10.0
+  - anyio=4.12.0
   - appdirs=1.4.4
-  - argcomplete=3.6.2
+  - argcomplete=3.6.3
   - argh=0.31.3
   - argparse-dataclass=2.0.0
-  - array-api-compat=1.12.0
-  - asttokens=3.0.0
-  - attrs=25.3.0
+  - asttokens=3.0.1
+  - attrs=25.4.0
   - babel=2.17.0
   - backports=1.0
   - backports.tarfile=1.2.0
   - bcftools=1.21
-  - beautifulsoup4=4.13.5
+  - beautifulsoup4=4.14.3
   - bedtools=2.31.1
-  - binutils_impl_linux-64=2.44
-  - biopython=1.85
+  - binutils_impl_linux-64=2.45
+  - biopython=1.86
   - boost-cpp=1.85.0
   - bowtie=1.3.1
   - bowtie2=2.5.4
-  - brotli=1.1.0
-  - brotli-bin=1.1.0
-  - brotli-python=1.1.0
+  - brotli=1.2.0
+  - brotli-bin=1.2.0
+  - brotli-python=1.2.0
   - bwa=0.7.18
   - bwidget=1.10.1
   - bx-python=0.11.0
   - bzip2=1.0.8
   - c-ares=1.34.5
-  - ca-certificates=2025.8.3
-  - cached-property=1.5.2
-  - cached_property=1.5.2
+  - ca-certificates=2025.11.12
   - cairo=1.18.0
-  - certifi=2025.8.3
-  - cffi=1.17.1
-  - charset-normalizer=3.4.3
-  - click=8.2.1
+  - certifi=2025.11.12
+  - cffi=2.0.0
+  - charset-normalizer=3.4.4
+  - click=8.3.1
   - coin-or-cbc=2.10.12
   - coin-or-cgl=0.60.7
   - coin-or-clp=1.17.8
@@ -59,32 +54,29 @@ dependencies:
   - configargparse=1.7.1
   - connection_pool=0.0.3
   - contourpy=1.3.3
-  - cpython=3.12.11
-  - crc32c=2.7.1
-  - cryptography=45.0.7
+  - cpython=3.12.12
+  - cryptography=46.0.3
   - curl=8.8.0
-  - cutadapt=5.1
+  - cutadapt=5.2
   - cycler=0.12.1
   - dbus=1.13.6
   - decorator=5.2.1
   - deeptools=3.5.5
   - deeptoolsintervals=0.1.9
-  - deprecated=1.2.18
   - distlib=0.4.0
   - dnaio=1.2.2
-  - docutils=0.21.2
-  - donfig=0.8.1.post1
+  - docutils=0.22.3
   - dpath=2.2.0
   - editables=0.5
   - eido=0.2.4
   - et_xmlfile=2.0.0
-  - exceptiongroup=1.3.0
-  - execnet=2.1.1
+  - exceptiongroup=1.3.1
+  - execnet=2.1.2
   - executing=2.2.1
-  - expat=2.7.1
+  - expat=2.7.3
   - fastq-screen=0.16.0
   - fastqc=0.12.1
-  - filelock=3.19.1
+  - filelock=3.20.0
   - font-ttf-dejavu-sans-mono=2.37
   - font-ttf-inconsolata=3.000
   - font-ttf-source-code-pro=2.038
@@ -92,23 +84,24 @@ dependencies:
   - fontconfig=2.14.2
   - fonts-conda-ecosystem=1
   - fonts-conda-forge=1
-  - fonttools=4.59.2
+  - fonttools=4.61.0
   - freetype=2.12.1
   - fribidi=1.0.16
   - gatk4=4.6.2.0
-  - gcc_impl_linux-64=15.1.0
+  - gcc_impl_linux-64=15.2.0
+  - gettext=0.25.1
+  - gettext-tools=0.25.1
   - gffread=0.12.7
   - gffutils=0.13
-  - gfortran_impl_linux-64=15.1.0
+  - gfortran_impl_linux-64=15.2.0
   - giflib=5.2.2
   - gitdb=4.0.12
   - gitpython=3.1.45
   - graphite2=1.3.14
   - gsl=2.7
-  - gxx_impl_linux-64=15.1.0
+  - gxx_impl_linux-64=15.2.0
   - h11=0.16.0
   - h2=4.3.0
-  - h5py=3.13.0
   - harfbuzz=9.0.0
   - hatch=1.14.1
   - hatchling=1.27.0
@@ -120,18 +113,18 @@ dependencies:
   - httpcore=1.0.9
   - httpx=0.28.1
   - humanfriendly=10.0
-  - humanize=4.13.0
+  - humanize=4.14.0
   - hyperframe=6.1.0
   - hyperlink=21.0.0
   - icu=73.2
-  - idna=3.10
+  - idna=3.11
   - imagesize=1.4.1
   - immutables=0.21
   - importlib-metadata=8.7.0
   - importlib_resources=6.5.2
-  - iniconfig=2.0.0
+  - iniconfig=2.3.0
   - intervalstats=1.01
-  - ipython=9.5.0
+  - ipython=9.8.0
   - ipython_pygments_lexers=1.1.1
   - isa-l=2.31.1
   - jaraco.classes=3.4.0
@@ -140,135 +133,134 @@ dependencies:
   - jedi=0.19.2
   - jeepney=0.9.0
   - jinja2=3.1.6
-  - joblib=1.5.2
   - jsonschema=4.25.1
   - jsonschema-specifications=2025.9.1
-  - jupyter_core=5.8.1
+  - jupyter_core=5.9.1
   - kaleido-core=0.2.1
   - kallisto=0.51.1
   - kernel-headers_linux-64=4.18.0
-  - keyring=25.6.0
+  - keyring=25.7.0
   - keyutils=1.6.3
   - kiwisolver=1.4.9
   - krb5=1.21.3
   - lcms2=2.16
-  - ld_impl_linux-64=2.44
-  - legacy-api-wrap=1.4.1
+  - ld_impl_linux-64=2.45
   - lerc=4.0.0
+  - lftp=4.9.2
   - libaec=1.1.4
-  - libblas=3.9.0
+  - libasprintf=0.25.1
+  - libasprintf-devel=0.25.1
+  - libblas=3.11.0
   - libboost=1.85.0
   - libboost-devel=1.85.0
   - libboost-headers=1.85.0
-  - libbrotlicommon=1.1.0
-  - libbrotlidec=1.1.0
-  - libbrotlienc=1.1.0
-  - libcblas=3.9.0
+  - libbrotlicommon=1.2.0
+  - libbrotlidec=1.2.0
+  - libbrotlienc=1.2.0
+  - libcblas=3.11.0
   - libcups=2.3.3
   - libcurl=8.8.0
   - libdeflate=1.20
   - libedit=3.1.20250104
   - libev=4.33
-  - libexpat=2.7.1
-  - libffi=3.4.6
-  - libgcc=15.1.0
-  - libgcc-devel_linux-64=15.1.0
-  - libgcc-ng=15.1.0
+  - libexpat=2.7.3
+  - libffi=3.5.2
+  - libgcc=15.2.0
+  - libgcc-devel_linux-64=15.2.0
+  - libgcc-ng=15.2.0
   - libgd=2.3.3
-  - libgfortran=15.1.0
-  - libgfortran-ng=15.1.0
-  - libgfortran5=15.1.0
+  - libgettextpo=0.25.1
+  - libgettextpo-devel=0.25.1
+  - libgfortran=15.2.0
+  - libgfortran-ng=15.2.0
+  - libgfortran5=15.2.0
   - libglib=2.80.2
-  - libgomp=15.1.0
+  - libgomp=15.2.0
   - libhwloc=2.11.2
   - libiconv=1.18
   - libjemalloc=5.3.0
-  - libjpeg-turbo=3.1.0
-  - liblapack=3.9.0
-  - liblapacke=3.9.0
-  - libllvm14=14.0.6
+  - libjpeg-turbo=3.1.2
+  - liblapack=3.11.0
+  - liblapacke=3.11.0
   - liblzma=5.8.1
   - liblzma-devel=5.8.1
   - libnghttp2=1.58.0
   - libnsl=2.0.1
   - libopenblas=0.3.30
-  - libopenssl-static=3.5.2
+  - libopenssl-static=3.6.0
   - libpng=1.6.43
-  - libsanitizer=15.1.0
+  - libsanitizer=15.2.0
   - libsqlite=3.46.0
   - libssh2=1.11.0
-  - libstdcxx=15.1.0
-  - libstdcxx-devel_linux-64=15.1.0
-  - libstdcxx-ng=15.1.0
+  - libstdcxx=15.2.0
+  - libstdcxx-devel_linux-64=15.2.0
+  - libstdcxx-ng=15.2.0
   - libtiff=4.6.0
-  - libuuid=2.41.1
+  - libuuid=2.41.2
   - libwebp=1.4.0
   - libwebp-base=1.4.0
   - libxcb=1.15
   - libxcrypt=4.4.36
   - libxml2=2.12.7
   - libzlib=1.2.13
-  - llvmlite=0.42.0
   - logmuse=0.2.8
   - logomaker=0.8.6
   - make=4.4.1
-  - markdown=3.9
+  - markdown=3.10
   - markdown-it-py=4.0.0
-  - markupsafe=3.0.2
+  - markupsafe=3.0.3
   - mathjax=2.7.7
   - matplotlib-base=3.10.1
-  - matplotlib-inline=0.1.7
+  - matplotlib-inline=0.2.1
   - mdurl=0.1.2
   - more-itertools=10.8.0
-  - msgpack-python=1.1.1
-  - multiqc=1.31
+  - multiqc=1.32
   - munkres=1.1.4
   - mysql-connector-c=6.1.11
-  - narwhals=2.4.0
+  - narwhals=2.13.0
   - natsort=8.4.0
   - nbformat=5.10.4
   - ncurses=6.5
-  - networkx=3.5
-  - nspr=4.37
+  - networkx=3.6
+  - nspr=4.38
   - nss=3.100
-  - numba=0.59.1
-  - numcodecs=0.16.1
   - numpy=1.26.4
-  - numpydoc=1.9.0
+  - numpydoc=1.10.0
   - openjdk=17.0.11
   - openjpeg=2.5.2
   - openpyxl=3.1.5
-  - openssl=3.5.2
+  - openssl=3.6.0
   - packaging=25.0
-  - pandas=2.3.2
-  - pandoc=3.8
+  - pandas=2.3.3
+  - pandoc=3.8.3
   - pango=1.54.0
   - parso=0.8.5
   - pathspec=0.12.1
-  - patsy=1.0.1
+  - patsy=1.0.2
   - pbzip2=1.1.13
   - pcre2=10.43
   - pephubclient=0.4.4
-  - peppy=0.40.7
+  - peppy=0.40.8
   - perl=5.32.1
   - perl-gd=2.76
   - perl-gdgraph=1.56
   - perl-gdtextutil=0.86
   - pexpect=4.9.0
   - picard=2.27.5
-  - pickleshare=0.7.5
   - pigz=2.8
   - pillow=10.3.0
-  - pip=25.2
+  - pip=25.3
   - pixman=0.46.4
-  - plac=1.4.5
-  - platformdirs=4.4.0
-  - plotly=6.3.0
+  - platformdirs=4.5.1
+  - plotly=6.5.0
   - pluggy=1.6.0
-  - polars-lts-cpu=1.33.1
+  - polars=1.35.2
+  - polars-lts-cpu=1.34.0.deprecated
+  - polars-runtime-32=1.35.2
+  - polars-runtime-compat=1.35.2
   - preseq=3.2.0
   - prompt-toolkit=3.0.52
-  - psutil=7.0.0
+  - psutil=7.1.3
   - pthread-stubs=0.4
   - ptyprocess=0.7.0
   - pulp=2.8.0
@@ -278,21 +270,20 @@ dependencies:
   - pybedtools=0.10.0
   - pybigwig=0.3.22
   - pycparser=2.22
-  - pydantic=2.11.7
-  - pydantic-core=2.33.2
+  - pydantic=2.12.5
+  - pydantic-core=2.41.5
   - pyfaidx=0.9.0.3
   - pygments=2.19.2
-  - pynndescent=0.5.13
-  - pyparsing=3.2.4
+  - pyparsing=3.2.5
   - pysam=0.22.1
   - pysocks=1.7.1
-  - pytest=8.4.2
+  - pytest=9.0.2
   - pytest-xdist=3.8.0
   - python=3.12.3
   - python-dateutil=2.9.0.post0
-  - python-dotenv=1.1.1
+  - python-dotenv=1.2.1
   - python-fastjsonschema=2.21.2
-  - python-gil=3.12.11
+  - python-gil=3.12.12
   - python-isal=1.8.0
   - python-kaleido=0.2.1
   - python-tzdata=2025.2
@@ -300,53 +291,50 @@ dependencies:
   - python_abi=3.12
   - pytz=2025.2
   - pyvcf3=1.0.4
-  - pyyaml=6.0.2
+  - pyyaml=6.0.3
   - qhull=2020.2
   - r-base=4.4.0
   - readline=8.2
-  - referencing=0.36.2
-  - regex=2025.7.34
+  - referencing=0.37.0
+  - regex=2025.11.3
   - requests=2.32.5
   - reretry=0.11.8
-  - rich=14.1.0
-  - rich-click=1.8.9
-  - roman-numerals-py=3.1.0
-  - rpds-py=0.27.1
+  - rich=14.2.0
+  - rich-click=1.9.4
+  - roman-numerals=3.1.0
+  - rpds-py=0.30.0
   - rseqc=5.0.4
   - rust-bio-tools=0.42.2
   - salmon=1.10.3
   - samtools=1.21
-  - scanpy=1.11.4
-  - scikit-learn=1.7.2
-  - scipy=1.16.2
+  - scipy=1.16.3
   - seaborn=0.13.2
   - seaborn-base=0.13.2
-  - secretstorage=3.4.0
+  - secretstorage=3.4.1
   - sed=4.9
-  - session-info2=0.2.2
   - setuptools=80.9.0
   - shellingham=1.5.4
-  - simplejson=3.20.1
+  - simplejson=3.20.2
   - six=1.17.0
-  - slack-sdk=3.36.0
-  - slack_sdk=3.36.0
-  - smart_open=7.3.1
+  - slack-sdk=3.39.0
+  - slack_sdk=3.39.0
+  - smart_open=7.5.0
   - smmap=5.0.2
-  - snakemake=9.11.2
-  - snakemake-interface-common=1.21.0
+  - snakemake=9.14.3
+  - snakemake-interface-common=1.22.0
   - snakemake-interface-executor-plugins=9.3.9
-  - snakemake-interface-logger-plugins=1.2.4
-  - snakemake-interface-report-plugins=1.2.0
-  - snakemake-interface-scheduler-plugins=2.0.1
-  - snakemake-interface-storage-plugins=4.2.2
-  - snakemake-minimal=9.11.2
+  - snakemake-interface-logger-plugins=2.0.0
+  - snakemake-interface-report-plugins=1.3.0
+  - snakemake-interface-scheduler-plugins=2.0.2
+  - snakemake-interface-storage-plugins=4.3.2
+  - snakemake-minimal=9.14.3
   - sniffio=1.3.1
   - snowballstemmer=3.0.1
-  - snpeff=5.2
-  - snpsift=5.2
+  - snpeff=5.3.0a
+  - snpsift=5.3.0a
   - soupsieve=2.8
   - spectra=0.0.11
-  - sphinx
+  - sphinx=9.0.4
   - sphinxcontrib-applehelp=2.0.0
   - sphinxcontrib-devhelp=2.0.0
   - sphinxcontrib-htmlhelp=2.1.0
@@ -358,29 +346,28 @@ dependencies:
   - stack_data=0.6.3
   - star=2.7.11b
   - starcode=1.4
-  - statsmodels=0.14.5
+  - statsmodels=0.14.6
   - subread=2.0.6
   - sysroot_linux-64=2.28
   - tabulate=0.9.0
   - tbb=2022.1.0
-  - threadpoolctl=3.6.0
   - throttler=1.2.2
-  - tiktoken=0.11.0
+  - tiktoken=0.12.0
   - tk=8.6.13
   - tktable=2.10
-  - tomli=2.2.1
+  - tomli=2.3.0
   - tomli-w=1.2.0
   - tomlkit=0.13.3
   - tqdm=4.67.1
   - trackhub=1.0
   - traitlets=5.14.3
-  - trove-classifiers=2025.9.11.17
+  - trove-classifiers=2025.12.1.14
   - typeguard=4.4.4
-  - typer=0.17.4
-  - typer-slim=0.17.4
-  - typer-slim-standard=0.17.4
+  - typer=0.20.0
+  - typer-slim=0.20.0
+  - typer-slim-standard=0.20.0
   - typing-extensions=4.15.0
-  - typing-inspection=0.4.1
+  - typing-inspection=0.4.2
   - typing_extensions=4.15.0
   - tzdata=2025b
   - ubiquerg=0.8.0
@@ -396,14 +383,13 @@ dependencies:
   - ucsc-stringify=472
   - ucsc-twobittofa=472
   - ucsc-wigtobigwig=472
-  - umap-learn=0.5.9.post2
-  - unicodedata2=16.0.0
+  - unicodedata2=17.0.0
   - urllib3=2.5.0
   - userpath=1.9.2
-  - uv=0.8.17
+  - uv=0.9.16
   - veracitools=0.1.3
-  - virtualenv=20.34.0
-  - wcwidth=0.2.13
+  - virtualenv=20.35.4
+  - wcwidth=0.2.14
   - webencodings=0.5.1
   - wheel=0.45.1
   - wrapt=1.17.3
@@ -430,10 +416,10 @@ dependencies:
   - xz-gpl-tools=5.8.1
   - xz-tools=5.8.1
   - yaml=0.2.5
-  - yte=1.8.1
-  - zarr=3.1.2
+  - yte=1.9.4
   - zipp=3.23.0
   - zlib=1.2.13
-  - zlib-ng=2.2.5
+  - zlib-ng=2.3.2
   - zstandard=0.23.0
   - zstd=1.5.6
+prefix: /vf/users/NICHD-core0/test/fullerbl/vc-wf/env
diff --git a/include/requirements.txt b/include/requirements.txt
index 58dd5b80c..43352f1a3 100644
--- a/include/requirements.txt
+++ b/include/requirements.txt
@@ -17,6 +17,7 @@ hisat2
 intervalstats
 ipython
 kallisto
+lftp
 multiqc
 pandas
 pandoc
diff --git a/lib/common.py b/lib/common.py
index 7c0a8e96b..03f7be4bc 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -13,6 +13,7 @@
 from lib.imports import resolve_name
 from lib import aligners
 from lib import utils
+from urllib.parse import urlparse
 from snakemake.shell import shell
 from snakemake.io import expand
 
@@ -313,6 +314,7 @@ def default_postprocess(origfn, newfn):
     if isinstance(urls, str):
         urls = [urls]
 
+
     # Download tempfiles into reasonably-named filenames
     tmpfiles = ['{0}.{1}.tmp'.format(outfile, i) for i in range(len(urls))]
     tmpinputfiles = tmpfiles
@@ -321,6 +323,11 @@ def default_postprocess(origfn, newfn):
             if url.startswith('file:'):
                 url = url.replace('file://', '')
                 shell('cp {url} {tmpfile} 2> {outfile}.log')
+            elif url.lower().startswith('ftp'):
+                parsed_url = urlparse(url)
+                ftp_url = f"{parsed_url.scheme}://{parsed_url.netloc}" # scheme ~ protocol; netloc ~ domain name
+                ftp_path = parsed_url.path # File path on server
+                shell('lftp -e "get {ftp_path}; bye" {ftp_url} -O {tmpfile}')  # lftp more resilient to certain kinds of ftp errors than wget
             else:
                 shell("wget {url} -O- > {tmpfile}")
 

From a07e16084e3b636b0624dbfa9d848e89d8fcfe13 Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Thu, 11 Dec 2025 11:37:30 -0500
Subject: [PATCH 55/59] Update fly reference config with ftp urls

---
 include/reference_configs/Drosophila_melanogaster.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/reference_configs/Drosophila_melanogaster.yaml b/include/reference_configs/Drosophila_melanogaster.yaml
index 625c405d3..8385f023e 100644
--- a/include/reference_configs/Drosophila_melanogaster.yaml
+++ b/include/reference_configs/Drosophila_melanogaster.yaml
@@ -3,8 +3,8 @@ references:
     rRNA:
       genome:
         url:
-          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz'
-          - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz'
+          - 'ftp://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz'
+          - 'ftp://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz'
         postprocess:
           function: 'lib.common.filter_fastas'
           args: 'Drosophila melanogaster'

From 9dacff2285fe28383b4a3259ccb8d60d198016ba Mon Sep 17 00:00:00 2001
From: Brandon Fuller <brandonfuller007@gmail.com>
Date: Mon, 15 Dec 2025 16:25:52 -0500
Subject: [PATCH 56/59] Try updating silva urls

---
 include/reference_configs/Drosophila_melanogaster.yaml | 4 ++--
 include/reference_configs/test.yaml                    | 4 ++--
 lib/common.py                                          | 5 -----
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/include/reference_configs/Drosophila_melanogaster.yaml b/include/reference_configs/Drosophila_melanogaster.yaml
index 8385f023e..ba7ecc7c7 100644
--- a/include/reference_configs/Drosophila_melanogaster.yaml
+++ b/include/reference_configs/Drosophila_melanogaster.yaml
@@ -3,8 +3,8 @@ references:
     rRNA:
       genome:
         url:
-          - 'ftp://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz'
-          - 'ftp://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz'
+          - 'https://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz'
+          - 'https://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz'
         postprocess:
           function: 'lib.common.filter_fastas'
           args: 'Drosophila melanogaster'
diff --git a/include/reference_configs/test.yaml b/include/reference_configs/test.yaml
index 93af49b42..b1f415eed 100644
--- a/include/reference_configs/test.yaml
+++ b/include/reference_configs/test.yaml
@@ -70,8 +70,8 @@ references:
     rRNA:
       genome:
         url:
-          - 'ftp://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz'
-          - 'ftp://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz'
+          - 'https://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz'
+          - 'https://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz'
         postprocess:
           function: 'lib.common.filter_fastas'
           args: 'Drosophila melanogaster'
diff --git a/lib/common.py b/lib/common.py
index 03f7be4bc..199d5f71c 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -323,11 +323,6 @@ def default_postprocess(origfn, newfn):
             if url.startswith('file:'):
                 url = url.replace('file://', '')
                 shell('cp {url} {tmpfile} 2> {outfile}.log')
-            elif url.lower().startswith('ftp'):
-                parsed_url = urlparse(url)
-                ftp_url = f"{parsed_url.scheme}://{parsed_url.netloc}" # scheme ~ protocol; netloc ~ domain name
-                ftp_path = parsed_url.path # File path on server
-                shell('lftp -e "get {ftp_path}; bye" {ftp_url} -O {tmpfile}')  # lftp more resilient to certain kinds of ftp errors than wget
             else:
                 shell("wget {url} -O- > {tmpfile}")
 

From 8c4ea197da69ab4d99725885190d547c12aec309 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 20 Dec 2025 13:02:01 -0500
Subject: [PATCH 57/59] wget -> curl

---
 lib/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/common.py b/lib/common.py
index 199d5f71c..27d286555 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -324,7 +324,7 @@ def default_postprocess(origfn, newfn):
                 url = url.replace('file://', '')
                 shell('cp {url} {tmpfile} 2> {outfile}.log')
             else:
-                shell("wget {url} -O- > {tmpfile}")
+                shell("curl -L {url} > {tmpfile}")
 
         for func, args, kwargs, outfile in funcs:
             func(tmpinputfiles, outfile, *args, **kwargs)

From 7188aaf92df75d5923566357a8f8b9a53f6d5b65 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 20 Dec 2025 16:05:49 -0500
Subject: [PATCH 58/59] large resource class for rnaseq-misc

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 406e39500..11db1aee0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -425,6 +425,7 @@ jobs:
 
   rnaseq-misc:
     <<: *defaults
+    resource_class: large
     steps:
       - checkout
       - *restore_cache

From a4ad836b244a5d343a440acb315f0113e2036f99 Mon Sep 17 00:00:00 2001
From: Ryan Dale <115406+daler@users.noreply.github.com>
Date: Sat, 20 Dec 2025 16:45:56 -0500
Subject: [PATCH 59/59] large resource class for mutect2

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 11db1aee0..d4dc06b9d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -413,6 +413,7 @@ jobs:
 
   variantcalling:
     <<: *defaults
+    resource_class: large
     steps:
       - checkout
       - *restore_cache