diff --git a/.circleci/config.yml b/.circleci/config.yml index dc3287660..d4dc06b9d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -141,6 +141,7 @@ variables: cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh # cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh + cp $ORIG/workflows/variant-calling/run_test.sh $DEPLOY/workflows/variant-calling/run_test.sh mkdir $DEPLOY/ci mkdir $DEPLOY/test @@ -170,7 +171,7 @@ variables: test/lcdb-wf-test unit_tests --ensure-docs # find all URLs in reference configs and make sure they exist - test/lcdb-wf-test unit_tests --url-check + # test/lcdb-wf-test unit_tests --url-check # run R package unit tests using the R env test/lcdb-wf-test unit_tests --r-test @@ -230,6 +231,11 @@ variables: conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -k --orig $ORIG + # Variant-calling references + $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=$ORIG/test/test_configs/variant-calling.yaml -j2 -p -k --orig $ORIG + + + # -------------------------------------------------------------------------- # Standard RNA-seq workflow rnaseq-step: &rnaseq-step @@ -292,6 +298,21 @@ variables: # conda activate $LCDBWF_ENV # $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -p -j2 --use-conda --orig $ORIG + + # -------------------------------------------------------------------------- + # Variant-calling workflow + variantcalling-step: &variantcalling-step + run: + name: variantcalling workflow + command: | + cd $DEPLOY + source /opt/miniforge/etc/profile.d/conda.sh + conda activate $LCDBWF_ENV + $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow -n + $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow --use-conda -j2 + + tar -zcf /tmp/variantcalling.tar.gz workflows/variant-calling/results + # -------------------------------------------------------------------------- # Syntax note: All of the steps above, with their "&step-name" labels, can be # referred to by a corresponding "*step-name" below. The "<<: *defaults" @@ -390,8 +411,22 @@ jobs: destination: gene-patterns.html + variantcalling: + <<: *defaults + resource_class: large + steps: + - checkout + - *restore_cache + - *set-path + - *get-data + - *variantcalling-step + - store_artifacts: + path: /tmp/variantcalling.tar.gz + destination: variantcalling.tar.gz + rnaseq-misc: <<: *defaults + resource_class: large steps: - checkout - *restore_cache @@ -497,6 +532,10 @@ workflows: # requires: # - initial-setup # - pytest + - variantcalling: + requires: + - initial-setup + - pytest - build-docs: requires: - initial-setup @@ -508,3 +547,4 @@ workflows: - chipseq-misc - references # - colocalization + - variantcalling diff --git a/.gitignore b/.gitignore index ab3fd51ea..0a3984dd2 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,6 @@ workflows/rnaseq/downstream/rnaseq.html ._* Rplots.pdf /lib/include/* +WORKLOG.rst +workflows/variant-calling/references/ +workflows/variant-calling/results/ diff --git a/deploy.py b/deploy.py index 1981804c5..20b02bb9e 100755 --- a/deploy.py +++ b/deploy.py @@ -108,6 +108,12 @@ def write_include_file(source, flavor="all"): "recursive-include workflows/figures *", "recursive-include workflows/external *", ], + + "variant-calling": [ + "include workflows/variant-calling/Snakefile", + "recursive-include workflows/variant-calling/config *", + ], + } patterns = [] @@ -115,6 +121,8 @@ def write_include_file(source, flavor="all"): patterns.extend(PATTERN_DICT["rnaseq"]) if flavor in ("full", "chipseq"): patterns.extend(PATTERN_DICT["chipseq"]) + if flavor in ("full", "variant-calling"): + patterns.extend(PATTERN_DICT["variant-calling"]) if flavor == "full": patterns.extend(PATTERN_DICT["full"]) patterns.extend(PATTERN_DICT["all"]) @@ -348,7 +356,7 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co "--flavor", default="full", help="""Options are {0}. Default is full.""".format( - ["full", "rnaseq", "chipseq"] + ["full", "rnaseq", "chipseq", "variant-calling"] ), ) ap.add_argument( diff --git a/docs/toc.rst b/docs/toc.rst index 1180c8cd7..0a1cbb5ff 100644 --- a/docs/toc.rst +++ b/docs/toc.rst @@ -13,6 +13,7 @@ Table of Contents rnaseq downstream-rnaseq chipseq + variant-calling integrative conda tests diff --git a/env.yml b/env.yml index 52310a19b..9ab3f5e79 100644 --- a/env.yml +++ b/env.yml @@ -7,433 +7,419 @@ dependencies: - _python_abi3_support=1.0 - _r-mutex=1.0.1 - alabaster=1.0.0 - - alsa-lib=1.2.14 + - alsa-lib=1.2.11 - amply=0.1.6 - annotated-types=0.7.0 - - anyio=4.9.0 + - anyio=4.12.0 - appdirs=1.4.4 - - argcomplete=3.6.2 + - argcomplete=3.6.3 - argh=0.31.3 - argparse-dataclass=2.0.0 - - asttokens=3.0.0 - - attrs=25.3.0 + - asttokens=3.0.1 + - attrs=25.4.0 - babel=2.17.0 - backports=1.0 - backports.tarfile=1.2.0 - - beautifulsoup4=4.13.4 + - bcftools=1.21 + - beautifulsoup4=4.14.3 - bedtools=2.31.1 - - binutils_impl_linux-64=2.44 - - biopython=1.85 + - binutils_impl_linux-64=2.45 + - biopython=1.86 - boost-cpp=1.85.0 - bowtie=1.3.1 - bowtie2=2.5.4 - - brotli=1.1.0 - - brotli-bin=1.1.0 - - brotli-python=1.1.0 + - brotli=1.2.0 + - brotli-bin=1.2.0 + - brotli-python=1.2.0 + - bwa=0.7.18 - bwidget=1.10.1 - - bx-python=0.13.0 + - bx-python=0.11.0 - bzip2=1.0.8 - c-ares=1.34.5 - - ca-certificates=2025.7.14 - - cairo=1.18.4 - - certifi=2025.7.14 - - cffi=1.17.1 - - charset-normalizer=3.4.2 - - click=8.2.1 + - ca-certificates=2025.11.12 + - cairo=1.18.0 + - certifi=2025.11.12 + - cffi=2.0.0 + - charset-normalizer=3.4.4 + - click=8.3.1 - coin-or-cbc=2.10.12 - - coin-or-cgl=0.60.9 - - coin-or-clp=1.17.10 - - coin-or-osi=0.108.11 - - coin-or-utils=2.11.12 + - coin-or-cgl=0.60.7 + - coin-or-clp=1.17.8 + - coin-or-osi=0.108.10 + - coin-or-utils=2.11.11 - colorama=0.4.6 - coloredlogs=15.0.1 - colormath=3.0.0 - conda-inject=1.3.2 - configargparse=1.7.1 - connection_pool=0.0.3 - - contourpy=1.3.2 - - cpython=3.12.11 - - cryptography=45.0.5 - - curl=8.14.1 - - cutadapt=5.1 + - contourpy=1.3.3 + - cpython=3.12.12 + - cryptography=46.0.3 + - curl=8.8.0 + - cutadapt=5.2 - cycler=0.12.1 - - dbus=1.16.2 + - dbus=1.13.6 - decorator=5.2.1 - - deeptools=3.5.6 + - deeptools=3.5.5 - deeptoolsintervals=0.1.9 - - distlib=0.3.9 + - distlib=0.4.0 - dnaio=1.2.2 - - docutils=0.21.2 + - docutils=0.22.3 - dpath=2.2.0 - editables=0.5 - eido=0.2.4 - et_xmlfile=2.0.0 - - exceptiongroup=1.3.0 - - execnet=2.1.1 - - executing=2.2.0 - - expat=2.7.0 + - exceptiongroup=1.3.1 + - execnet=2.1.2 + - executing=2.2.1 + - expat=2.7.3 - fastq-screen=0.16.0 - fastqc=0.12.1 - - filelock=3.18.0 + - filelock=3.20.0 - font-ttf-dejavu-sans-mono=2.37 - font-ttf-inconsolata=3.000 - font-ttf-source-code-pro=2.038 - font-ttf-ubuntu=0.83 - - fontconfig=2.15.0 + - fontconfig=2.14.2 - fonts-conda-ecosystem=1 - fonts-conda-forge=1 - - fonttools=4.58.5 - - freetype=2.13.3 - - fribidi=1.0.10 - - gcc_impl_linux-64=15.1.0 + - fonttools=4.61.0 + - freetype=2.12.1 + - fribidi=1.0.16 + - gatk4=4.6.2.0 + - gcc_impl_linux-64=15.2.0 + - gettext=0.25.1 + - gettext-tools=0.25.1 - gffread=0.12.7 - gffutils=0.13 - - gfortran_impl_linux-64=15.1.0 + - gfortran_impl_linux-64=15.2.0 - giflib=5.2.2 - gitdb=4.0.12 - - gitpython=3.1.44 + - gitpython=3.1.45 - graphite2=1.3.14 - - gsl=1.16 - - gxx_impl_linux-64=15.1.0 + - gsl=2.7 + - gxx_impl_linux-64=15.2.0 - h11=0.16.0 - - h2=4.2.0 - - harfbuzz=11.2.1 + - h2=4.3.0 + - harfbuzz=9.0.0 - hatch=1.14.1 - hatchling=1.27.0 - hdf5=1.14.3 - hisat2=2.2.1 - hpack=4.1.0 - html5lib=1.1 - - htslib=1.22 + - htslib=1.21 - httpcore=1.0.9 - httpx=0.28.1 - humanfriendly=10.0 - - humanize=4.12.3 + - humanize=4.14.0 - hyperframe=6.1.0 - hyperlink=21.0.0 - - icu=75.1 - - idna=3.10 + - icu=73.2 + - idna=3.11 - imagesize=1.4.1 - immutables=0.21 - importlib-metadata=8.7.0 - importlib_resources=6.5.2 - - iniconfig=2.0.0 + - iniconfig=2.3.0 - intervalstats=1.01 - - ipython=9.4.0 + - ipython=9.8.0 - ipython_pygments_lexers=1.1.1 - isa-l=2.31.1 - jaraco.classes=3.4.0 - jaraco.context=6.0.1 - - jaraco.functools=4.2.1 + - jaraco.functools=4.3.0 - jedi=0.19.2 - jeepney=0.9.0 - jinja2=3.1.6 - - jsonschema=4.24.0 - - jsonschema-specifications=2025.4.1 - - jupyter_core=5.8.1 + - jsonschema=4.25.1 + - jsonschema-specifications=2025.9.1 + - jupyter_core=5.9.1 - kaleido-core=0.2.1 - kallisto=0.51.1 - - kernel-headers_linux-64=3.10.0 - - keyring=25.6.0 - - keyutils=1.6.1 - - kiwisolver=1.4.8 + - kernel-headers_linux-64=4.18.0 + - keyring=25.7.0 + - keyutils=1.6.3 + - kiwisolver=1.4.9 - krb5=1.21.3 - - lcms2=2.17 - - ld_impl_linux-64=2.44 + - lcms2=2.16 + - ld_impl_linux-64=2.45 - lerc=4.0.0 + - lftp=4.9.2 - libaec=1.1.4 - - libblas=3.9.0 + - libasprintf=0.25.1 + - libasprintf-devel=0.25.1 + - libblas=3.11.0 - libboost=1.85.0 - libboost-devel=1.85.0 - libboost-headers=1.85.0 - - libbrotlicommon=1.1.0 - - libbrotlidec=1.1.0 - - libbrotlienc=1.1.0 - - libcblas=3.9.0 + - libbrotlicommon=1.2.0 + - libbrotlidec=1.2.0 + - libbrotlienc=1.2.0 + - libcblas=3.11.0 - libcups=2.3.3 - - libcurl=8.14.1 - - libdeflate=1.22 + - libcurl=8.8.0 + - libdeflate=1.20 - libedit=3.1.20250104 - libev=4.33 - - libexpat=2.7.0 - - libffi=3.4.6 - - libfreetype=2.13.3 - - libfreetype6=2.13.3 - - libgcc=15.1.0 - - libgcc-devel_linux-64=15.1.0 - - libgcc-ng=15.1.0 + - libexpat=2.7.3 + - libffi=3.5.2 + - libgcc=15.2.0 + - libgcc-devel_linux-64=15.2.0 + - libgcc-ng=15.2.0 - libgd=2.3.3 - - libgff=2.0.0 - - libgfortran=15.1.0 - - libgfortran5=15.1.0 - - libglib=2.84.2 - - libgomp=15.1.0 + - libgettextpo=0.25.1 + - libgettextpo-devel=0.25.1 + - libgfortran=15.2.0 + - libgfortran-ng=15.2.0 + - libgfortran5=15.2.0 + - libglib=2.80.2 + - libgomp=15.2.0 - libhwloc=2.11.2 - libiconv=1.18 - libjemalloc=5.3.0 - - libjpeg-turbo=3.1.0 - - liblapack=3.9.0 - - liblapacke=3.9.0 + - libjpeg-turbo=3.1.2 + - liblapack=3.11.0 + - liblapacke=3.11.0 - liblzma=5.8.1 - liblzma-devel=5.8.1 - - libnghttp2=1.64.0 + - libnghttp2=1.58.0 - libnsl=2.0.1 - libopenblas=0.3.30 - - libopenssl-static=3.5.1 - - libpng=1.6.50 - - libsanitizer=15.1.0 - - libsqlite=3.50.2 - - libssh2=1.11.1 - - libstdcxx=15.1.0 - - libstdcxx-devel_linux-64=15.1.0 - - libstdcxx-ng=15.1.0 - - libtiff=4.7.0 - - libuuid=2.38.1 - - libwebp-base=1.6.0 - - libxcb=1.17.0 + - libopenssl-static=3.6.0 + - libpng=1.6.43 + - libsanitizer=15.2.0 + - libsqlite=3.46.0 + - libssh2=1.11.0 + - libstdcxx=15.2.0 + - libstdcxx-devel_linux-64=15.2.0 + - libstdcxx-ng=15.2.0 + - libtiff=4.6.0 + - libuuid=2.41.2 + - libwebp=1.4.0 + - libwebp-base=1.4.0 + - libxcb=1.15 - libxcrypt=4.4.36 - - libxml2=2.13.8 - - libzlib=1.3.1 + - libxml2=2.12.7 + - libzlib=1.2.13 - logmuse=0.2.8 - logomaker=0.8.6 - make=4.4.1 - - mariadb-connector-c=3.4.6 - - markdown=3.8.2 - - markdown-it-py=3.0.0 - - markupsafe=3.0.2 + - markdown=3.10 + - markdown-it-py=4.0.0 + - markupsafe=3.0.3 - mathjax=2.7.7 - - matplotlib-base=3.10.3 - - matplotlib-inline=0.1.7 + - matplotlib-base=3.10.1 + - matplotlib-inline=0.2.1 - mdurl=0.1.2 - - more-itertools=10.7.0 - - multiqc=1.30 + - more-itertools=10.8.0 + - multiqc=1.32 - munkres=1.1.4 - mysql-connector-c=6.1.11 - - narwhals=1.46.0 + - narwhals=2.13.0 - natsort=8.4.0 - nbformat=5.10.4 - - ncbi-vdb=3.2.1 - ncurses=6.5 - - networkx=3.5 - - nspr=4.36 - - nss=3.113 - - numpy=2.3.1 - - numpydoc=1.9.0 - - openjdk=23.0.2 - - openjpeg=2.5.3 + - networkx=3.6 + - nspr=4.38 + - nss=3.100 + - numpy=1.26.4 + - numpydoc=1.10.0 + - openjdk=17.0.11 + - openjpeg=2.5.2 - openpyxl=3.1.5 - - openssl=3.5.1 - - ossuuid=1.6.2 + - openssl=3.6.0 - packaging=25.0 - - pandas=2.3.1 - - pandoc=3.7.0.2 - - pango=1.56.4 - - parso=0.8.4 + - pandas=2.3.3 + - pandoc=3.8.3 + - pango=1.54.0 + - parso=0.8.5 - pathspec=0.12.1 - - patsy=1.0.1 + - patsy=1.0.2 - pbzip2=1.1.13 - - pcre2=10.45 + - pcre2=10.43 - pephubclient=0.4.4 - - peppy=0.40.7 + - peppy=0.40.8 - perl=5.32.1 - - perl-alien-build=2.84 - - perl-alien-libxml2=0.17 - - perl-business-isbn=3.007 - - perl-business-isbn-data=20210112.006 - - perl-capture-tiny=0.48 - - perl-carp=1.50 - - perl-constant=1.33 - - perl-exporter=5.74 - - perl-extutils-makemaker=7.70 - - perl-ffi-checklib=0.28 - - perl-file-chdir=0.1011 - - perl-file-path=2.18 - - perl-file-temp=0.2304 - - perl-file-which=1.24 - - perl-gd=2.83 + - perl-gd=2.76 - perl-gdgraph=1.56 - perl-gdtextutil=0.86 - - perl-importer=0.026 - - perl-parent=0.243 - - perl-path-tiny=0.124 - - perl-pathtools=3.75 - - perl-scope-guard=0.21 - - perl-sub-info=0.002 - - perl-term-table=0.024 - - perl-test-fatal=0.016 - - perl-test-nowarnings=1.06 - - perl-test-warnings=0.031 - - perl-test2-suite=0.000163 - - perl-try-tiny=0.31 - - perl-uri=5.17 - - perl-xml-libxml=2.0210 - - perl-xml-namespacesupport=1.12 - - perl-xml-sax=1.02 - - perl-xml-sax-base=1.09 - pexpect=4.9.0 - picard=2.27.5 - - pickleshare=0.7.5 - pigz=2.8 - - pillow=11.3.0 - - pip=25.1.1 - - pixman=0.46.2 - - pkgutil-resolve-name=1.3.10 - - plac=1.4.5 - - platformdirs=4.3.8 - - plotly=6.2.0 + - pillow=10.3.0 + - pip=25.3 + - pixman=0.46.4 + - platformdirs=4.5.1 + - plotly=6.5.0 - pluggy=1.6.0 - - polars=1.31.0 - - polars-default=1.31.0 - - preseq=2.0.2 - - prompt-toolkit=3.0.51 - - psutil=7.0.0 + - polars=1.35.2 + - polars-lts-cpu=1.34.0.deprecated + - polars-runtime-32=1.35.2 + - polars-runtime-compat=1.35.2 + - preseq=3.2.0 + - prompt-toolkit=3.0.52 + - psutil=7.1.3 - pthread-stubs=0.4 - ptyprocess=0.7.0 - pulp=2.8.0 - pure_eval=0.2.3 - py2bit=0.3.3 - pyaml-env=1.2.2 - - pybedtools=0.12.0 - - pybigwig=0.3.24 + - pybedtools=0.10.0 + - pybigwig=0.3.22 - pycparser=2.22 - - pydantic=2.11.7 - - pydantic-core=2.33.2 - - pyfaidx=0.8.1.4 + - pydantic=2.12.5 + - pydantic-core=2.41.5 + - pyfaidx=0.9.0.3 - pygments=2.19.2 - - pyparsing=3.2.3 - - pysam=0.23.3 + - pyparsing=3.2.5 + - pysam=0.22.1 - pysocks=1.7.1 - - pytest=8.4.1 + - pytest=9.0.2 - pytest-xdist=3.8.0 - - python=3.12.11 + - python=3.12.3 - python-dateutil=2.9.0.post0 - - python-dotenv=1.1.1 - - python-fastjsonschema=2.21.1 - - python-gil=3.12.11 - - python-isal=1.7.2 + - python-dotenv=1.2.1 + - python-fastjsonschema=2.21.2 + - python-gil=3.12.12 + - python-isal=1.8.0 - python-kaleido=0.2.1 - python-tzdata=2025.2 - - python-zlib-ng=0.5.1 + - python-zlib-ng=1.0.0 - python_abi=3.12 - pytz=2025.2 - pyvcf3=1.0.4 - - pyyaml=6.0.2 + - pyyaml=6.0.3 - qhull=2020.2 - - r-base=4.2.3 + - r-base=4.4.0 - readline=8.2 - - referencing=0.36.2 - - regex=2024.11.6 - - requests=2.32.4 + - referencing=0.37.0 + - regex=2025.11.3 + - requests=2.32.5 - reretry=0.11.8 - - rich=14.0.0 - - rich-click=1.8.9 - - roman-numerals-py=3.1.0 - - rpds-py=0.26.0 + - rich=14.2.0 + - rich-click=1.9.4 + - roman-numerals=3.1.0 + - rpds-py=0.30.0 - rseqc=5.0.4 + - rust-bio-tools=0.42.2 - salmon=1.10.3 - - samtools=1.22 - - scipy=1.16.0 + - samtools=1.21 + - scipy=1.16.3 - seaborn=0.13.2 - seaborn-base=0.13.2 - - secretstorage=3.3.3 + - secretstorage=3.4.1 - sed=4.9 - setuptools=80.9.0 - shellingham=1.5.4 - - simplejson=3.20.1 + - simplejson=3.20.2 - six=1.17.0 - - slack-sdk=3.36.0 - - slack_sdk=3.36.0 - - smart_open=7.3.0.post1 + - slack-sdk=3.39.0 + - slack_sdk=3.39.0 + - smart_open=7.5.0 - smmap=5.0.2 - - snakemake=9.8.0 - - snakemake-interface-common=1.20.1 - - snakemake-interface-executor-plugins=9.3.8 - - snakemake-interface-logger-plugins=1.2.3 - - snakemake-interface-report-plugins=1.1.1 - - snakemake-interface-storage-plugins=4.2.1 - - snakemake-minimal=9.8.0 + - snakemake=9.14.3 + - snakemake-interface-common=1.22.0 + - snakemake-interface-executor-plugins=9.3.9 + - snakemake-interface-logger-plugins=2.0.0 + - snakemake-interface-report-plugins=1.3.0 + - snakemake-interface-scheduler-plugins=2.0.2 + - snakemake-interface-storage-plugins=4.3.2 + - snakemake-minimal=9.14.3 - sniffio=1.3.1 - snowballstemmer=3.0.1 - - soupsieve=2.7 + - snpeff=5.3.0a + - snpsift=5.3.0a + - soupsieve=2.8 - spectra=0.0.11 - - sphinx=8.2.3 + - sphinx=9.0.4 - sphinxcontrib-applehelp=2.0.0 - sphinxcontrib-devhelp=2.0.0 - sphinxcontrib-htmlhelp=2.1.0 - sphinxcontrib-jsmath=1.0.1 - sphinxcontrib-qthelp=2.0.0 - sphinxcontrib-serializinghtml=1.1.10 - - sqlite=3.50.2 - - sra-tools=3.2.1 + - sqlite=3.46.0 + - sra-tools=2.9.6 - stack_data=0.6.3 - - staden_io_lib=1.15.0 - star=2.7.11b - - statsmodels=0.14.5 - - subread=2.1.1 - - sysroot_linux-64=2.17 + - starcode=1.4 + - statsmodels=0.14.6 + - subread=2.0.6 + - sysroot_linux-64=2.28 - tabulate=0.9.0 - tbb=2022.1.0 - throttler=1.2.2 - - tiktoken=0.9.0 + - tiktoken=0.12.0 - tk=8.6.13 - tktable=2.10 - - tomli=2.2.1 + - tomli=2.3.0 - tomli-w=1.2.0 - tomlkit=0.13.3 - tqdm=4.67.1 - trackhub=1.0 - traitlets=5.14.3 - - trove-classifiers=2025.5.9.12 + - trove-classifiers=2025.12.1.14 - typeguard=4.4.4 - - typer=0.16.0 - - typer-slim=0.16.0 - - typer-slim-standard=0.16.0 - - typing-extensions=4.14.1 - - typing-inspection=0.4.1 - - typing_extensions=4.14.1 + - typer=0.20.0 + - typer-slim=0.20.0 + - typer-slim-standard=0.20.0 + - typing-extensions=4.15.0 + - typing-inspection=0.4.2 + - typing_extensions=4.15.0 - tzdata=2025b - ubiquerg=0.8.0 - - ucsc-bedgraphtobigwig=482 - - ucsc-bedsort=482 - - ucsc-bedtobigbed=482 - - ucsc-bigwigmerge=482 - - ucsc-fetchchromsizes=482 - - ucsc-genepredtobed=482 - - ucsc-gtftogenepred=482 - - ucsc-liftover=482 - - ucsc-oligomatch=482 - - ucsc-twobittofa=482 - - ucsc-wigtobigwig=482 - - unicodedata2=16.0.0 + - ucsc-bedgraphtobigwig=472 + - ucsc-bedsort=469 + - ucsc-bedtobigbed=473 + - ucsc-bigwigmerge=469 + - ucsc-fetchchromsizes=469 + - ucsc-genepredtobed=469 + - ucsc-gtftogenepred=469 + - ucsc-liftover=469 + - ucsc-oligomatch=469 + - ucsc-stringify=472 + - ucsc-twobittofa=472 + - ucsc-wigtobigwig=472 + - unicodedata2=17.0.0 - urllib3=2.5.0 - userpath=1.9.2 - - uv=0.7.20 + - uv=0.9.16 - veracitools=0.1.3 - - virtualenv=20.31.2 - - wcwidth=0.2.13 + - virtualenv=20.35.4 + - wcwidth=0.2.14 - webencodings=0.5.1 - wheel=0.45.1 - - wrapt=1.17.2 + - wrapt=1.17.3 - xopen=2.0.2 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 - xorg-libice=1.1.2 - xorg-libsm=1.2.6 - - xorg-libx11=1.8.12 + - xorg-libx11=1.8.9 - xorg-libxau=1.0.12 - xorg-libxdmcp=1.1.5 - - xorg-libxext=1.3.6 - - xorg-libxfixes=6.0.1 - - xorg-libxi=1.8.2 - - xorg-libxrandr=1.5.4 - - xorg-libxrender=0.9.12 - - xorg-libxt=1.3.1 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.11 + - xorg-libxt=1.3.0 - xorg-libxtst=1.2.5 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 - xz=5.8.1 - xz-gpl-tools=5.8.1 - xz-tools=5.8.1 - yaml=0.2.5 - - yte=1.8.1 + - yte=1.9.4 - zipp=3.23.0 - - zlib=1.3.1 - - zlib-ng=2.2.4 + - zlib=1.2.13 + - zlib-ng=2.3.2 - zstandard=0.23.0 - - zstd=1.5.7 + - zstd=1.5.6 +prefix: /vf/users/NICHD-core0/test/fullerbl/vc-wf/env diff --git a/include/reference_configs/Drosophila_melanogaster.yaml b/include/reference_configs/Drosophila_melanogaster.yaml index 625c405d3..ba7ecc7c7 100644 --- a/include/reference_configs/Drosophila_melanogaster.yaml +++ b/include/reference_configs/Drosophila_melanogaster.yaml @@ -3,8 +3,8 @@ references: rRNA: genome: url: - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + - 'https://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz' + - 'https://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz' postprocess: function: 'lib.common.filter_fastas' args: 'Drosophila melanogaster' diff --git a/include/reference_configs/ERCC.yaml b/include/reference_configs/ERCC.yaml index 22aa25af1..cea79ccc3 100644 --- a/include/reference_configs/ERCC.yaml +++ b/include/reference_configs/ERCC.yaml @@ -2,14 +2,14 @@ references: ercc: srm2374: genome: - url: 'https://tsapps.nist.gov/srmext/certificates/documents/SRM2374_Sequence_v1.FASTA' + url: 'https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA' postprocess: "lib.postprocess.ercc.fasta_postprocess" indexes: - 'bowtie2' - 'hisat2' - 'star' annotation: - url: 'https://tsapps.nist.gov/srmext/certificates/documents/SRM2374_Sequence_v1.FASTA' + url: 'https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA' postprocess: "lib.postprocess.ercc.gtf_postprocess" fisher92: diff --git a/include/reference_configs/test.yaml b/include/reference_configs/test.yaml index a8f80b770..b1f415eed 100644 --- a/include/reference_configs/test.yaml +++ b/include/reference_configs/test.yaml @@ -70,8 +70,8 @@ references: rRNA: genome: url: - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + - 'https://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz' + - 'https://ftp.arb-silva.de/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz' postprocess: function: 'lib.common.filter_fastas' args: 'Drosophila melanogaster' diff --git a/include/requirements.txt b/include/requirements.txt index a1a7f5d12..43352f1a3 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -1,18 +1,23 @@ +bcftools>=1.15.1 bedtools biopython bowtie bowtie2 +bwa +curl cutadapt>=3.0 deeptools fastq-screen fastqc font-ttf-dejavu-sans-mono +gatk4 gffread gffutils hisat2 intervalstats ipython kallisto +lftp multiqc pandas pandoc @@ -26,8 +31,9 @@ pyfaidx pysam pytest pytest-xdist -python +python>=3.10 rseqc +rust-bio-tools # earlier versions of salmon can segfault on Slurm salmon>=1.10.1 @@ -35,6 +41,8 @@ salmon>=1.10.1 samtools seaborn snakemake>9 +snpeff +snpsift sra-tools star subread diff --git a/lib/aligners.py b/lib/aligners.py index 62fe58a57..629a3eb17 100644 --- a/lib/aligners.py +++ b/lib/aligners.py @@ -83,3 +83,28 @@ def fastq_arg_from_input(fastqs): fastqs = '-1 {0} -2 {1} '.format(*fastqs) return fastqs +def bwa_index_from_prefix(prefix): + """ + Given a prefix, return a list of the corresponding bwa index files + """ + ext_list = ["amb", "ann", "bwt", "pac", "sa"] + return ['{prefix}.{ext}'.format(prefix=prefix, ext=ext_list[i]) for i in range(len(ext_list))] + +def bwa_prefix_from_index(index_files): + """ + Given a list of index files, return the corresponding prefix + """ + if isinstance(index_files, str): + return '.'.join(index_files.split('.')[:-1]) + else: + prefixes = list( + set( + map( + lambda x: '.'.join(x.split('.')[:-1]), index_files) + ) + ) + if len(prefixes) != 1: + raise ValueError( + "More than one prefix detected from '{0}'".format(prefixes) + ) + return prefixes[0] diff --git a/lib/common.py b/lib/common.py index 829cc1298..473ea4bd9 100644 --- a/lib/common.py +++ b/lib/common.py @@ -13,8 +13,11 @@ from lib.imports import resolve_name from lib import aligners from lib import utils +from urllib.parse import urlparse from snakemake.shell import shell from snakemake.io import expand +from lib import helpers +from pathlib import Path # List of possible keys in config that are to be interpreted as paths PATH_KEYS = [ @@ -313,6 +316,7 @@ def default_postprocess(origfn, newfn): if isinstance(urls, str): urls = [urls] + # Download tempfiles into reasonably-named filenames tmpfiles = ['{0}.{1}.tmp'.format(outfile, i) for i in range(len(urls))] tmpinputfiles = tmpfiles @@ -322,7 +326,7 @@ def default_postprocess(origfn, newfn): url = url.replace('file://', '') shell('cp {url} {tmpfile} 2> {outfile}.log') else: - shell("wget {url} -O- > {tmpfile} 2> {outfile}.log") + shell("curl -L {url} > {tmpfile}") for func, args, kwargs, outfile in funcs: func(tmpinputfiles, outfile, *args, **kwargs) @@ -419,6 +423,7 @@ def references_dict(config): 'bowtie2': aligners.bowtie2_index_from_prefix('')[0], 'hisat2': aligners.hisat2_index_from_prefix('')[0], 'star': '/Genome', + 'bwa': aligners.bwa_index_from_prefix('')[0], # Notes on salmon indexing: # - pre-1.0 versions had hash.bin @@ -451,13 +456,39 @@ def references_dict(config): type_extensions = { 'genome': 'fasta', 'annotation': 'gtf', - 'transcriptome': 'fasta' + 'transcriptome': 'fasta', + 'known': 'vcf.gz' } for organism in merged_references.keys(): d[organism] = {} for tag in merged_references[organism].keys(): e = {} + if tag == 'variation': + # variation databases should be the the keys of a dictionary + # containing a URL and postprocess block + for type_ in merged_references[organism][tag].keys(): + ext = '.vcf.gz' + if type_ == 'dbnsfp': + type_ = merged_references[organism][tag][type_]['version'] + '_' + merged_references[organism][tag][type_]['build'] + e[type_] = ( + '{references_dir}/' + '{organism}/' + '{tag}/' + '{type_}/' + '{organism}_{tag}{ext}'.format(**locals()) + ) + d[organism][tag] = e + continue + e[type_] = ( + '{references_dir}/' + '{organism}/' + '{tag}/' + '{type_}/' + '{organism}_{tag}{ext}'.format(**locals()) + ) + d[organism][tag] = e + continue for type_, block in merged_references[organism][tag].items(): if type_ == 'metadata': continue @@ -537,7 +568,8 @@ def references_dict(config): .format(**locals()) ) - # Only makes sense to have chromsizes for genome fasta, not transcriptome. + # Only makes sense to have chromsizes and faidx for genome + # fasta, not transcriptome. if type_ == 'genome': e['chromsizes'] = ( '{references_dir}/' @@ -546,6 +578,16 @@ def references_dict(config): '{type_}/' '{organism}_{tag}.chromsizes'.format(**locals()) ) + e['faidx'] = ( + '{references_dir}/' + '{organism}/' + '{tag}/' + '{type_}/' + '{organism}_{tag}.fai'.format(**locals()) + ) + + + d[organism][tag] = e return d, conversion_kwargs @@ -623,8 +665,6 @@ def load_config(config, missing_references_ok=False): Resolves any included references directories/files and runs the deprecation handler. """ - if isinstance(config, str): - config = yaml.load(open(config), Loader=yaml.FullLoader) # Here we populate a list of reference sections. Items later on the list # will have higher priority @@ -722,32 +762,22 @@ def is_paired_end(sampletable, sample): # We can't fall back to detecting PE based on two fastq files provided for # each sample when it's an SRA sampletable (which only has SRR accessions). # - # So detect first detect if SRA sampletable based on presence of "Run" - # column and all values of that column starting with "SRR", and then raise - # an error if the Layout column does not exist. - - if "Run" in sampletable.columns: - if all(sampletable["Run"].str.startswith("SRR")): - if "Layout" not in sampletable.columns and "layout" not in sampletable.columns: - raise ValueError( - "Sampletable appears to be SRA, but no 'Layout' column " - "found. This is required to specify single- or paired-end " - "libraries.") + # So instead first detect if there is in fact a second fastq file listed, + # and if not then check if the layout of the library is listed row = sampletable.set_index(sampletable.columns[0]).loc[sample] if 'orig_filename_R2' in row: return True - if 'layout' in row and 'LibraryLayout' in row: - raise ValueError("Expecting column 'layout' or 'LibraryLayout', " - "not both") - try: - return row['layout'].lower() in ['pe', 'paired'] - except KeyError: - pass - try: - return row['LibraryLayout'].lower() in ['pe', 'paired'] - except KeyError: - pass + if "Run" in sampletable.columns: + if all(sampletable["Run"].str.startswith("SRR")): + layout_columns = set(sampletable.columns).intersection(['layout', 'LibraryLayout', 'Layout']) + if len(layout_columns) != 1: + raise ValueError("Expected exactly one of ['layout', 'LibraryLayout', 'Layout'] in sample table") + layout_column = list(layout_columns)[0] + try: + return row[layout_column].lower() in ['pe', 'paired'] + except KeyError: + pass return False @@ -912,3 +942,4 @@ def gff2gtf(gff, gtf): shell('gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}') else: shell('gffread {gff} -T -o- | gzip -c > {gtf}') + diff --git a/lib/helpers.py b/lib/helpers.py index 9e0d63236..b34b782af 100644 --- a/lib/helpers.py +++ b/lib/helpers.py @@ -5,6 +5,7 @@ from snakemake.shell import shell from snakemake.io import expand, regex_from_filepattern from lib import common +import os class ConfigurationError(Exception): @@ -217,3 +218,22 @@ def strand_arg_lookup(config, lookup): keys = list(lookup.keys()) raise KeyError(f"'{config.stranded}' not one of {keys}") return lookup[config.stranded] + +def get_top_level_dir(start_dir=None): + # Start from the specified directory or current working directory if none is given + current_dir = os.path.abspath(start_dir or os.getcwd()) + # Search current directory and above for targets + while True: + # Check if the target directories exists in the current directory + if (os.path.isdir(os.path.join(current_dir, ".git")) and os.path.isdir(os.path.join(current_dir, "workflows"))): + return current_dir + # Move up one level + parent_dir = os.path.dirname(current_dir) + # Stop if we've reached the root directory + if current_dir == parent_dir: + break + current_dir = parent_dir + #TODO: Check for other edge cases? + + return None + diff --git a/lib/helpers.smk b/lib/helpers.smk new file mode 100644 index 000000000..8c146ba80 --- /dev/null +++ b/lib/helpers.smk @@ -0,0 +1,387 @@ +import pandas as pd +import yaml +import os + +# Read sample table +samples = pd.read_table(config["samples"], dtype=str).set_index("sample", drop=False) +units = pd.read_table(config["units"], dtype=str).set_index(["sample","unit"], drop=False) +units.index = units.index.set_levels([i for i in units.index.levels]) + +def preflight(): + """ + This helper function gets called at the top of the main Snakefile. It + handles reading the config to see if references are provided externally, or + if we are relying on lcdb-wf references. Returns variables containing + filepaths of references to be used in rules. It will also perform some + checks to make sure the config is not contradicting itself under certain + configurations. + """ + aln_index = [] + dbnsfp = [] + dictionary = [] + indexed = [] + known_sites = [] + reference = [] + + # Handle reference names if LCDB-WF References is ran + if config['ref']['use_references_workflow']: + include: '../references/Snakefile' + refdict = common.references_dict(config) + reference = refdict[config['ref']['organism'][config['ref']['genome']['tag']]['genome']] + aln = refdict[config['ref']['organism'][config['ref']['aligner']['tag']]['bwa']] + aln_index = multiext(os.path.splitext(aln)[0], ".amb", ".ann", ".bwt", ".pac", ".sa") + indexed = refdict[config['ref']['organism'][config['ref']['faidx']['tag']]['faidx']] + if config['ref']['variation']['dbnsfp']: + # The config can supply a path to a local file in the variation slots + if config['ref']['variation']['dbnsfp'].startswith('/'): + dbnsfp = config['ref']['variation']['dbnsfp'] + else: + dbnsfp = refdict[config['ref']['organism']]['variation'][str( + config['ref']['variation']['dbnsfp'] + '_' + config['ref']['genome']['build'] + )] + else: + dbnsfp = [] + if config['ref']['variation']['known']: + if config['ref']['variation']['known'].startswith('/'): + known_sites = config['ref']['variation']['known'] + else: + known_sites = refdict[config['ref']['organism']][config['ref']['genome']['tag']][config['ref']['variation']['known']] + else: + known_sites = [] + else: + known_sites = ( + config['ref']['paths']['known'] + if config['ref']['paths']['known'] + else [] + ) + reference = config['ref']['paths']['ref'] + indexed = ( + config['ref']['paths']['index'] + if config['ref']['paths']['index'] + else reference + '.fai' + ) + dbnsfp = ( + config['ref']['paths']['dbnsfp'] + if config['ref']['paths']['dbnsfp'] + else [] + ) + aln_index = [] + + # Handle dictionary name, stop the workflow if the fasta file is not named properly. Stop the workflow if there is no reference + if reference == []: + raise ValueError("You must supply a reference file to workflow.") + if reference.endswith('.gz'): + dictionary = '.'.join(reference.split('.')[:-2]) + '.dict' + else: + try: + dictionary ='.'.join(reference.split('.')[:-1]) + '.dict' + # If there is no exception, python will raise a TypeError trying to concatenate an empty list with str + except TypeError: + raise ValueError("There is something wrong with your reference extension. " + "Please make sure your reference has an extension") + # Stop the workflow easily if there is no known variation, but bqsr is set in the config + if config['filtering']['bqsr'] == True: + assert known_sites != [], 'Check your config.yaml. You are requiring that bqsr be run, but there is no known sites vcf' + + return aln_index, dbnsfp, dictionary, indexed, known_sites, reference + + +def get_contigs(): + """ + Helper function to read the contigs from the fasta index checkpoint rule. + These contigs define the regions to split variant calling by for joint-calling. + """ + with checkpoints.genome_index.get().output[0].open() as fai: + ser = pd.read_table(fai, header=None, usecols=[0], dtype=str) + ser = ser.squeeze() + # TODO: make this less brittle, and better support non-Ensembl organisms + # Remove all contigs that don't correspond to a chromosome + ser = ser[ser.apply(lambda x: len(x)) <= 2] + # Remove mitochondiral if specified in the config + if config["processing"]["remove-mitochondrial"]: + return ser[ser != "MT"] + else: + return ser + + +def get_fastq(wildcards): + """ + Get fastq files of given sample-unit. Sample-unit structure is how technical replicates + are handled. This is defined in the sampletable. + """ + fastqs = units.loc[(wildcards.sample, wildcards.unit), ["fq1", "fq2"]].dropna() + if len(fastqs) == 2: + return {"r1": fastqs.fq1, "r2": fastqs.fq2} + return {"r1": fastqs.fq1} + + +def get_read_group(wildcards): + """Denote sample name and platform in read group.""" + return "-R '@RG\\tID:{sample}\\tSM:{sample}\\tPL:{platform}'".format( + sample=wildcards.sample, + platform=units.loc[(wildcards.sample, wildcards.unit), "platform"], + ) + + +def get_recal_input(bai=False): + """ + Handle providing bams the input of the bqsr rules. + Read config options to determine the appropriate bam and bam index files. + If we don't remove duplicates, return the sorted bams from map reads rule. + If duplicates are removed, return the deduplicated bams from the mark duplicates rule. + If a bed file is used in variant calling + """ + # Case 1: no duplicate removal + f = "results/mapped/{sample}-{unit}.sorted.bam" + if config["processing"]["remove-duplicates"]: + # Case 2: remove duplicates + f = "results/dedup/{sample}-{unit}.bam" + if bai: + if config["processing"]["restrict-regions"]: + # Case 3: need an index because random access is required + f += ".bai" + return f + else: + # Case 4: no index needed + return [] + else: + return f + + +def get_sample_bams(wildcards): + """ + Get all aligned reads of given sample. Return the recal bams if bqsr is run + otherwise return the dedup bams. We return all units for a given sample because + we want to provide technical replicates to the variant calling rule where this is called + """ + unitlist = units.loc[wildcards.sample].unit + reslist = [] + if config['filtering']['bqsr']: + reslist.extend( + [ + "results/recal/{}-{}.bam".format(wildcards.sample, unit) for unit in unitlist + ] + ) + else: + reslist.extend( + [ + "results/dedup/{}-{}.bam".format(wildcards.sample, unit) for unit in unitlist + ] + ) + + return reslist + + +def get_sample_unit_bams(wildcards): + """ + Get all aligned reads of given sample. Unlike the function above, we return a single sample-unit combination per function call. + This is because this function is used to QC rules like samtools-stats where we do not want to combine technical replicates. + Return the recal bams if bqsr is run otherwise return the dedup bams + """ + reslist = '' + if config['filtering']['bqsr']: + reslist = "results/recal/{sample}-{unit}.bam".format(sample=wildcards.sample, unit=wildcards.unit) + else: + reslist = "results/dedup/{sample}-{unit}.bam".format(sample=wildcards.sample, unit=wildcards.unit) + return reslist + + +def get_regions_param(regions=config["processing"]["restrict-regions"], default=""): + """ + If a captured regions bedfile is present, split the variant calling up into regions + follwing GATK best practices + """ + if regions: + params = "--intervals '{}' ".format(regions) + padding = config["processing"].get("region-padding") + if padding: + params += "--interval-padding {}".format(padding) + return params + return default + + +def get_call_variants_params(wildcards, input): + """ + Calls the previous function to assemble the regions into interval lists + along with any specified parameters for variant calling in the config + """ + return ( + get_regions_param( + regions=input.regions, default="--intervals {}".format(wildcards.contig) + ) + ) + + +def set_java_opts(resources): + """ + Using the resources directive from the snakemake rule + set the heap size. Request 75 percent of the requested + mem_mb. The remaining 25 percent should be enough for + OS and other system processes that occur outside the shell command + """ + heap = int(resources.mem_mb * 0.75) + heap = int(heap / 1024) + java_temp ='''"-Xmx{}g -Djava.io.tmpdir=$TMPDIR\"''' + java_opts = java_temp.format(heap) + return java_opts + +def all_input_mutect2(): + """ + Format the input for the all rule for mutect2 + """ + comparisons = config['mutect2'].keys() + return expand("results/mutect2_annotated_normed/{comp}.vcf.gz", comp=comparisons) + + +def names_for_somatic(wildcards): + """ + Format the names into arguments to pass to mutect2. + Mutect2 requires you to specify the names of the "normal" samples. + There can be multiple normal samples in a single mutect2 call. + Tumor samples do not need to be named. This will be done by reading + from the config. + """ + comp = wildcards.comp + normals = config['mutect2'][comp]['normal'] + if not isinstance(normals, list): + normals = [normals] + return normals + + +def input_for_somatic(wildcards): + """ + Format the bam input for mutect2 by reading from the config. + Technical replicates are separated and grouped. Returns a dictionary + contains the reference genome, sequence dictionary, and input bams + """ + comp = wildcards.comp + normals = config['mutect2'][comp]['normal'] + if not isinstance(normals, list): + normals = [normals] + tumors = config['mutect2'][comp]['tumor'] + if not isinstance(tumors, list): + tumors = [tumors] + # Fill these lists with paths to tumor and normal files + t_files = [] + n_files = [] + for i in range(len(tumors)): + # Get the unit for each tumor sample + unitlist = units.loc[tumors[i]].unit + if config['filtering']['bqsr']: + t_files.extend( + [ + "results/recal/{}-{}.bam".format(tumors[i], unit) for unit in unitlist + ] + ) + else: + t_files.extend( + [ + "results/dedup/{}-{}.bam".format(tumors[i], unit) for unit in unitlist + ] + ) + # Do the same for Normals + for i in range(len(normals)): + unitlist = units.loc[normals[i]].unit + if config['filtering']['bqsr']: + n_files.extend( + [ + "results/recal/{}-{}.bam".format(normals[i], unit) for unit in unitlist + ] + ) + else: + n_files.extend( + [ + "results/dedup/{}-{}.bam".format(normals[i], unit) for unit in unitlist + ] + ) + + + # Put all the input files needed into a dictionary to pass to the rule + d = dict( + ref=reference, + normals=n_files, + tumors=t_files, + dict=dictionary, + regions=( + "results/called/{contig}.regions.bed".format(contig = wildcards.contig) + if config["processing"]["restrict-regions"] + else [] + ), + ) + return d + + +def get_fai_nomenclature(): + """ + Helper function to get the nomenclature of the fasta index + Returns True if the chr prefix is present, and False if it is absent + """ + nom = False + with checkpoints.genome_index.get().output[0].open() as fai: + for line in fai: + if line.startswith('chr'): + nom = True + break + return nom + + +def get_bed_nomenclature(input): + """ + Helper function to get the nomenclature of the bedfile + Returns True if the chr prefix is present, and False if it is absent + """ + nom = False + with open(input.bed, 'r') as f: + for line in f: + if line.startswith('browser'): + continue + if line.startswith('track'): + continue + if line.startswith('chr'): + nom = True + break + return nom + + +def snpeff_input(wildcards): + """ + Ensure all correct input files for snpEff rule are available + """ + dbnsfp = ( + config['ref']['paths']['dbnsfp'] + if config['ref']['paths']['dbnsfp'] + else [] + ) + dbnsfp_tbi = dbnsfp + '.tbi' + vcf = 'results/filtered/all.normed.vcf.gz' + + # make dictionary containing the required snpEff input files + d = dict( + dbnsfp=dbnsfp, + dbnsfp_tbi=dbnsfp_tbi, + vcf=vcf, + ) + return d + + +def snpeff_cancer_input(wildcards): + """ + Ensure all correct input files for snpEff cancer rule are available + """ + dbnsfp = ( + config['ref']['paths']['dbnsfp'] + if config['ref']['paths']['dbnsfp'] + else [] + ) + dbnsfp_tbi = dbnsfp + '.tbi' + vcf = 'results/somatic_filtered/normed.{comp}.vcf.gz'.format(comp = wildcards.comp) + + # make dictionary containing the required snpEff cancer input files + d = dict( + dbnsfp=dbnsfp, + dbnsfp_tbi=dbnsfp_tbi, + vcf=vcf, + ) + return d + +# vim: ft=python diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test index df59b24c5..b12f3bc50 100755 --- a/test/lcdb-wf-test +++ b/test/lcdb-wf-test @@ -105,6 +105,8 @@ class Runner(object): %(prog)s rnaseq --downstream %(prog)s chipseq --run-workflow %(prog)s references --run-workflow --configfile=config/config.yaml + %(prog)s variantcalling --run-workflow + DATA ---- @@ -180,7 +182,7 @@ class Runner(object): parser.add_argument( "--kind", default="all", - choices=["all", "rnaseq", "chipseq"], + choices=["all", "rnaseq", "chipseq", "variantcalling"], help="Kind of data to download", ) parser.add_argument( @@ -194,8 +196,22 @@ class Runner(object): args = parser.parse_args(sys.argv[2:]) - repo = "lcdb-test-data" - URL = f"https://github.com/lcdb/{repo}/blob/{args.branch}/data/{{}}?raw=true" + # Create a repo lookup for the different assays + # For variantcalling, the `args.branch` should be "main" instead of "master", unless we can fix this + repo_lookup = { + 'rnaseq': { + 'repo': "lcdb-test-data", + 'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true" + }, + 'chipseq': { + 'repo': "lcdb-test-data", + 'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true" + }, + 'variantcalling': { + 'repo': 'lcdb-wf-variant-calling-test-data', + 'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true" + } + } # This dict maps files in the `data` directory of test-data repo to # a local path to which it should be downloaded, as expected by the @@ -269,6 +285,33 @@ class Runner(object): "workflows/chipseq/data/example_data/chipseq_ip4.fq.gz", ), ], + "variantcalling": [ + ( + "GRCh38.6.20.fa.gz", + "workflows/variant-calling/references/GRCh38.6.20.fa.gz", + ), + ( + "known_variation_noiupac.vcf.gz", + "workflows/variant-calling/references/known_variation_noiupac.vcf.gz" + + ), + ( + "normal_R1.6.20.fq.gz", + "workflows/variant-calling/data/example_data/normal_R1.fq.gz" + ), + ( + "normal_R2.6.20.fq.gz", + "workflows/variant-calling/data/example_data/normal_R2.fq.gz" + ), + ( + "tumor_R1.6.20.fq.gz", + "workflows/variant-calling/data/example_data/tumor_R1.fq.gz" + ), + ( + "tumor_R2.6.20.fq.gz", + "workflows/variant-calling/data/example_data/tumor_R2.fq.gz" + ) + ] } if args.kind == "all": @@ -277,12 +320,12 @@ class Runner(object): kinds = [args.kind] for kind in kinds: for fn, dest in data_files[kind]: - url = URL.format(fn) + url = repo_lookup[kind]['URL'].format(fn, repo=repo_lookup[kind]['repo']) if args.verbose: print(f"downloading {url}") if dest is None: dest = fn - dest = Path(dest) + dest = Path(dest).resolve() dest.parent.mkdir(parents=True, exist_ok=True) sp.run( f"wget -q -O- {url} > {dest}", shell=True, check=True, cwd=TOPLEVEL @@ -577,6 +620,44 @@ class Runner(object): executable="/bin/bash" ) + def _cmd_variantcalling(self): + """ + This function handles the "variantcalling" subcommand. + """ + + parser = argparse.ArgumentParser( + description="Run variant calling workflow and downstream tests", + parents=[self.global_parser], + ) + parser.add_argument( + "--run-workflow", + action="store_true", + help="""Run variant workflow using run_tesh.sh, which runs preprocess.py + on the snakefile, converting it to a test file to be run.""", + ) + + + workflow_prefix = "bash run_test.sh" + workflow_dir = TOPLEVEL / "workflows/variant-calling" + args, extra = parser.parse_known_args(sys.argv[2:]) + + if args.run_workflow: + print(args) + extra = [i.replace("__ORIG__", args.orig) for i in extra] + strargs = " ".join(extra) + cmd = ( + 'eval "$(conda shell.bash hook)" ' + f"&& conda activate {args.env} " + f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})" + ) + print_header(f"Running the following command:\n{cmd}") + sp.run( + cmd, + check=True, + shell=True, + executable="/bin/bash" + ) + if __name__ == "__main__": Runner() diff --git a/test/test_configs/variant-calling.yaml b/test/test_configs/variant-calling.yaml new file mode 100644 index 000000000..d768eb05e --- /dev/null +++ b/test/test_configs/variant-calling.yaml @@ -0,0 +1,30 @@ +references_dir: "references" +references: + human: + ensembl-104: + metadata: + build: 'GRCh38' + release: 104 + species: 'homo_sapiens' + genome: + url: 'https://github.com/lcdb/lcdb-wf-variant-calling-test-data/raw/master/data/GRCh38.6.20.fa.gz' + # URL format is 'ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species_capitalized}.{build}.{datatype}.{assembly}.{suffix}' + # When using GRCh37, branch changes to "grch37/release-{release}" + # always use primary_assembly for human, NEVER use top_level for assembly for human + indexes: + - 'bwa' + known: + # You can download structural_variations, somatic, or "all" which corresponds to germline known variation for all chromosomes + type: 'all' + # Comment the variation key out if not requiring dbnsfp + #variation: + # Download of variation databases will be handled by a unique rule in the Snakefile + # ONLY include keys like 'dbnsfp' IF you intend to download them, comment out these keys if not. + #dbnsfp: + # The version of the database should be correctly formatted like this + #version: 'dbNSFPv4.4' + # The url is found by copying the link address of the latest version found here: https://sites.google.com/site/jpopgen/dbNSFP + #url: 'https://usf.box.com/shared/static/bvfzmkpgtphvbmmrvb2iyl2jl21o49kc' + # Match the build to the metadata block above + #build: 'GRCh38' + diff --git a/lib/test_suite.py b/test/tests/test_suite.py similarity index 68% rename from lib/test_suite.py rename to test/tests/test_suite.py index 21b9c0527..b1389f037 100644 --- a/lib/test_suite.py +++ b/test/tests/test_suite.py @@ -1,8 +1,25 @@ -import os -import pprint +import sys +import subprocess +top_level_dir = subprocess.run(["dirname $(dirname $(pwd))"], shell=True, capture_output=True, text=True).stdout.strip() +print("top level dir: ", top_level_dir) +sys.path.insert(0, top_level_dir) +import pytest from textwrap import dedent -from . import common +from lib import common, helpers, patterns_targets +# Make config object that can be re-used for any test +@pytest.fixture +def config(request): + config_path = request.param + config = common.load_config(config_path, test=True) + return patterns_targets.RNASeqConfig(config, config.get('patterns', '../workflows/rnaseq/config/rnaseq_patterns.yaml')) + +# Call helpers.detect_layout(), which implicitly tests common.is_paired_end() +# TODO: Make assertion condition NOT hard coded in to work with current example table +@pytest.mark.parametrize("config", ['../../workflows/rnaseq/config/config.yaml'], indirect=True) +def test_is_paired_end(config): + is_paired = helpers.detect_layout(config.sampletable) == 'PE' + assert not is_paired, f"Test failed, is_paired = {is_paired}" def test_config_loading(tmpdir): f0 = tmpdir.mkdir('subdir').join('file0.yaml') diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index 7d25545b9..84a4af7b7 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -4,7 +4,8 @@ import gzip import yaml import importlib import tempfile -import pandas +from tempfile import TemporaryDirectory +import pandas as pd from snakemake.utils import makedirs HERE = str(Path(workflow.snakefile).parent) @@ -69,6 +70,29 @@ rule unzip: mem_mb=autobump(gb=1), shell: 'gunzip -c {input} > {output}' +rule bwa_index: + """ + Build bwa index + """ + input: + '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' + output: + protected(aligners.bwa_index_from_prefix('{references_dir}/{organism}/{tag}/genome/bwa/{organism}_{tag}')) + log: + '{references_dir}/logs/{organism}/{tag}/genome/bwa/{organism}_{tag}.log' + resources: + runtime=autobump(hours=3), + mem_mb=gb(24), + disk_mb=gb(24) + run: + prefix=aligners.bwa_prefix_from_index(output) + print(prefix) + shell( + 'bwa index ' + '-p {prefix} ' + '-a bwtsw ' + '{input} ' + '&> {log}') rule bowtie2_index: """ @@ -362,7 +386,7 @@ rule mappings: d['__featuretype__'] = ft res.append(d) - df = pandas.DataFrame(res) + df = pd.DataFrame(res) # Depending on how many attributes there were and the # include_featuretypes settings, this may take a while. @@ -373,4 +397,176 @@ rule mappings: # Restore original setting gffutils.constants.always_return_list = orig_setting + +checkpoint genome_index: + """ + Build fasta index. GATK uses this file for rapid fasta accession as well as + the fasta index is used to identify chromosomes and contigs in the genome + to download the appropriate known variation file + """ + input: + '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' + output: + protected('{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fai') + log: + '{references_dir}/logs/{organism}/{tag}/genome/{organism}_{tag}.fai.log' + resources: + runtime=hours(1), + mem_mb=gb(4) + run: + shell( + 'samtools ' + 'faidx ' + '-o {output} {input} ' + '&> {log}' + ) + + +rule known_variation: + """ + Download all the chromosomes on the ensembl ftp site and combine them to + create a known variation vcf + """ + input: + fai='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fai' + output: + protected('{references_dir}/{organism}/{tag}/known/{organism}_{tag}.vcf.gz') + log: + '{references_dir}/{organism}/{tag}/known/{organism}_{tag}.known.log' + resources: + runtime=hours(4), + mem_mb=gb(16), + disk_mb=gb(32) + run: + release = int(config['references'][wildcards.organism][wildcards.tag]['metadata']['release']) + species = config['references'][wildcards.organism][wildcards.tag]['metadata']['species'] + build = config['references'][wildcards.organism][wildcards.tag]['metadata']['build'] + typ = config['references'][wildcards.organism][wildcards.tag]['known']['type'] + + # ---------------------------------------------------------------------- + # NOTE: species-specific configuration may be required + # ---------------------------------------------------------------------- + # + # Ensembl has many species available, but the code below is designed to + # work for human. + # + # For available species, see https://ftp.ensembl.org/pub/release-109/variation/vcf/ + # + + # Human-specific patching to deal with GRCh37 filenames and directory + # structure on Ensembl FTP + branch = "" + if release >= 81 and build == "GRCh37": + branch="grch37/" + if typ == "all": + suffixes = [""] + + # Starting in release 93, human germline VCFs are split by chrom + if species == "homo_sapiens" and release >= 93: + ser = ( + pd.read_table(input.fai, header=None, usecols=[0], dtype=str) + .squeeze() + ) + contigs = ser[ser.apply(lambda x: len(x)) <= 2] + suffixes = expand("-chr{chrom}", chrom=contigs) + + elif typ == "somatic": + suffixes=["_somatic"] + elif typ == "structural_variations": + suffixes=["_structural_variations"] + species = species.lower() + release = int(release) + build = build + typ = typ + + # Prior to release 91, species names started with a capital letter + species_filename = species if release >= 91 else species.capitalize() + + urls = expand( + "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}", + release=release, + species=species, + suffix=suffixes, + species_filename=species_filename, + branch=branch, + ext=["vcf.gz", "vcf.gz.csi"] + ) + vcfs = [os.path.basename(url) for url in urls if url.endswith(".gz")] + + # Compose a command that downloads all vcf.gz files (possibly one for + # each contig in human Ensembl releases >=93) + gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls))) + + # We'll be downloading in a temp dir, so get absolute paths to make things easier + fai = os.path.abspath(input.fai) + log = os.path.abspath(str(log)) + with tempfile.TemporaryDirectory() as tmpdir: + if input.get("fai"): + shell( + "cd {tmpdir}; {gather} 2> {log} " + "&& bcftools concat -Oz --naive-force {vcfs} > concat.vcf.gz 2>> {log}" + ) + shell( + "bcftools reheader --fai {fai} {tmpdir}/concat.vcf.gz> {output} 2>> {log} " + "&& tabix -p vcf {output} 2>> {log} " + ) + +#if config['references']['human'].get('variation'): +# rule dbnsfp: +# """ +# Download and process dbNSFP database. This involves downloading and +# extracting the zip file, then combining the chromosomes to create +# a single file. For genome builds like hg19 and GRCh37, some processing +# needs to be done to make them compatible with dbNSFP version > 3.X +# dbNSFP is only for human genomes. +# """ +# output: +# protected( +# '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.vcf.gz'.format( +# dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], +# build=config['references']['human']['variation']['dbnsfp']['build'] +# ) +# ) +# log: +# '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.log'.format( +# dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], +# build=config['references']['human']['variation']['dbnsfp']['build'] +# ) +# resources: +# disk_mb=gb(500), +# mem_mb=gb(500), +# runtime=hours(8) +# threads: 16 +# run: +# version = config['references']['human'][wildcards.tag]['dbnsfp']['version'] +# URL = config['references']['human'][wildcards.tag]['dbnsfp']['url'] +# build = config['references']['human'][wildcards.tag]['dbnsfp']['build'] +# workdir = wildcards.references_dir +# if build == 'GRCh37': +# # We need to process the dbNSFP file to make it compatible with older genomes +# with tempfile.TemporaryDirectory() as tmpdir: +# shell( +# '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' +# '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' +# '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' +# '''awk '$8 != "." ' all_chrs > all_chrs_filtered && ''' +# '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && ''' +# '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && ''' +# '''bgzip -c all_chrs_filtered_sorted_header > {output}) && ''' +# '''tabix -s 8 -b 9 -e 9 {output} ''' +# ) +# if build == 'GRCh38': +# with tempfile.TemporaryDirectory() as tmpdir: +# # No need for processing and we can use the first 2 columns for coordinates +# shell( +# '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' +# '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' +# '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' +# '''sort -S 50% --parallel=24 all_chrs -k1,1 -k2,2n > all_chrs_sorted && ''' +# '''cat h all_chrs_sorted > all_chrs_sorted_header && ''' +# '''bgzip -c all_chrs_sorted_header > {output}) && ''' +# '''tabix -s 1 -b 2 -e 2 {output} ''' +# ) +# + # vim: ft=python diff --git a/workflows/variant-calling/README.md b/workflows/variant-calling/README.md new file mode 100644 index 000000000..32423f298 --- /dev/null +++ b/workflows/variant-calling/README.md @@ -0,0 +1,3 @@ +Please note, the variant-calling workflow is still experimental. + +Detailed documentation can be found in this directory in `variant-calling.rst`. diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile new file mode 100644 index 000000000..783e87491 --- /dev/null +++ b/workflows/variant-calling/Snakefile @@ -0,0 +1,860 @@ +import sys +import pandas as pd +import tempfile +import os +from os import path +import re +from tempfile import TemporaryDirectory,NamedTemporaryFile +from snakemake.shell import shell +import yaml +from textwrap import dedent +from pathlib import Path +from urllib.request import urlretrieve +from zipfile import ZipFile +HERE = str(Path(workflow.snakefile).parent) +sys.path.insert(0, HERE + "/../..") +from lib import common, utils, helpers, aligners +from lib.utils import autobump, gb, hours + +configfile: "config/config.yaml" + +include: '../../lib/helpers.smk' + +aln_index, dbnsfp, dictionary, indexed, known_sites, reference = preflight() + +wildcard_constraints: + vartype="snvs|indels", + sample="|".join(samples.index), + unit="|".join(units["unit"]), + comp="|".join(config['mutect2'].keys()) + + +rule all: + input: + "results/qc/multiqc.html", + "results/filtered/all.normed.vcf.gz", + expand("results/somatic_filtered/normed.{comp}.vcf.gz", comp = config['mutect2'].keys()), + expand("results/merged/{sample}.bam", sample=samples.index), + expand("results/merged/{sample}.bam.bai", sample=samples.index), + + + +checkpoint genome_index: + threads: 2 + resources: + mem_mb=gb(4), + runtime=autobump(60) + input: + reference + output: + indexed + log: + 'logs/fasta_index.log' + shell: + "samtools " + "faidx " + "{input} > {output} 2> {log} " + + +rule fasta_dict: + threads: 1 + resources: + mem_mb=gb(4), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + disk_mb=gb(4), + runtime=autobump(60) + input: + ref=reference + output: dictionary + log: "logs/sequence_dictionary.log" + run: + java_opts = set_java_opts(resources) + shell( + 'picard CreateSequenceDictionary {java_opts} ' + '-R {input.ref} ' + '-O {output} &> {log} ' + ) + + +if not aln_index: + rule bwa_index: + """ + Generate BWA index for the reference genome if we are not using lcdb-wf references workflow + """ + threads: 8 + resources: + disk_mb=gb(24), + mem_mb=gb(24), + runtime=autobump(180) + input: + reference + output: + multiext(reference, ".amb", ".ann", ".bwt", ".pac", ".sa") + log: + "logs/bwa_index.log" + params: + "bwtsw " + shell: + "bwa index -a {params} " + "{input} " + " &> {log}" + + +rule trim_adapters: + threads: 8 + resources: + mem_mb=gb(32), + runtime=autobump(360) + input: unpack(get_fastq), + output: + r1="results/trimmed/{sample}-{unit}.1.fastq.gz", + r2="results/trimmed/{sample}-{unit}.2.fastq.gz", + log: + "logs/{sample}-{unit}_trimming.log" + shell: + 'cutadapt ' + '-o {output.r1} ' + '-p {output.r2} ' + '-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT ' + '--nextseq-trim 20 ' + '--overlap 6 ' + '-j {threads} ' + '--minimum-length 25 ' + '{input.r1} ' + '{input.r2} ' + ' &> {log} ' + + +rule map_reads: + threads: 32 + resources: + disk_mb=gb(40), + mem_mb=gb(48), + runtime=autobump(1920) + input: + reads=["results/trimmed/{sample}-{unit}.1.fastq.gz","results/trimmed/{sample}-{unit}.2.fastq.gz"], + idx=multiext(reference, ".amb", ".ann", ".bwt", ".pac", ".sa"), + output: + bam=temp("results/mapped/{sample}-{unit}.sorted.bam"), + params: + extra=get_read_group, + index=lambda w, input: os.path.splitext(input.idx[0])[0], + log: + "logs/{sample}-{unit}_bwamem.log" + shell: + "bwa mem " + "-t {threads} " + "{params.extra} " + "{params.index} " + "{input.reads} | " + "samtools view -bh | samtools sort -o {output} -O BAM " + "2> {log}" + + +rule mark_duplicates: + """ + If we run bqsr, then we do not need to save the output of mark duplicates, since those bams will + be recalibrated. However, if we don't recalibrate, then we need to save the bams from mark duplicates + and we don't want to mark them as temporary + """ + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(40), + mem_mb=gb(32), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + runtime=autobump(720) + input: + bam = "results/mapped/{sample}-{unit}.sorted.bam", + output: + metrics="results/qc/picard/markdups/{sample}-{unit}_marked_dup_metrics.txt", + bam=( + temp("results/dedup/{sample}-{unit}.bam") if config['filtering']['bqsr'] + else "results/dedup/{sample}-{unit}.bam" + ) + + log: + "logs/{sample}-{unit}_mark_dup.log" + params: + rm = ('-REMOVE_DUPLICATES true ' + if config['processing']['remove-duplicates'] + else '') + run: + java_opts = set_java_opts(resources) + shell( + 'picard MarkDuplicates ' + '{java_opts} ' + '-INPUT {input.bam} ' + '-OUTPUT {output.bam} ' + '-ASSUME_SORT_ORDER coordinate ' + '{params.rm} ' + '-METRICS_FILE {output.metrics} ' + ' 2> {log} ' + ) + + +if config["filtering"]['bqsr']: + rule base_recalibrator: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + mem_mb=gb(8), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + disk_mb=gb(40), + runtime=autobump(960) + input: + bam=get_recal_input(bai=False), + bai=get_recal_input(bai=True), + ref=reference, + dict=dictionary, + known=known_sites, + known_idx=known_sites + '.tbi' + output: + recal_table="results/recal/{sample}-{unit}.grp" + log: + "logs/{sample}-{unit}_base_recalibrator.log" + run: + java_opts = set_java_opts(resources) + shell( + 'gatk --java-options {java_opts} BaseRecalibrator ' + '-R {input.ref} ' + '-I {input.bam} ' + '-O {output.recal_table} ' + '--known-sites {input.known} 2> {log}' + ) + + + rule apply_bqsr: + threads: 8 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + disk_mb=gb(40), + runtime=autobump(960) + input: + bam=get_recal_input(bai=False), + bai=get_recal_input(bai=True), + ref=reference, + dict=dictionary, + recal_table="results/recal/{sample}-{unit}.grp", + output: + bam=protected("results/recal/{sample}-{unit}.bam") + log: + "logs/{sample}-{unit}_apply_bsqr.log" + run: + java_opts = set_java_opts(resources) + shell( + 'gatk --java-options {java_opts} ApplyBQSR ' + '-R {input.ref} ' + '-I {input.bam} ' + '--bqsr-recal-file {input.recal_table} ' + '-O {output.bam} 2> {log}' + ) + + +rule build_bam_index: + resources: + mem_mb=gb(2), + disk_mb=gb(2), + runtime=autobump(30) + input: + bam ="{prefix}.bam" + output: + "{prefix}.bam.bai" + run: + basename = os.path.basename(input.bam) + log = 'logs/' + os.path.splitext(basename)[0] + '_buildbamindex.log' + shell("samtools index {input.bam} > {output} 2> {log}") + + +if config["processing"]["restrict-regions"]: + rule compose_regions: + """ + This command will ONLY work if the chromosome nomeclature matches the format in the reference genome + That is, chromosomes DO NOT have the 'chr' prefix. + + .bed files are formatted like so: + + Some bed files have header lines that can start with the word 'browser' or 'track' per UCSC + + To check this, we will read the lines of the .bed file and compare them to what is in the fasta index. + If we encounter any mismatches, then we exit with division by zero error and a print + statement that explains the bed file needs to be edited to make the nomenclature match. + + The awk command in the shell statement prints the entire lines of the input bed + into distinct files that are each named by the first column (chromosome) + Basically, we are splitting the provided .bed file up into contigs. + """ + resources: + disk_mb=1024, + mem_mb=1024, + runtime=20 + input: + bed = config["processing"]["restrict-regions"], + output: + "results/called/{contig}.regions.bed" + log: + "logs/{contig}_compose_regions.log" + run: + # Check for nomenclature mismatch using helper function in helpers.smk + chr_fai = get_fai_nomenclature + chr_bed = get_bed_nomenclature + if chr_fai != chr_bed: + raise ValueError("Nomenclature mismatch detected. Please review the fasta index file and the .bed files being used. The chromosome format MUST match between the .bed file and the reference. Please edit the bed file. For GRCh38 genomes, there should be no 'chr' prefix.") + shell(''' awk '$1 == "{wildcards.contig}" {{print $0 >> (t "/" $1 ".regions.bed" )}}' t=results/called {input} ''') + + +rule call_variants: + input: + bam=get_sample_bams, + ref=reference, + dict=dictionary, + known=known_sites, + tbi=( + known_sites + '.tbi' if known_sites else [] + ), + regions=( + "results/called/{contig}.regions.bed" + if config["processing"]["restrict-regions"] + else [] + ), + output: + gvcf=protected("results/called/{sample}.{contig}.g.vcf.gz"), + log: "logs/{sample}_{contig}_call_variants.log" + resources: + disk_mb=gb(16), + mem_mb=gb(40), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(hours=8) + threads: 8 + # threads: 1 # [ TEST SETTINGS -1 ] + params: + extra=get_call_variants_params, + pcr=( + '--pcr-indel-model ' + config['processing']['pcr'] + if config['processing']['pcr'] + else '' + ) + run: + java_opts = set_java_opts(resources) + regions = params.extra + bams = input.bam + if isinstance(bams, str): + bams = [bams] + bams = list(map("-I {}".format, bams)) + shell( + 'gatk --java-options {java_opts} HaplotypeCaller {regions} ' + '-R {input.ref} ' + '{bams} ' + '-ERC GVCF ' + '{params.pcr} ' + '-O {output.gvcf} ' + ) + + +rule combine_calls: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(4), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(720) + input: + ref=reference, + gvcfs=expand("results/called/{sample}.{{contig}}.g.vcf.gz", sample=samples.index), + output: + gvcf="results/called/all.{contig}.g.vcf.gz", + log: "logs/{contig}_combine_calls.log" + run: + java_opts = set_java_opts(resources) + gvcfs=list(map("-V {}".format, input.gvcfs)) + shell( + 'gatk --java-options {java_opts} CombineGVCFs ' + '{gvcfs} ' + '-R {input.ref} ' + '-O {output.gvcf} 2> {log} ' + ) + + +rule genotype_variants: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(8), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + ref=reference, + idx="results/called/all.{contig}.g.vcf.gz.tbi", + gvcf="results/called/all.{contig}.g.vcf.gz", + output: + vcf=temp("results/genotyped/all.{contig}.vcf.gz"), + log: + "logs/genotypegvcfs.{contig}.log", + run: + java_opts = set_java_opts(resources) + shell( + 'gatk --java-options {java_opts} GenotypeGVCFs ' + '-V {input.gvcf} ' + '-R {input.ref} ' + '-O {output.vcf} 2> {log}' + ) + + +rule merge_variants: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(8), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + vcfs=lambda w: expand( + "results/genotyped/all.{contig}.vcf.gz", contig=get_contigs() + ), + output: + vcf="results/genotyped/all.vcf.gz", + log: + "logs/merge-genotyped.log", + run: + inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) + java_opts = set_java_opts(resources) + shell( + 'picard' + ' MergeVcfs' + ' {java_opts}' + ' {inputs}' + ' -OUTPUT {output}' + ' 2> {log}' + ) + + +rule tabix_variants: + threads: 2 + resources: + disk_mb=gb(2), + mem_mb=gb(2), + runtime=autobump(30) + input: + vcf="{prefix}.vcf.gz", + output: + "{prefix}.vcf.gz.tbi", + run: + basename = os.path.basename(input.vcf) + log = 'logs/' + os.path.splitext(basename)[0] + '_tabix.log' + shell("tabix -p vcf {input.vcf} 2> {log} ") + + +rule select_calls: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(4), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + ref=reference, + vcf="results/genotyped/all.vcf.gz", + output: + vcf=temp("results/filtered/all.{vartype}.vcf.gz"), + log: + "logs/selectvariants_{vartype}.log", + run: + java_opts = set_java_opts(resources) + vartype_arg="--select-type-to-include {}".format( + "SNP" if wildcards.vartype == "snvs" else "INDEL" + ), + shell( + 'gatk --java-options {java_opts} SelectVariants ' + '-R {input.ref} ' + '-V {input.vcf} ' + '{vartype_arg} ' + '-O {output.vcf} 2> {log}' + ) + + +rule hard_filter_calls: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(4), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + ref=reference, + vcf="results/filtered/all.{vartype}.vcf.gz", + output: + vcf=temp("results/filtered/all.{vartype}.hardfiltered.vcf.gz"), + log: + "logs/variantfiltration_{vartype}.log", + run: + java_opts = set_java_opts(resources) + filter_arg = {'snv_hard_filer' : config['filtering']['hard'][wildcards.vartype]} + filters = [ + "--filter-name {} --filter-expression '{}'".format(name, expr.replace("'", "\\'")) + for name, expr in filter_arg.items() + ] + shell( + 'gatk --java-options {java_opts} VariantFiltration ' + '-R {input.ref} ' + '-V {input.vcf} ' + '{filters} ' + '-O {output.vcf} 2> {log}' + ) + + +rule merge_calls: + threads: 2 + resources: + disk_mb=gb(10), + mem_mb=gb(8), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + vcfs=expand( + "results/filtered/all.{vartype}.{filtertype}.vcf.gz", + vartype=["snvs", "indels"], filtertype='hardfiltered', + ), + output: + vcf=temp("results/filtered/all.final.vcf.gz"), + log: + "logs/merge-filtered.log", + run: + inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) + java_opts = set_java_opts(resources) + shell( + 'picard ' + ' MergeVcfs' + ' {java_opts}' + ' {inputs}' + ' -OUTPUT {output}' + ' 2> {log}' + ) + + +rule norm: + """ + Split multiallielic variants into multiple biallelic ones. + """ + resources: + mem_mb=gb(16), + runtime=autobump(120) + input: + ref=reference, + vcf="results/filtered/all.final.vcf.gz" + output: + "results/filtered/all.normed.vcf.gz" + log: + "logs/norm-vcf.log" + shell: + "bcftools norm -f {input.ref} " + "-m- " + "{input.vcf} " + "--output-type z " + "--output {output} 2> {log}" + + +rule fastqc: + resources: + mem_mb=gb(12), + runtime=autobump(120), + threads: 8 + input: + unpack(get_fastq), + output: + html="results/qc/fastqc/data/{sample}-{unit}_fastqc.html", + zip="results/qc/fastqc/zip/{sample}-{unit}_fastqc.zip" + log: + "logs/{sample}-{unit}_fastqc.log" + run: + def base_file(file_path): + baseName = Path(path.basename(file_path)) + while baseName.suffix in {'.gz','.bz2','.txt','.fastq','.fq','.sam','.bam'}: + baseName = baseName.with_suffix('') + return str(baseName) + with TemporaryDirectory() as tempdir: + shell( + "fastqc " + "--threads {threads} " + "--noextract " + "--quiet " + "--outdir {tempdir:q} " + "{input:q} " + "&> {log} " + ) + output_base = base_file(input[0]) + html_path = path.join(tempdir, output_base + "_fastqc.html") + zip_path = path.join(tempdir, output_base + "_fastqc.zip") + if output.html != html_path: + shell("mv {html_path:q} {output.html:q}") + if output.zip != zip_path: + shell("mv {zip_path:q} {output.zip:q}") + + +rule samtools_stats: + """ + Run samtools stats + """ + resources: + mem_mb=gb(16), + runtime=autobump(120) + input: + get_sample_unit_bams + output: + "results/qc/samtools-stats/{sample}-{unit}.txt" + log: + "logs/samtools-stats_{sample}-{unit}.log" + shell: + "samtools stats {input} 1> {output} 2> {log} " + + +rule multiqc: + """ + Gather qc metrics and run MultiQC + Get the html output from somatic and germline VCF annotation if specified in the config. + """ + resources: + mem_mb=gb(4), + runtime=autobump(60) + input: + fastqc=expand("results/qc/fastqc/zip/{u.sample}-{u.unit}_fastqc.zip", u=units.itertuples()), + markdup=expand("results/qc/picard/markdups/{u.sample}-{u.unit}_marked_dup_metrics.txt", u=units.itertuples()), + samstats=expand("results/qc/samtools-stats/{u.sample}-{u.unit}.txt", u=units.itertuples()), + output: + "results/qc/multiqc.html", + params: + dirname="results/qc/", + name="multiqc.html", + log: + "logs/multiqc.log", + run: + input_dirs=params.dirname + shell( + "multiqc " + "--force " + "-o {params.dirname} " + "-n {params.name} " + "{input_dirs} " + " &> {log} " + ) + + + +rule mutect2: + """ + Use Mutect2 to call variants on individual samples, one per contig + """ + resources: + disk_mb=gb(40), + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(720) + input: + unpack(input_for_somatic) + output: + vcf="results/mutect2_called/raw.{comp}.{contig}.vcf.gz", + stats='results/mutect2_called/raw.{comp}.{contig}.vcf.gz.stats', + orientation='results/lrom/{contig}_{comp}.tar.gz' + log: + "logs/{comp}_{contig}_mutect2_call_variants.log" + params: + pon = ( + '--panel-of-normals ' + config['mutect2']['PON'] + if config['PON'] + else [] + ), + extra=get_call_variants_params, + pcr=( + '--pcr-indel-model ' + config['processing']['pcr'] + if config['processing']['pcr'] + else '' + ) + # threads: 1 # [ TEST SETTINGS ] + run: + java_opts = set_java_opts(resources) + normals = " ".join("-I {} ".format(n) for n in input.normals) + tumors = " ".join("-I {} ".format(t) for t in input.tumors) + names = names_for_somatic(wildcards) + formatted_names = " ".join('-normal {} '.format(name) for name in names) + shell( + "gatk Mutect2 " + "--java-options {java_opts} " + "-R {input.ref} " + "{normals} " + "{tumors} " + "{params.extra} " + "{formatted_names} " + "{params.pcr} " + "--f1r2-tar-gz {output.orientation} " + "{params.pon} " + "-O {output.vcf} 2> {log}" + ) + + +rule lrom: + """ + Run LearnReadOrientationModel to get the maximum likelihood estimates of artifact prior probabilities + in the orientation bias mixture model filter + """ + resources: + disk_mb=gb(20), + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + orientation=lambda w: expand('results/lrom/{contig}_{{comp}}.tar.gz', contig=get_contigs()) + output: + lrom='results/lrom/artifact-prior-{comp}.tar.gz' + log: + 'logs/lrom_{comp}.log' + # threads: 1 # [ TEST SETTINGS ] + run: + java_opts = set_java_opts(resources) + def get_format_lrom(): + names= ['-I {} '.format(i) for i in input.orientation] + names = ' '.join(names) + return names + lrom_names = get_format_lrom() + + shell( + 'gatk --java-options {java_opts} LearnReadOrientationModel {lrom_names} ' + '-O {output.lrom} &> {log}' + ) + + +rule merge_mutect2_variants: + """ + After individual contigs are called via mutect2, we merge them together here. + """ + resources: + disk_mb=gb(20), + mem_mb=gb(32), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + vcfs=lambda w: expand( + "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz", contig=get_contigs() + ), + output: + temp("results/somatic/merged.{comp}.vcf.gz") + log: + "logs/merge_mutect2.{comp}.log", + # threads: 1 # [ TEST SETTINGS ] + run: + inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) + java_opts = set_java_opts(resources) + shell( + 'picard' + ' MergeVcfs' + ' {java_opts}' + ' {inputs}' + ' -OUTPUT {output}' + ' &> {log}' + ) + + +rule merge_mutect2_stats: + """ + Just like merging VCFs for Mutect2, we also need to merge stats for filtering. + """ + resources: + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + stats=lambda w: expand( + "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz.stats", contig=get_contigs() + ), + output: + temp("results/somatic/merged.{comp}.vcf.gz.stats") + log: + "logs/merge_mutect2_stats.{comp}.log" + # threads: 1 # [ TEST SETTINGS ] + run: + java_opts = set_java_opts(resources) + inputs = " ".join(" -stats {} ".format(f) for f in input.stats) + shell( + "gatk MergeMutectStats " + "--java-options {java_opts} " + "{inputs} " + "-O {output} " + "&> {log}" + ) + + +rule filter_mutect2_calls: + """ + New versions of Mutect2 have optimized defaults for filtering; we can just use those. + """ + resources: + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb=gb(2), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + ref=reference, + unfiltered="results/somatic/merged.{comp}.vcf.gz", + stats="results/somatic/merged.{comp}.vcf.gz.stats", + lrom='results/lrom/artifact-prior-{comp}.tar.gz' + output: + "results/somatic_filtered/filtered.{comp}.vcf.gz" + log: + "logs/{comp}.vcf.gz.log" + # threads: 1 # [ TEST SETTINGS ] + run: + java_opts = set_java_opts(resources) + shell( + "gatk FilterMutectCalls " + "--java-options {java_opts} " + "-stats {input.stats} " + "--orientation-bias-artifact-priors {input.lrom} " + "-R {input.ref} " + "-V {input.unfiltered} " + "-O {output}" + ) + + +rule mutect2_norm: + """ + Split multiallielic variants into multiple biallelic ones. + """ + resources: + mem_mb=gb(16), + runtime=autobump(120) + input: + ref=reference, + vcf="results/somatic_filtered/filtered.{comp}.vcf.gz" + output: + "results/somatic_filtered/normed.{comp}.vcf.gz" + log: + "logs/norm-{comp}-vcf.log" + shell: + "bcftools norm -f {input.ref} " + "-m- " + "{input.vcf} " + "--output-type z " + "--output {output} 2> {log}" + +rule merge_bams: + """ + Merge technical replicates together to get single bam files used for validation. + """ + input: + bam=get_sample_bams + output: + bam=protected("results/merged/{sample}.bam"), + resources: + disk_mb=gb(20), + mem_mb=gb(16), + runtime=autobump(120) + threads: + 6 + shell: + """samtools merge -@ {threads} -o {output.bam} {input}""" diff --git a/workflows/variant-calling/analyze_variants.py b/workflows/variant-calling/analyze_variants.py new file mode 100644 index 000000000..90b6231a6 --- /dev/null +++ b/workflows/variant-calling/analyze_variants.py @@ -0,0 +1,225 @@ +# Import libraries +from cyvcf2 import VCF, Writer +from collections import defaultdict +import pickle +import re +import pandas as pd +import numpy as np +import re +import sys +import argparse +import yaml +import os +import csv +import math + +def main(): + parser = argparse.ArgumentParser(description="A script to compute metrics for variants") + + parser.add_argument("--vcf", "-v", default="results/annotated/filtered_ann.vcf.gz", help="Path to vcf file containing variant information") + parser.add_argument("--output", "-o", default="results/analyzed", help="Path to the folder that the output files will be written to") + parser.add_argument("--gnomad", "-g", action='store_true', default=True, help="Flag to set if vcf is annotated with gnomad fields AF and AC") + + + args = parser.parse_args() + return args + + +def process_vcf(args): + # Load data + # vcf_path = "/data/NICHD-core0/analysis/stratakis/poi-wes/workflows/variant-calling/results/annotated/ann.vcf.gz" + # shortened_vcf_path = "results/annotated/subset.ann.vcf" + # output_path = "/data/NICHD-core0/analysis/stratakis/poi-wes/workflows/variant-calling/results/analyzed" + + testing = False # Boolean to turn on if doing testing runs (runs with a subsetted vcf file) + vcf = VCF(args.vcf, strict_gt=True) + output_path = args.output + if not os.path.exists(output_path): + os.makedirs(output_path) + + + print("Reading in data...") + # W = Writer(output_path + '.vcf', vcf) + samples = vcf.samples + + records = [] + for variant in vcf: + ann_raw = variant.INFO.get("ANN") + # These fields are assigned via SnpEff's documentation here: (https://pcingola.github.io/SnpEff/snpeff/inputoutput/) + if ann_raw: + ann_fields = ann_raw.split(",")[0].split("|") + if ann_fields[2] not in ["MODERATE", "HIGH"]: + continue + allele = ann_fields[0] if len(ann_fields) > 0 else None + var_type = ann_fields[1] if len(ann_fields) > 1 else None + impact = ann_fields[2] if len(ann_fields) > 2 else None + gene = ann_fields[3] if len(ann_fields) > 3 else None + gene_id = ann_fields[4] if len(ann_fields) > 4 else None + feature_type = ann_fields[5] if len(ann_fields) > 5 else None + feature_id = ann_fields[6] if len(ann_fields) > 6 else None + tscript_biotype = ann_fields[7] if len(ann_fields) > 7 else None + ex_int_rank_total = ann_fields[8] if len(ann_fields) > 8 else None + HGVS_c = ann_fields[9] if len(ann_fields) > 9 else None + HGVS_p = ann_fields[10] if len(ann_fields) > 10 else None + cDNA_pos_rank = ann_fields[11] if len(ann_fields) > 11 else None + CDS_pos_rank = ann_fields[12] if len(ann_fields) > 12 else None + prot_pos_len = ann_fields[13] if len(ann_fields) > 13 else None + feat_dist = ann_fields[14] if len(ann_fields) > 14 else None + other = ann_fields[15] if len(ann_fields) > 15 else None + + record = { + "CHROM": variant.CHROM, + "POS": variant.POS, + "REF": variant.REF, + "ALT": variant.ALT, + "DEPTH": variant.INFO.get("DP"), + "GENOTYPES": variant.genotypes, + "INFO_ANN": variant.INFO, + "ANN": ann_raw, + "ALLELE": allele, + "VAR_TYPE": var_type, + "IMPACT": impact, + "GENE": gene, + "GENE_ID": gene_id, + "FEATURE_TYPE": feature_type, + "FEATURE_ID": feature_id, + "TSCRIPT_BIOTYPE": tscript_biotype, + "EX_INT_RANK_TOTAL": ex_int_rank_total, + "HGVS.C": HGVS_c, + "HGVS.P": HGVS_p, + "CDNA_POS_RANK": cDNA_pos_rank, + "CDS_POS_RANK": CDS_pos_rank, + "PROT_POS_LEN": prot_pos_len, + "FEAT_DIST": feat_dist, + "OTHER": other + } + + # Add genotype info for each sample + for i, sample in enumerate(samples): + record[sample] = variant.gt_types[i] + + if args.gnomad: + record["gnomad_AF"] = variant.INFO.get("AF_joint") + record["gnomad_AC"] = variant.INFO.get("AC_joint") + record["gnomad_AN"] = variant.INFO.get("AN_joint") + records.append(record) + + + vcf_df = pd.DataFrame(records) + + + # Compute metrics for each gene and variant + + print("Computing variant metrics...") + # Get list of unique genes in vcf + genes = vcf_df['GENE'].unique() + genes_df = pd.DataFrame(genes) + genes_df.columns = ['gene'] + genes_df['num_vars'] = pd.Series([pd.NA] * len(genes_df), dtype='Int64') + genes_df['sample_AF'] = np.NAN + genes_df['sample_AC'] = pd.Series([pd.NA] * len(genes_df), dtype='Int64') + genes_df['sample_AN'] = pd.Series([pd.NA] * len(genes_df), dtype='Int64') + genes_df['vars'] = pd.Series([None] * len(genes_df), dtype='object') + + #TODO: Change this to an enumerate? + index = 0 + for gene in genes: + # Grab all rows/variants associated with the current gene + cur_gene_df = vcf_df[vcf_df['GENE'] == gene] + + # Create list of dictionaries (each index corresponds to a variant for that gene, + # each index's dictionary will hold the info for that variant) + vars = [] + total_alleles = 0 + total_alt_alleles = 0 + + for _,var in cur_gene_df.iterrows(): + # In cyvcf2, the genotype numbers correspond as follows (even though their API says something else): + # 0: homozygous reference + # 1: heterozygous for alternate + # 2: unknown (great ordering here by whoever made this up) + # 3: homozygous for alternate + var_alleles = 0 + # samples_gt = {sample: var[sample] for sample in samples} + genotypes = var['GENOTYPES'] + samples_gt = { + sample: "./." if genotypes[i][0] is None or genotypes[i][1] is None or genotypes[i][0] == -1 or genotypes[i][1] == -1 + else f"{genotypes[i][0]}/{genotypes[i][1]}" + for i, sample in enumerate(samples) + } + all_gt = var[samples] # Grab genotype info from all samples + called_gt = all_gt[all_gt != 2] # Filter out 'unknown' variant calls + var_samples = len(called_gt) # Number of samples with a genotype called for this variant + var_alt_samples = len(called_gt[(called_gt==1) | (called_gt==3)]) # Number of samples with at least one called alternate allele for this variant + var_alleles = len(called_gt) * 2 # Total alleles called for this variant + alt_alleles = 0 # Total alternate alleles called for this variant + for gt in called_gt: + if gt == 1: + alt_alleles += 1 + elif gt == 3: + alt_alleles += 2 + total_alleles += var_alleles # Add to the total alleles called for the current gene + total_alt_alleles += alt_alleles # Add to the total alt alleles called for the current gene + var_AF = alt_alleles / var_alleles if var_alleles > 0 else 0 # Calculate the alternate allele frequency for this variant across all samples + + # Store info in list of variant info for current gene + var_info = {"var_loc": f"{var['CHROM']}:{var['POS']}", "variant-id": f"{var['REF']}-{'/'.join(var['ALT'])}", "sample_AF": var_AF, "sample_AC": alt_alleles,"sample_AN": var_alleles, "num_alt_samples": var_alt_samples, "total_called_samples": var_samples, "depth": var['DEPTH'], "var_impact": var['IMPACT'], "var_type": var["VAR_TYPE"], "genotypes": samples_gt} + if args.gnomad: + var_info["gnomad_AF"] = round(var['gnomad_AF'], 3) + var_info["gnomad_AC"] = int(var['gnomad_AC']) if pd.notna(var['gnomad_AC']) else 0 + var_info["gnomad_AN"] = int(var['gnomad_AN']) if pd.notna(var['gnomad_AN']) else 0 + vars.append(var_info) + + # Compute alternate allele frequency across all samples + gene_alt_freq = round(total_alt_alleles / total_alleles, 3) if total_alleles > 0 else 0 + + genes_df.at[index, 'num_vars'] = len(vars) + genes_df.at[index, 'sample_AF'] = gene_alt_freq + genes_df.at[index, 'sample_AC'] = total_alt_alleles + genes_df.at[index, 'sample_AN'] = total_alleles + genes_df.at[index, 'vars'] = vars + index += 1 + + genes_df = genes_df.applymap(lambda x: float(f"{x:.4g}") if isinstance(x, float) and not pd.isnull(x) else x) + + # Expand variant column to create new files where each variant gets its own row + vars_df = genes_df.drop(['num_vars', 'sample_AC','sample_AF', 'sample_AN'], axis=1) + vars_df = vars_df.explode('vars') + variant_details = pd.json_normalize(vars_df['vars']) + vars_df= pd.concat([vars_df.drop(columns='vars').reset_index(drop=True), variant_details], axis=1) + vars_df = vars_df.applymap(lambda x: float(f"{x:.4g}") if isinstance(x, float) and not pd.isnull(x) else x) + vars_df.rename(columns={ + col: f"POF{col.split('POF')[-1]}_gt" + for col in vars_df.columns + if col.startswith('genotypes.POF') + }, inplace=True) + + + # Print the variants dataframe + vars_df = vars_df.sort_values(by="sample_AC", ascending=False) + with open(f'{output_path}/var_metrics.pkl', 'wb') as file: + pickle.dump(vars_df, file) + vars_df.to_csv(f'{output_path}/var_metrics.tsv', sep="\t", index=False) + + # Print the genes dataframe + genes_final_df = genes_df.drop('vars', axis=1) + genes_final_df = genes_final_df.sort_values(by="sample_AC", ascending=False) + print("Saving to .pkl and .tsv...") + if testing: + with open(f'{output_path}/shortened_gene_var_metrics.pkl', 'wb') as file: + pickle.dump(genes_final_df, file) + + genes_df.to_csv(f'{output_path}/shortened_gene_var_metrics.tsv', sep="\t", index=False, quoting=csv.QUOTE_MINIMAL) + else: + with open(f'{output_path}/gene_var_metrics.pkl', 'wb') as file: + pickle.dump(genes_final_df, file) + + genes_df.to_csv(f'{output_path}/gene_var_metrics.tsv', sep="\t", index=False) + + print("All done!") + + +if __name__ == "__main__": + args = main() + process_vcf(args) + diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml new file mode 100644 index 000000000..3b2ce5305 --- /dev/null +++ b/workflows/variant-calling/config/config.yaml @@ -0,0 +1,79 @@ +samples: config/samples.tsv +units: config/units.tsv +ref: + # Set to true only if you want your references to come from lcdb-wf references + use_references_workflow: false + # Match these to the reference config that is included at the bottom of this file + # Only configure this section if use_references_workflow is set to true + organism: 'human' + genome: + tag: 'ensembl-104' + build: 'GRCh38' + aligner: + index: 'bwa' + tag: 'ensembl-104' + faidx: + index: 'faidx' + tag: 'ensembl-104' + variation: + # Fill these keys in with the name of the variation database that matches the value in the reference config + # Or alternatively, you can provide an ABSOLUTE path to these files locally (paths MUST start with '/') + # If this is the case, you should go edit the lcdb-wf references config to make sure these jobs are not run for no reason. + known: 'known' + dbnsfp: 'dbNSFPv4.4' + # If you are providing your own references, include their paths here. + paths: + # When using BWA, you should not use a top level genome assembly for human, see http://lh3.github.io/2017/11/13/which-human-reference-genome-to-use + ref: 'references/GRCh38.6.20.fa.gz' + known: 'references/known_variation_noiupac.vcf.gz' + index: + dbnsfp: 'references/dbnsfp_6_20.vcf.gz' +processing: + remove-duplicates: true + remove-mitochondrial: true + # See https://gatk.broadinstitute.org/hc/en-us/articles/360036465912-HaplotypeCaller#--pcr-indel-model for pcr + # If you know there was no PCR used to generate your sequencing data, set this to NONE + pcr: + # Point to a bed file, e.g. captured regions + restrict-regions: + #'references/exons_subset.bed' + # If regions are restricted, optionally enlarge them by a given value + region-padding: +filtering: + # Set to true in order to apply machine learning based recalibration of + # quality scores instead of hard filtering. + bqsr: true + hard: + # hard filtering as outlined in GATK docs + # (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set) + snvs: + "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" + indels: + "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" +snpeff: + # MultiQC rule needs these set to collect summary files from the snpeff and snpeff_cancer rule + # Set each to true respectively if you plan on generating annotations for somatic or germline data + somatic: true + germline: true + # Run snpEff databases to see available databases (https://pcingola.github.io/SnpEff/se_commandline/) + # See https://pcingola.github.io/SnpEff/se_build_db/ for docs on building your own database + genome: 'GRCh38.p14' + # Add annotations in the form of a comma-separated string to attach from dbnsfp for snpsift. + # These annotations should be column names in the dbnsfp file. + # Leave this blank if you are not using dbnsfp. + annotations: 'FATHMM_pred,SIFT_pred' +# Supple a panel of normals file if you have one for your genome. It is OK to leave this blank. +# See https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- for details on the PON file. +PON: +mutect2: + # See the docs on how to configure this section + tumor-normal: + tumor: + - 'tumor' + normal: + - 'normal' + + + +include_references: + - '../../include/reference_configs/variant-calling.yaml' diff --git a/workflows/variant-calling/config/samples.tsv b/workflows/variant-calling/config/samples.tsv new file mode 100644 index 000000000..21c6b82e8 --- /dev/null +++ b/workflows/variant-calling/config/samples.tsv @@ -0,0 +1,3 @@ +sample +tumor +normal diff --git a/workflows/variant-calling/config/units.tsv b/workflows/variant-calling/config/units.tsv new file mode 100644 index 000000000..d336d4be9 --- /dev/null +++ b/workflows/variant-calling/config/units.tsv @@ -0,0 +1,3 @@ +sample unit platform fq1 fq2 +tumor 1 Illumina data/example_data/tumor_R1.fq.gz data/example_data/tumor_R2.fq.gz +normal 1 Illumina data/example_data/normal_R1.fq.gz data/example_data/normal_R2.fq.gz diff --git a/workflows/variant-calling/run_test.sh b/workflows/variant-calling/run_test.sh new file mode 100755 index 000000000..7aacb413c --- /dev/null +++ b/workflows/variant-calling/run_test.sh @@ -0,0 +1,3 @@ +set -e +python -m doctest ../../ci/preprocessor.py +python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@" diff --git a/workflows/variant-calling/variant-calling.rst b/workflows/variant-calling/variant-calling.rst new file mode 100644 index 000000000..a80cb705b --- /dev/null +++ b/workflows/variant-calling/variant-calling.rst @@ -0,0 +1,647 @@ +.. _Overview: + +LCDB-WF Variant Calling Overview +================================ + +This Snakemake workflow handles detection and annotation of germline and +somatic variants in DNA sequencing reads. Analysis-ready VCFs are generated by +following a GATK best practices workflow. Annotations are attached from +variation databases like dbNSFP using SnpEff. This workflow also provides QC +analysis on the input fastq files as well as some metrics recorded throughout +the variant calling workflow. + +The workflow is primarily designed for human sequencing data, but it will +support other organisms. However, due to limiting references in other +organisms, some of the features of the workflow may not be available. See the +section on `configuring other organisms `__ for details on how to run +the workflow for non-human organisms. + +The workflow can be thought of as having 5 main components: references, +mapping, calling, annotating, and QC. + + + +.. _References: + +References +---------- + +There are a handful of files that can be considered references that are +involved in calling and annotating variants. References can either be provided +externally, or they will be generated inside of the workflow by implementing +the LCDB-WF References workflow. See the section on `configuring references +`__ for details on how to properly provide or generate references. + +- Reference genome + * The reference genome will be downloaded by the LCDB-WF references + workflow if it is not supplied. The workflow supports chromosome + nomenclature from both GRCh38/hg38 and GRCh37/hg19 references. + * By default, Ensembl’s GRCh38 genome build is used because the known + variation file is also provided by Ensembl. +- Known variation + * This is a VCF file that contains sites of “known variation” in an + organism’s genome. This is used to recalibrate bam files using `base + quality score recalibration + `_ + and avoid detection of variants that are extremely common. + * Note that this file is not essential to variant calling and the workflow + can run without it. For some organisms, this type of file is not + available. +- BWA Index Files + * These index files are generated by ``bwa index`` + * The references workflow will build these files, but if the references + workflow is not used (see `configuring references `__), these + files will be generated by the Snakefile at workflow execution. +- Sequence Dictionary + * Generated by ``picard CreateSequenceDictionary`` and used by several GATK + commands. +- Fasta Index + * Generated by ``samtools faidx`` and used by the workflow to establish + contigs for joint-calling. +- Variation Databases + * These are (usually) VCFs that contain variant annotations that are used + by SnpEff for annotating VCFs in the workflow. + * the dbNSFP database may be downloaded by the references workflow or you + can download it yourself (see `Downloading dbNSFP `__). It + contains a comprehensive set of annotations for humans. This file can be + provided externally if you are not using the references workflow (see the + section on `Configuring Annotation Databases `__) + * These are also not required and we can still run annotation without the + file present (we just won’t have all the fancy annotations that these + databases provide) + +.. _Mapping: + + +Mapping +------- + +First, adapters are trimmed from the reads using ``cutadapt``. After that, +reads are mapped to the genome using ``bwa mem``. + +After bams are generated, we need to mark and delete duplicates using ``picard +MarkDuplicates``. Duplicate reads are uninformative for variant calling, and +deleting them will speed up the workflow. + +Once duplicates are removed, we recalibrate base quality score using GATK +``BaseRecalibrator`` and ``ApplyBQSR`` provided that a known variation file is +present and the config option for base recalibration is set (see `filtering and +processing `__). + + + +.. _Calling: + +Calling Variants +---------------- + +The workflow contains support for both germline and somatic variant calling. + +For germline variants, this workflow takes advantage of Snakemake +parallelization by splitting up variant calling into regions. By default, these +regions are defined by the contigs identified in the fasta index file. + +However, you can provide a .bed file with regions that you want to restrict +variant calling to such as in a targeted exon sequencing project (see +`Processing `__). + +Once regions are established, variants are called with GATK ``HaplotypeCaller`` +to produce a `GVCF +`_ +for every sample for every config. The GVCFs from each sample are combined for +each contig to produce GVCFs that contain variants from ALL samples for each +contig. Finally, GVCFs are genotyped and merged to produce a single VCF +containing mutations in all samples for all contigs. + +Somatic variant calling roughly follows the same procedures as above, however +a bit more configuration is involved. See the section on `Somatic Variant +Calling with Mutect2 `__. Additionally, for filtering somatic variants +we run `LROM +`_ +to help estimate artifacts produced by read orientation bias. + + + +.. _Annotation: + +Annotation +---------- + +Annotating variants is the process of looking up a variant in a database and +adding a description or metric about that variant to a section in the ``INFO`` +column of a VCF file. `dbNSFP `_ +is a popular database that contains annotations from hundreds of sources +compiled into a single VCF. See the section on `Configuring Annotation +Databases `__ for information on how to use dbNSFP and other +variation databases. + +Annotations are attached using `SnpEff and SnpSift +`_. These are also useful tools for +downstream analysis, which is not included in the workflow, but discussed +`here `__ + +Finally, we run ``bcftools norm`` on all final VCFs in order to split +multi-allelic variant sites into a series of SNPs. + + + +.. _QC: + +QC +-- + +The workflow generates QC metrics at several points and aggregates them using +`MultiQC `_. These include metrics from: + +- ``samtools stats`` +- ``picard MarkDuplicates`` +- ``FastQC`` +- ``SnpEff`` Variant annotation summary statistics + +MultiQC reads from all of these sources and generates an interactive html +document that allows you to visualize and explore QC data. + + + +.. _Configuration: + +Configuration and config.yaml +============================= + +The config file, ``config.yaml``, is the main point of communication between +the user and the Snakefile that the workflow executes. Users should always +approach this file first before editing the Snakefile. This is how we pass +paths to reference files, arguments to rules in the Snakefile, and configure +patterns for somatic variant calling. Each section in the ``config.yaml`` is +documented here. + + + +.. _samples: + +Input +----- + +In order to pass sequencing data to the workflow, samples need to be configured +into two tables, ``units.tsv`` and ``samples.tsv`` + +``units.tsv`` is configured like this: + + +========= ========================== =================== ================ ============================================ +sample unit platform fq1 fq2 +========= ========================== =================== ================ ============================================ +Sample ID Technical replicate number Sequencing platform Path to R1 fastq Path to R2 fastq (if using paired-end reads) +========= ========================== =================== ================ ============================================ + + +Note: + +- sample names needn’t be unique, but no sample-unit combination can be the + same. Increment the unit starting from 1 for identical sample names to + represent technical replicates. A sampletable with no technical replicates + should have the value for unit set to 1 for every sample. +- platform is used to attach read groups. It should be the sequencing platform + of your data, for example, "Illumina." +- fq1 and fq2 represent the R1 reads and R2 reads from a paired-end sample. If + you samples are not paired-end, then leave the value in the fq2 column empty. + +``samples.tsv`` is a single column consisting of the sample names. You can use +``cut -f1 units.tsv > samples.tsv`` to generate this file + + +.. _confall: + +Configuring the All Rule +------------------------ + +The all rule is how we initially generate the DAG for which rules to execute in +which order to generate the final outputs of the workflow. We specify the +outputs we want at the end of the workflow here: + +- Germline variants, annotated, multiallelic-split (with ``bcftools norm``): + “results/annotated/ann.vcf.gz” + + * If you decide to use dbNSFP to attach annotations AFTER you've already + generated the "results/annotated/ann.vcf.gz" file, you need to `configure + dbNSFP `__ and delete “results/annotated/ann.vcf.gz” to + regenerate output. + +- MultiQC: “results/qc/multiqc.html” +- Mutect2 annotated: ``expand("results/mutect2_annotated/snpeff.{comp}.vcf.gz", + comp = config['mutect2'].keys())`` + + * See the `Mutect2 configuration `__ + +If you do not wish to attach annotations to your variants: + +- Germline (multiallelic-split): “results/filtered/all.normed.vcf.gz” +- Somatic + (multiallelic-split): ``expand("results/somatic_filtered/normed.{comp}.vcf.gz", + comp = config['mutect2'].keys())`` + +If for some reason you do not want your variants to be multiallelic-split, then +you will have to edit the ``rule merge_calls`` for germline variants and change +the output to not be marked ``temp()``. For somatic variants, you will have to +edit the ``filter_mutect2_calls`` rule in the same fashion. Specify the output +of those rules in your all rule. + + + +.. _confref: + +Configuring References in config.yaml +------------------------------------- + +The ``ref:`` section of the config.yaml is how we determine the patterns of the +reference files generated by the references workflow or pass the paths of +external references to the workflow. + +If you intend on using the LCDB-WF references workflow, set +``use_references_workflow: true``. The Snakefile reads this argument and +determines which references to use. At the bottom of the config file is an +``include_references:`` key that must point to the reference config from the +LCDB-WF references workflow that you wish to use. The structure of the +reference config that you include is like so: + +.. code-block:: yaml + + references: + organism: + tag: + type: + +It is crucial that the values for organism and tag in the ``ref:`` block of +``config.yaml`` match exactly to the reference config. The ``aligner:`` and +``faidx`` sub-fields must also match what is found in the ``indexes:`` subfield +of the ``genome:`` “type” from the above yaml structure in the reference +config. + +For example, if the reference config looks like this: + +.. code-block:: yaml + + references: + human: + ensembl-104: + genome: + url: ‘dummy.fasta.download.url’ + indexes: + - ‘bwa’ + - ‘faidx’ + +Then we will configure the ``config.yaml`` like this: + +.. code-block:: yaml + + ref: + use_references_workflow: true + organism: ‘human’ + genome: + tag: ‘ensembl-104’ + build: 'GRCh38' + aligner: + index: ‘bwa’ + tag: ‘ensembl-104’ + faidx: + index: ‘faidx’ + tag: ‘ensembl-104’ + +The ``build:`` key in the ``genome:`` block must match what is provided in the +metadata of the reference config. + +For variation databases, the versioning on the database used is controlled by +a key-value provided to the reference config under the ``variation:`` “tag”. +This value must match what is given in the ``config.yaml``. You must also +supply the build of your genome to the reference config and it has to match the +build you are using for your reference genome. This is because we need to +process the dbNSFP file in the references workflow to make it compatible with +older genomes. To mirror the example above if this is what is in our reference +config: + +.. code-block:: yaml + + references: + human: + known: + type: 'all' + + variation: + dbnsfp: + version: 'dbNSFPv4.4' + url: ‘dummy.download.dbsnfp.url’ + build 'GRCh38' + +Then we will configure the ``config.yaml`` like this: + +.. code-block:: yaml + + ref: + use_references_workflow: true + organism: ‘human’ + + variation: + known: ‘known’ + dbnsfp: ‘dbNSFPv4.4’ + +The workflow supports the option to supply the ``known:`` and ``dbnsfp:`` keys +with an ABSOLUTE path to a file you have locally. The path MUST start with +``"/"``. Make sure to read on below to see what sort of processing and +modifications must be made to these files in order for the workflow to use +them. + +Note that in the reference config, the ``type:`` field under ``known:`` +corresponds to the type of known variation file you wish to download. Available +options are ‘somatic’, ‘structural_variation’, or ‘all’. At the moment, this +workflow does not support structural variant calling. Providing ‘all’ to this +field will download germline known variation for all chromosomes. If your +organism does not have a variation database, then simply leave these fields +empty in the variant calling workflow config, and the workflow will run without +them. + +If you are providing your own references to the workflow, set them in the paths +section of the ref block: + +.. code-block:: yaml + + ref: + paths: + ref: path/to/known/genome/fasta + known: path/to/known/variation/file + index: path/to/fasta/index + dbnsfp: path/to/dbnsfp/file + +If you are providing a known variation file that contains `IUPAC-coded alleles +`_, then you must run +``rust-bio-tools`` on your variation file and provide this output as your known +variation file: + +.. code-block:: bash + + rbt vcf-fix-iupac-alleles < input | bcftools view -Oz > known-variation.vcf.gz + + +Of these, the only file that you absolutely must provide always is the genome +fasta. NEVER provide the workflow with a "top level" assembly of the genome +when working with human data. This contains a lot of unaligned contigs and +haplotypes and BWA will not be able to map your reads to the genome. The +workflow accepts both compressed and uncompressed fasta files. Some notes on +references such as the known variation, dbNSFP, BWA index, and fasta index: + +- If you cannot provide the files in the config above, just leave their values + empty in the ``config.yaml`` field. + + * If the known variation file is absent, we will not be able to recalibrate + bases and our bam file processing ends at marking and deleting + duplicates. + * If the dbNSFP file is absent, we will not able to run SnpSift to attach + fields from the dbNSFP file (see `Configuring Annotation Databases + `__), but we will still be able to run SnpEff. + * If the BWA index is not used from the LCDB-WF references workflow, then + it will be created in the Snakefile in the same directory as the provided + reference + * If the fasta index is not provided, it will be created in the same + directory as the provided reference during the workflow. + + +.. _dldbnsfp: + +Downloading dbNSFP +------------------ + +To download the dbNSFP file grab the Box link from the `dbNSFP website +`_. Alternatively, you can use +the `ftp server `_. You can grab +the file from here, but the zips are really big, and if you click the link you +will open the ftp server on your machine in finder. You can get the file name +of the file you want and fill in the string like this: + +.. code-block:: bash + + version= + wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFP${version}.zip + # Alternatively, wget + +The most recent version may not be on the ftp server. Watch your disk space +when downloading and processing this file, it will use hundreds of gigabytes. +You may want to do this from a shell script writing to a tmp dir. Do not +attempt to download this on a node that is not suitable for heavy computation + +Pay attention to the comments in the code block. The process differs for +GRCh38/hg38 genomes and GRCh37/hg19 genomes. Recent versions of dbNSFP (v3.X +and later) switched to GRCh38 coordinates. You can still use these later +versions for GRCh37 coordinates, but you have to process the files differently. + +.. code-block:: bash + + unzip + zcat dbNSFP*_variant.chr1* | head -n1 > h + # head works if you run this command line by line, but if you are setting "set -euo pipefail" head may cause the pipe to exit. Alternatively, you can use pipe to awk "NR <= 1" > h + + # For GRCh38/hg38 data (include the genome version somewhere in your output file name for clarity): + zgrep -v "^#chr" dbNSFP*_variant.chr* | sort -k1,1 -k2,2n - | cat h - | bgzip -c > + tabix -s 1 -b 2 -e 2 + + # For GRCh37/hg19 data: + zgrep -h -v "^#chr" dbNSFP*_variant.chr* | awk '$8 != "." ' | sort -k8,8 -k9,9n - | cat h - | bgzip -c > + tabix -s 8 -b 9 -e 9 + +It is highly recommended to run ``sort`` with the ``--parallel=N`` option on +a node with multiple CPUs as this will take a very long time. See the `sort man +page `_ for details on how +to properly parallelize. + + +.. _conforg: + +Support for Non-Human Organisms +------------------------------- + +If you wish to call variants on a non-human organism, then you will have to +supply the references yourself following the directions above, or edit the +reference config for your organism in the top level of the cloned LCDB-WF +directory in ``include/reference_configs/``. You can add to the config: +``variant-calling.yaml`` reference config if you are using an Ensembl genome. +The tag should be configured as “ensembl-release”. ONLY include the variation +block in the reference config if your organism has support for an annotation +variation database like dbNSFP Downloading the known variation file in the +references workflow relies on the build, release, and species values specified +in the ``metadata:`` block. + +WARNING: The workflow relies heavily on references downloaded from ensembl, +mainly due to Ensembl housing known variation files. You will run into issues +if your references have a "chr" prefix for chromosome nomenclature. Check the +``get_contigs()`` method in ``lib/helpers.smk`` (from the top-level directory +of LCDB-WF) and see if this will cause any issues with your organism. + + + +.. _fp: + +Filtering and Processing +------------------------ + +There are several filtering and processing configurations that can be provided +to the workflow in the ``processing:`` and ``filtering:`` sections of +``config.yaml``. + +In ``processing:`` there are several options: + +- ``remove-duplicates:`` if ``true`` is set for this option, we remove + duplicates with ``picard MarkDuplicates`` (you should do this unless you are + interested in duplicated reads in your data) +- ``remove-mitochondrial:`` if ``true`` is set for this option, then + mitochondrial contigs will be ignored for variant calling (default) +- ``pcr:`` This option takes a string according to `GATK PCR INDEL model docs + `_. + The default option in the workflow is to not specify this argument, but if + you know that your data is PCR-free sequencing data then you should set this + option to ‘NONE’ for the most specific results. +- ``restrict-regions:`` Supply a capture file in .bed format. Commonly used for + whole exome sequencing (WES) and targeted exome sequencing. + + * ``region-padding:`` Increase the size of the regions specified in the + .bed file by a flat amount of base pairs. Leave both of these options + blank if you do not have a .bed file. + +In ``filtering:`` we have these options: + +- ``bqsr:`` This option controls whether or not we run `GATK base recalibrator + `_. + Basically, if we are able to, we should. What defines if we are able to is + whether or not the known variation file is available for the organism we are + calling variants on. Turning this option to false means that our bam + processing ends after deleting duplicates. +- ``hard:`` This is the hard-filtering option as outlined in `GATK filtering + docs + `_. + These are basic, quality-based filters that remove low-confidence or + low-quality variants from our callset. + + * ``snvs:`` and ``indels:`` We split the variants into SNPs and INDELS + because they require different hard-filtering options. The filter values + are read from these options and can be adjusted. What is present by + default is GATK’s standard recommendation. + + + +.. _confdbnsfp: + +Configuring Annotation Databases in config.yaml +----------------------------------------------- + +The `SnpEff and SnpSift docs `_ are a very +helpful resource for running SnpEff and SnpSift. + +In the ``snpeff:`` section of ``config.yaml`` you can configure arguments that +are passed to the annotation rules. + + +- ``somatic:`` This is how we configure SnpEff summary output for MultiQC to + aggregate the correct output files for its report. Set this value to ``true`` + if you are calling somatic variants +- ``germline:`` Same as ``somatic:`` but set to ``true`` if you are calling + germline variants. +- ``genome:`` For help with this value, see SnpEff’s `docs + `_ for which database to use. +- ``annotations:`` This key is an optional configuration that allows you to + specify which fields from dbNSFP you would like to attach to your VCF. + Leaving this field blank will attach ALL fields from dbNSFP (of which there + are many, so it will save you time and computation to specify your fields). + The value given to this key is a comma-separated string (WITHOUT WHITESPACE!) + of the names of the columns in the dbNSFP file you would like to attach. For + example, to get FATHMM and SIFT pathogenic predictions, you would configure + like this: ``annotations: ‘FATHMM_pred,SIFT_pred’`` + + + +.. _mutect: + +Somatic Variant Calling with Mutect2 +------------------------------------- + +When talking about somatic calling, we say “tumor” and “normal” as being the two +components of a Mutect2 contrast. In reality, the relationship does not have to +be tumor and normal tissue. However, the “tumor” and “normal” samples should +come from the same organism or from very genetically similar organisms. In +somatic calling, we are trying to identify regions where the “tumor” sample +differs from the “normal” sample. + +Directly above the ``mutect2:`` section is ``PON:``. PON stands for panel of +normals, and is a file that is very similar in use to a known variation file. +It is a file made from “normal” samples believed to be free from somatic +alterations. Its purpose is to help identify technical artifacts. You can read +more about panel of normals `here +`_. +This file is optional and leaving it blank will not affect our ability to call +somatic variants. + +In the ``mutect2:`` section of the ``config.yaml`` there is (importantly) only +one option. In the default config, this single option is ``tumor-normal:`` This +is how we will tell Mutect2 which samples should be analyzed together and which +of those are “normals” or “tumors”. However, this key need not be named +“tumor-normal”. Below is an example configuration: + +.. code-block:: yaml + + mutect2: + patient-1: + tumor: + - ‘p1_tumor’ + normal: + - ‘p1_normal’ + patient-2: + tumor: + - ‘p2_tumor’ + - ‘p2_tumor_metastasis’ + normal: + - ‘p2_normal’ + +The keys one level below ``mutect2:`` correspond to the name of the comparisons +we are making. This is how we establish wildcards in the Snakefile for the +Mutect2 rule. However, the keys inside of the comparison name MUST be named +``tumor:`` and ``normal:``. Their values should be formatted as a list in yaml. +Each value under ``tumor:`` or ``normal:`` corresponds to the sample name found +in the `sampletable `__. The workflow handles combining technical +replicates, so you only have to provide the sample name. + + + +.. _down: + +Downstream Analysis +=================== + + +.. _sliv: + +Slivar for Rare Disease or Pedigrees +------------------------------------ + +Several projects in the past have focused on calling variants on pedigree or +trio data. This is a common archetype in investigating inheritance patterns of +rare diseases. `Slivar `_ is a set of +command-line tools that query and filter VCF file in order to investigate +non-Mendelian and Mendelian inheritance patterns in pedigree data. One of the +most powerful aspects of this tool is the ability to create custom filtering +patterns using basic boolean logic. + + +.. _patho: + +Filtering Annotations +--------------------- + +One of the benefits of SnpSift is the ability to filter your VCF files based on +what annotations are attached. The `docs page +`_ on SnpSift filter has great +documentation on using this. Similar to Slivar, you can use arbitrary +expressions and access fields in the VCF such as quality (``QUAL``) and +genotype (``GEN[]``) to enhance your annotation +filters. + +For example, to get all variants marked as “Damaging” by the FATHMM pathogenic +prediction annotation from dbNSFP, you can run this on your annotated vcf: + +``SnpSift filter “( dbNSFP_FATHMM_pred = 'D' )" input.vcf > output.vcf`` +