diff --git a/.circleci/config.yml b/.circleci/config.yml index da38e0592..9014546f5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,7 +5,7 @@ variables: # default settings for all steps defaults: &defaults docker: - - image: ubuntu:20.04 + - image: ubuntu:latest # -------------------------------------------------------------------------- # The caching dramatically speeds up testing time, because we can do the @@ -26,9 +26,9 @@ variables: save_cache: &save_cache save_cache: - key: v5-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} + key: v0-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} paths: - - /opt/mambaforge + - /opt/miniforge # this file is created by sra-tools upon installation by conda, and so # needs to be included in the cache otherwise fastq-dump thinks it's @@ -38,7 +38,7 @@ variables: restore_cache: &restore_cache restore_cache: keys: - - v5-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} + - v0-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} # -------------------------------------------------------------------------- # The path needs to be set each time; in jobs below this will be called as @@ -48,6 +48,7 @@ variables: name: Set path command: | # x11-utils required to avoid R::png() segfaulting + export DEBIAN_FRONTEND=noninteractive apt update && apt install -y \ curl \ git \ @@ -73,7 +74,7 @@ variables: # Note that if we don't escape \$PATH, we'll be stuck with the exact # PATH defined here, which will break anything needing conda envs. - echo "export PATH=\$PATH:/opt/mambaforge/bin" >> $BASH_ENV + echo "export PATH=\$PATH:/opt/miniforge/bin" >> $BASH_ENV source $BASH_ENV @@ -85,28 +86,16 @@ variables: command: | source $BASH_ENV echo $PATH - # /opt/mambaforge will only exist if there was a cache restore; otherwise we'll make it here. + # /opt/miniforge will only exist if there was a cache restore; otherwise we'll make it here. # - # Use mambaforge which comes with mamba. - if [ ! -e /opt/mambaforge ]; then - curl -L https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh > mambaforge.sh - bash mambaforge.sh -b -p /opt/mambaforge - source "/opt/mambaforge/etc/profile.d/conda.sh" - source "/opt/mambaforge/etc/profile.d/mamba.sh" + if [ ! -e /opt/miniforge ]; then + curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" + bash Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/miniforge + source "/opt/miniforge/etc/profile.d/conda.sh" conda activate which conda - which mamba - mamba --version - - # Note that mambaforge doesn't come with the defaults channel, but - # we're adding it here at the beginning to simulate what most users - # probably have locally (and following the bioconda docs). Using - # strict channel priority means we should [theoretically] never - # pull packages from defaults because they all exist on - # conda-forge. - conda config --system --add channels defaults - + conda --version conda config --system --add channels bioconda conda config --system --add channels conda-forge conda config --system --set channel_priority strict @@ -115,10 +104,12 @@ variables: # https://docs.conda.io/projects/conda-build/en/latest/resources/link-scripts.html, # post-link scripts should not depend on any installed or # to-be-installed conda packages...but they do. - mamba install -n base r-base yq + # conda install -n base r-base yq - time mamba env create -n $LCDBWF_ENV --file env.yml - time mamba env create -n $LCDBWF_ENV_R --file env-r.yml + time conda env create -n $LCDBWF_ENV --file env.yml + time conda env create -n $LCDBWF_ENV_R --file env-r.yml + conda env export -n $LCDBWF_ENV > /opt/miniforge/env.yml + conda env export -n $LCDBWF_ENV_R > /opt/miniforge/env.yml fi # -------------------------------------------------------------------------- @@ -127,7 +118,7 @@ variables: run: name: Download example data command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV conda info --envs conda config --show @@ -146,45 +137,40 @@ variables: tree $ORIG set +x - # Separately copy over some test-specific files + # Separately copy over some test-specific files that are not part of deploying cp $ORIG/workflows/chipseq/run_test.sh $DEPLOY/workflows/chipseq/run_test.sh cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh - cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh - cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh mkdir $DEPLOY/ci mkdir $DEPLOY/test - cp $ORIG/test/lcdb-wf-test $DEPLOY/test/lcdb-wf-test - cp $ORIG/test/workflow_test_params.yaml $DEPLOY/test/workflow_test_params.yaml cp $ORIG/ci/get-data.py $DEPLOY/ci/get-data.py # the ./run_test.sh scripts run this cp $ORIG/ci/preprocessor.py $DEPLOY/ci/preprocessor.py - # download example data + # Now we can download example data cd $DEPLOY - test/lcdb-wf-test data --kind=all --verbose + ci/get-data.py # -------------------------------------------------------------------------- # Run the doctests across the included modules pytest-step: &pytest-step run: - name: Run pytest suite and testthat suite + name: Run pytest suite and R testthat suite command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV + # run unit tests and doctests for the modules in lib - test/lcdb-wf-test unit_tests --pytest + pytest --doctest-modules lib # Ensure that the chunks in rnaseq.Rmd have matching documentation - test/lcdb-wf-test unit_tests --ensure-docs - - # find all URLs in reference configs and make sure they exist - test/lcdb-wf-test unit_tests --url-check + (cd ci && ./ensure_docs.py) # run R package unit tests using the R env - test/lcdb-wf-test unit_tests --r-test + conda activate $LCDBWF_ENV_R + Rscript -e "devtools::test('lib/lcdbwf', reporter=c('summary', 'fail'), export_all=TRUE)" # -------------------------------------------------------------------------- @@ -194,10 +180,11 @@ variables: name: chipseq workflow command: | cd $DEPLOY/workflows/chipseq - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p -r - $DEPLOY/test/lcdb-wf-test chipseq --trackhub + cd $DEPLOY/workflows/chipseq + ./run_test.sh --use-conda -j2 -k -p + python chipseq_trackhub.py config/config.yaml config/hub_config.yaml # -------------------------------------------------------------------------- # Previous versions had an error where chipseq peaks needed to be defined for @@ -207,11 +194,10 @@ variables: run: name: chipseq misc command: | - cd $DEPLOY/workflows/chipseq - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - - ./run_test.sh --use-conda -j2 -k -p -r \ + cd $DEPLOY/workflows/chipseq + ./run_test.sh --use-conda -j2 -k -p \ --configfile $ORIG/test/test_configs/test_chipseq_regression.yaml \ --config sampletable=$ORIG/test/test_configs/chipseq_one_run.tsv \ merged_bigwigs="{}" \ @@ -231,16 +217,6 @@ variables: --until bed_to_bigbed fi - # -------------------------------------------------------------------------- - # Standard references workflow. - references-step: &references-step - run: - name: references workflow - command: | - source /opt/mambaforge/etc/profile.d/conda.sh - conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -r -k --orig $ORIG - # -------------------------------------------------------------------------- # Standard RNA-seq workflow rnaseq-step: &rnaseq-step @@ -248,24 +224,29 @@ variables: name: rnaseq workflow command: | cd $DEPLOY - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p -r --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --trackhub --orig $ORIG + cd workflows/rnaseq + + ./run_test.sh -n \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml + + ./run_test.sh --use-conda -j2 -k -p \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml + + python rnaseq_trackhub.py \ + config/config.yaml config/hub_config.yaml \ + --additional-configs $ORIG/test/test_configs/test_rnaseq_config.yaml - # This run the preprocessor on the Rmd files and stores them - # in a new download-test directory (see the comments in the script - # for details) - $DEPLOY/test/lcdb-wf-test rnaseq --downstream + conda activate $LCDBWF_ENV_R + + # This creates files in `workflows/rnaseq/downstream-test`: + ./run_downstream_test.sh # bundle up the entire directory to be used as an artifact - tar -zcf /tmp/downstream.tar.gz workflows/rnaseq/downstream-test/ - cp workflows/rnaseq/downstream-test/rnaseq.html /tmp/rnaseq.html - cp workflows/rnaseq/downstream-test/functional-enrichment.html /tmp/functional-enrichment.html - cp workflows/rnaseq/downstream-test/gene-patterns.html /tmp/gene-patterns.html - cp workflows/rnaseq/data/rnaseq_aggregation/multiqc.html /tmp/rnaseq.html + tar -zcf /tmp/downstream.tar.gz downstream-test/ + cp data/rnaseq_aggregation/multiqc.html /tmp/rnaseq.html # -------------------------------------------------------------------------- # Various tests on RNA-seq workflow that don't warrant the overhead of a new @@ -276,32 +257,27 @@ variables: command: | ORIG=$(pwd) cd $DEPLOY - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - # Check the help for test/lcdb-wf-test to see what args these - # provide; some of them use the --until argument to restrict the - # rules that are run. Note the use of --orig $ORIG to use the test - # configs from the original clone rather than the deployed directory. - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k -r -p -j2 --use-conda --orig $ORIG + cd workflows/rnaseq + # SRA test + ./run_test.sh -k -p -j2 --use-conda \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \ + --config sampletable=$ORIG/test/test_configs/test_sra_sampletable.csv + + # SRA SE only + ./run_test.sh -k -p -j2 --use-conda \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \ + --config sampletable=$ORIG/test/test_configs/test_sra_sampletable_SE_only.tsv + + # PE + ./run_test.sh -k -p -j2 --use-conda \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \ + --config sampletable=$ORIG/test/test_configs/test_pe_sampletable.tsv - # -------------------------------------------------------------------------- - # Standard colocalization workflow - colocalization-step: &colocalization-step - run: - name: colocalization workflow - command: | - cd $DEPLOY/workflows/colocalization - source /opt/mambaforge/etc/profile.d/conda.sh - conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -r -p -j2 --use-conda --orig $ORIG # -------------------------------------------------------------------------- # Syntax note: All of the steps above, with their "&step-name" labels, can be @@ -342,10 +318,13 @@ jobs: # themselves. - *save_cache + # These files were created during conda setup, and become part of the + # cache. So we should get them as artifacts regardless of if the conda + # setup ran this time. - store_artifacts: - path: /tmp/lcdb-wf-test/env.yaml + path: /opt/miniforge/env.yml - store_artifacts: - path: /tmp/lcdb-wf-test/env-r.yaml + path: /opt/miniforge/env-r.yml pytest: <<: *defaults resource_class: small @@ -365,7 +344,7 @@ jobs: - *get-data - *chipseq-step - store_artifacts: - path: /tmp/lcdb-wf-test/workflows/chipseq/data/chipseq_aggregation/multiqc.html + path: $DEST/workflows/chipseq/data/chipseq_aggregation/multiqc.html chipseq-misc: <<: *defaults @@ -387,19 +366,9 @@ jobs: - store_artifacts: path: /tmp/downstream.tar.gz destination: downstream.tar.gz - - store_artifacts: - path: /tmp/rnaseq.html - destination: rnaseq.html - store_artifacts: path: /tmp/multiqc.html destination: multiqc.html - - store_artifacts: - path: /tmp/functional-enrichment.html - destination: functional-enrichment.html - - store_artifacts: - path: /tmp/gene-patterns.html - destination: gene-patterns.html - rnaseq-misc: <<: *defaults @@ -410,24 +379,6 @@ jobs: - *get-data - *rnaseq-misc-step - colocalization: - <<: *defaults - steps: - - checkout - - *restore_cache - - *set-path - - *get-data - - *colocalization-step - - references: - <<: *defaults - steps: - - checkout - - *restore_cache - - *set-path - - *get-data - - *references-step - build-docs: <<: *defaults resource_class: small @@ -438,9 +389,9 @@ jobs: - run: name: Install sphinx command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate lcdb-wf-test - mamba install -y sphinx make yaml + conda install -y sphinx make yaml - run: name: OK for unknown github host command: mkdir -p ~/.ssh/ && echo -e "Host github.com\n\tStrictHostKeyChecking no\n" > ~/.ssh/config @@ -450,30 +401,12 @@ jobs: - run: name: Build and upload docs command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate lcdb-wf-test ci/build-docs.sh - store_artifacts: path: /tmp/docs.tar.gz - report-env: - <<: *defaults - resource_class: small - steps: - - checkout - - *restore_cache - - *set-path - - run: - name: Report environment - command: | - source /opt/mambaforge/etc/profile.d/conda.sh - conda env export -n lcdb-wf-test > /tmp/env.yaml - conda env export -n lcdb-wf-test-r > /tmp/env-r.yaml - - store_artifacts: - path: /tmp/env.yaml - - store_artifacts: - path: /tmp/env-r.yaml - # ---------------------------------------------------------------------------- # This section configures the dependencies across jobs. workflows: @@ -492,6 +425,7 @@ workflows: requires: - initial-setup - pytest + - chipseq - rnaseq: requires: - initial-setup @@ -500,22 +434,7 @@ workflows: requires: - initial-setup - pytest - - references: - requires: - - initial-setup - - pytest - - colocalization: - requires: - - initial-setup - - pytest + - rnaseq - build-docs: requires: - initial-setup - - report-env: - requires: - - rnaseq - - rnaseq-misc - - chipseq - - chipseq-misc - - references - - colocalization diff --git a/.gitignore b/.gitignore index ab3fd51ea..b1f7c8ca1 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,6 @@ workflows/rnaseq/downstream/rnaseq.html ._* Rplots.pdf /lib/include/* + +workflows/*/references + diff --git a/ci/get-data.py b/ci/get-data.py index cd2d356b1..984ed9de8 100755 --- a/ci/get-data.py +++ b/ci/get-data.py @@ -1,37 +1,112 @@ #!/usr/bin/env python +import argparse import os + from snakemake.shell import shell from snakemake.utils import makedirs -shell.executable('/bin/bash') -BRANCH = 'master' -URL = 'https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true'.format(BRANCH) +BRANCH = "master" +URL = "https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true".format( + BRANCH +) + +TOPLEVEL = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -def _download_file(fn, dest=None): +def _download_file(fn, dest=None, verbose=False): url = URL.format(fn) if dest is None: dest = fn + dest = os.path.join(TOPLEVEL, dest) makedirs(os.path.dirname(dest)) - basename = os.path.basename(fn) - shell('wget -q -O- {url} > {dest}') + if not verbose: + q = "-q" + else: + q = "" + shell(f"wget {q} -O- {url} > {dest}") + if verbose: + print(f"Saved {dest}") return dest -_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz') -_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz') -_download_file('rnaseq_samples/sample3/sample3.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz') -_download_file('rnaseq_samples/sample4/sample4.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz') +ap = argparse.ArgumentParser() +ap.add_argument("-v", "--verbose", action="store_true", help="Be verbose when downloading") +args = ap.parse_args() + +_download_file( + "rnaseq_samples/sample1/sample1.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample2/sample2.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample3/sample3.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample4/sample4.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz", + args.verbose, +) -_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz') -_download_file('rnaseq_samples/sample1/sample1.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz') -_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz') -_download_file('rnaseq_samples/sample2/sample2.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz') +_download_file( + "rnaseq_samples/sample1/sample1.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample1/sample1.small_R2.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample2/sample2.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample2/sample2.small_R2.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz", + args.verbose, +) -_download_file('chipseq_samples/input_1/input_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input1.fq.gz') -_download_file('chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip1.fq.gz') -_download_file('chipseq_samples/input_2/input_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input2.fq.gz') -_download_file('chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip2.fq.gz') -_download_file('chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip3.fq.gz') -_download_file('chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip4.fq.gz') -_download_file('chipseq_samples/input_3/input_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input3.fq.gz') +_download_file( + "chipseq_samples/input_1/input_1.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_input1.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_ip1.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/input_2/input_2.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_input2.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_ip2.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_ip3.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_ip4.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/input_3/input_3.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_input3.fq.gz", + args.verbose, +) diff --git a/ci/preprocessor.py b/ci/preprocessor.py index 042bee332..1cd7e5dac 100644 --- a/ci/preprocessor.py +++ b/ci/preprocessor.py @@ -7,54 +7,16 @@ in production. Rather than require users edit files to remove those test-specific patterns, here we keep the test settings commented out and only un-comment when running tests. - -First, we look for any line that matches "# [test settings]" (case insensitive, -with optional surrounding spacing) and an optional signed integer. Any of these -would work: - - >>> assert matches('# [test settings]') - >>> assert matches('#[test settings]') - >>> assert matches('# [ test settings ]') - >>> assert matches('# [ test settings -1]') - >>> assert matches('# [ test settings +2]') - >>> assert matches('# [ TEST SETTINGS +2]') - >>> assert matches('# [ TeSt SeTTiNgS +2 ]') - -If a lines does not match, output it as-is. - -If a line matches, then uncomment it. Specifically, remove the first "#" in the -line; if it was followed by exactly one space, then remove that too. - -If a line matches and a signed integer was provided, then consider it -a relative location, and then comment-out the referred-to line. Example: - - >>> preprocess(''' - ... use this for production - ... # use this for tests # [test settings -1] - ... '''.splitlines(True)) - - # use this for production - use this for tests # [test settings -1] - - -If the matched special string creates the first "#" in the line, then do -nothing to that line but still respect the relative locations. Useful for just -commenting out nearby lines for tests: - - >>> preprocess(''' - ... # [TEST SETTINGS +1] - ... comment out for testing'''.splitlines(True)) - - # [TEST SETTINGS +1] - # comment out for testing """ + import re -regexp = re.compile(r'#\s?\[\s?test settings\s?(?P[-+]*\d)?\s*\]') +regexp = re.compile(r"#\s?\[\s?(enable|disable) for test\s?\]") -def matches(line): - return regexp.search(line.lower()) is not None + +def is_commented(line): + return line.strip().startswith("#") def comment_line(line): @@ -66,87 +28,75 @@ def comment_line(line): """ x = [] for i, character in enumerate(line): - if character == ' ': + if character == " ": x.append(character) else: break - x.append('# ') + x.append("# ") x.extend(line[i:]) - return ''.join(x) + return "".join(x) def uncomment_line(line): """ Removes the first instance of "#" from a line; if it was followed by - exactly one space then remove that too. + exactly one space then remove that too . . . UNLESS the *only* comment is the + special character that triggers this behavior, in which case we do nothing. >>> assert uncomment_line('# asdf') == 'asdf' >>> assert uncomment_line('#asdf') == 'asdf' >>> assert uncomment_line('# asdf # but this should be kept') == 'asdf # but this should be kept' >>> assert uncomment_line('# asdf') == ' asdf' >>> assert uncomment_line(' # asdf') == ' asdf' + >>> assert uncomment_line('do nothing') == 'do nothing' + >>> assert uncomment_line('do nothing # [disable for test]') == 'do nothing # [disable for test]' + >>> assert uncomment_line('#uncomment # [disable for test]') == 'uncomment # [disable for test]' """ - first = line.find('#') + first = line.find("#") - # If the first comment is the one that flag the line, then do nothing. + # If the first comment is the one that flagged the line, then do nothing. m = regexp.search(line.lower()) if m: if m.start() == first: return line - if line[first + 1] == ' ' and line[first + 2] != ' ': - pattern = '# ' + if line[first + 1] == " " and line[first + 2] != " ": + pattern = "# " else: - pattern = '#' - return line.replace(pattern, '', 1) + pattern = "#" + return line.replace(pattern, "", 1) def preprocess(lines): + result = [] if isinstance(lines, str): lines = [lines] - # These lists will keep track of whether a line should be changed. We need to - # create them ahead of time so that we can use relative indexing from line N to - # modify the state of lines N-1 or N+1 - uncomment = [False for i in range(len(lines))] - comment = [False for i in range(len(lines))] - - for i, line in enumerate(lines): + for line in lines: m = regexp.search(line.lower()) - if m: - # There as at least a "[ test settings ]", so remove comment - uncomment[i] = True - - # Figure out if there was also a relative location to uncomment, - # and keep track of it in the `comment` list. - rel = m.group('rel') - if rel is not None: - rel = int(rel) - comment[i + rel] = True + if not m: + result.append(line) + continue - result = [] - for (c, u, line) in zip(comment, uncomment, lines): - # E.g., in this situation, unclear what should happen: - # - # # [test settings] - # # [test settings -1] - # - if c and u: - raise ValueError("Line {0} is trying to be both commented and uncommented".format(line)) - if c: - result.append(comment_line(line)) - elif u: + action = m.group(1) + if action == "enable" and is_commented(line): result.append(uncomment_line(line)) + elif action == "disable" and not is_commented(line): + result.append(comment_line(line)) else: - result.append(line) - print(''.join(result)) + raise ValueError(f"Inconsistent commenting and action:\n{line}") + + print("".join(result)) if __name__ == "__main__": import argparse + ap = argparse.ArgumentParser(usage=__doc__) - ap.add_argument('infile', help='Input file to modify. Modified file printed to stdout.') + ap.add_argument( + "infile", help="Input file to modify. Modified file printed to stdout." + ) args = ap.parse_args() lines = open(args.infile).readlines() preprocess(lines) diff --git a/deploy.py b/deploy.py index 7ad7e1ace..6e98596b2 100755 --- a/deploy.py +++ b/deploy.py @@ -8,14 +8,13 @@ import subprocess as sp import datetime import json -import fnmatch import logging import hashlib from pathlib import Path from distutils import filelist # Determine default staging area, used in help -default_staging = "/tmp/{0}-lcdb-wf-staging".format(os.getenv('USER')) +default_staging = "/tmp/{0}-lcdb-wf-staging".format(os.getenv("USER")) usage = f""" This script assists in the deployment of relevant code from the lcdb-wf @@ -74,52 +73,50 @@ def error(s): logging.error(RED + s + RESET) -def write_include_file(source, flavor='all'): +def write_include_file(source, flavor="all"): # Patterns follow that of MANIFEST.in # (https://packaging.python.org/en/latest/guides/using-manifest-in/), # and distutils.filelist is used below to parse them. PATTERN_DICT = { - 'rnaseq': [ - 'include workflows/rnaseq/Snakefile', - 'recursive-include workflows/rnaseq/config *', - 'include workflows/rnaseq/rnaseq_trackhub.py', - 'recursive-include workflows/rnaseq/downstream *.Rmd', - 'recursive-include workflows/rnaseq/downstream *.yaml', + "rnaseq": [ + "include workflows/rnaseq/Snakefile", + "recursive-include workflows/rnaseq/config *", + "include workflows/rnaseq/rnaseq_trackhub.py", + "recursive-include workflows/rnaseq/downstream *.Rmd", + "recursive-include workflows/rnaseq/downstream *.yaml", ], - 'chipseq': [ - 'include workflows/chipseq/Snakefile', - 'recursive-include workflows/chipseq/config *', - 'include workflows/chipseq/chipseq_trackhub.py', + "chipseq": [ + "include workflows/chipseq/Snakefile", + "recursive-include workflows/chipseq/config *", + "include workflows/chipseq/chipseq_trackhub.py", ], - 'all': [ - 'recursive-include wrappers *', - 'recursive-include include *', - 'recursive-include lib *', - 'include env.yml env-r.yml .gitignore', - 'include workflows/references/Snakefile', - 'recursive-include workflows/references/config *', - 'global-exclude __pycache__', + "all": [ + "recursive-include wrappers *", + "recursive-include include *", + "recursive-include lib *", + "include env.yml env-r.yml .gitignore", + "recursive-include scripts *", + "global-exclude __pycache__", + ], + "full": [ + "include workflows/colocalization/Snakefile", + "recursive-include workflows/colocalization/config *", + "recursive-include workflows/colocalization/scripts *", + "recursive-include workflows/figures *", + "recursive-include workflows/external *", ], - 'full': [ - 'include workflows/colocalization/Snakefile', - 'recursive-include workflows/colocalization/config *', - 'recursive-include workflows/colocalization/scripts *', - 'recursive-include workflows/figures *', - 'recursive-include workflows/external *', - ] - } patterns = [] - if flavor in ('full', 'rnaseq'): - patterns.extend(PATTERN_DICT['rnaseq']) - if flavor in ('full', 'chipseq'): - patterns.extend(PATTERN_DICT['chipseq']) - if flavor == 'full': - patterns.extend(PATTERN_DICT['full']) - patterns.extend(PATTERN_DICT['all']) + if flavor in ("full", "rnaseq"): + patterns.extend(PATTERN_DICT["rnaseq"]) + if flavor in ("full", "chipseq"): + patterns.extend(PATTERN_DICT["chipseq"]) + if flavor == "full": + patterns.extend(PATTERN_DICT["full"]) + patterns.extend(PATTERN_DICT["all"]) def fastwalk(path): """ @@ -128,13 +125,13 @@ def fastwalk(path): """ path = str(path) for root, dirs, files in os.walk(path, topdown=True): - if 'conda-meta' in dirs: + if "conda-meta" in dirs: dirs[:] = [] files[:] = [] for d in dirs: - yield os.path.join(root, d).replace(path + '/', '') + yield os.path.join(root, d).replace(path + "/", "") for f in files: - yield os.path.join(root, f).replace(path + '/', '') + yield os.path.join(root, f).replace(path + "/", "") f = filelist.FileList() f.allfiles = list(fastwalk(source)) @@ -153,9 +150,9 @@ def fastwalk(path): to_transfer = list(set(under_version_control).intersection(f.files)) include = tempfile.NamedTemporaryFile(delete=False).name - with open(include, 'w') as fout: - fout.write('\n\n') - fout.write('\n'.join(to_transfer)) + with open(include, "w") as fout: + fout.write("\n\n") + fout.write("\n".join(to_transfer)) return include @@ -188,8 +185,8 @@ def check_md5(f): full_here = Path(__file__).resolve() full_there = Path(dest) / "deploy.py" error( - "Files {full_here} and {full_there} do not match! ".format(**locals()) + - "The deploy script you are running appears to be out of date. " + f"Files {full_here} and {full_there} do not match! " + + "The deploy script you are running appears to be out of date. " "Please get an updated copy from https://github.com/lcdb/lcdb-wf, perhaps " "with 'wget https://raw.githubusercontent.com/lcdb/lcdb-wf/master/deploy.py'" ) @@ -267,7 +264,7 @@ def deployment_json(source, dest): info("Wrote details of deployment to {log}".format(**locals())) -def build_envs(dest, conda_frontend="mamba"): +def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="conda"): """ Build conda environments. @@ -279,14 +276,22 @@ def build_envs(dest, conda_frontend="mamba"): the command line with --dest) in which the env and env-r yaml files should already exist. Envs will be created in here. + additional_main : list + Other packages to install, e.g., a snakemake plugin needed for + a cluster profile, into the main environment. + + additional_r : list + Other packages to install into the R environment. + conda_frontend : 'mamba' | 'conda' Which front-end to use (terminology borrowed from Snakemake) + """ mapping = [ - ("./env", "env.yml"), - ("./env-r", "env-r.yml"), + ("./env", "env.yml", additional_main), + ("./env-r", "env-r.yml", additional_r), ] - for env, yml in mapping: + for env, yml, additional in mapping: info("Building environment " + os.path.join(dest, env)) try: @@ -308,27 +313,42 @@ def build_envs(dest, conda_frontend="mamba"): p = sp.Popen(cmds, universal_newlines=True, cwd=dest) p.wait() + if additional: + info(f"Adding {additional} to environment") + cmds = [conda_frontend, "install", "-y", "-p", env] + additional + p = sp.Popen(cmds, universal_newlines=True, cwd=dest) + p.wait() + except KeyboardInterrupt: print("") - error("Killing running {conda_frontend} job, '".format(**locals()) + " ".join(cmds)) + error( + "Killing running {conda_frontend} job, '".format(**locals()) + + " ".join(cmds) + ) p.kill() sys.exit(1) if p.returncode: - error("Error running {conda_frontend}, '".format(**locals()) + " ".join(cmds)) + error( + "Error running {conda_frontend}, '".format(**locals()) + " ".join(cmds) + ) sys.exit(1) full_env = Path(dest) / env - info("Created env {full_env}".format(**locals())) + info(f"Created env {full_env}") if __name__ == "__main__": + additional_main_from_env_var = os.getenv("LCDBWF_ADDITIONAL_MAIN", []) + ap = argparse.ArgumentParser(usage=usage) ap.add_argument( "--flavor", default="full", - help="""Options are {0}. Default is full.""".format(['full', 'rnaseq', 'chipseq']), + help="""Options are {0}. Default is full.""".format( + ["full", "rnaseq", "chipseq"] + ), ) ap.add_argument( "--dest", help="""Destination directory in which to copy files""", required=True @@ -340,7 +360,7 @@ def build_envs(dest, conda_frontend="mamba"): help=f"""Make a new clone to a staging area (at the location specified by --staging which defaults to {default_staging}) and deploy from there. Useful if using this script as a standalone tool. You can also - use --branch to configure which branch to deploy from that clone.""" + use --branch to configure which branch to deploy from that clone.""", ) ap.add_argument( @@ -367,25 +387,40 @@ def build_envs(dest, conda_frontend="mamba"): ap.add_argument( "--conda-frontend", help="Set program (conda or mamba) to use when creating environments. Default is %(default)s.", - default="mamba", + default="conda", ) ap.add_argument( "--rsync-args", help="Options for rsync when deploying to a new directory. Default is %(default)s.", - default="-rlt" + default="-rlt", ) ap.add_argument( - "--mismatch-ok", - action="store_true", - help="Used for testing") + "--additional-main", + help="""Additional packages to install in main environment (only + relevant with --build-envs). For example, + 'snakemake-executor-plugin-cluster-generic' to support a cluster + profile. You can use the env var LCDBWF_ADDITIONAL_MAIN to supply this + argument automatically instead.""", + nargs="+", + ) + ap.add_argument( + "--additional-r", + help="Additional packages to install in R environment (only relevant with --build-envs)", + nargs="+", + ) + + ap.add_argument("--mismatch-ok", action="store_true", help="Used for testing") args = ap.parse_args() dest = args.dest flavor = args.flavor if args.staging and not args.clone: - print("ERROR: --staging was specified but --clone was not. Did you want to use --clone?", file=sys.stderr) - sys.exit(1) + print( + "ERROR: --staging was specified but --clone was not. Did you want to use --clone?", + file=sys.stderr, + ) + sys.exit(1) if args.clone: if args.staging is None: args.staging = default_staging @@ -398,7 +433,22 @@ def build_envs(dest, conda_frontend="mamba"): rsync(include, source, dest, args.rsync_args) deployment_json(source, dest) + if additional_main_from_env_var: + if args.additional_main: + print( + "ERROR: Unset LCDBWF_ADDITIONAL_MAIN env var if you want to use the --additional-main argument." + ) + sys.exit(1) + additional_main = [additional_main_from_env_var] + else: + additional_main = args.additional_main + if args.build_envs: - build_envs(dest, conda_frontend=args.conda_frontend) + build_envs( + dest, + additional_main=additional_main, + additional_r=args.additional_r, + conda_frontend=args.conda_frontend, + ) warning("Deployment complete in {args.dest}".format(**locals())) diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 45ed38711..000000000 --- a/docs/README.md +++ /dev/null @@ -1,30 +0,0 @@ -This documentation uses [sphinx](http://www.sphinx-doc.org) to buid the documentation. - -The built documentation from the master branch can be found at -https://lcdb.github.io/lcdb-wf. If you want to build a local copy of the -documentation: - -- create an environment from the `docs/docs-requirements.txt` file -- activate it -- run the Makefile in `docs` - - -That is: - -```bash -# Create env -conda create -n lcdb-wf-docs \ - --file docs/docs-requirements.txt \ - --channel bioconda \ - --channel conda-forge \ - --channel lcdb - -# activate it -source activate lcdb-wf-docs - -# build the docs -cd docs -make html -``` - -The locally-built docs will be in `docs/_build/html/toc.html`. diff --git a/docs/_static/balloon.min.css b/docs/_static/balloon.min.css deleted file mode 100644 index 268c8a8e4..000000000 --- a/docs/_static/balloon.min.css +++ /dev/null @@ -1 +0,0 @@ -[data-balloon]{position:relative}[data-balloon]:after,[data-balloon]:before{-ms-filter:"progid:DXImageTransform.Microsoft.Alpha(Opacity=0)";filter:alpha(opacity=0);-khtml-opacity:0;-moz-opacity:0;opacity:0;pointer-events:none;-webkit-transition:all .18s ease-out .18s;transition:all .18s ease-out .18s;bottom:100%;left:50%;position:absolute;z-index:10;-webkit-transform:translate(-50%,10px);-ms-transform:translate(-50%,10px);transform:translate(-50%,10px);-webkit-transform-origin:top;-ms-transform-origin:top;transform-origin:top}[data-balloon]:after{background:rgba(17,17,17,.9);border-radius:4px;color:#fff;content:attr(data-balloon);font-size:12px;padding:.5em 1em;white-space:nowrap;margin-bottom:11px}[data-balloon]:before{background:url('data:image/svg+xml;utf8,') no-repeat;background-size:100% auto;height:6px;width:18px;content:"";margin-bottom:5px}[data-balloon]:hover:after,[data-balloon]:hover:before{-ms-filter:"progid:DXImageTransform.Microsoft.Alpha(Opacity=100)";filter:alpha(opacity=100);-khtml-opacity:1;-moz-opacity:1;opacity:1;pointer-events:auto;-webkit-transform:translate(-50%,0);-ms-transform:translate(-50%,0);transform:translate(-50%,0)}[data-balloon][data-balloon-break]:after{white-space:normal}[data-balloon-pos=down]:after,[data-balloon-pos=down]:before{bottom:auto;left:50%;top:100%;-webkit-transform:translate(-50%,-10px);-ms-transform:translate(-50%,-10px);transform:translate(-50%,-10px)}[data-balloon-pos=down]:after{margin-top:11px}[data-balloon-pos=down]:before{background:url('data:image/svg+xml;utf8,') no-repeat;background-size:100% auto;height:6px;width:18px;margin-top:5px;margin-bottom:0}[data-balloon-pos=down]:hover:after,[data-balloon-pos=down]:hover:before{-webkit-transform:translate(-50%,0);-ms-transform:translate(-50%,0);transform:translate(-50%,0)}[data-balloon-pos=left]:after,[data-balloon-pos=left]:before{bottom:auto;left:auto;right:100%;top:50%;-webkit-transform:translate(10px,-50%);-ms-transform:translate(10px,-50%);transform:translate(10px,-50%)}[data-balloon-pos=left]:after{margin-right:11px}[data-balloon-pos=left]:before{background:url('data:image/svg+xml;utf8,') no-repeat;background-size:100% auto;height:18px;width:6px;margin-right:5px;margin-bottom:0}[data-balloon-pos=left]:hover:after,[data-balloon-pos=left]:hover:before{-webkit-transform:translate(0,-50%);-ms-transform:translate(0,-50%);transform:translate(0,-50%)}[data-balloon-pos=right]:after,[data-balloon-pos=right]:before{bottom:auto;left:100%;top:50%;-webkit-transform:translate(-10px,-50%);-ms-transform:translate(-10px,-50%);transform:translate(-10px,-50%)}[data-balloon-pos=right]:after{margin-left:11px}[data-balloon-pos=right]:before{background:url('data:image/svg+xml;utf8,') no-repeat;background-size:100% auto;height:18px;width:6px;margin-bottom:0;margin-left:5px}[data-balloon-pos=right]:hover:after,[data-balloon-pos=right]:hover:before{-webkit-transform:translate(0,-50%);-ms-transform:translate(0,-50%);transform:translate(0,-50%)}[data-balloon-length]:after{white-space:normal}[data-balloon-length=small]:after{width:80px}[data-balloon-length=medium]:after{width:150px}[data-balloon-length=large]:after{width:260px}[data-balloon-length=xlarge]:after{width:90vw}@media screen and (min-width:768px){[data-balloon-length=xlarge]:after{width:380px}}[data-balloon-length=fit]:after{width:100%} \ No newline at end of file diff --git a/docs/_static/custom.css b/docs/_static/custom.css deleted file mode 100644 index b83f5902d..000000000 --- a/docs/_static/custom.css +++ /dev/null @@ -1,30 +0,0 @@ -pre { - font-size: 0.7em; -} - - -h3 { - font-style: italic; -} - -h2 { - /* text-decoration: underline; */ -} - -code { - background-color: #fff; - font-size: 0.8em; - color: #444; -} - -code.file { - font-style: italic; -} - -/* make fixed sidebar scrollable - from: https://stackoverflow.com/questions/57031848/sphinx-alabaster-theme-scroll-inside-of-fixed-sidebar -*/ -div.sphinxsidebar { - max-height: 90%; - overflow-y: auto; -} diff --git a/docs/autodoc.rst b/docs/autodoc.rst deleted file mode 100644 index 7217f828b..000000000 --- a/docs/autodoc.rst +++ /dev/null @@ -1,9 +0,0 @@ -Module documentation -==================== - -.. toctree:: - :maxdepth: 2 - - lib.common - lib.chipseq - lib.patterns_targets diff --git a/docs/changelog.rst b/docs/changelog.rst index 220399444..9583eafcc 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,17 @@ Changelog ========= +v2.0 +---- + +Major release, heavily focused on simplification where possible. This includes +the reference configurations, more streamlined config files, and many fixes and +improvements that have been requested over the years. + +- Requires Snakemake 8+ +- Removed colocalization workflow + + v1.10.3 ------- diff --git a/docs/chipseq.png b/docs/chipseq.png deleted file mode 100644 index 051e0df12..000000000 Binary files a/docs/chipseq.png and /dev/null differ diff --git a/docs/chipseq.rst b/docs/chipseq.rst deleted file mode 100644 index 202e0375c..000000000 --- a/docs/chipseq.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _chipseq: - -ChIP-seq workflow ------------------ -The ChIP-seq workflow starts with raw FASTQ files and performs various QC steps. It -aligns and prepares BAM and bigWig files, performs peak-calling, and combines -everything together into a track hub for visualization. - -Specifically, the workflow does the following: - - - trims reads with cutadapt - - maps reads with Bowtie2 - - runs FastQC on raw, trimmed, and aligned reads - - Removes multimappers (samtools) and duplicates (Picard MarkDuplicates) - - performs fastq_screen on multiple configured genomes to look for evidence of - cross-contamination - - QC aggregation using MultiQC, along with a custom table for library sizes - - merges technical replicates and then re-deduplicates them - - creates bigWigs from unique, no-dups BAM files - - optionally merges bigWigs to create one signal track for all replicates - - runs deepTools plotFingerprint on grouped IP and input for QC and - evaluation of enrichment - - calls peaks using macs2, spp, and/or sicer, with support for multiple - peak-calling runs using different parameters to assist with assessing - performance and to help make decisions for downstream analysis - - optionally runs a template diffBind RMarkdown file used for differential binding analysis - - converts BED files into bigBed (or bigNarrowPeak where possible) - - builds and optionally uploads a track hub of bigWigs and bigBeds to - visualize peak-calling in UCSC Genome Browser - -To configure a ChIP-seq experiment, see :ref:`config-yaml`. - -.. image:: chipseq.png diff --git a/docs/conda.rst b/docs/conda.rst deleted file mode 100644 index 1cf44f84d..000000000 --- a/docs/conda.rst +++ /dev/null @@ -1,209 +0,0 @@ -.. _conda-envs: - -conda and conda envs in `lcdb-wf` -================================= - -Conda basics ------------- - -If you're not familiar with ``conda``, it is a way of keeping software isolated -on a computer in an "environment" (basically a directory with the executables -for all the software you want to use). When you "activate" the environment, it -places that location at the beginning of your ``$PATH`` variable, so that any -executables there are found first. It does not affect any existing installation -of any software on your machine and does not need root privileges. - -If you don't already have conda installed and the Bioconda channel set up, see -the `Bioconda docs `_ for details. - -You'll also probably want `mamba `_. Mamba -is a drop-in replacement for conda that is faster and more robust. In fact, it -is now the default conda front-end for Snakemake. If you don't already have -mamba, you can install it into your base conda environment with: - -.. code-block:: bash - - conda install -n base -c conda-forge mamba - -It's recommended that you install mamba into the base env (just like conda -itself is) so that it behaves like conda. It does *not* need to be installed -into each individual environment. - - -Building the environments -------------------------- - -**It is recommended that you create a separate environment directory for -each project**, rather than a single environment for all projects. That way you -can update packages in each project independently of any others, and yet the -environment will always be close at hand. This is an especially good practice -in shared space as others can easily find and activate the environment specific -to the project. - -.. note:: - - We recommend using mamba rather than conda for the speed increase and - ability to more correctly solve environments. See the `snakemake docs - `_ - for more info. - - -If you use the ``--build-envs`` argument when deploying lcdb-wf to a project -directory (see :ref:`setup-proj`), two conda environments will be built in the -directories: ``env``, which has all of the non-R requirements, and ``env-r`` -which has the R packages used in particular for downstream RNA-seq analysis. -These environments will use the fully-pinned environments in ``env.yml`` and -``env-r.yml``. If you've already deployed but didn't use the ``--build-envs`` -argument, then then the equivalent command to run in the deployed directory is: - -.. code-block:: bash - - mamba env create -p ./env --file env.yml - mamba env create -p ./env-r --file env-r.yml - - -.. _conda-troubleshooting: - -Troubleshooting environments ----------------------------- - -Sometimes there is a problem with creating an environment. For example, the -exact package specified in the env yaml might not be available for some reason -(this should not happen, but in practice sometimes it does in corner cases). - -If this happens, you can try a couple things. - -First, some terminology with how packages are specified in the environment -yamls. Here's an example for ``libpng`` version 1.6.37:: - - libpng=1.6.37=hed695b0_2 - |____| |____| |________| - | | | - name | | - version | - build string - -The package name (libpng) and version (1.6.37) are pretty standard and -self-explanatory. The `build` string refers to different built versions of the -*conda package*, but for the same version (1.6.37 in this case) of the package. -For example, if a conda package was built for version 1.1 of a tool, but that -package itself had an error unrelated to the tool, then a fixed build would be -made. The package version would remain the same (1.1) but the build string -would change. - -In this example, the build string contains a hash ``hed695b0`` which is a hash -of all the pinned dependencies for this package at packaging time. The -`conda-forge pinning docs -`_ give more detail -on what this pinning is about, but basically if that pinning changes then this -hash will change. The ``_2`` on the end of the build string hash indicates that -this is the third built package (build numbers start at zero) for this version -of ``libpng`` using the same pinning. In other words, there also likely exists -``libpng=1.6.37=hed695b0_1`` and ``libpng=1.6.37=hed695b0_0``. At the time of -this writing, there is also ``libpng-1.6.37-h21135ba_2`` (notice the different -hash) which is the same libpng version but uses different pinnings. - -What does this mean for troubleshooting? - -For any package that seems to be problematic, try editing the respective -environment yaml (e.g., ``env.yml``) to remove the build string (so in the -example above, you would try changing it to just ``libpng=1.6.37``) and try -building the environment again. If that doesn't work, try removing the version -as well (so just ``libpng``). - -Alternatively for very problematic cases or cases where there are multiple -problematic packages, you can try creating an environment with the "loose" -pinning in ``include/requirements.txt`` which effectively does not require any -particular versions with the exception of a few corner cases. Keep in mind that -using that file may cause the environment to take a while to build as conda (or -mamba) solves the dependencies of all the specified packages. - - -Conda envs in lcdb-wf ---------------------- - -Given all of the software used across all of `lcdb-wf`, the environments can -take a lot of time to build because the solver needs to figure out the entire -dependency tree and come up with a solution that works to satisfy the entire -set of specified requirements. - -We chose to split the conda environments in two: the **main** environment and the **R** -environment (see :ref:`conda-design-decisions`). These environments are -described by both "strict" and "loose" files. By default we use the "strict" -version, which pins all versions of all packages exactly. This is preferred -wherever possible. However we also provide a "loose" version that is not -specific about versions. The following table describes these files: - -+----------------+--------------------------------+----------------------------------+ -| strict version | loose version | used for | -+================+================================+==================================+ -| ``env.yml`` | ``include/requirements.txt`` | Main Snakefiles | -+----------------+--------------------------------+----------------------------------+ -| ``env-r.yaml`` | ``include/requirements-r.txt`` | Downstream RNA-seq analysis in R | -+----------------+--------------------------------+----------------------------------+ - -When deploying new instances, use the ``--build-envs`` argument which will use -the strict version. Or use the following commands in a deployed directory: - -.. code-block:: bash - - mamba env create -p ./env --file env.yml - mamba env create -p ./env-r --file env-r.yml - -When getting ready to release a new lcdb-wf version, create a new environment -using the loose version to prepare the env and then when tests pass, export it -to yaml. That is: - -.. code-block:: bash - - # use loose version when preparing a new version of lcdb-wf - mamba create -p ./env --file include/requirements.txt - mamba create -p ./env-r --file include/requirements-r.txt - - # then do testing.... - - # when tests pass, export the envs - conda env export -p ./env > env.yml - conda env export -p ./env-r > env-r.yaml - - # commit, push, finalize release - - -.. _conda-design-decisions: - -Design decisions ----------------- - -We made the design decision to split the conda envs into two different -environments -- one for R, one for non-R. We found that by by removing the -entire sub-DAG of R packages from the main environment we can dramatically -reduce the creation time. - -We also made the decision to use large top-level environments rather than -smaller environments created for each rule using the ``conda:`` directive. -There are two reasons for this choice. First, it allows us to activate a single -environment to give us access to all the tools used. This streamlines -troubleshooting because we don't have to dig through the ``.snakemake/conda`` -directory to figure out which hash corresponds to which file, but comes with -the up-front cost of creating the environment initially. Second, it simplifies -running the tests on CircleCI, allowing us to cache the env directories as -a whole to be re-used for multiple tests rather than caching the individual -.snakemake directories for each tested workflow. - -Given that the conda and snakemake ecosystem are in flux, this may change in -the future to using small conda environments for each rule separately if it -turns out to be more beneficial to do so. - -.. note:: - - Prior to v1.7, we used requirements.txt files with loose pinning. Moving to - yaml files allows us the option of also installing pip packages if needed. - It also allows us to specify channels directly in the yaml file for - streamlined installation. - - Using strictly-pinned yaml files that are consistently tested will - hopefully result in a more stable experience for users. For example, if you - happen to create an environment around the time of a new R/Bioconductor - release, the environment may not build correctly using a loose pinning. - Other transient issues in the packaging ecosystem can similarly cause - issues. diff --git a/docs/conf.py b/docs/conf.py index a8c11dc93..047fd82fa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,180 +1,35 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- +# Configuration file for the Sphinx documentation builder. # -# lcdb-wf documentation build configuration file, created by -# sphinx-quickstart on Tue Apr 11 11:06:34 2017. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys -sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('..')) -sys.path.insert(0, os.path.abspath('../lib')) - - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'generate_guide', - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', - 'sphinx.ext.napoleon', - 'sphinx.ext.todo', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages'] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html -# The master toctree document. -master_doc = 'toc' +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -# General information about the project. project = 'lcdb-wf' -copyright = '2017, Ryan Dale, Justin Fear' -author = 'Ryan Dale, Justin Fear' +copyright = '2025, Ryan Dale' +author = 'Ryan Dale' +release = '2.0' -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '1.9' -# The full version, including alpha/beta/rc tags. -release = '1.9' +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = "en" +extensions = [] -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path +templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = True -autoclass_content = "both" -autosummary_generate = True -# -- Options for HTML output ---------------------------------------------- +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'alabaster' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# Default options here: https://github.com/bitprophet/alabaster/blob/master/alabaster/theme.conf -# -html_theme_options = { - 'description': 'Customizable workflows for high-throughput sequencing analysis', - 'show_related': 'true', - 'fixed_sidebar': 'true', - 'sidebar_width': '300px', -} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". +html_theme = 'shibuya' html_static_path = ['_static'] +master_doc = 'toc' -html_sidebars = { - "*": [ - 'about.html', - 'navigation.html', - 'relations.html', - 'searchbox.html', - ] -} -# -- Options for HTMLHelp output ------------------------------------------ - -# Output file base name for HTML help builder. -htmlhelp_basename = 'lcdb-wfdoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', +html_theme_options = { + "globaltoc_expand_depth": 1, + "toctree_titles_only": False, + "accent_color": "gold", } - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'lcdb-wf.tex', 'lcdb-wf Documentation', - 'Ryan Dale, Justin Fear', 'manual'), -] - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'lcdb-wf', 'lcdb-wf Documentation', - [author], 1) -] - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'lcdb-wf', 'lcdb-wf Documentation', - author, 'lcdb-wf', 'One line description of project.', - 'Miscellaneous'), -] diff --git a/docs/config-yaml.rst b/docs/config-yaml.rst deleted file mode 100644 index c80263253..000000000 --- a/docs/config-yaml.rst +++ /dev/null @@ -1,607 +0,0 @@ -.. _config-yaml: - -Config YAML -=========== - -This page details the various configuration options and describes how to -configure a new workflow. - -Note that the ``references:`` section is detailed separately, at -:ref:`references-config`. - -Config files are expected to be in a ``config`` directory next to the -the Snakefile. For example, the RNA-seq workflow at -``workflows/rnaseq/Snakefile`` expects the config file -``workflows/rnaseq/config/config.yaml``. - -While it is possible to use Snakemake mechanisms such as ``--config`` to -override a particular config value and ``--configfile`` to update the config -with a different file, it is easiest to edit the existing -``config/config.yaml`` in place. This has the additional benefit of reproducibity -because all of the config information is stored in one place. - -The following table summarizes the config fields, which ones are use for which -workflow, and under what conditions, if any, they are required. Each option -links to a section below with more details on how to use it. - -================================================================================== =================== ================ ================= ========= -Field Used for References Used for RNA-seq Used for ChIP-seq Required -================================================================================== =================== ================ ================= ========= -:ref:`references ` and/or :ref:`include_references ` yes yes yes yes -:ref:`references_dir ` yes yes yes if `REFERENCES_DIR` env var not set -:ref:`sampletable ` . yes yes always -:ref:`organism ` . yes yes always -:ref:`aligner ` . yes yes always -:ref:`stranded ` . yes no usually (see :ref:`stranded `) -:ref:`fastq_screen ` . yes yes if using `fastq_screen` -:ref:`merged_bigwigs ` . yes yes if you want to merge bigwigs -:ref:`gtf ` . yes . always for RNA-seq -:ref:`rrna ` . yes . if rRNA screening desired -:ref:`salmon ` . yes . if Salmon quantification will be run -:ref:`chipseq ` . . yes always for ChIP-seq -================================================================================== =================== ================ ================= ========= - -Example configs ---------------- - -To provide an overview, here are some example config files. More detail is -provided later; this is just to provide some context: - -RNA-seq -~~~~~~~ - -The config file for RNA-seq is expected to be in -``workflows/rnaseq/config/config.yaml``: - -.. code-block:: yaml - - references_dir: "/data/references" - sampletable: "config/sampletable.tsv" - organism: 'human' - aligner: - tag: 'gencode-v25' - index: 'hisat2' - rrna: - tag: 'rRNA' - index: 'bowtie2' - gtf: - tag: 'gencode-v25' - - fastq_screen: - - label: Human - organism: human - tag: gencode-v25 - - label: rRNA - organism: human - tag: rRNA - - # Portions have been omitted from "references" section below for - # simplicity; see references config section for details. - - references: - human: - gencode-v25: - genome: - url: 'ftp://.../genome.fa.gz' - indexes: - - 'hisat2' - - 'bowtie2' - annotation: - url: 'ftp://.../annotation.gtf.gz' - - transcriptome: - indexes: - - 'salmon' - - rRNA: - genome: - url: 'https://...' - indexes: - - 'bowtie2' - -ChIP-seq -~~~~~~~~ - -The config file for ChIP-seq is expected to be in -``workflows/chipseq/config/config.yaml``. - -The major differences between ChIP-seq and RNA-seq configs are: - -- ChIP-seq has no ``annotation`` or ``rrna`` fields -- ChIP-seq has an addition section ``chipseq: peak_calling:`` - -.. code-block:: yaml - - sampletable: 'config/sampletable.tsv' - organism: 'dmel' - genome: 'dm6' - - aligner: - index: 'bowtie2' - tag: 'test' - - chipseq: - peak_calling: - - - label: gaf-embryo-1 - algorithm: macs2 - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - - - label: gaf-embryo-1 - algorithm: spp - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - - - label: gaf-wingdisc-pooled - algorithm: macs2 - ip: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - control: - - input-wingdisc-1 - - input-wingdisc-2 - - - label: gaf-wingdisc-pooled - algorithm: spp - ip: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - control: - - input-wingdisc-1 - - input-wingdisc-2 - - - label: gaf-wingdisc-pooled-1 - algorithm: epic2 - ip: - - gaf-wingdisc-1 - control: - - input-wingdisc-1 - extra: '' - - - label: gaf-wingdisc-pooled-2 - algorithm: epic2 - ip: - - gaf-wingdisc-2 - control: - - input-wingdisc-2 - extra: '' - - fastq_screen: - - label: Human - organism: human - tag: gencode-v25 - - merged_bigwigs: - input-wingdisc: - - input-wingdisc-1 - - input-wingdisc-2 - gaf-wingdisc: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - gaf-embryo: - - gaf-embryo-1 - - - # Portions have been omitted from "references" section below for - # simplicity; see references config section for details. - - references: - human: - gencode-v25: - genome: - url: 'ftp://.../genome.fa.gz' - indexes: - - 'hisat2' - - 'bowtie2' - annotation: - url: 'ftp://.../annotation.gtf.gz' - - fly: - test: - genome: - url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" - postprocess: 'lib.common.gzipped' - indexes: - - 'bowtie2' - - 'hisat2' - - - -Field descriptions ------------------- -Required for references, RNA-seq and ChIP-seq -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _cfg-references: - -``references`` -`````````````` - This section defines labels for references, where to get FASTA and GTF - files and (optionally) post-process them, and which indexes to build. - - Briefly, the example above has a single organism configured ("human"). That - organism has two tags ("gencode-v25" and "rRNA"). - - This is the most complex section and is documented elsewhere (see - :ref:`references-config`). - - -.. _cfg-inc-refs: - -``include_references`` -`````````````````````` - - This section can be used to supplement the ``references`` section with - other reference sections stored elsewhere in files. It's a convenient way - of managing a large amount of references without cluttering the config - file. - - See :ref:`references-config` for more. - - -.. _cfg-references-dir: - -``references_dir`` -`````````````````` - Top-level directory in which to create references. - - If not specified, uses the environment variable ``REFERENCES_DIR``. - - If specified and ``REFERENCES_DIR`` also exists, ``REFERENCES_DIR`` takes - precedence. - - This is useful when multiple people in a group share the same references to - avoid duplicating commonly-used references. Simply point references_dir to - an existing references directory to avoid having to rebuild references. - -Required for RNA-seq and ChIP-seq -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _cfg-sampletable: - -``sampletable`` field -````````````````````` - Path to sampletable file which, at minimum, list sample names and paths to - FASTQ files. The path of this filename is relative to the Snakefile. See - :ref:`sampletable` for more info on the expected contents of the file. - - Example: - - .. code-block:: yaml - - sampletable: "config/sampletable.tsv" - -.. _cfg-organism: - -``organism`` field -`````````````````` - This field selects the top-level section of the ``references`` section that - will be used for the analysis. In RNA-seq example above, "human" is the - only organism configured. In the ChIP-seq example, there is "human" as well - as "fly". - - Example: - - .. code-block:: yaml - - organism: "human" - -.. _cfg-aligner: - -``aligner`` config section -`````````````````````````` - This field has two sub-fields, and automatically uses the configured - ``organism`` to select the top-level entry in the references section. - ``tag`` selects the tag from the organism to use, and ``index`` selects - which aligner index to use. The relevant option from the example above - would be "gencode-v25", which configures both bowtie2 and hisat2 indexes to - be built. For RNA-seq we would likely choose "hisat2"; for ChIP-seq - "bowtie2". - - Currently-configured options are ``hisat2``, ``bowtie2``, and ``star``. - - Example: - - .. code-block:: yaml - - aligner: - tag: "gencode-v25" - index: "hisat2" - -Required for RNA-seq -~~~~~~~~~~~~~~~~~~~~ - -.. _cfg-stranded: - -``stranded`` field -`````````````````` - This field specifies the strandedness of the library. This is used by - various rule to set the parameters correctly. For example, - ``featureCounts`` will use ``-s0``, ``-s1``, or ``-s2`` accordingly; - ``kallisto`` will use ``--fr-stranded`` if needed, and so on. - - This field can take the following options: - - =================== =========== - value description - =================== =========== - ``unstranded`` The strand that R1 reads align to has no information about the strand of the gene. - ``fr-firststrand`` R1 reads from plus-strand genes align to the *minus* strand. Also called reverse stranded, dUTP-based - ``fr-secondstrand`` R1 reads from plus-strand genes align to the *plus* strand. Also called forward stranded. - =================== =========== - - Example: - - .. code-block:: yaml - - stranded: "fr-firststrand" - - Rules that require information about strand will check the config file at - run time and raise an error if this field doesn't exist. - - If you don't know the strandedness of the library, run the Snakefile in - such a way to only run the ``strand_check`` rule: - - .. code-block:: bash - - snakemake -j 2 strand_check - - Or, when using the Slurm wrapper on cluster, - - .. code-block:: bash - - sbatch ../../include/WRAPPER_SLURM strand_check - - When complete, there will be a MultiQC HTML file in the ``strand_check/`` - directory that you can inspect to make your choice. - - This will align the first 10,000 reads to the specified reference and run - RSeQC's ``infer_experiment.py`` on the results and then run MultiQC on just - those output files. - - .. versionadded:: 1.8 - -Optional fields -~~~~~~~~~~~~~~~ - -.. _cfg-fastq-screen: - -``fastq_screen`` config section -``````````````````````````````` - - This section configures which Bowtie2 indexes should be used with - `fastq_screen`. It takes the form of a list of dictionaries. Each - dictionary has the keys: - - - `label`: how to label the genome in the output - - `organism`: a configured organism. In the example above, there is only a single configured organism, "human". - - `tag`: a configured tag for that organism. - - Each entry in the list must have a Bowtie2 index configured to be built. - - Example: - - .. code-block:: yaml - - fastq_screen: - - label: Human - organism: human - tag: gencode-v25 - - label: rRNA - organism: human - tag: rRNA - - The above example configures two different indexes to use for fastq_screen: - the human gencode-v25 reference, and the human rRNA reference. - -.. _cfg-merged-bigwigs: - -``merged_bigwigs`` config section -````````````````````````````````` - This section controls optional merging of signal files in bigWig format. - Its format differs depending on RNA-seq or ChIP-seq, due to how strands are - handled in those workflows. - - Here is an RNA-seq example: - - .. code-block:: yaml - - merged_bigwigs: - arbitrary_label_to_use: - pos: - - 'sample1' - - 'sample2' - neg: - - 'sample1' - - 'sample2' - - This will result in a single bigWig file called - `arbitrary_label_to_use.bigwig` in the directory - `data/rnaseq_aggregation/merged_bigwigs` (by default; this is configured - using ``config/rnaseq_patterns.yaml``). That file merges together both the - positive and negative signal strands of two samples, `sample1` and `sample2`. The - names "sample1" and "sample2" are sample names defined in the :ref:`sample - table `. - - In other words, if samples 1 and 2 are replicates for a condition, this - gets us a single merged (averaged) track for that condition. - - Here's another RNA-seq example, where we merge the samples again but keep - the strands separate. This will result in two output bigwigs. - - .. code-block:: yaml - - merged_bigwigs: - merged_sense: - sense: - - 'sample1' - - 'sample2' - merged_antisense: - antisense: - - 'sample1' - - 'sample - - Here is a ChIP-seq example: - - .. code-block:: yaml - - merged_bigwigs: - arbitrary_label_to_use: - - 'label1' - - 'label2' - - This will result in a single bigWig file called - `arbitrary_label_to_use.bigwig` in the directory - `data/chipseq_aggregation/merged_bigwigs` (by default; this is configured - using ``config/chipseq_patterns.yaml``) that merges together the "label1" - and "label2" bigwigs. - - See :ref:`sampletable` for more info on the relationship between a *sample* - and a *label* when working with ChIP-seq. - - -RNA-seq-only fields -~~~~~~~~~~~~~~~~~~~ -.. _cfg-rrna: - -``rrna`` field -``````````````` - - This field selects the reference tag to use for screening rRNA reads. - Similar to the ``aligner`` field, it takes both a ``tag`` and ``index`` - key. The specified index must have been configured to be built for the - specified tag. It uses the already configured ``organism``. - - Example: - - .. code-block:: yaml - - rrna: - tag: 'rRNA' - index: 'bowtie2' - - -.. _cfg-gtf: - -``gtf`` field -````````````` - - This field selects the reference tag to use for counting reads in features. - The tag must have had a ``gtf:`` section specified; see - :ref:`references-config` for details. - - The organism is inherited from the ``organism:`` field. - - Example: - - .. code-block:: yaml - - gtf: - tag: "gencode-v25" - -.. _cfg-salmon: - -``salmon`` field -```````````````` - This field selects the reference tag to use for the Salmon index (if used). - The tag must have had a FASTA configured, and an index for "salmon" must - have been configured to be built for the organism selected with the - ``organism`` config option. - - -ChIP-seq-only fields -~~~~~~~~~~~~~~~~~~~~ - -.. _cfg-chipseq: - -``chipseq`` config section -`````````````````````````` - This section configures the peak-calling stage of the ChIP-seq workflow. It - currently expects a single key, ``peak_calling``, which is a list of - peak-calling runs. - - A peak-calling run is a dictionary configuring a single execution of - a peak-caller which results in a single BED file of called peaks. - A peak-calling run is uniquely described by its ``label`` and - ``algorithm``. This way, we can use the same label (e.g., `gaf-embryo-1`) - across multiple peak-callers to help organize the output. - - The currently-supported peak-callers are ``macs2``, ``spp``, and ``sicer``. - They each have corresponding wrappers in the ``wrappers`` directory. To add - other peak-callers, see :ref:`new-peak-caller`. - - The track hubs will include all of these called peaks which helps with - assessing the peak-calling performance. - - Here is a minimal example of a peak-calling config section. It defines - a single peak-calling run using the `macs2` algorithm. Note that the - ``ip:`` and ``control:`` keys are lists of **labels** from the ChIP-seq - sample table's ``label`` column, **not sample IDs** from the first column. - - .. code-block:: yaml - - chipseq: - peak_calling: - - - label: gaf-embryo-1 - algorithm: macs2 - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - - The above peak-calling config will result in a file - ``data/chipseq_peaks/macs2/gaf-embryo-1/peaks.bed`` (that pattern is - defined in ``chipseq_patterns.yaml`` if you need to change it). - - We can specify additional command-line arguments that are passed verbatim - to `macs2` with the ``extra:`` section, for example: - - .. code-block:: yaml - - chipseq: - peak_calling: - - - label: gaf-embryo-1 - algorithm: macs2 - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - extra: '--nomodel --extsize 147' - - - `macs2` supports multiple IP and input files, which internally are merged - by `macs2`. We can supply multiple IP and input labels for biological - replicates to get a set of peaks called on pooled samples. Note that we - give it a different label so it doesn't overwrite the other peak-calling - run we already have configured. - - .. code-block:: yaml - - chipseq: - peak_calling: - - - label: gaf-embryo-1 - algorithm: macs2 - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - extra: '--nomodel --extsize 147' - - - - label: gaf-embryo-pooled - algorithm: macs2 - ip: - - gaf-embryo-1 - - gaf-embryo-2 - control: - - input-embryo-1 - - input-embryo-2 - - - diff --git a/docs/config.rst b/docs/config.rst index 649a3cabc..107fbe288 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -5,75 +5,549 @@ Configuration ============= -General configuration -~~~~~~~~~~~~~~~~~~~~~ +Configuration happens in two places: + +**Config file:** + +- :ref:`rnaseq-config` +- :ref:`chipseq-config` + +**Sampletable:** + +- :ref:`rnaseq-sampletable` +- :ref:`chipseq-sampletable` + + +.. _configfiles: + +Config file +----------- + +Within a workflow directory, the default config file is expected to be at :file:`config/config.yaml`. + +Config files, at a minimum, specify which reference FASTA to use (:ref:`reference-config`). + +For RNA-seq (:ref:`rnaseq-config`) the config file also specifies a GTF +reference and strandedness of the libraries. + +For ChIP-seq (:ref:`chipseq-config`) the config file specifies peak-calling runs. + +You can override the default config file location when calling snakemake like +this:: + + snakemake --configfile="otherdir/myconfig.yaml" ... + +Snakemake will merge the config file(s) given on the command line with the +default config file (:file:`config/config.yaml`). + +.. _reference-config: + +References section +~~~~~~~~~~~~~~~~~~ + +This section is just about the references part of the config; see +:ref:`rnaseq-config` and :ref:`chipseq-config` for any additional config for +those workflows. + +Using included reference config templates +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The repository includes pre-configured reference genome and annotation +templates in :file:`include/reference_config_templates/` for common model +organisms. These templates provide organism name, genome FASTA URL, and +annotation GTF URL (for RNA-seq). They can be used for both ChIP-seq and +RNA-seq to conveniently fill in the references part of the config. + +This is the easiest way to configure references. There are two ways to use +these templates: + +1. Command-line: Point to the template using ``--configfile`` when calling Snakemake:: + + snakemake --configfile=../../include/reference_config_templates/Homo_sapiens/GENCODE.yaml ... + + This merges the template with your default :file:`config/config.yaml`, + creating new or replacing existing keys. + +2. Copy-paste: Copy the contents from a template file into your + :file:`config/config.yaml` file. + +Otherwise, see the next section for customizing the references section. + +Configuring references +^^^^^^^^^^^^^^^^^^^^^^ + +Both RNA-seq and ChIP-seq need a reference fasta configured, like this: + +.. code-block:: yaml + + genome: + url: + + +RNA-seq also needs a GTF annotation configured, which works similarly: + +.. code-block:: yaml + + annotation: + url: + + +The value of ``url`` can be a file (like +``file:///data/references/Homo_sapiens/gencode.fa.gz``) or any FTP or HTTP URL. + +This is useful if you have existing reference files you want to use. + +By default, reference files will be downloaded to the :file:`references` +directory within the current workflow. Aligner indexes will be built here as well. + +For ChIP-seq, the references directory will look like: + +.. code-block:: text + + references/ + ├── genome.fa.gz # Downloaded FASTA + ├── bowtie2/ # bowtie2 index + │   └── genome.*.bt2 + └── genome.chromsizes # chromsizes from fasta + +For RNA-seq, it will look like: + +.. code-block:: text + + references/ + ├── bowtie2/ # bowtie2 index for rRNA + │   └── rrna.*.bt2 + ├── salmon/ # salmon index + ├── star/ # STAR index + ├── annotation.gtf.gz # downloaded GTF + ├── annotation.refflat # GTF converted to refflat + ├── annotation.bed12 # GTF converted to bed12 + ├── annotation.mapping.tsv.gz # TSV of attributes from GTF + ├── genome.fa.gz # downloaded FASTA + ├── genome.fa.fai # chrom sizes + ├── rrna.fa.gz # rRNA sequence for organism from SILVA + └── transcriptome.fa.gz # created from genome FASTA and GTF + + +See :ref:`decisions-references` for a discussion on why it's done this way. You +can control this behavior by using the optional ``references`` entry in the +config file, which will instead look for (and create if needed) the specified +directory. If you do this, keep in mind that each reference directory uses +generic labels like ``genome``, ``annotation``, etc, so using the same +directory for different organisms will cause the files to be overwritten for +the last-run organism. So if you use this approach you should consider putting +your references in directories named after organisms and the versions of +aligners used. + + + +.. _rnaseq-config: + +RNA-seq config +~~~~~~~~~~~~~~ + +For RNA-seq, in addition to the genome fasta file described above, you also need: + +- ``annotation``, structured similar to ``genome``, which specifies a gzipped + GTF file. A transcriptome fasta is automatically built from the genome fasta + and this GTF. +- ``organism`` which will be used to screen ribosomal RNA. Technically, this is + searching for the string in the SILVA rRNA database's fasta records. +- ``stranded`` of the libraries, which is used for automatically + configuring strand-specific tools. The options are: + - ``fr-firststrand`` for dUTP libraries + - ``fr-secondstrand`` for ligation libraries + - ``unstranded`` for libraries without strand specificity. + +See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more +info on strandedness. If you don't know ahead of time, you can use +``fr-firststrand`` and inspect the results for RSeQC's infer_experiment in the +MultiQC output. Correct the strandedness in the config, and re-run. Only the +jobs affected by strandedness will be re-run. + +Here is an example for human: + +.. code-block:: yaml + + organism: "Homo sapiens" + genome: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz" + annotation: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz" + stranded: "fr-firststrand" + +In :file:`include/reference_config_templates` you can find configs for common +model organisms. These have both genome and annotation, so you can point +Snakemake to them on the command line. You would still need to specify +strandedness, which can be a config entry in +:file:`config/config.yaml`. Or it could be specified directly on the command +line, like this: + +.. code-block:: bash + + snakemake \ + --configfile=../../include/reference_config_templates/Homo_sapiens/GENCODE.yaml \ + --config stranded=fr-firststrand + +(in this case a separate :file:`config/config.yaml` would not be needed, as +long as you use the default :file:`config/sampletable.tsv` as your sampletable) + +.. _chipseq-config: + +ChIP-seq config +~~~~~~~~~~~~~~~ + +For ChIP-seq, in addition to the genome fasta file described above, you also +need a peak-calling section if you want to to run peak-calling. + +The idea is that the ``peak_calling:`` entry in the config is a list. Each item +in the list is a dictionary with the following keys: + +- ``label`` for the peak-calling run. This is intentionally free-form since you + may want to run the same samples through multiple algorithms or different + parameters. Output will be in :file:`data/peak_calling//